using System.Net; using Microsoft.Extensions.Options; using ScrapperAPI.Interfaces; using ScrapperAPI.Options; namespace ScrapperAPI.Services; public sealed class ScraperHttpClient : IScraperHttpClient { private readonly IHttpClientFactory _httpClientFactory; private readonly IDomainRateLimiter _rateLimiter; private readonly ILogger _logger; private readonly ScraperOptions _opts; public ScraperHttpClient( IHttpClientFactory httpClientFactory, IDomainRateLimiter rateLimiter, ILogger logger, IOptions options) { _httpClientFactory = httpClientFactory; _rateLimiter = rateLimiter; _logger = logger; _opts = options.Value; } public async Task GetStringWithRetryAsync(string url, CancellationToken ct) { if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) throw new ArgumentException("Invalid URL", nameof(url)); var host = uri.Host; var http = _httpClientFactory.CreateClient("scraper"); var maxAttempts = Math.Max(1, _opts.Retry.MaxAttempts); var baseDelay = Math.Max(0, _opts.Retry.BaseDelayMs); var maxDelay = Math.Max(baseDelay, _opts.Retry.MaxDelayMs); Exception? lastEx = null; for (var attempt = 1; attempt <= maxAttempts; attempt++) { ct.ThrowIfCancellationRequested(); // Rate limit por host antes de iniciar a request var before = DateTimeOffset.UtcNow; await _rateLimiter.WaitAsync(host, ct); var waitedMs = (int)(DateTimeOffset.UtcNow - before).TotalMilliseconds; if (waitedMs > 0) _logger.LogDebug("RateLimit applied: waited {WaitedMs}ms for host {Host}", waitedMs, host); try { using var req = new HttpRequestMessage(HttpMethod.Get, uri); req.Headers.UserAgent.ParseAdd("webscrapper/1.0"); req.Headers.Accept.ParseAdd("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); using var resp = await http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct); if (IsTransientStatus(resp.StatusCode)) { lastEx = new HttpRequestException($"Transient status code {(int)resp.StatusCode} ({resp.StatusCode})"); await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, lastEx, baseDelay, maxDelay, ct, resp.StatusCode); continue; } resp.EnsureSuccessStatusCode(); return await resp.Content.ReadAsStringAsync(ct); } catch (Exception ex) when (IsTransientException(ex, ct)) { lastEx = ex; if (attempt >= maxAttempts) break; await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, ex, baseDelay, maxDelay, ct, statusCode: null); } } throw lastEx ?? new Exception("Request failed"); } private static bool IsTransientStatus(HttpStatusCode statusCode) { // Transientes típicos: // 408 Request Timeout // 429 Too Many Requests // 5xx Server errors var code = (int)statusCode; return code == 408 || code == 429 || (code >= 500 && code <= 599); } private static bool IsTransientException(Exception ex, CancellationToken ct) { // HttpRequestException (DNS, socket, etc.) // TaskCanceledException pode ser timeout (mas se foi cancelamento do host, não retry) if (ex is OperationCanceledException && ct.IsCancellationRequested) return false; return ex is HttpRequestException || ex is TaskCanceledException; // timeout de HttpClient costuma cair aqui } private async Task LogAndDelayRetryAsync( string url, string host, int attempt, int maxAttempts, Exception ex, int baseDelayMs, int maxDelayMs, CancellationToken ct, HttpStatusCode? statusCode) { var delayMs = ComputeBackoffWithJitterMs(attempt, baseDelayMs, maxDelayMs); if (statusCode is not null) { _logger.LogWarning( ex, "Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to status {StatusCode} for host {Host}. Url={Url}", attempt, maxAttempts, delayMs, (int)statusCode.Value, host, url); } else { _logger.LogWarning( ex, "Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to transient error for host {Host}. Url={Url}", attempt, maxAttempts, delayMs, host, url); } await Task.Delay(delayMs, ct); } private static int ComputeBackoffWithJitterMs(int attempt, int baseDelayMs, int maxDelayMs) { // Exponential backoff: base * 2^(attempt-1), com jitter [0..base) // clamp em maxDelay var exp = baseDelayMs * (1 << Math.Clamp(attempt - 1, 0, 30)); var clamped = Math.Min(exp, maxDelayMs); var jitter = Random.Shared.Next(0, Math.Max(1, baseDelayMs)); return Math.Min(clamped + jitter, maxDelayMs); } }