1
0
voyager-api/ScrapperAPI/Services/ScraperHttpClient.cs

146 lines
5.2 KiB
C#

using System.Net;
using Microsoft.Extensions.Options;
using ScrapperAPI.Interfaces;
using ScrapperAPI.Options;
namespace ScrapperAPI.Services;
public sealed class ScraperHttpClient : IScraperHttpClient
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly IDomainRateLimiter _rateLimiter;
private readonly ILogger<ScraperHttpClient> _logger;
private readonly ScraperOptions _opts;
public ScraperHttpClient(
IHttpClientFactory httpClientFactory,
IDomainRateLimiter rateLimiter,
ILogger<ScraperHttpClient> logger,
IOptions<ScraperOptions> options)
{
_httpClientFactory = httpClientFactory;
_rateLimiter = rateLimiter;
_logger = logger;
_opts = options.Value;
}
public async Task<string> GetStringWithRetryAsync(string url, CancellationToken ct)
{
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
throw new ArgumentException("Invalid URL", nameof(url));
var host = uri.Host;
var http = _httpClientFactory.CreateClient("scraper");
var maxAttempts = Math.Max(1, _opts.Retry.MaxAttempts);
var baseDelay = Math.Max(0, _opts.Retry.BaseDelayMs);
var maxDelay = Math.Max(baseDelay, _opts.Retry.MaxDelayMs);
Exception? lastEx = null;
for (var attempt = 1; attempt <= maxAttempts; attempt++)
{
ct.ThrowIfCancellationRequested();
// Rate limit por host antes de iniciar a request
var before = DateTimeOffset.UtcNow;
await _rateLimiter.WaitAsync(host, ct);
var waitedMs = (int)(DateTimeOffset.UtcNow - before).TotalMilliseconds;
if (waitedMs > 0)
_logger.LogDebug("RateLimit applied: waited {WaitedMs}ms for host {Host}", waitedMs, host);
try
{
using var req = new HttpRequestMessage(HttpMethod.Get, uri);
req.Headers.UserAgent.ParseAdd("webscrapper/1.0");
req.Headers.Accept.ParseAdd("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
using var resp = await http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct);
if (IsTransientStatus(resp.StatusCode))
{
lastEx = new HttpRequestException($"Transient status code {(int)resp.StatusCode} ({resp.StatusCode})");
await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, lastEx, baseDelay, maxDelay, ct, resp.StatusCode);
continue;
}
resp.EnsureSuccessStatusCode();
return await resp.Content.ReadAsStringAsync(ct);
}
catch (Exception ex) when (IsTransientException(ex, ct))
{
lastEx = ex;
if (attempt >= maxAttempts)
break;
await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, ex, baseDelay, maxDelay, ct, statusCode: null);
}
}
throw lastEx ?? new Exception("Request failed");
}
private static bool IsTransientStatus(HttpStatusCode statusCode)
{
// Transientes típicos:
// 408 Request Timeout
// 429 Too Many Requests
// 5xx Server errors
var code = (int)statusCode;
return code == 408 || code == 429 || (code >= 500 && code <= 599);
}
private static bool IsTransientException(Exception ex, CancellationToken ct)
{
// HttpRequestException (DNS, socket, etc.)
// TaskCanceledException pode ser timeout (mas se foi cancelamento do host, não retry)
if (ex is OperationCanceledException && ct.IsCancellationRequested)
return false;
return ex is HttpRequestException
|| ex is TaskCanceledException; // timeout de HttpClient costuma cair aqui
}
private async Task LogAndDelayRetryAsync(
string url,
string host,
int attempt,
int maxAttempts,
Exception ex,
int baseDelayMs,
int maxDelayMs,
CancellationToken ct,
HttpStatusCode? statusCode)
{
var delayMs = ComputeBackoffWithJitterMs(attempt, baseDelayMs, maxDelayMs);
if (statusCode is not null)
{
_logger.LogWarning(
ex,
"Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to status {StatusCode} for host {Host}. Url={Url}",
attempt, maxAttempts, delayMs, (int)statusCode.Value, host, url);
}
else
{
_logger.LogWarning(
ex,
"Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to transient error for host {Host}. Url={Url}",
attempt, maxAttempts, delayMs, host, url);
}
await Task.Delay(delayMs, ct);
}
private static int ComputeBackoffWithJitterMs(int attempt, int baseDelayMs, int maxDelayMs)
{
// Exponential backoff: base * 2^(attempt-1), com jitter [0..base)
// clamp em maxDelay
var exp = baseDelayMs * (1 << Math.Clamp(attempt - 1, 0, 30));
var clamped = Math.Min(exp, maxDelayMs);
var jitter = Random.Shared.Next(0, Math.Max(1, baseDelayMs));
return Math.Min(clamped + jitter, maxDelayMs);
}
}