146 lines
5.2 KiB
C#
146 lines
5.2 KiB
C#
using System.Net;
|
|
using Microsoft.Extensions.Options;
|
|
using ScrapperAPI.Interfaces;
|
|
using ScrapperAPI.Options;
|
|
|
|
namespace ScrapperAPI.Services;
|
|
|
|
public sealed class ScraperHttpClient : IScraperHttpClient
|
|
{
|
|
private readonly IHttpClientFactory _httpClientFactory;
|
|
private readonly IDomainRateLimiter _rateLimiter;
|
|
private readonly ILogger<ScraperHttpClient> _logger;
|
|
private readonly ScraperOptions _opts;
|
|
|
|
public ScraperHttpClient(
|
|
IHttpClientFactory httpClientFactory,
|
|
IDomainRateLimiter rateLimiter,
|
|
ILogger<ScraperHttpClient> logger,
|
|
IOptions<ScraperOptions> options)
|
|
{
|
|
_httpClientFactory = httpClientFactory;
|
|
_rateLimiter = rateLimiter;
|
|
_logger = logger;
|
|
_opts = options.Value;
|
|
}
|
|
|
|
public async Task<string> GetStringWithRetryAsync(string url, CancellationToken ct)
|
|
{
|
|
if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
|
|
throw new ArgumentException("Invalid URL", nameof(url));
|
|
|
|
var host = uri.Host;
|
|
var http = _httpClientFactory.CreateClient("scraper");
|
|
|
|
var maxAttempts = Math.Max(1, _opts.Retry.MaxAttempts);
|
|
var baseDelay = Math.Max(0, _opts.Retry.BaseDelayMs);
|
|
var maxDelay = Math.Max(baseDelay, _opts.Retry.MaxDelayMs);
|
|
|
|
Exception? lastEx = null;
|
|
|
|
for (var attempt = 1; attempt <= maxAttempts; attempt++)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
|
|
// Rate limit por host antes de iniciar a request
|
|
var before = DateTimeOffset.UtcNow;
|
|
await _rateLimiter.WaitAsync(host, ct);
|
|
var waitedMs = (int)(DateTimeOffset.UtcNow - before).TotalMilliseconds;
|
|
|
|
if (waitedMs > 0)
|
|
_logger.LogDebug("RateLimit applied: waited {WaitedMs}ms for host {Host}", waitedMs, host);
|
|
|
|
try
|
|
{
|
|
using var req = new HttpRequestMessage(HttpMethod.Get, uri);
|
|
req.Headers.UserAgent.ParseAdd("webscrapper/1.0");
|
|
req.Headers.Accept.ParseAdd("text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
|
|
using var resp = await http.SendAsync(req, HttpCompletionOption.ResponseHeadersRead, ct);
|
|
|
|
if (IsTransientStatus(resp.StatusCode))
|
|
{
|
|
lastEx = new HttpRequestException($"Transient status code {(int)resp.StatusCode} ({resp.StatusCode})");
|
|
await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, lastEx, baseDelay, maxDelay, ct, resp.StatusCode);
|
|
continue;
|
|
}
|
|
|
|
resp.EnsureSuccessStatusCode();
|
|
return await resp.Content.ReadAsStringAsync(ct);
|
|
}
|
|
catch (Exception ex) when (IsTransientException(ex, ct))
|
|
{
|
|
lastEx = ex;
|
|
|
|
if (attempt >= maxAttempts)
|
|
break;
|
|
|
|
await LogAndDelayRetryAsync(url, host, attempt, maxAttempts, ex, baseDelay, maxDelay, ct, statusCode: null);
|
|
}
|
|
}
|
|
|
|
throw lastEx ?? new Exception("Request failed");
|
|
}
|
|
|
|
private static bool IsTransientStatus(HttpStatusCode statusCode)
|
|
{
|
|
// Transientes típicos:
|
|
// 408 Request Timeout
|
|
// 429 Too Many Requests
|
|
// 5xx Server errors
|
|
var code = (int)statusCode;
|
|
return code == 408 || code == 429 || (code >= 500 && code <= 599);
|
|
}
|
|
|
|
private static bool IsTransientException(Exception ex, CancellationToken ct)
|
|
{
|
|
// HttpRequestException (DNS, socket, etc.)
|
|
// TaskCanceledException pode ser timeout (mas se foi cancelamento do host, não retry)
|
|
if (ex is OperationCanceledException && ct.IsCancellationRequested)
|
|
return false;
|
|
|
|
return ex is HttpRequestException
|
|
|| ex is TaskCanceledException; // timeout de HttpClient costuma cair aqui
|
|
}
|
|
|
|
private async Task LogAndDelayRetryAsync(
|
|
string url,
|
|
string host,
|
|
int attempt,
|
|
int maxAttempts,
|
|
Exception ex,
|
|
int baseDelayMs,
|
|
int maxDelayMs,
|
|
CancellationToken ct,
|
|
HttpStatusCode? statusCode)
|
|
{
|
|
var delayMs = ComputeBackoffWithJitterMs(attempt, baseDelayMs, maxDelayMs);
|
|
|
|
if (statusCode is not null)
|
|
{
|
|
_logger.LogWarning(
|
|
ex,
|
|
"Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to status {StatusCode} for host {Host}. Url={Url}",
|
|
attempt, maxAttempts, delayMs, (int)statusCode.Value, host, url);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogWarning(
|
|
ex,
|
|
"Retrying ({Attempt}/{MaxAttempts}) in {DelayMs}ms due to transient error for host {Host}. Url={Url}",
|
|
attempt, maxAttempts, delayMs, host, url);
|
|
}
|
|
|
|
await Task.Delay(delayMs, ct);
|
|
}
|
|
|
|
private static int ComputeBackoffWithJitterMs(int attempt, int baseDelayMs, int maxDelayMs)
|
|
{
|
|
// Exponential backoff: base * 2^(attempt-1), com jitter [0..base)
|
|
// clamp em maxDelay
|
|
var exp = baseDelayMs * (1 << Math.Clamp(attempt - 1, 0, 30));
|
|
var clamped = Math.Min(exp, maxDelayMs);
|
|
var jitter = Random.Shared.Next(0, Math.Max(1, baseDelayMs));
|
|
return Math.Min(clamped + jitter, maxDelayMs);
|
|
}
|
|
} |