Add distributed scraping architecture with agent-based support via gRPC
This commit is contained in:
parent
c7a6d5f938
commit
fe448405ec
48
.idea/.idea.WebScrapperPro/.idea/dataSources.local.xml
generated
Normal file
48
.idea/.idea.WebScrapperPro/.idea/dataSources.local.xml
generated
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="dataSourceStorageLocal" created-in="RD-253.29346.144">
|
||||||
|
<data-source name="postgres@localhost" uuid="7be83bba-6576-475f-a181-b1cb9bab100c">
|
||||||
|
<database-info product="PostgreSQL" version="17.6 (Debian 17.6-1.pgdg13+1)" jdbc-version="4.2" driver-name="PostgreSQL JDBC Driver" driver-version="42.7.3" dbms="POSTGRES" exact-version="17.6" exact-driver-version="42.7">
|
||||||
|
<identifier-quote-string>"</identifier-quote-string>
|
||||||
|
</database-info>
|
||||||
|
<case-sensitivity plain-identifiers="lower" quoted-identifiers="exact" />
|
||||||
|
<secret-storage>master_key</secret-storage>
|
||||||
|
<user-name>postgres</user-name>
|
||||||
|
<schema-mapping>
|
||||||
|
<introspection-scope>
|
||||||
|
<node negative="1">
|
||||||
|
<node kind="database" negative="1" />
|
||||||
|
<node kind="database" qname="@">
|
||||||
|
<node kind="schema" qname="@" />
|
||||||
|
</node>
|
||||||
|
<node kind="database" qname="webscrapper_dev">
|
||||||
|
<node kind="schema" negative="1" />
|
||||||
|
</node>
|
||||||
|
</node>
|
||||||
|
</introspection-scope>
|
||||||
|
</schema-mapping>
|
||||||
|
</data-source>
|
||||||
|
<data-source name="@192.168.3.35" uuid="5a27a41e-f460-4160-b185-262eb7acd4db">
|
||||||
|
<database-info product="PostgreSQL" version="18.1 (Ubuntu 18.1-1.pgdg24.04+2)" jdbc-version="4.2" driver-name="PostgreSQL JDBC Driver" driver-version="42.7.3" dbms="POSTGRES" exact-version="18.1" exact-driver-version="42.7">
|
||||||
|
<identifier-quote-string>"</identifier-quote-string>
|
||||||
|
</database-info>
|
||||||
|
<case-sensitivity plain-identifiers="lower" quoted-identifiers="exact" />
|
||||||
|
<secret-storage>master_key</secret-storage>
|
||||||
|
<user-name>admin</user-name>
|
||||||
|
<schema-mapping>
|
||||||
|
<introspection-scope>
|
||||||
|
<node negative="1">
|
||||||
|
<node kind="database" negative="1" />
|
||||||
|
<node kind="database">
|
||||||
|
<name qname="authentik" />
|
||||||
|
<name qname="giteadb" />
|
||||||
|
<name qname="postgres" />
|
||||||
|
<name qname="voyager" />
|
||||||
|
<node kind="schema" negative="1" />
|
||||||
|
</node>
|
||||||
|
</node>
|
||||||
|
</introspection-scope>
|
||||||
|
</schema-mapping>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
121
README.md
Normal file
121
README.md
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
# Voyager (central) + Agentes (opcionais)
|
||||||
|
|
||||||
|
Este ZIP adiciona uma arquitetura **distribuída e opcional** para o Voyager:
|
||||||
|
|
||||||
|
- **Central (ScrapperAPI)** continua capaz de fazer o scrape localmente.
|
||||||
|
- **Agentes (VoyagerAgent)** são *opcionais*: quando habilitados, eles pegam lotes de URLs do central via **gRPC** e devolvem o conteúdo.
|
||||||
|
- A coordenação é feita por **lease** no banco (PostgreSQL). Se um agente morrer, o lease expira e outro worker pode recuperar o item.
|
||||||
|
|
||||||
|
## Visão geral
|
||||||
|
|
||||||
|
- Banco: a tabela `queue` ganhou colunas `leased_by` e `lease_expires_at`, além de `attempts` e `last_error`.
|
||||||
|
- Central expõe um gRPC `AgentService` para:
|
||||||
|
- `RegisterAgent` (registro + thumbprint do cert)
|
||||||
|
- `Heartbeat`
|
||||||
|
- `LeaseWork` (lote de URLs)
|
||||||
|
- `SubmitResult` (conteúdo + status)
|
||||||
|
- Segurança: recomendado **mTLS** (TLS mútuo) no endpoint gRPC.
|
||||||
|
|
||||||
|
## Modos (Workers.Mode)
|
||||||
|
|
||||||
|
Em `ScrapperAPI/appsettings*.json`:
|
||||||
|
|
||||||
|
- `LocalOnly`: **somente** worker local.
|
||||||
|
- `Hybrid` (padrão): local + agentes ao mesmo tempo.
|
||||||
|
- `PreferAgents`: local só trabalha quando não há agentes ativos (por uma janela de graça).
|
||||||
|
- `PreferLocal`: (reservado) mantenha local sempre ativo.
|
||||||
|
|
||||||
|
## Como rodar (dev)
|
||||||
|
|
||||||
|
1) Rode o banco e aplique o script:
|
||||||
|
|
||||||
|
- `ScrapperAPI/Scripts/database.sql`
|
||||||
|
|
||||||
|
2) Rode o central:
|
||||||
|
|
||||||
|
- `dotnet run --project ScrapperAPI`
|
||||||
|
|
||||||
|
3) (Opcional) Rode um agente:
|
||||||
|
|
||||||
|
- ajuste `VoyagerAgent/appsettings.json` com `CentralGrpcAddress` e `SessionIds`
|
||||||
|
- `dotnet run --project VoyagerAgent`
|
||||||
|
|
||||||
|
> Em dev, você pode deixar `Workers:Agents:RequireMutualTls=false` para testar sem cert.
|
||||||
|
|
||||||
|
## Como habilitar mTLS (produção)
|
||||||
|
|
||||||
|
### 1) Gere uma CA local e certs (exemplo)
|
||||||
|
|
||||||
|
> Ajuste paths conforme seu ambiente.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CA
|
||||||
|
openssl genrsa -out ca.key 4096
|
||||||
|
openssl req -x509 -new -nodes -key ca.key -sha256 -days 3650 -subj "/CN=Voyager-CA" -out ca.crt
|
||||||
|
|
||||||
|
# Servidor (central)
|
||||||
|
openssl genrsa -out server.key 2048
|
||||||
|
openssl req -new -key server.key -subj "/CN=voyager-grpc" -out server.csr
|
||||||
|
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt -days 825 -sha256
|
||||||
|
|
||||||
|
# Agente
|
||||||
|
openssl genrsa -out agent01.key 2048
|
||||||
|
openssl req -new -key agent01.key -subj "/CN=agent-01" -out agent01.csr
|
||||||
|
openssl x509 -req -in agent01.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out agent01.crt -days 825 -sha256
|
||||||
|
|
||||||
|
# PFX do agente
|
||||||
|
openssl pkcs12 -export -out agent-01.pfx -inkey agent01.key -in agent01.crt -certfile ca.crt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2) Configure o Kestrel do central
|
||||||
|
|
||||||
|
A forma mais comum é via `appsettings.Production.json` (exemplo):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"Kestrel": {
|
||||||
|
"Endpoints": {
|
||||||
|
"HttpsGrpc": {
|
||||||
|
"Url": "https://0.0.0.0:7443",
|
||||||
|
"Certificate": {
|
||||||
|
"Path": "./certs/server.pfx",
|
||||||
|
"Password": "change-me"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"Workers": {
|
||||||
|
"Agents": {
|
||||||
|
"Enabled": true,
|
||||||
|
"RequireMutualTls": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
> Observação: o código do gRPC **exige cert do cliente** quando `RequireMutualTls=true`.
|
||||||
|
|
||||||
|
### 3) Configure o agente
|
||||||
|
|
||||||
|
Em `VoyagerAgent/appsettings.json`:
|
||||||
|
|
||||||
|
- `ClientCertificatePath` -> `./certs/agent-01.pfx`
|
||||||
|
- `ClientCertificatePassword` -> senha do PFX
|
||||||
|
- `CentralGrpcAddress` -> https do central (porta 7443, por exemplo)
|
||||||
|
|
||||||
|
### 4) Registro do agente
|
||||||
|
|
||||||
|
Ao iniciar, o agente chama `RegisterAgent` e o central grava:
|
||||||
|
- `agent.id`
|
||||||
|
- `agent.cert_thumbprint`
|
||||||
|
|
||||||
|
Depois disso, os requests são validados pelo thumbprint.
|
||||||
|
|
||||||
|
## O que foi adicionado/alterado
|
||||||
|
|
||||||
|
- `queue`: lease + tentativas
|
||||||
|
- `agent`: tabela para registrar agentes (thumbprint)
|
||||||
|
- `IQueueRepository`: lease batch, renew, mark done/failed validando `leased_by`
|
||||||
|
- `ScrapperAPI`: gRPC `AgentServiceImpl`
|
||||||
|
- `VoyagerAgent`: Worker Service que faz lease + scrape + submit
|
||||||
|
|
||||||
199
ScrapperAPI/AgentGrpc/AgentServiceImpl.cs
Normal file
199
ScrapperAPI/AgentGrpc/AgentServiceImpl.cs
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
using Grpc.Core;
|
||||||
|
using Grpc.AspNetCore.Server;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
|
using ScrapperAPI.AgentGrpc;
|
||||||
|
using ScrapperAPI.Interfaces;
|
||||||
|
using ScrapperAPI.Options;
|
||||||
|
|
||||||
|
namespace ScrapperAPI.AgentGrpc;
|
||||||
|
|
||||||
|
public sealed class AgentServiceImpl : AgentService.AgentServiceBase
|
||||||
|
{
|
||||||
|
private readonly IAgentRepository _agents;
|
||||||
|
private readonly IQueueRepository _queue;
|
||||||
|
private readonly IContentRepository _content;
|
||||||
|
private readonly WorkerOptions _opts;
|
||||||
|
|
||||||
|
public AgentServiceImpl(
|
||||||
|
IAgentRepository agents,
|
||||||
|
IQueueRepository queue,
|
||||||
|
IContentRepository content,
|
||||||
|
IOptions<WorkerOptions> options)
|
||||||
|
{
|
||||||
|
_agents = agents;
|
||||||
|
_queue = queue;
|
||||||
|
_content = content;
|
||||||
|
_opts = options.Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override async Task<RegisterAgentResponse> RegisterAgent(RegisterAgentRequest request, ServerCallContext context)
|
||||||
|
{
|
||||||
|
EnsureAgentsEnabled();
|
||||||
|
|
||||||
|
var (agentId, displayName) = (request.AgentId?.Trim(), request.DisplayName?.Trim());
|
||||||
|
if (string.IsNullOrWhiteSpace(agentId))
|
||||||
|
throw new RpcException(new Status(StatusCode.InvalidArgument, "agent_id is required"));
|
||||||
|
|
||||||
|
var thumbprint = GetClientCertThumbprint(context);
|
||||||
|
await _agents.UpsertAsync(agentId, string.IsNullOrWhiteSpace(displayName) ? null : displayName, thumbprint, context.CancellationToken);
|
||||||
|
|
||||||
|
return new RegisterAgentResponse { Ok = true };
|
||||||
|
}
|
||||||
|
|
||||||
|
public override async Task<HeartbeatResponse> Heartbeat(HeartbeatRequest request, ServerCallContext context)
|
||||||
|
{
|
||||||
|
EnsureAgentsEnabled();
|
||||||
|
var agentId = request.AgentId?.Trim();
|
||||||
|
if (string.IsNullOrWhiteSpace(agentId))
|
||||||
|
throw new RpcException(new Status(StatusCode.InvalidArgument, "agent_id is required"));
|
||||||
|
|
||||||
|
await ValidateAgentAsync(agentId, context);
|
||||||
|
await _agents.TouchAsync(agentId, context.CancellationToken);
|
||||||
|
|
||||||
|
return new HeartbeatResponse { Ok = true };
|
||||||
|
}
|
||||||
|
|
||||||
|
public override async Task<LeaseWorkResponse> LeaseWork(LeaseWorkRequest request, ServerCallContext context)
|
||||||
|
{
|
||||||
|
EnsureAgentsEnabled();
|
||||||
|
|
||||||
|
if (_opts.Mode == DistributedMode.LocalOnly)
|
||||||
|
{
|
||||||
|
return new LeaseWorkResponse
|
||||||
|
{
|
||||||
|
ServerTimeUtcMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
var agentId = request.AgentId?.Trim();
|
||||||
|
if (string.IsNullOrWhiteSpace(agentId))
|
||||||
|
throw new RpcException(new Status(StatusCode.InvalidArgument, "agent_id is required"));
|
||||||
|
|
||||||
|
await ValidateAgentAsync(agentId, context);
|
||||||
|
await _agents.TouchAsync(agentId, context.CancellationToken);
|
||||||
|
|
||||||
|
var capacity = Math.Clamp(request.Capacity, 0, 1000);
|
||||||
|
if (capacity == 0)
|
||||||
|
{
|
||||||
|
return new LeaseWorkResponse
|
||||||
|
{
|
||||||
|
ServerTimeUtcMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
var workerId = $"agent:{agentId}";
|
||||||
|
var leaseFor = TimeSpan.FromSeconds(Math.Max(5, _opts.LeaseSeconds));
|
||||||
|
|
||||||
|
var batch = await _queue.LeaseBatchAsync(request.SessionId, workerId, capacity, leaseFor, context.CancellationToken);
|
||||||
|
|
||||||
|
var resp = new LeaseWorkResponse
|
||||||
|
{
|
||||||
|
ServerTimeUtcMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()
|
||||||
|
};
|
||||||
|
|
||||||
|
foreach (var it in batch)
|
||||||
|
{
|
||||||
|
resp.Items.Add(new WorkItem
|
||||||
|
{
|
||||||
|
QueueId = it.Id,
|
||||||
|
SessionId = it.SessionId,
|
||||||
|
Url = it.Url,
|
||||||
|
LeaseExpiresUtcMs = DateTimeOffset.UtcNow.Add(leaseFor).ToUnixTimeMilliseconds()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return resp;
|
||||||
|
}
|
||||||
|
|
||||||
|
public override async Task<SubmitResultResponse> SubmitResult(SubmitResultRequest request, ServerCallContext context)
|
||||||
|
{
|
||||||
|
EnsureAgentsEnabled();
|
||||||
|
|
||||||
|
var agentId = request.AgentId?.Trim();
|
||||||
|
if (string.IsNullOrWhiteSpace(agentId))
|
||||||
|
throw new RpcException(new Status(StatusCode.InvalidArgument, "agent_id is required"));
|
||||||
|
|
||||||
|
await ValidateAgentAsync(agentId, context);
|
||||||
|
await _agents.TouchAsync(agentId, context.CancellationToken);
|
||||||
|
|
||||||
|
if (request.QueueId <= 0)
|
||||||
|
throw new RpcException(new Status(StatusCode.InvalidArgument, "queue_id must be > 0"));
|
||||||
|
|
||||||
|
var workerId = $"agent:{agentId}";
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
if (request.Success)
|
||||||
|
{
|
||||||
|
if (request.ContentBytes is { Length: > 0 })
|
||||||
|
{
|
||||||
|
var encoding = string.IsNullOrWhiteSpace(request.ContentEncoding) ? "gzip" : request.ContentEncoding;
|
||||||
|
var origLen = request.OriginalLength > 0 ? request.OriginalLength : 0;
|
||||||
|
var compLen = request.CompressedLength > 0 ? request.CompressedLength : request.ContentBytes.Length;
|
||||||
|
|
||||||
|
await _content.SaveCompressedAsync(
|
||||||
|
request.QueueId,
|
||||||
|
encoding,
|
||||||
|
request.ContentBytes.ToByteArray(),
|
||||||
|
origLen,
|
||||||
|
compLen,
|
||||||
|
context.CancellationToken);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
await _content.SaveAsync(request.QueueId, request.ContentText ?? string.Empty, context.CancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
var ok = await _queue.MarkDoneAsync(request.QueueId, workerId, context.CancellationToken);
|
||||||
|
if (!ok)
|
||||||
|
return new SubmitResultResponse { Ok = false, Message = "Lease is not valid for this agent" };
|
||||||
|
|
||||||
|
return new SubmitResultResponse { Ok = true, Message = "Stored" };
|
||||||
|
}
|
||||||
|
|
||||||
|
var error = string.IsNullOrWhiteSpace(request.Error) ? "unknown error" : request.Error;
|
||||||
|
var failed = await _queue.MarkFailedAsync(request.QueueId, workerId, error, context.CancellationToken);
|
||||||
|
if (!failed)
|
||||||
|
return new SubmitResultResponse { Ok = false, Message = "Lease is not valid for this agent" };
|
||||||
|
|
||||||
|
return new SubmitResultResponse { Ok = true, Message = "Marked failed" };
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
throw new RpcException(new Status(StatusCode.Internal, ex.Message));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void EnsureAgentsEnabled()
|
||||||
|
{
|
||||||
|
if (!_opts.Agents.Enabled)
|
||||||
|
throw new RpcException(new Status(StatusCode.Unavailable, "Agents are disabled"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task ValidateAgentAsync(string agentId, ServerCallContext context)
|
||||||
|
{
|
||||||
|
var row = await _agents.GetAsync(agentId, context.CancellationToken);
|
||||||
|
if (row is null)
|
||||||
|
throw new RpcException(new Status(StatusCode.PermissionDenied, "Agent not registered"));
|
||||||
|
|
||||||
|
if (!row.IsEnabled)
|
||||||
|
throw new RpcException(new Status(StatusCode.PermissionDenied, "Agent disabled"));
|
||||||
|
|
||||||
|
var thumbprint = GetClientCertThumbprint(context);
|
||||||
|
if (!string.Equals(row.CertThumbprint, thumbprint, StringComparison.OrdinalIgnoreCase))
|
||||||
|
throw new RpcException(new Status(StatusCode.PermissionDenied, "Client certificate does not match agent"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private string GetClientCertThumbprint(ServerCallContext context)
|
||||||
|
{
|
||||||
|
if (!_opts.Agents.RequireMutualTls)
|
||||||
|
return "";
|
||||||
|
|
||||||
|
var http = context.GetHttpContext();
|
||||||
|
var cert = http.Connection.ClientCertificate;
|
||||||
|
if (cert is null)
|
||||||
|
throw new RpcException(new Status(StatusCode.Unauthenticated, "Client certificate is required"));
|
||||||
|
|
||||||
|
return (cert.Thumbprint ?? string.Empty).Replace(" ", string.Empty);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -24,10 +24,10 @@ public sealed class ExtractionModelsController : ControllerBase
|
|||||||
public async Task<IActionResult> Create([FromBody] CreateExtractionModelRequest req, CancellationToken ct)
|
public async Task<IActionResult> Create([FromBody] CreateExtractionModelRequest req, CancellationToken ct)
|
||||||
{
|
{
|
||||||
var id = await _models.CreateAsync(new CreateExtractionModelDto(
|
var id = await _models.CreateAsync(new CreateExtractionModelDto(
|
||||||
Name: req.Name,
|
name: req.Name,
|
||||||
Version: req.Version <= 0 ? 1 : req.Version,
|
version: req.Version <= 0 ? 1 : req.Version,
|
||||||
Description: req.Description,
|
description: req.Description,
|
||||||
Definition: req.Definition
|
definition: req.Definition
|
||||||
), ct);
|
), ct);
|
||||||
|
|
||||||
return Created($"/extraction-models/{id}", new { id });
|
return Created($"/extraction-models/{id}", new { id });
|
||||||
|
|||||||
10
ScrapperAPI/Dtos/AgentRow.cs
Normal file
10
ScrapperAPI/Dtos/AgentRow.cs
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
namespace ScrapperAPI.Dtos;
|
||||||
|
|
||||||
|
public sealed record AgentRow(
|
||||||
|
string Id,
|
||||||
|
string? DisplayName,
|
||||||
|
string CertThumbprint,
|
||||||
|
DateTimeOffset CreatedAt,
|
||||||
|
DateTimeOffset LastSeenAt,
|
||||||
|
bool IsEnabled
|
||||||
|
);
|
||||||
@ -2,19 +2,57 @@ using System.Text.Json;
|
|||||||
|
|
||||||
namespace ScrapperAPI.Dtos;
|
namespace ScrapperAPI.Dtos;
|
||||||
|
|
||||||
public sealed record CreateExtractionModelDto(
|
public sealed class CreateExtractionModelDto
|
||||||
string Name,
|
{
|
||||||
int Version,
|
public string Name { get; init; } = null!;
|
||||||
string? Description,
|
public int Version { get; init; }
|
||||||
JsonDocument Definition
|
public string? Description { get; init; }
|
||||||
);
|
public JsonDocument Definition { get; init; } = null!;
|
||||||
|
|
||||||
public sealed record ExtractionModelRow(
|
public CreateExtractionModelDto()
|
||||||
long Id,
|
{
|
||||||
string Name,
|
|
||||||
int Version,
|
}
|
||||||
string? Description,
|
|
||||||
JsonDocument Definition,
|
public CreateExtractionModelDto(string name, int version, string? description, JsonDocument definition)
|
||||||
DateTimeOffset CreatedAt,
|
{
|
||||||
DateTimeOffset UpdatedAt
|
Name = name;
|
||||||
);
|
Version = version;
|
||||||
|
Description = description;
|
||||||
|
Definition = definition;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ExtractionModelRow
|
||||||
|
{
|
||||||
|
public long Id { get; init; }
|
||||||
|
public string Name { get; init; } = null!;
|
||||||
|
public int Version { get; init; }
|
||||||
|
public string? Description { get; init; }
|
||||||
|
public JsonDocument Definition { get; init; } = null!;
|
||||||
|
public DateTimeOffset CreatedAt { get; init; }
|
||||||
|
public DateTimeOffset UpdatedAt { get; init; }
|
||||||
|
|
||||||
|
public ExtractionModelRow()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractionModelRow(
|
||||||
|
long id,
|
||||||
|
string name,
|
||||||
|
int version,
|
||||||
|
string? description,
|
||||||
|
JsonDocument definition,
|
||||||
|
DateTimeOffset createdAt,
|
||||||
|
DateTimeOffset updatedAt)
|
||||||
|
{
|
||||||
|
Id = id;
|
||||||
|
Name = name;
|
||||||
|
Version = version;
|
||||||
|
Description = description;
|
||||||
|
Definition = definition;
|
||||||
|
CreatedAt = createdAt;
|
||||||
|
UpdatedAt = updatedAt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -3,68 +3,211 @@ using System.Text.Json;
|
|||||||
|
|
||||||
namespace ScrapperAPI.Dtos;
|
namespace ScrapperAPI.Dtos;
|
||||||
|
|
||||||
public sealed record StartExtractionRequest(
|
public sealed class StartExtractionRequest
|
||||||
[Required] int SessionId,
|
{
|
||||||
[Required] long ModelId,
|
[Required]
|
||||||
bool OnlyDone = true
|
public int SessionId { get; set; }
|
||||||
);
|
|
||||||
|
[Required]
|
||||||
|
public long ModelId { get; set; }
|
||||||
|
|
||||||
|
public bool OnlyDone { get; set; }
|
||||||
|
|
||||||
|
public StartExtractionRequest()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public StartExtractionRequest(int sessionId, long modelId, bool onlyDone = true)
|
||||||
|
{
|
||||||
|
SessionId = sessionId;
|
||||||
|
ModelId = modelId;
|
||||||
|
OnlyDone = onlyDone;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class BulkStartExtractionRequest
|
||||||
|
{
|
||||||
|
[Required]
|
||||||
|
public long ModelId { get; set; }
|
||||||
|
|
||||||
public sealed record BulkStartExtractionRequest(
|
|
||||||
[Required] long ModelId,
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Se vazio/nulo, roda para todas as sessions existentes.
|
/// Se vazio/nulo, roda para todas as sessions existentes.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
int[]? SessionIds = null,
|
public int[]? SessionIds { get; set; }
|
||||||
bool OnlyDone = true
|
|
||||||
);
|
|
||||||
|
|
||||||
public sealed record CreateExtractionRunDto(
|
public bool OnlyDone { get; set; }
|
||||||
long ModelId,
|
|
||||||
int SessionId
|
|
||||||
);
|
|
||||||
|
|
||||||
public sealed record ExtractionRunRow(
|
public BulkStartExtractionRequest()
|
||||||
long Id,
|
{
|
||||||
long ModelId,
|
}
|
||||||
int SessionId,
|
|
||||||
short Status,
|
|
||||||
DateTimeOffset CreatedAt,
|
|
||||||
DateTimeOffset? StartedAt,
|
|
||||||
DateTimeOffset? FinishedAt,
|
|
||||||
int Total,
|
|
||||||
int Succeeded,
|
|
||||||
int Failed,
|
|
||||||
string? Error
|
|
||||||
);
|
|
||||||
|
|
||||||
public sealed record ExtractionRuntimeStatus(
|
public BulkStartExtractionRequest(long modelId, int[]? sessionIds = null, bool onlyDone = true)
|
||||||
long RunId,
|
{
|
||||||
bool IsRunning,
|
ModelId = modelId;
|
||||||
int Processed,
|
SessionIds = sessionIds;
|
||||||
int Total,
|
OnlyDone = onlyDone;
|
||||||
int Succeeded,
|
}
|
||||||
int Failed,
|
}
|
||||||
int? CurrentQueueId
|
|
||||||
);
|
|
||||||
|
|
||||||
public sealed record UpsertExtractedDataDto(
|
public sealed class CreateExtractionRunDto
|
||||||
long RunId,
|
{
|
||||||
long ModelId,
|
public long ModelId { get; set; }
|
||||||
int SessionId,
|
|
||||||
int QueueId,
|
|
||||||
JsonDocument ExtractedJson,
|
|
||||||
bool Success,
|
|
||||||
string? Error
|
|
||||||
);
|
|
||||||
|
|
||||||
public sealed record ExtractedDataRow(
|
public int SessionId { get; set; }
|
||||||
long Id,
|
|
||||||
long RunId,
|
public CreateExtractionRunDto()
|
||||||
long ModelId,
|
{
|
||||||
int SessionId,
|
}
|
||||||
int QueueId,
|
|
||||||
JsonDocument ExtractedJson,
|
public CreateExtractionRunDto(long modelId, int sessionId)
|
||||||
bool Success,
|
{
|
||||||
string? Error,
|
ModelId = modelId;
|
||||||
DateTimeOffset ExtractedAt
|
SessionId = sessionId;
|
||||||
);
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ExtractionRunRow
|
||||||
|
{
|
||||||
|
public long Id { get; set; }
|
||||||
|
|
||||||
|
public long ModelId { get; set; }
|
||||||
|
|
||||||
|
public int SessionId { get; set; }
|
||||||
|
|
||||||
|
public short Status { get; set; }
|
||||||
|
|
||||||
|
public DateTimeOffset CreatedAt { get; set; }
|
||||||
|
|
||||||
|
public DateTimeOffset? StartedAt { get; set; }
|
||||||
|
|
||||||
|
public DateTimeOffset? FinishedAt { get; set; }
|
||||||
|
|
||||||
|
public int Total { get; set; }
|
||||||
|
|
||||||
|
public int Succeeded { get; set; }
|
||||||
|
|
||||||
|
public int Failed { get; set; }
|
||||||
|
|
||||||
|
public string? Error { get; set; }
|
||||||
|
|
||||||
|
public ExtractionRunRow()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractionRunRow(long id, long modelId, int sessionId, short status, DateTimeOffset createdAt, DateTimeOffset? startedAt, DateTimeOffset? finishedAt, int total, int succeeded, int failed, string? error)
|
||||||
|
{
|
||||||
|
Id = id;
|
||||||
|
ModelId = modelId;
|
||||||
|
SessionId = sessionId;
|
||||||
|
Status = status;
|
||||||
|
CreatedAt = createdAt;
|
||||||
|
StartedAt = startedAt;
|
||||||
|
FinishedAt = finishedAt;
|
||||||
|
Total = total;
|
||||||
|
Succeeded = succeeded;
|
||||||
|
Failed = failed;
|
||||||
|
Error = error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ExtractionRuntimeStatus
|
||||||
|
{
|
||||||
|
public long RunId { get; set; }
|
||||||
|
|
||||||
|
public bool IsRunning { get; set; }
|
||||||
|
|
||||||
|
public int Processed { get; set; }
|
||||||
|
|
||||||
|
public int Total { get; set; }
|
||||||
|
|
||||||
|
public int Succeeded { get; set; }
|
||||||
|
|
||||||
|
public int Failed { get; set; }
|
||||||
|
|
||||||
|
public int? CurrentQueueId { get; set; }
|
||||||
|
|
||||||
|
public ExtractionRuntimeStatus()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractionRuntimeStatus(long runId, bool isRunning, int processed, int total, int succeeded, int failed, int? currentQueueId)
|
||||||
|
{
|
||||||
|
RunId = runId;
|
||||||
|
IsRunning = isRunning;
|
||||||
|
Processed = processed;
|
||||||
|
Total = total;
|
||||||
|
Succeeded = succeeded;
|
||||||
|
Failed = failed;
|
||||||
|
CurrentQueueId = currentQueueId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class UpsertExtractedDataDto
|
||||||
|
{
|
||||||
|
public long RunId { get; set; }
|
||||||
|
|
||||||
|
public long ModelId { get; set; }
|
||||||
|
|
||||||
|
public int SessionId { get; set; }
|
||||||
|
|
||||||
|
public int QueueId { get; set; }
|
||||||
|
|
||||||
|
public JsonDocument ExtractedJson { get; set; }
|
||||||
|
|
||||||
|
public bool Success { get; set; }
|
||||||
|
|
||||||
|
public string? Error { get; set; }
|
||||||
|
|
||||||
|
public UpsertExtractedDataDto()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public UpsertExtractedDataDto(long runId, long modelId, int sessionId, int queueId, JsonDocument extractedJson, bool success, string? error)
|
||||||
|
{
|
||||||
|
RunId = runId;
|
||||||
|
ModelId = modelId;
|
||||||
|
SessionId = sessionId;
|
||||||
|
QueueId = queueId;
|
||||||
|
ExtractedJson = extractedJson;
|
||||||
|
Success = success;
|
||||||
|
Error = error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class ExtractedDataRow
|
||||||
|
{
|
||||||
|
public long Id { get; set; }
|
||||||
|
|
||||||
|
public long RunId { get; set; }
|
||||||
|
|
||||||
|
public long ModelId { get; set; }
|
||||||
|
|
||||||
|
public int SessionId { get; set; }
|
||||||
|
|
||||||
|
public int QueueId { get; set; }
|
||||||
|
|
||||||
|
public JsonDocument ExtractedJson { get; set; }
|
||||||
|
|
||||||
|
public bool Success { get; set; }
|
||||||
|
|
||||||
|
public string? Error { get; set; }
|
||||||
|
|
||||||
|
public DateTimeOffset ExtractedAt { get; set; }
|
||||||
|
|
||||||
|
public ExtractedDataRow()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExtractedDataRow(long id, long runId, long modelId, int sessionId, int queueId, JsonDocument extractedJson, bool success, string? error, DateTimeOffset extractedAt)
|
||||||
|
{
|
||||||
|
Id = id;
|
||||||
|
RunId = runId;
|
||||||
|
ModelId = modelId;
|
||||||
|
SessionId = sessionId;
|
||||||
|
QueueId = queueId;
|
||||||
|
ExtractedJson = extractedJson;
|
||||||
|
Success = success;
|
||||||
|
Error = error;
|
||||||
|
ExtractedAt = extractedAt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
13
ScrapperAPI/Interfaces/IAgentRepository.cs
Normal file
13
ScrapperAPI/Interfaces/IAgentRepository.cs
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
using ScrapperAPI.Dtos;
|
||||||
|
|
||||||
|
namespace ScrapperAPI.Interfaces;
|
||||||
|
|
||||||
|
public interface IAgentRepository
|
||||||
|
{
|
||||||
|
Task UpsertAsync(string agentId, string? displayName, string certThumbprint, CancellationToken ct);
|
||||||
|
Task<bool> IsEnabledAsync(string agentId, CancellationToken ct);
|
||||||
|
Task<string?> GetThumbprintAsync(string agentId, CancellationToken ct);
|
||||||
|
Task TouchAsync(string agentId, CancellationToken ct);
|
||||||
|
Task<int> CountActiveAsync(TimeSpan seenWithin, CancellationToken ct);
|
||||||
|
Task<AgentRow?> GetAsync(string agentId, CancellationToken ct);
|
||||||
|
}
|
||||||
@ -6,6 +6,17 @@ namespace ScrapperAPI.Interfaces;
|
|||||||
public interface IContentRepository
|
public interface IContentRepository
|
||||||
{
|
{
|
||||||
Task<int> SaveAsync(int queueId, string content, CancellationToken ct);
|
Task<int> SaveAsync(int queueId, string content, CancellationToken ct);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Saves already-compressed content (e.g. from a remote agent) without recompressing.
|
||||||
|
/// </summary>
|
||||||
|
Task<int> SaveCompressedAsync(
|
||||||
|
int queueId,
|
||||||
|
string contentEncoding,
|
||||||
|
byte[] contentBytes,
|
||||||
|
int originalLength,
|
||||||
|
int compressedLength,
|
||||||
|
CancellationToken ct);
|
||||||
Task<ContentRow?> GetByQueueIdAsync(int queueId, CancellationToken ct);
|
Task<ContentRow?> GetByQueueIdAsync(int queueId, CancellationToken ct);
|
||||||
Task<CompressedContent?> GetCompressedByQueueIdAsync(int queueId, CancellationToken ct);
|
Task<CompressedContent?> GetCompressedByQueueIdAsync(int queueId, CancellationToken ct);
|
||||||
|
|
||||||
|
|||||||
@ -8,13 +8,24 @@ public interface IQueueRepository
|
|||||||
Task<QueueCounts> GetCountsAsync(int sessionId, CancellationToken ct);
|
Task<QueueCounts> GetCountsAsync(int sessionId, CancellationToken ct);
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Pega 1 item pendente e muda para Processing atomica/seguramente.
|
/// Pega 1 item pendente e faz "lease" atomico (Processing) para um worker.
|
||||||
/// Retorna null se não houver itens pendentes.
|
/// Retorna null se não houver itens disponíveis.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
Task<QueueItem?> TryDequeueAsync(int sessionId, CancellationToken ct);
|
Task<QueueItem?> TryDequeueAsync(int sessionId, string workerId, TimeSpan leaseFor, CancellationToken ct);
|
||||||
|
|
||||||
Task MarkDoneAsync(int queueId, CancellationToken ct);
|
/// <summary>
|
||||||
Task MarkFailedAsync(int queueId, string error, CancellationToken ct);
|
/// Pega um lote de itens pendentes e faz "lease" atomico (Processing) para um worker.
|
||||||
|
/// Itens com lease expirado também podem ser reprocessados.
|
||||||
|
/// </summary>
|
||||||
|
Task<IReadOnlyList<QueueItem>> LeaseBatchAsync(int sessionId, string workerId, int take, TimeSpan leaseFor, CancellationToken ct);
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Renova o lease de um item (se ele ainda pertence ao mesmo worker).
|
||||||
|
/// </summary>
|
||||||
|
Task<bool> RenewLeaseAsync(int queueId, string workerId, TimeSpan leaseFor, CancellationToken ct);
|
||||||
|
|
||||||
|
Task<bool> MarkDoneAsync(int queueId, string workerId, CancellationToken ct);
|
||||||
|
Task<bool> MarkFailedAsync(int queueId, string workerId, string error, CancellationToken ct);
|
||||||
|
|
||||||
// Opcional: resetar stuck processing (se quiser depois)
|
// Opcional: resetar stuck processing (se quiser depois)
|
||||||
Task<int> RequeueStuckProcessingAsync(int sessionId, TimeSpan olderThan, CancellationToken ct);
|
Task<int> RequeueStuckProcessingAsync(int sessionId, TimeSpan olderThan, CancellationToken ct);
|
||||||
|
|||||||
39
ScrapperAPI/Options/WorkerOptions.cs
Normal file
39
ScrapperAPI/Options/WorkerOptions.cs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
namespace ScrapperAPI.Options;
|
||||||
|
|
||||||
|
public enum DistributedMode
|
||||||
|
{
|
||||||
|
LocalOnly = 0,
|
||||||
|
Hybrid = 1,
|
||||||
|
PreferAgents = 2,
|
||||||
|
PreferLocal = 3
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class WorkerOptions
|
||||||
|
{
|
||||||
|
public DistributedMode Mode { get; set; } = DistributedMode.Hybrid;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Lease duration for a queue item before it can be recovered by another worker.
|
||||||
|
/// </summary>
|
||||||
|
public int LeaseSeconds { get; set; } = 120;
|
||||||
|
|
||||||
|
public LocalWorkerOptions Local { get; set; } = new();
|
||||||
|
public AgentOptions Agents { get; set; } = new();
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class LocalWorkerOptions
|
||||||
|
{
|
||||||
|
public bool Enabled { get; set; } = true;
|
||||||
|
public int Concurrency { get; set; } = 1;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// When Mode=PreferAgents, local worker will run only if no agent was seen within this window.
|
||||||
|
/// </summary>
|
||||||
|
public int PreferAgentsGraceSeconds { get; set; } = 30;
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class AgentOptions
|
||||||
|
{
|
||||||
|
public bool Enabled { get; set; } = true;
|
||||||
|
public bool RequireMutualTls { get; set; } = true;
|
||||||
|
}
|
||||||
@ -10,12 +10,14 @@ using ScrapperAPI.Repositories;
|
|||||||
using ScrapperAPI.Services;
|
using ScrapperAPI.Services;
|
||||||
using ScrapperAPI.Utils;
|
using ScrapperAPI.Utils;
|
||||||
using ScrapperAPI.Workers;
|
using ScrapperAPI.Workers;
|
||||||
|
using ScrapperAPI.AgentGrpc;
|
||||||
|
|
||||||
var builder = WebApplication.CreateBuilder(args);
|
var builder = WebApplication.CreateBuilder(args);
|
||||||
|
|
||||||
builder.Services.AddOpenApi();
|
builder.Services.AddOpenApi();
|
||||||
builder.Services.AddSignalR();
|
builder.Services.AddSignalR();
|
||||||
builder.Services.AddControllers();
|
builder.Services.AddControllers();
|
||||||
|
builder.Services.AddGrpc();
|
||||||
|
|
||||||
// Authentik (OIDC) - JWT Bearer validation for API + SignalR
|
// Authentik (OIDC) - JWT Bearer validation for API + SignalR
|
||||||
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
|
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
|
||||||
@ -63,6 +65,7 @@ builder.Services.AddAuthorization(options =>
|
|||||||
|
|
||||||
builder.Services.Configure<ScraperOptions>(builder.Configuration.GetSection("Scraper"));
|
builder.Services.Configure<ScraperOptions>(builder.Configuration.GetSection("Scraper"));
|
||||||
builder.Services.Configure<ExtractionOptions>(builder.Configuration.GetSection("Extraction"));
|
builder.Services.Configure<ExtractionOptions>(builder.Configuration.GetSection("Extraction"));
|
||||||
|
builder.Services.Configure<WorkerOptions>(builder.Configuration.GetSection("Workers"));
|
||||||
|
|
||||||
builder.Services.AddSingleton<IDomainRateLimiter>(sp =>
|
builder.Services.AddSingleton<IDomainRateLimiter>(sp =>
|
||||||
{
|
{
|
||||||
@ -76,6 +79,7 @@ builder.Services.AddSingleton<IDbConnectionFactory, NpgsqlConnectionFactory>();
|
|||||||
builder.Services.AddScoped<ISessionRepository, SessionRepository>();
|
builder.Services.AddScoped<ISessionRepository, SessionRepository>();
|
||||||
builder.Services.AddScoped<IQueueRepository, QueueRepository>();
|
builder.Services.AddScoped<IQueueRepository, QueueRepository>();
|
||||||
builder.Services.AddScoped<IContentRepository, ContentRepository>();
|
builder.Services.AddScoped<IContentRepository, ContentRepository>();
|
||||||
|
builder.Services.AddScoped<IAgentRepository, AgentRepository>();
|
||||||
|
|
||||||
// Extraction
|
// Extraction
|
||||||
builder.Services.AddSingleton<ExtractionEngine>();
|
builder.Services.AddSingleton<ExtractionEngine>();
|
||||||
@ -115,8 +119,10 @@ if (app.Environment.IsDevelopment())
|
|||||||
app.MapOpenApi().AllowAnonymous();
|
app.MapOpenApi().AllowAnonymous();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
app.MapGrpcService<VoyagerAgentService>();
|
||||||
app.MapControllers();
|
app.MapControllers();
|
||||||
app.MapHub<ScrapeHub>("/ws/scrape").RequireAuthorization();
|
app.MapHub<ScrapeHub>("/ws/scrape").RequireAuthorization();
|
||||||
|
app.MapGrpcService<AgentServiceImpl>().AllowAnonymous();
|
||||||
|
|
||||||
// app.UseHttpsRedirection();
|
// app.UseHttpsRedirection();
|
||||||
|
|
||||||
|
|||||||
66
ScrapperAPI/Protos/agent.proto
Normal file
66
ScrapperAPI/Protos/agent.proto
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option csharp_namespace = "ScrapperAPI.AgentGrpc";
|
||||||
|
|
||||||
|
package voyager.agent;
|
||||||
|
|
||||||
|
message RegisterAgentRequest {
|
||||||
|
string agent_id = 1;
|
||||||
|
string display_name = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
message RegisterAgentResponse {
|
||||||
|
bool ok = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message HeartbeatRequest {
|
||||||
|
string agent_id = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message HeartbeatResponse {
|
||||||
|
bool ok = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
message LeaseWorkRequest {
|
||||||
|
int32 session_id = 1;
|
||||||
|
string agent_id = 2;
|
||||||
|
int32 capacity = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
message LeaseWorkResponse {
|
||||||
|
repeated WorkItem items = 1;
|
||||||
|
int64 server_time_utc_ms = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
message WorkItem {
|
||||||
|
int32 queue_id = 1;
|
||||||
|
int32 session_id = 2;
|
||||||
|
string url = 3;
|
||||||
|
int64 lease_expires_utc_ms = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
message SubmitResultRequest {
|
||||||
|
int32 queue_id = 1;
|
||||||
|
string agent_id = 2;
|
||||||
|
bool success = 3;
|
||||||
|
string error = 4;
|
||||||
|
|
||||||
|
// Content: either plain text (content_text) or compressed bytes (content_bytes).
|
||||||
|
string content_text = 5;
|
||||||
|
bytes content_bytes = 6;
|
||||||
|
string content_encoding = 7; // e.g. "gzip"
|
||||||
|
int32 original_length = 8;
|
||||||
|
int32 compressed_length = 9;
|
||||||
|
}
|
||||||
|
|
||||||
|
message SubmitResultResponse {
|
||||||
|
bool ok = 1;
|
||||||
|
string message = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
service AgentService {
|
||||||
|
rpc RegisterAgent(RegisterAgentRequest) returns (RegisterAgentResponse);
|
||||||
|
rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);
|
||||||
|
rpc LeaseWork(LeaseWorkRequest) returns (LeaseWorkResponse);
|
||||||
|
rpc SubmitResult(SubmitResultRequest) returns (SubmitResultResponse);
|
||||||
|
}
|
||||||
84
ScrapperAPI/Repositories/AgentRepository.cs
Normal file
84
ScrapperAPI/Repositories/AgentRepository.cs
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
using Dapper;
|
||||||
|
using ScrapperAPI.Dtos;
|
||||||
|
using ScrapperAPI.Interfaces;
|
||||||
|
|
||||||
|
namespace ScrapperAPI.Repositories;
|
||||||
|
|
||||||
|
public sealed class AgentRepository : IAgentRepository
|
||||||
|
{
|
||||||
|
private readonly IDbConnectionFactory _db;
|
||||||
|
|
||||||
|
public AgentRepository(IDbConnectionFactory db) => _db = db;
|
||||||
|
|
||||||
|
public async Task UpsertAsync(string agentId, string? displayName, string certThumbprint, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
insert into agent(id, display_name, cert_thumbprint, last_seen_at, is_enabled)
|
||||||
|
values (@agentId, @displayName, @certThumbprint, now(), true)
|
||||||
|
on conflict (id)
|
||||||
|
do update set
|
||||||
|
display_name = excluded.display_name,
|
||||||
|
cert_thumbprint = excluded.cert_thumbprint,
|
||||||
|
last_seen_at = now();
|
||||||
|
""";
|
||||||
|
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
await conn.ExecuteAsync(new CommandDefinition(sql, new { agentId, displayName, certThumbprint }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<bool> IsEnabledAsync(string agentId, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
select is_enabled from agent where id = @agentId;
|
||||||
|
""";
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
return await conn.ExecuteScalarAsync<bool>(new CommandDefinition(sql, new { agentId }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<string?> GetThumbprintAsync(string agentId, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
select cert_thumbprint from agent where id = @agentId;
|
||||||
|
""";
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
return await conn.ExecuteScalarAsync<string?>(new CommandDefinition(sql, new { agentId }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task TouchAsync(string agentId, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
update agent set last_seen_at = now() where id = @agentId;
|
||||||
|
""";
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
await conn.ExecuteAsync(new CommandDefinition(sql, new { agentId }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<int> CountActiveAsync(TimeSpan seenWithin, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
select count(*)
|
||||||
|
from agent
|
||||||
|
where is_enabled = true
|
||||||
|
and last_seen_at > now() - (@seenSeconds * interval '1 second');
|
||||||
|
""";
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
return await conn.ExecuteScalarAsync<int>(new CommandDefinition(sql, new { seenSeconds = (int)seenWithin.TotalSeconds }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<AgentRow?> GetAsync(string agentId, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
select
|
||||||
|
id as Id,
|
||||||
|
display_name as DisplayName,
|
||||||
|
cert_thumbprint as CertThumbprint,
|
||||||
|
created_at as CreatedAt,
|
||||||
|
last_seen_at as LastSeenAt,
|
||||||
|
is_enabled as IsEnabled
|
||||||
|
from agent
|
||||||
|
where id = @agentId;
|
||||||
|
""";
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
return await conn.QuerySingleOrDefaultAsync<AgentRow>(new CommandDefinition(sql, new { agentId }, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -33,6 +33,31 @@ public sealed class ContentRepository : IContentRepository
|
|||||||
}, cancellationToken: ct));
|
}, cancellationToken: ct));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task<int> SaveCompressedAsync(
|
||||||
|
int queueId,
|
||||||
|
string contentEncoding,
|
||||||
|
byte[] contentBytes,
|
||||||
|
int originalLength,
|
||||||
|
int compressedLength,
|
||||||
|
CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
insert into content(queue_id, content_encoding, content_bytes, original_length, compressed_length)
|
||||||
|
values (@queueId, @contentEncoding, @bytes, @origLen, @compLen)
|
||||||
|
returning id;
|
||||||
|
""";
|
||||||
|
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
return await conn.ExecuteScalarAsync<int>(new CommandDefinition(sql, new
|
||||||
|
{
|
||||||
|
queueId,
|
||||||
|
contentEncoding,
|
||||||
|
bytes = contentBytes,
|
||||||
|
origLen = originalLength,
|
||||||
|
compLen = compressedLength
|
||||||
|
}, cancellationToken: ct));
|
||||||
|
}
|
||||||
|
|
||||||
public async Task<ContentRow?> GetByQueueIdAsync(int queueId, CancellationToken ct)
|
public async Task<ContentRow?> GetByQueueIdAsync(int queueId, CancellationToken ct)
|
||||||
{
|
{
|
||||||
const string sql = """
|
const string sql = """
|
||||||
|
|||||||
@ -47,7 +47,7 @@ public sealed class ExtractedDataRepository : IExtractedDataRepository
|
|||||||
model_id as ModelId,
|
model_id as ModelId,
|
||||||
session_id as SessionId,
|
session_id as SessionId,
|
||||||
queue_id as QueueId,
|
queue_id as QueueId,
|
||||||
extracted_json::text as extracted_json,
|
extracted_json::text as extractedJson,
|
||||||
success,
|
success,
|
||||||
error,
|
error,
|
||||||
extracted_at as ExtractedAt
|
extracted_at as ExtractedAt
|
||||||
@ -71,7 +71,7 @@ public sealed class ExtractedDataRepository : IExtractedDataRepository
|
|||||||
model_id as ModelId,
|
model_id as ModelId,
|
||||||
session_id as SessionId,
|
session_id as SessionId,
|
||||||
queue_id as QueueId,
|
queue_id as QueueId,
|
||||||
extracted_json::text as extracted_json,
|
extracted_json::text as extractedJson,
|
||||||
success,
|
success,
|
||||||
error,
|
error,
|
||||||
extracted_at as ExtractedAt
|
extracted_at as ExtractedAt
|
||||||
@ -86,16 +86,7 @@ public sealed class ExtractedDataRepository : IExtractedDataRepository
|
|||||||
return row?.ToDto();
|
return row?.ToDto();
|
||||||
}
|
}
|
||||||
|
|
||||||
private sealed record RowRaw(
|
private sealed class RowRaw
|
||||||
long Id,
|
|
||||||
long RunId,
|
|
||||||
long ModelId,
|
|
||||||
int SessionId,
|
|
||||||
int QueueId,
|
|
||||||
string Extracted_Json,
|
|
||||||
bool Success,
|
|
||||||
string? Error,
|
|
||||||
DateTimeOffset ExtractedAt)
|
|
||||||
{
|
{
|
||||||
public ExtractedDataRow ToDto() => new(
|
public ExtractedDataRow ToDto() => new(
|
||||||
Id,
|
Id,
|
||||||
@ -103,10 +94,38 @@ public sealed class ExtractedDataRepository : IExtractedDataRepository
|
|||||||
ModelId,
|
ModelId,
|
||||||
SessionId,
|
SessionId,
|
||||||
QueueId,
|
QueueId,
|
||||||
JsonDocument.Parse(Extracted_Json),
|
JsonDocument.Parse(ExtractedJson ?? "{}"),
|
||||||
Success,
|
Success,
|
||||||
Error,
|
Error,
|
||||||
ExtractedAt
|
ExtractedAt
|
||||||
);
|
);
|
||||||
|
|
||||||
|
public long Id { get; init; }
|
||||||
|
public long RunId { get; init; }
|
||||||
|
public long ModelId { get; init; }
|
||||||
|
public int SessionId { get; init; }
|
||||||
|
public int QueueId { get; init; }
|
||||||
|
public string? ExtractedJson { get; init; }
|
||||||
|
public bool Success { get; init; }
|
||||||
|
public string? Error { get; init; }
|
||||||
|
public DateTimeOffset ExtractedAt { get; init; }
|
||||||
|
|
||||||
|
public RowRaw()
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public RowRaw(long id, long runId, long modelId, int sessionId, int queueId, string? extractedJson, bool success, string? error, DateTimeOffset extractedAt)
|
||||||
|
{
|
||||||
|
Id = id;
|
||||||
|
RunId = runId;
|
||||||
|
ModelId = modelId;
|
||||||
|
SessionId = sessionId;
|
||||||
|
QueueId = queueId;
|
||||||
|
ExtractedJson = extractedJson;
|
||||||
|
Success = success;
|
||||||
|
Error = error;
|
||||||
|
ExtractedAt = extractedAt;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,7 +37,7 @@ public sealed class ExtractionModelRepository : IExtractionModelRepository
|
|||||||
name,
|
name,
|
||||||
version,
|
version,
|
||||||
description,
|
description,
|
||||||
definition::text as definition_json,
|
definition::text as definitionJson,
|
||||||
created_at,
|
created_at,
|
||||||
updated_at
|
updated_at
|
||||||
from extraction_model
|
from extraction_model
|
||||||
|
|||||||
@ -41,9 +41,16 @@ public sealed class QueueRepository : IQueueRepository
|
|||||||
new CommandDefinition(sql, new { sessionId }, cancellationToken: ct));
|
new CommandDefinition(sql, new { sessionId }, cancellationToken: ct));
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task<QueueItem?> TryDequeueAsync(int sessionId, CancellationToken ct)
|
public async Task<QueueItem?> TryDequeueAsync(int sessionId, string workerId, TimeSpan leaseFor, CancellationToken ct)
|
||||||
{
|
{
|
||||||
// Importante: 1 transação + SKIP LOCKED (permite multi-worker no futuro)
|
var batch = await LeaseBatchAsync(sessionId, workerId, take: 1, leaseFor, ct);
|
||||||
|
return batch.FirstOrDefault();
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<QueueItem>> LeaseBatchAsync(int sessionId, string workerId, int take, TimeSpan leaseFor, CancellationToken ct)
|
||||||
|
{
|
||||||
|
if (take <= 0) return Array.Empty<QueueItem>();
|
||||||
|
|
||||||
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
using var tx = conn.BeginTransaction();
|
using var tx = conn.BeginTransaction();
|
||||||
|
|
||||||
@ -52,15 +59,20 @@ public sealed class QueueRepository : IQueueRepository
|
|||||||
select id
|
select id
|
||||||
from queue
|
from queue
|
||||||
where session_id = @sessionId
|
where session_id = @sessionId
|
||||||
and status = 0
|
and (
|
||||||
|
status = 0
|
||||||
|
or (status = 1 and lease_expires_at is not null and lease_expires_at < now())
|
||||||
|
)
|
||||||
order by id
|
order by id
|
||||||
for update skip locked
|
for update skip locked
|
||||||
limit 1
|
limit @take
|
||||||
)
|
)
|
||||||
update queue q
|
update queue q
|
||||||
set status = 1,
|
set status = 1,
|
||||||
started_date = now(),
|
started_date = coalesce(q.started_date, now()),
|
||||||
attempts = attempts + 1
|
attempts = q.attempts + 1,
|
||||||
|
leased_by = @workerId,
|
||||||
|
lease_expires_at = now() + (@leaseSeconds * interval '1 second')
|
||||||
from next
|
from next
|
||||||
where q.id = next.id
|
where q.id = next.id
|
||||||
returning
|
returning
|
||||||
@ -75,39 +87,72 @@ public sealed class QueueRepository : IQueueRepository
|
|||||||
q.last_error as LastError;
|
q.last_error as LastError;
|
||||||
""";
|
""";
|
||||||
|
|
||||||
var item = await conn.QuerySingleOrDefaultAsync<QueueItem>(
|
var rows = await conn.QueryAsync<QueueItem>(
|
||||||
new CommandDefinition(sql, new { sessionId }, transaction: tx, cancellationToken: ct));
|
new CommandDefinition(sql,
|
||||||
|
new
|
||||||
|
{
|
||||||
|
sessionId,
|
||||||
|
workerId,
|
||||||
|
take,
|
||||||
|
leaseSeconds = Math.Max(1, (int)leaseFor.TotalSeconds)
|
||||||
|
},
|
||||||
|
transaction: tx,
|
||||||
|
cancellationToken: ct));
|
||||||
|
|
||||||
tx.Commit();
|
tx.Commit();
|
||||||
return item;
|
return rows.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task MarkDoneAsync(int queueId, CancellationToken ct)
|
public async Task<bool> RenewLeaseAsync(int queueId, string workerId, TimeSpan leaseFor, CancellationToken ct)
|
||||||
|
{
|
||||||
|
const string sql = """
|
||||||
|
update queue
|
||||||
|
set lease_expires_at = now() + (@leaseSeconds * interval '1 second')
|
||||||
|
where id = @queueId
|
||||||
|
and status = 1
|
||||||
|
and leased_by = @workerId
|
||||||
|
and (lease_expires_at is null or lease_expires_at > now() - interval '5 minutes');
|
||||||
|
""";
|
||||||
|
|
||||||
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
|
var rows = await conn.ExecuteAsync(new CommandDefinition(sql,
|
||||||
|
new { queueId, workerId, leaseSeconds = Math.Max(1, (int)leaseFor.TotalSeconds) },
|
||||||
|
cancellationToken: ct));
|
||||||
|
return rows > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<bool> MarkDoneAsync(int queueId, string workerId, CancellationToken ct)
|
||||||
{
|
{
|
||||||
const string sql = """
|
const string sql = """
|
||||||
update queue
|
update queue
|
||||||
set status = 2,
|
set status = 2,
|
||||||
finished_date = now(),
|
finished_date = now(),
|
||||||
last_error = null
|
last_error = null,
|
||||||
where id = @queueId;
|
lease_expires_at = null
|
||||||
|
where id = @queueId
|
||||||
|
and leased_by = @workerId;
|
||||||
""";
|
""";
|
||||||
|
|
||||||
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
await conn.ExecuteAsync(new CommandDefinition(sql, new { queueId }, cancellationToken: ct));
|
var rows = await conn.ExecuteAsync(new CommandDefinition(sql, new { queueId, workerId }, cancellationToken: ct));
|
||||||
|
return rows > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task MarkFailedAsync(int queueId, string error, CancellationToken ct)
|
public async Task<bool> MarkFailedAsync(int queueId, string workerId, string error, CancellationToken ct)
|
||||||
{
|
{
|
||||||
const string sql = """
|
const string sql = """
|
||||||
update queue
|
update queue
|
||||||
set status = 3,
|
set status = 3,
|
||||||
finished_date = now(),
|
finished_date = now(),
|
||||||
last_error = @error
|
last_error = @error,
|
||||||
where id = @queueId;
|
lease_expires_at = null
|
||||||
|
where id = @queueId
|
||||||
|
and leased_by = @workerId;
|
||||||
""";
|
""";
|
||||||
|
|
||||||
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
using var conn = await _db.CreateOpenConnectionAsync(ct);
|
||||||
await conn.ExecuteAsync(new CommandDefinition(sql, new { queueId, error }, cancellationToken: ct));
|
var rows = await conn.ExecuteAsync(new CommandDefinition(sql, new { queueId, workerId, error }, cancellationToken: ct));
|
||||||
|
return rows > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async Task<int> RequeueStuckProcessingAsync(int sessionId, TimeSpan olderThan, CancellationToken ct)
|
public async Task<int> RequeueStuckProcessingAsync(int sessionId, TimeSpan olderThan, CancellationToken ct)
|
||||||
|
|||||||
@ -7,12 +7,21 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.1" />
|
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.1" />
|
||||||
|
<PackageReference Include="Grpc.AspNetCore" Version="2.67.0" />
|
||||||
|
<PackageReference Include="Google.Protobuf" Version="3.29.3" />
|
||||||
|
<PackageReference Include="Grpc.Tools" Version="2.69.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
</PackageReference>
|
||||||
<PackageReference Include="Dapper" Version="2.1.66" />
|
<PackageReference Include="Dapper" Version="2.1.66" />
|
||||||
<PackageReference Include="Npgsql" Version="10.0.0" />
|
<PackageReference Include="Npgsql" Version="10.0.0" />
|
||||||
<PackageReference Include="Microsoft.AspNet.SignalR" Version="2.4.3" />
|
<PackageReference Include="Microsoft.AspNet.SignalR" Version="2.4.3" />
|
||||||
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
|
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
|
||||||
<PackageReference Include="AngleSharp" Version="1.3.0" />
|
<PackageReference Include="AngleSharp" Version="1.3.0" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Protobuf Include="Protos/agent.proto" GrpcServices="Server" />
|
||||||
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Content Include="..\.dockerignore">
|
<Content Include="..\.dockerignore">
|
||||||
<Link>.dockerignore</Link>
|
<Link>.dockerignore</Link>
|
||||||
|
|||||||
@ -16,12 +16,30 @@ create table queue(
|
|||||||
status smallint not null default 0,
|
status smallint not null default 0,
|
||||||
started_date timestamp null,
|
started_date timestamp null,
|
||||||
finished_date timestamp null,
|
finished_date timestamp null,
|
||||||
|
leased_by text null,
|
||||||
|
lease_expires_at timestamptz null,
|
||||||
attempts int not null default 0,
|
attempts int not null default 0,
|
||||||
last_error text null,
|
last_error text null,
|
||||||
created_date timestamp default now()
|
created_date timestamp default now()
|
||||||
);
|
);
|
||||||
|
|
||||||
create index idx_queue_session_status on queue(session_id, status);
|
create index idx_queue_session_status on queue(session_id, status);
|
||||||
|
create index idx_queue_lease on queue(session_id, status, lease_expires_at);
|
||||||
|
|
||||||
|
-- ------------------------------------------------------------
|
||||||
|
-- Agents (optional distributed workers)
|
||||||
|
-- ------------------------------------------------------------
|
||||||
|
|
||||||
|
drop table if exists agent;
|
||||||
|
|
||||||
|
create table agent(
|
||||||
|
id text primary key,
|
||||||
|
display_name text null,
|
||||||
|
cert_thumbprint text not null,
|
||||||
|
created_at timestamptz not null default now(),
|
||||||
|
last_seen_at timestamptz not null default now(),
|
||||||
|
is_enabled boolean not null default true
|
||||||
|
);
|
||||||
|
|
||||||
create table content(
|
create table content(
|
||||||
id serial primary key,
|
id serial primary key,
|
||||||
|
|||||||
@ -98,13 +98,13 @@ public sealed class ExtractionCoordinator : BackgroundService, IExtractionCoordi
|
|||||||
return new ExtractionRuntimeStatus(runId, false, 0, 0, 0, 0, null);
|
return new ExtractionRuntimeStatus(runId, false, 0, 0, 0, 0, null);
|
||||||
|
|
||||||
return new ExtractionRuntimeStatus(
|
return new ExtractionRuntimeStatus(
|
||||||
RunId: r.RunId,
|
runId: r.RunId,
|
||||||
IsRunning: r.IsRunning,
|
isRunning: r.IsRunning,
|
||||||
Processed: r.Processed,
|
processed: r.Processed,
|
||||||
Total: r.Total,
|
total: r.Total,
|
||||||
Succeeded: r.Succeeded,
|
succeeded: r.Succeeded,
|
||||||
Failed: r.Failed,
|
failed: r.Failed,
|
||||||
CurrentQueueId: r.CurrentQueueId
|
currentQueueId: r.CurrentQueueId
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,13 +194,13 @@ public sealed class ExtractionCoordinator : BackgroundService, IExtractionCoordi
|
|||||||
using var json = _engine.Extract(html, modelRow.Definition.RootElement);
|
using var json = _engine.Extract(html, modelRow.Definition.RootElement);
|
||||||
|
|
||||||
await extracted.UpsertAsync(new UpsertExtractedDataDto(
|
await extracted.UpsertAsync(new UpsertExtractedDataDto(
|
||||||
RunId: runtime.RunId,
|
runId: runtime.RunId,
|
||||||
ModelId: runtime.ModelId,
|
modelId: runtime.ModelId,
|
||||||
SessionId: runtime.SessionId,
|
sessionId: runtime.SessionId,
|
||||||
QueueId: qid,
|
queueId: qid,
|
||||||
ExtractedJson: json,
|
extractedJson: json,
|
||||||
Success: true,
|
success: true,
|
||||||
Error: null
|
error: null
|
||||||
), hostToken);
|
), hostToken);
|
||||||
|
|
||||||
runtime.Succeeded++;
|
runtime.Succeeded++;
|
||||||
@ -210,13 +210,13 @@ public sealed class ExtractionCoordinator : BackgroundService, IExtractionCoordi
|
|||||||
using var errJson = JsonDocument.Parse("{}");
|
using var errJson = JsonDocument.Parse("{}");
|
||||||
|
|
||||||
await extracted.UpsertAsync(new UpsertExtractedDataDto(
|
await extracted.UpsertAsync(new UpsertExtractedDataDto(
|
||||||
RunId: runtime.RunId,
|
runId: runtime.RunId,
|
||||||
ModelId: runtime.ModelId,
|
modelId: runtime.ModelId,
|
||||||
SessionId: runtime.SessionId,
|
sessionId: runtime.SessionId,
|
||||||
QueueId: qid,
|
queueId: qid,
|
||||||
ExtractedJson: errJson,
|
extractedJson: errJson,
|
||||||
Success: false,
|
success: false,
|
||||||
Error: Truncate(ex.Message, 2000)
|
error: Truncate(ex.Message, 2000)
|
||||||
), hostToken);
|
), hostToken);
|
||||||
|
|
||||||
runtime.Failed++;
|
runtime.Failed++;
|
||||||
|
|||||||
@ -16,6 +16,7 @@ public sealed class ScrapeCoordinator : BackgroundService, IScrapeCoordinator
|
|||||||
private readonly IScraperHttpClient _scraperHttp;
|
private readonly IScraperHttpClient _scraperHttp;
|
||||||
private readonly IScrapeEventBus _events;
|
private readonly IScrapeEventBus _events;
|
||||||
private readonly ScraperOptions _opts;
|
private readonly ScraperOptions _opts;
|
||||||
|
private readonly WorkerOptions _workerOpts;
|
||||||
|
|
||||||
private readonly Channel<int> _startRequests = Channel.CreateUnbounded<int>(
|
private readonly Channel<int> _startRequests = Channel.CreateUnbounded<int>(
|
||||||
new UnboundedChannelOptions { SingleReader = true, SingleWriter = false });
|
new UnboundedChannelOptions { SingleReader = true, SingleWriter = false });
|
||||||
@ -30,6 +31,7 @@ public sealed class ScrapeCoordinator : BackgroundService, IScrapeCoordinator
|
|||||||
IHttpClientFactory httpClientFactory,
|
IHttpClientFactory httpClientFactory,
|
||||||
ILogger<ScrapeCoordinator> logger,
|
ILogger<ScrapeCoordinator> logger,
|
||||||
IOptions<ScraperOptions> options,
|
IOptions<ScraperOptions> options,
|
||||||
|
IOptions<WorkerOptions> workerOptions,
|
||||||
IScraperHttpClient scraperHttp,
|
IScraperHttpClient scraperHttp,
|
||||||
IScrapeEventBus events)
|
IScrapeEventBus events)
|
||||||
{
|
{
|
||||||
@ -37,6 +39,7 @@ public sealed class ScrapeCoordinator : BackgroundService, IScrapeCoordinator
|
|||||||
_httpClientFactory = httpClientFactory;
|
_httpClientFactory = httpClientFactory;
|
||||||
_logger = logger;
|
_logger = logger;
|
||||||
_opts = options.Value;
|
_opts = options.Value;
|
||||||
|
_workerOpts = workerOptions.Value;
|
||||||
_scraperHttp = scraperHttp;
|
_scraperHttp = scraperHttp;
|
||||||
_events = events;
|
_events = events;
|
||||||
}
|
}
|
||||||
@ -120,84 +123,36 @@ public sealed class ScrapeCoordinator : BackgroundService, IScrapeCoordinator
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
if (!_workerOpts.Local.Enabled)
|
||||||
|
return;
|
||||||
|
|
||||||
var http = _httpClientFactory.CreateClient("scraper");
|
var http = _httpClientFactory.CreateClient("scraper");
|
||||||
|
|
||||||
|
// When PreferAgents: only run local if no agent was recently seen.
|
||||||
while (!hostToken.IsCancellationRequested)
|
while (!hostToken.IsCancellationRequested)
|
||||||
{
|
{
|
||||||
|
if (_workerOpts.Mode == DistributedMode.PreferAgents)
|
||||||
|
{
|
||||||
|
var noAgents = await NoAgentsRecentlySeenAsync(_workerOpts.Local.PreferAgentsGraceSeconds, hostToken);
|
||||||
|
if (!noAgents)
|
||||||
|
{
|
||||||
|
await Task.Delay(TimeSpan.FromSeconds(2), hostToken);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// STOP GRACIOSO: não pega próxima URL
|
// STOP GRACIOSO: não pega próxima URL
|
||||||
if (runner.StopRequested)
|
if (runner.StopRequested)
|
||||||
break;
|
break;
|
||||||
|
var concurrency = Math.Max(1, _workerOpts.Local.Concurrency);
|
||||||
|
var leaseFor = TimeSpan.FromSeconds(Math.Max(5, _workerOpts.LeaseSeconds));
|
||||||
|
|
||||||
// cria scope (repos scoped vivem aqui dentro)
|
var tasks = Enumerable.Range(0, concurrency)
|
||||||
using var scope = _scopeFactory.CreateScope();
|
.Select(i => RunLocalWorkerLoopAsync(runner, workerId: $"local:{Environment.MachineName}:{i}", leaseFor, hostToken))
|
||||||
var queue = scope.ServiceProvider.GetRequiredService<IQueueRepository>();
|
.ToArray();
|
||||||
var content = scope.ServiceProvider.GetRequiredService<IContentRepository>();
|
|
||||||
|
|
||||||
var item = await queue.TryDequeueAsync(runner.SessionId, hostToken);
|
await Task.WhenAll(tasks);
|
||||||
if (item is null)
|
break; // no more work (or stop requested)
|
||||||
break;
|
|
||||||
|
|
||||||
runner.SetCurrent(item.Id, item.Url);
|
|
||||||
|
|
||||||
await _events.PublishAsync(new ScrapeEvent(
|
|
||||||
ScrapeEventType.ItemStarted,
|
|
||||||
runner.SessionId,
|
|
||||||
DateTimeOffset.UtcNow,
|
|
||||||
QueueId: item.Id,
|
|
||||||
Url: item.Url
|
|
||||||
), hostToken);
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
var html = await _scraperHttp.GetStringWithRetryAsync(item.Url, hostToken);
|
|
||||||
|
|
||||||
await content.SaveAsync(item.Id, html, hostToken);
|
|
||||||
await queue.MarkDoneAsync(item.Id, hostToken);
|
|
||||||
|
|
||||||
await _events.PublishAsync(new ScrapeEvent(
|
|
||||||
ScrapeEventType.ItemSucceeded,
|
|
||||||
runner.SessionId,
|
|
||||||
DateTimeOffset.UtcNow,
|
|
||||||
QueueId: item.Id,
|
|
||||||
Url: item.Url
|
|
||||||
), hostToken);
|
|
||||||
}
|
|
||||||
catch (Exception ex)
|
|
||||||
{
|
|
||||||
await queue.MarkFailedAsync(item.Id, Truncate(ex.ToString(), 8000), hostToken);
|
|
||||||
|
|
||||||
await _events.PublishAsync(new ScrapeEvent(
|
|
||||||
ScrapeEventType.ItemFailed,
|
|
||||||
runner.SessionId,
|
|
||||||
DateTimeOffset.UtcNow,
|
|
||||||
QueueId: item.Id,
|
|
||||||
Url: item.Url,
|
|
||||||
Error: ex.Message
|
|
||||||
), hostToken);
|
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
|
||||||
// progresso (snapshot do DB) + percent
|
|
||||||
var counts = await queue.GetCountsAsync(runner.SessionId, hostToken);
|
|
||||||
var percent = counts.Total == 0 ? 0 : (double)counts.Done * 100.0 / (double)counts.Total;
|
|
||||||
|
|
||||||
await _events.PublishAsync(new ScrapeEvent(
|
|
||||||
ScrapeEventType.Progress,
|
|
||||||
runner.SessionId,
|
|
||||||
DateTimeOffset.UtcNow,
|
|
||||||
Total: counts.Total,
|
|
||||||
Done: counts.Done,
|
|
||||||
Pending: counts.Pending,
|
|
||||||
Processing: counts.Processing,
|
|
||||||
Failed: counts.Failed,
|
|
||||||
Percent: percent
|
|
||||||
), hostToken);
|
|
||||||
|
|
||||||
runner.ClearCurrent();
|
|
||||||
|
|
||||||
if (!runner.StopRequested && !hostToken.IsCancellationRequested)
|
|
||||||
await PoliteDelayAsync(hostToken);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
@ -212,6 +167,97 @@ public sealed class ScrapeCoordinator : BackgroundService, IScrapeCoordinator
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async Task RunLocalWorkerLoopAsync(Runner runner, string workerId, TimeSpan leaseFor, CancellationToken hostToken)
|
||||||
|
{
|
||||||
|
while (!hostToken.IsCancellationRequested && !runner.StopRequested)
|
||||||
|
{
|
||||||
|
using var scope = _scopeFactory.CreateScope();
|
||||||
|
var queue = scope.ServiceProvider.GetRequiredService<IQueueRepository>();
|
||||||
|
var content = scope.ServiceProvider.GetRequiredService<IContentRepository>();
|
||||||
|
|
||||||
|
var item = await queue.TryDequeueAsync(runner.SessionId, workerId, leaseFor, hostToken);
|
||||||
|
if (item is null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
runner.SetCurrent(item.Id, item.Url);
|
||||||
|
|
||||||
|
await _events.PublishAsync(new ScrapeEvent(
|
||||||
|
ScrapeEventType.ItemStarted,
|
||||||
|
runner.SessionId,
|
||||||
|
DateTimeOffset.UtcNow,
|
||||||
|
QueueId: item.Id,
|
||||||
|
Url: item.Url
|
||||||
|
), hostToken);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var html = await _scraperHttp.GetStringWithRetryAsync(item.Url, hostToken);
|
||||||
|
|
||||||
|
await content.SaveAsync(item.Id, html, hostToken);
|
||||||
|
await queue.MarkDoneAsync(item.Id, workerId, hostToken);
|
||||||
|
|
||||||
|
await _events.PublishAsync(new ScrapeEvent(
|
||||||
|
ScrapeEventType.ItemSucceeded,
|
||||||
|
runner.SessionId,
|
||||||
|
DateTimeOffset.UtcNow,
|
||||||
|
QueueId: item.Id,
|
||||||
|
Url: item.Url
|
||||||
|
), hostToken);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
await queue.MarkFailedAsync(item.Id, workerId, Truncate(ex.ToString(), 8000), hostToken);
|
||||||
|
|
||||||
|
await _events.PublishAsync(new ScrapeEvent(
|
||||||
|
ScrapeEventType.ItemFailed,
|
||||||
|
runner.SessionId,
|
||||||
|
DateTimeOffset.UtcNow,
|
||||||
|
QueueId: item.Id,
|
||||||
|
Url: item.Url,
|
||||||
|
Error: ex.Message
|
||||||
|
), hostToken);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
var counts = await queue.GetCountsAsync(runner.SessionId, hostToken);
|
||||||
|
var percent = counts.Total == 0 ? 0 : (double)counts.Done * 100.0 / (double)counts.Total;
|
||||||
|
|
||||||
|
await _events.PublishAsync(new ScrapeEvent(
|
||||||
|
ScrapeEventType.Progress,
|
||||||
|
runner.SessionId,
|
||||||
|
DateTimeOffset.UtcNow,
|
||||||
|
Total: counts.Total,
|
||||||
|
Done: counts.Done,
|
||||||
|
Pending: counts.Pending,
|
||||||
|
Processing: counts.Processing,
|
||||||
|
Failed: counts.Failed,
|
||||||
|
Percent: percent
|
||||||
|
), hostToken);
|
||||||
|
|
||||||
|
runner.ClearCurrent();
|
||||||
|
|
||||||
|
if (!runner.StopRequested && !hostToken.IsCancellationRequested)
|
||||||
|
await PoliteDelayAsync(hostToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<bool> NoAgentsRecentlySeenAsync(int withinSeconds, CancellationToken ct)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using var scope = _scopeFactory.CreateScope();
|
||||||
|
var agents = scope.ServiceProvider.GetRequiredService<IAgentRepository>();
|
||||||
|
var active = await agents.CountActiveAsync(TimeSpan.FromSeconds(Math.Max(1, withinSeconds)), ct);
|
||||||
|
return active == 0;
|
||||||
|
}
|
||||||
|
catch
|
||||||
|
{
|
||||||
|
// If agents table isn't configured yet, default to "no agents".
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static async Task<string> FetchHtmlAsync(HttpClient http, string url, CancellationToken ct)
|
private static async Task<string> FetchHtmlAsync(HttpClient http, string url, CancellationToken ct)
|
||||||
{
|
{
|
||||||
using var req = new HttpRequestMessage(HttpMethod.Get, url);
|
using var req = new HttpRequestMessage(HttpMethod.Get, url);
|
||||||
|
|||||||
@ -1,4 +1,16 @@
|
|||||||
{
|
{
|
||||||
|
"Kestrel": {
|
||||||
|
"Endpoints": {
|
||||||
|
"Http": {
|
||||||
|
"Url": "http://0.0.0.0:5123",
|
||||||
|
"Protocols": "Http1"
|
||||||
|
},
|
||||||
|
"Grpc": {
|
||||||
|
"Url": "https://0.0.0.0:5001",
|
||||||
|
"Protocols": "Http2"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"Logging": {
|
"Logging": {
|
||||||
"LogLevel": {
|
"LogLevel": {
|
||||||
"Default": "Information",
|
"Default": "Information",
|
||||||
@ -12,5 +24,18 @@
|
|||||||
},
|
},
|
||||||
"Extraction": {
|
"Extraction": {
|
||||||
"MaxParallelRuns": 3
|
"MaxParallelRuns": 3
|
||||||
|
},
|
||||||
|
"Workers": {
|
||||||
|
"Mode": "Hybrid",
|
||||||
|
"LeaseSeconds": 60,
|
||||||
|
"Local": {
|
||||||
|
"Enabled": true,
|
||||||
|
"Concurrency": 1,
|
||||||
|
"PreferAgentsGraceSeconds": 15
|
||||||
|
},
|
||||||
|
"Agents": {
|
||||||
|
"Enabled": true,
|
||||||
|
"RequireMutualTls": false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -23,6 +23,19 @@
|
|||||||
"Extraction": {
|
"Extraction": {
|
||||||
"MaxParallelRuns": 3
|
"MaxParallelRuns": 3
|
||||||
},
|
},
|
||||||
|
"Workers": {
|
||||||
|
"Mode": "Hybrid",
|
||||||
|
"LeaseSeconds": 120,
|
||||||
|
"Local": {
|
||||||
|
"Enabled": true,
|
||||||
|
"Concurrency": 1,
|
||||||
|
"PreferAgentsGraceSeconds": 30
|
||||||
|
},
|
||||||
|
"Agents": {
|
||||||
|
"Enabled": true,
|
||||||
|
"RequireMutualTls": true
|
||||||
|
}
|
||||||
|
},
|
||||||
"AllowedHosts": "*",
|
"AllowedHosts": "*",
|
||||||
"Authentication": {
|
"Authentication": {
|
||||||
"Authority": "https://auth.evolucao.io/application/o/web-scrapper/",
|
"Authority": "https://auth.evolucao.io/application/o/web-scrapper/",
|
||||||
|
|||||||
@ -2,6 +2,8 @@
|
|||||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapperAPI", "ScrapperAPI\ScrapperAPI.csproj", "{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ScrapperAPI", "ScrapperAPI\ScrapperAPI.csproj", "{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VoyagerAgent", "VoyagerAgent\VoyagerAgent.csproj", "{29EADEEB-C9EE-483C-80EC-DFDBA98B23FE}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
@ -12,5 +14,9 @@ Global
|
|||||||
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Release|Any CPU.Build.0 = Release|Any CPU
|
{206F88EA-2109-4DC0-B1E1-45AA8D3D092F}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{29EADEEB-C9EE-483C-80EC-DFDBA98B23FE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{29EADEEB-C9EE-483C-80EC-DFDBA98B23FE}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{29EADEEB-C9EE-483C-80EC-DFDBA98B23FE}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{29EADEEB-C9EE-483C-80EC-DFDBA98B23FE}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
EndGlobal
|
EndGlobal
|
||||||
|
|||||||
28
VoyagerAgent/AgentClientOptions.cs
Normal file
28
VoyagerAgent/AgentClientOptions.cs
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
namespace VoyagerAgent;
|
||||||
|
|
||||||
|
public sealed class AgentClientOptions
|
||||||
|
{
|
||||||
|
/// <summary>Unique id for this agent (e.g. "agent-01").</summary>
|
||||||
|
public string AgentId { get; set; } = "agent-01";
|
||||||
|
|
||||||
|
public string? DisplayName { get; set; }
|
||||||
|
|
||||||
|
/// <summary>Central Voyager gRPC endpoint, e.g. "https://voyager.example.com:7443".</summary>
|
||||||
|
public string CentralGrpcAddress { get; set; } = "https://localhost:7443";
|
||||||
|
|
||||||
|
/// <summary>Session ids this agent should pull from.</summary>
|
||||||
|
public int[] SessionIds { get; set; } = Array.Empty<int>();
|
||||||
|
|
||||||
|
/// <summary>How many URLs to request per lease batch.</summary>
|
||||||
|
public int Capacity { get; set; } = 10;
|
||||||
|
|
||||||
|
/// <summary>Client certificate (PFX) path for mTLS.</summary>
|
||||||
|
public string ClientCertificatePath { get; set; } = "";
|
||||||
|
public string ClientCertificatePassword { get; set; } = "";
|
||||||
|
|
||||||
|
/// <summary>If true, skip strict server certificate validation (dev only).</summary>
|
||||||
|
public bool InsecureSkipServerCertificateValidation { get; set; } = false;
|
||||||
|
|
||||||
|
/// <summary>Delay between polls when no work is available.</summary>
|
||||||
|
public int PollDelayMs { get; set; } = 1500;
|
||||||
|
}
|
||||||
154
VoyagerAgent/AgentWorker.cs
Normal file
154
VoyagerAgent/AgentWorker.cs
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
using System.IO.Compression;
|
||||||
|
using System.Text;
|
||||||
|
using Grpc.Core;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
|
using ScrapperAPI.AgentGrpc;
|
||||||
|
|
||||||
|
namespace VoyagerAgent;
|
||||||
|
|
||||||
|
public sealed class AgentWorker : BackgroundService
|
||||||
|
{
|
||||||
|
private readonly ILogger<AgentWorker> _logger;
|
||||||
|
private readonly AgentClientOptions _opts;
|
||||||
|
private readonly GrpcAgentClient _grpc;
|
||||||
|
|
||||||
|
public AgentWorker(ILogger<AgentWorker> logger, IOptions<AgentClientOptions> opts, GrpcAgentClient grpc)
|
||||||
|
{
|
||||||
|
_logger = logger;
|
||||||
|
_opts = opts.Value;
|
||||||
|
_grpc = grpc;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||||
|
{
|
||||||
|
if (_opts.SessionIds.Length == 0)
|
||||||
|
{
|
||||||
|
_logger.LogWarning("No Agent:SessionIds configured. Agent will idle.");
|
||||||
|
}
|
||||||
|
|
||||||
|
var client = _grpc.CreateClient();
|
||||||
|
|
||||||
|
await TryRegisterAsync(client, stoppingToken);
|
||||||
|
|
||||||
|
using var http = new HttpClient { Timeout = TimeSpan.FromSeconds(30) };
|
||||||
|
http.DefaultRequestHeaders.UserAgent.ParseAdd("voyager-agent/1.0");
|
||||||
|
|
||||||
|
while (!stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
var didWork = false;
|
||||||
|
|
||||||
|
foreach (var sessionId in _opts.SessionIds)
|
||||||
|
{
|
||||||
|
if (stoppingToken.IsCancellationRequested) break;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var lease = await client.LeaseWorkAsync(new LeaseWorkRequest
|
||||||
|
{
|
||||||
|
AgentId = _opts.AgentId,
|
||||||
|
SessionId = sessionId,
|
||||||
|
Capacity = _opts.Capacity
|
||||||
|
}, cancellationToken: stoppingToken);
|
||||||
|
|
||||||
|
if (lease.Items.Count == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
didWork = true;
|
||||||
|
|
||||||
|
foreach (var item in lease.Items)
|
||||||
|
{
|
||||||
|
if (stoppingToken.IsCancellationRequested) break;
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var html = await http.GetStringAsync(item.Url, stoppingToken);
|
||||||
|
var compressed = GzipCompressUtf8(html, CompressionLevel.Fastest, out var origLen);
|
||||||
|
|
||||||
|
var submit = await client.SubmitResultAsync(new SubmitResultRequest
|
||||||
|
{
|
||||||
|
QueueId = item.QueueId,
|
||||||
|
AgentId = _opts.AgentId,
|
||||||
|
Success = true,
|
||||||
|
ContentEncoding = "gzip",
|
||||||
|
ContentBytes = Google.Protobuf.ByteString.CopyFrom(compressed),
|
||||||
|
OriginalLength = origLen,
|
||||||
|
CompressedLength = compressed.Length
|
||||||
|
}, cancellationToken: stoppingToken);
|
||||||
|
|
||||||
|
if (!submit.Ok)
|
||||||
|
_logger.LogWarning("SubmitResult not ok for queue {QueueId}: {Message}", item.QueueId, submit.Message);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Scrape failed for {Url}", item.Url);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await client.SubmitResultAsync(new SubmitResultRequest
|
||||||
|
{
|
||||||
|
QueueId = item.QueueId,
|
||||||
|
AgentId = _opts.AgentId,
|
||||||
|
Success = false,
|
||||||
|
Error = ex.Message
|
||||||
|
}, cancellationToken: stoppingToken);
|
||||||
|
}
|
||||||
|
catch (Exception inner)
|
||||||
|
{
|
||||||
|
_logger.LogError(inner, "Failed to submit failure status for queue {QueueId}", item.QueueId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (RpcException rpc)
|
||||||
|
{
|
||||||
|
_logger.LogWarning("gRPC error: {Status} {Detail}", rpc.StatusCode, rpc.Status.Detail);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogError(ex, "Unhandled error while leasing work.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// heartbeat (best-effort)
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await client.HeartbeatAsync(new HeartbeatRequest { AgentId = _opts.AgentId }, cancellationToken: stoppingToken);
|
||||||
|
}
|
||||||
|
catch { /* ignore */ }
|
||||||
|
|
||||||
|
if (!didWork)
|
||||||
|
await Task.Delay(_opts.PollDelayMs, stoppingToken);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task TryRegisterAsync(AgentService.AgentServiceClient client, CancellationToken ct)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await client.RegisterAgentAsync(new RegisterAgentRequest
|
||||||
|
{
|
||||||
|
AgentId = _opts.AgentId,
|
||||||
|
DisplayName = _opts.DisplayName ?? string.Empty
|
||||||
|
}, cancellationToken: ct);
|
||||||
|
|
||||||
|
_logger.LogInformation("Agent registered as {AgentId}", _opts.AgentId);
|
||||||
|
}
|
||||||
|
catch (RpcException rpc)
|
||||||
|
{
|
||||||
|
_logger.LogWarning("RegisterAgent failed: {Status} {Detail}", rpc.StatusCode, rpc.Status.Detail);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte[] GzipCompressUtf8(string content, CompressionLevel level, out int originalLength)
|
||||||
|
{
|
||||||
|
var bytes = Encoding.UTF8.GetBytes(content);
|
||||||
|
originalLength = bytes.Length;
|
||||||
|
|
||||||
|
using var ms = new MemoryStream();
|
||||||
|
using (var gzip = new GZipStream(ms, level, leaveOpen: true))
|
||||||
|
{
|
||||||
|
gzip.Write(bytes, 0, bytes.Length);
|
||||||
|
}
|
||||||
|
return ms.ToArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
37
VoyagerAgent/GrpcAgentClient.cs
Normal file
37
VoyagerAgent/GrpcAgentClient.cs
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
using System.Net.Http;
|
||||||
|
using System.Security.Cryptography.X509Certificates;
|
||||||
|
using Grpc.Net.Client;
|
||||||
|
using Microsoft.Extensions.Options;
|
||||||
|
using ScrapperAPI.AgentGrpc;
|
||||||
|
|
||||||
|
namespace VoyagerAgent;
|
||||||
|
|
||||||
|
public sealed class GrpcAgentClient
|
||||||
|
{
|
||||||
|
private readonly AgentClientOptions _opts;
|
||||||
|
|
||||||
|
public GrpcAgentClient(IOptions<AgentClientOptions> options)
|
||||||
|
{
|
||||||
|
_opts = options.Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public AgentService.AgentServiceClient CreateClient()
|
||||||
|
{
|
||||||
|
var handler = new HttpClientHandler();
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(_opts.ClientCertificatePath))
|
||||||
|
{
|
||||||
|
var cert = new X509Certificate2(_opts.ClientCertificatePath, _opts.ClientCertificatePassword);
|
||||||
|
handler.ClientCertificates.Add(cert);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_opts.InsecureSkipServerCertificateValidation)
|
||||||
|
{
|
||||||
|
handler.ServerCertificateCustomValidationCallback = HttpClientHandler.DangerousAcceptAnyServerCertificateValidator;
|
||||||
|
}
|
||||||
|
|
||||||
|
var httpClient = new HttpClient(handler);
|
||||||
|
var channel = GrpcChannel.ForAddress(_opts.CentralGrpcAddress, new GrpcChannelOptions { HttpClient = httpClient });
|
||||||
|
return new AgentService.AgentServiceClient(channel);
|
||||||
|
}
|
||||||
|
}
|
||||||
19
VoyagerAgent/Program.cs
Normal file
19
VoyagerAgent/Program.cs
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
using Microsoft.Extensions.DependencyInjection;
|
||||||
|
using Microsoft.Extensions.Hosting;
|
||||||
|
|
||||||
|
namespace VoyagerAgent;
|
||||||
|
|
||||||
|
public static class Program
|
||||||
|
{
|
||||||
|
public static void Main(string[] args)
|
||||||
|
{
|
||||||
|
var builder = Host.CreateApplicationBuilder(args);
|
||||||
|
|
||||||
|
builder.Services.Configure<AgentClientOptions>(builder.Configuration.GetSection("Agent"));
|
||||||
|
builder.Services.AddSingleton<GrpcAgentClient>();
|
||||||
|
builder.Services.AddHostedService<AgentWorker>();
|
||||||
|
|
||||||
|
var host = builder.Build();
|
||||||
|
host.Run();
|
||||||
|
}
|
||||||
|
}
|
||||||
30
VoyagerAgent/VoyagerAgent.csproj
Normal file
30
VoyagerAgent/VoyagerAgent.csproj
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk.Worker">
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net10.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Grpc.Net.Client" Version="2.67.0" />
|
||||||
|
<PackageReference Include="Google.Protobuf" Version="3.29.3" />
|
||||||
|
<PackageReference Include="Grpc.Tools" Version="2.69.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
</PackageReference>
|
||||||
|
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.1" />
|
||||||
|
<PackageReference Include="Grpc.AspNetCore" Version="2.67.0" />
|
||||||
|
<PackageReference Include="Google.Protobuf" Version="3.29.3" />
|
||||||
|
<PackageReference Include="Grpc.Tools" Version="2.69.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
</PackageReference>
|
||||||
|
<PackageReference Include="Dapper" Version="2.1.66" />
|
||||||
|
<PackageReference Include="Npgsql" Version="10.0.0" />
|
||||||
|
<PackageReference Include="Microsoft.AspNet.SignalR" Version="2.4.3" />
|
||||||
|
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
|
||||||
|
<PackageReference Include="AngleSharp" Version="1.3.0" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Protobuf Include="../ScrapperAPI/Protos/agent.proto" GrpcServices="Client" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Project>
|
||||||
13
VoyagerAgent/appsettings.json
Normal file
13
VoyagerAgent/appsettings.json
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"Agent": {
|
||||||
|
"AgentId": "agent-01",
|
||||||
|
"DisplayName": "Edge Worker 01",
|
||||||
|
"CentralGrpcAddress": "http://localhost:5001",
|
||||||
|
"SessionIds": [1],
|
||||||
|
"Capacity": 25,
|
||||||
|
"ClientCertificatePath": "",
|
||||||
|
"ClientCertificatePassword": "",
|
||||||
|
"InsecureSkipServerCertificateValidation": true,
|
||||||
|
"PollDelayMs": 1500
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user