1
0

Add extraction models, runs, and extracted data implementation

This commit is contained in:
Márcio Eric 2026-01-15 21:28:31 -03:00
parent 3e182baf7e
commit 1ae977b3f1
18 changed files with 1196 additions and 1 deletions

View File

@ -0,0 +1,46 @@
using System.ComponentModel.DataAnnotations;
using System.Text.Json;
using Microsoft.AspNetCore.Mvc;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
namespace ScrapperAPI.Controllers;
public sealed record CreateExtractionModelRequest(
[Required] string Name,
[Required] JsonDocument Definition,
int Version = 1,
string? Description = null);
[ApiController]
[Route("extraction-models")]
public sealed class ExtractionModelsController : ControllerBase
{
private readonly IExtractionModelRepository _models;
public ExtractionModelsController(IExtractionModelRepository models) => _models = models;
[HttpPost]
public async Task<IActionResult> Create([FromBody] CreateExtractionModelRequest req, CancellationToken ct)
{
var id = await _models.CreateAsync(new CreateExtractionModelDto(
Name: req.Name,
Version: req.Version <= 0 ? 1 : req.Version,
Description: req.Description,
Definition: req.Definition
), ct);
return Created($"/extraction-models/{id}", new { id });
}
[HttpGet]
public async Task<IActionResult> List(CancellationToken ct)
=> Ok(await _models.GetAllAsync(ct));
[HttpGet("{id:long}")]
public async Task<IActionResult> GetById(long id, CancellationToken ct)
{
var row = await _models.GetByIdAsync(id, ct);
return row is null ? NotFound() : Ok(row);
}
}

View File

@ -0,0 +1,66 @@
using Microsoft.AspNetCore.Mvc;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
namespace ScrapperAPI.Controllers;
[ApiController]
[Route("extraction-runs")]
public sealed class ExtractionRunsController : ControllerBase
{
private readonly IExtractionCoordinator _coord;
private readonly IExtractionRunRepository _runs;
private readonly IExtractedDataRepository _extracted;
public ExtractionRunsController(
IExtractionCoordinator coord,
IExtractionRunRepository runs,
IExtractedDataRepository extracted)
{
_coord = coord;
_runs = runs;
_extracted = extracted;
}
/// <summary>
/// Inicia uma extração em background (cria um run).
/// </summary>
[HttpPost]
public async Task<IActionResult> Start([FromBody] StartExtractionRequest req, CancellationToken ct)
{
var runId = await _coord.StartRunAsync(req, ct);
return Accepted(new { runId });
}
[HttpGet("{runId:long}")]
public async Task<IActionResult> GetRun(long runId, CancellationToken ct)
{
var row = await _runs.GetByIdAsync(runId, ct);
if (row is null) return NotFound();
var runtime = _coord.GetRuntimeStatus(runId);
return Ok(new { run = row, runtime });
}
/// <summary>
/// Lista os dados extraídos de uma sessão por modelo.
/// GET /extraction-runs/session/1/model/10
/// </summary>
[HttpGet("session/{sessionId:int}/model/{modelId:long}")]
public async Task<IActionResult> ListExtracted(int sessionId, long modelId, CancellationToken ct)
{
var rows = await _extracted.ListBySessionAsync(sessionId, modelId, ct);
return Ok(rows);
}
/// <summary>
/// Pega o JSON extraído de um item específico.
/// GET /extraction-runs/queue/123/model/10
/// </summary>
[HttpGet("queue/{queueId:int}/model/{modelId:long}")]
public async Task<IActionResult> GetByQueue(int queueId, long modelId, CancellationToken ct)
{
var row = await _extracted.GetByQueueIdAsync(queueId, modelId, ct);
return row is null ? NotFound() : Ok(row);
}
}

View File

@ -0,0 +1,20 @@
using System.Text.Json;
namespace ScrapperAPI.Dtos;
public sealed record CreateExtractionModelDto(
string Name,
int Version,
string? Description,
JsonDocument Definition
);
public sealed record ExtractionModelRow(
long Id,
string Name,
int Version,
string? Description,
JsonDocument Definition,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt
);

View File

@ -0,0 +1,61 @@
using System.ComponentModel.DataAnnotations;
using System.Text.Json;
namespace ScrapperAPI.Dtos;
public sealed record StartExtractionRequest(
[Required] int SessionId,
[Required] long ModelId,
bool OnlyDone = true
);
public sealed record CreateExtractionRunDto(
long ModelId,
int SessionId
);
public sealed record ExtractionRunRow(
long Id,
long ModelId,
int SessionId,
short Status,
DateTimeOffset CreatedAt,
DateTimeOffset? StartedAt,
DateTimeOffset? FinishedAt,
int Total,
int Succeeded,
int Failed,
string? Error
);
public sealed record ExtractionRuntimeStatus(
long RunId,
bool IsRunning,
int Processed,
int Total,
int Succeeded,
int Failed,
int? CurrentQueueId
);
public sealed record UpsertExtractedDataDto(
long RunId,
long ModelId,
int SessionId,
int QueueId,
JsonDocument ExtractedJson,
bool Success,
string? Error
);
public sealed record ExtractedDataRow(
long Id,
long RunId,
long ModelId,
int SessionId,
int QueueId,
JsonDocument ExtractedJson,
bool Success,
string? Error,
DateTimeOffset ExtractedAt
);

View File

@ -0,0 +1,10 @@
using ScrapperAPI.Dtos;
namespace ScrapperAPI.Interfaces;
public interface IExtractedDataRepository
{
Task UpsertAsync(UpsertExtractedDataDto dto, CancellationToken ct);
Task<IReadOnlyList<ExtractedDataRow>> ListBySessionAsync(int sessionId, long modelId, CancellationToken ct);
Task<ExtractedDataRow?> GetByQueueIdAsync(int queueId, long modelId, CancellationToken ct);
}

View File

@ -0,0 +1,16 @@
using ScrapperAPI.Dtos;
namespace ScrapperAPI.Interfaces;
public interface IExtractionCoordinator
{
/// <summary>
/// Cria um run e inicia o processamento em background.
/// </summary>
Task<long> StartRunAsync(StartExtractionRequest request, CancellationToken ct);
/// <summary>
/// Retorna status em tempo real (se estiver rodando).
/// </summary>
ExtractionRuntimeStatus GetRuntimeStatus(long runId);
}

View File

@ -0,0 +1,10 @@
using ScrapperAPI.Dtos;
namespace ScrapperAPI.Interfaces;
public interface IExtractionModelRepository
{
Task<long> CreateAsync(CreateExtractionModelDto dto, CancellationToken ct);
Task<IReadOnlyList<ExtractionModelRow>> GetAllAsync(CancellationToken ct);
Task<ExtractionModelRow?> GetByIdAsync(long id, CancellationToken ct);
}

View File

@ -0,0 +1,12 @@
using ScrapperAPI.Dtos;
namespace ScrapperAPI.Interfaces;
public interface IExtractionRunRepository
{
Task<long> CreateAsync(CreateExtractionRunDto dto, CancellationToken ct);
Task<ExtractionRunRow?> GetByIdAsync(long id, CancellationToken ct);
Task MarkRunningAsync(long runId, CancellationToken ct);
Task MarkDoneAsync(long runId, int total, int succeeded, int failed, CancellationToken ct);
Task MarkFailedAsync(long runId, string error, CancellationToken ct);
}

View File

@ -21,4 +21,9 @@ public interface IQueueRepository
Task<bool> RemovePendingByIdAsync(int sessionId, int queueId, CancellationToken ct);
Task<int> RemovePendingByUrlAsync(int sessionId, string url, CancellationToken ct);
/// <summary>
/// Lista IDs da fila por sessão e status (ex.: status=2 -> DONE).
/// </summary>
Task<IReadOnlyList<int>> ListQueueIdsAsync(int sessionId, IReadOnlyCollection<short>? statuses, CancellationToken ct);
}

View File

@ -76,11 +76,20 @@ builder.Services.AddScoped<ISessionRepository, SessionRepository>();
builder.Services.AddScoped<IQueueRepository, QueueRepository>();
builder.Services.AddScoped<IContentRepository, ContentRepository>();
// Extraction
builder.Services.AddSingleton<ExtractionEngine>();
builder.Services.AddScoped<IExtractionModelRepository, ExtractionModelRepository>();
builder.Services.AddScoped<IExtractionRunRepository, ExtractionRunRepository>();
builder.Services.AddScoped<IExtractedDataRepository, ExtractedDataRepository>();
builder.Services.AddHttpClient("scraper", c => c.Timeout = TimeSpan.FromSeconds(30));
builder.Services.AddSingleton<IScrapeCoordinator, ScrapeCoordinator>();
builder.Services.AddHostedService(sp => (ScrapeCoordinator)sp.GetRequiredService<IScrapeCoordinator>());
builder.Services.AddSingleton<IExtractionCoordinator, ExtractionCoordinator>();
builder.Services.AddHostedService(sp => (ExtractionCoordinator)sp.GetRequiredService<IExtractionCoordinator>());
builder.Services.AddCors(options =>
{
options.AddPolicy("AllowReact",

View File

@ -0,0 +1,112 @@
using System.Text.Json;
using Dapper;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
namespace ScrapperAPI.Repositories;
public sealed class ExtractedDataRepository : IExtractedDataRepository
{
private readonly IDbConnectionFactory _db;
public ExtractedDataRepository(IDbConnectionFactory db) => _db = db;
public async Task UpsertAsync(UpsertExtractedDataDto dto, CancellationToken ct)
{
const string sql = """
insert into extracted_data(run_id, model_id, session_id, queue_id, extracted_json, success, error)
values (@runId, @modelId, @sessionId, @queueId, @json::jsonb, @success, @error)
on conflict (model_id, queue_id)
do update set
run_id = excluded.run_id,
session_id = excluded.session_id,
extracted_json = excluded.extracted_json,
success = excluded.success,
error = excluded.error,
extracted_at = now();
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
await conn.ExecuteAsync(new CommandDefinition(sql, new
{
runId = dto.RunId,
modelId = dto.ModelId,
sessionId = dto.SessionId,
queueId = dto.QueueId,
json = dto.ExtractedJson.RootElement.GetRawText(),
success = dto.Success,
error = dto.Error
}, cancellationToken: ct));
}
public async Task<IReadOnlyList<ExtractedDataRow>> ListBySessionAsync(int sessionId, long modelId, CancellationToken ct)
{
const string sql = """
select
id,
run_id as RunId,
model_id as ModelId,
session_id as SessionId,
queue_id as QueueId,
extracted_json::text as extracted_json,
success,
error,
extracted_at as ExtractedAt
from extracted_data
where session_id = @sessionId
and model_id = @modelId
order by queue_id;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
var rows = await conn.QueryAsync<RowRaw>(new CommandDefinition(sql, new { sessionId, modelId }, cancellationToken: ct));
return rows.Select(r => r.ToDto()).ToList();
}
public async Task<ExtractedDataRow?> GetByQueueIdAsync(int queueId, long modelId, CancellationToken ct)
{
const string sql = """
select
id,
run_id as RunId,
model_id as ModelId,
session_id as SessionId,
queue_id as QueueId,
extracted_json::text as extracted_json,
success,
error,
extracted_at as ExtractedAt
from extracted_data
where queue_id = @queueId
and model_id = @modelId
limit 1;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
var row = await conn.QuerySingleOrDefaultAsync<RowRaw>(new CommandDefinition(sql, new { queueId, modelId }, cancellationToken: ct));
return row?.ToDto();
}
private sealed record RowRaw(
long Id,
long RunId,
long ModelId,
int SessionId,
int QueueId,
string Extracted_Json,
bool Success,
string? Error,
DateTimeOffset ExtractedAt)
{
public ExtractedDataRow ToDto() => new(
Id,
RunId,
ModelId,
SessionId,
QueueId,
JsonDocument.Parse(Extracted_Json),
Success,
Error,
ExtractedAt
);
}
}

View File

@ -0,0 +1,92 @@
using System.Text.Json;
using Dapper;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
namespace ScrapperAPI.Repositories;
public sealed class ExtractionModelRepository : IExtractionModelRepository
{
private readonly IDbConnectionFactory _db;
public ExtractionModelRepository(IDbConnectionFactory db) => _db = db;
public async Task<long> CreateAsync(CreateExtractionModelDto dto, CancellationToken ct)
{
const string sql = """
insert into extraction_model(name, version, description, definition)
values (@name, @version, @description, @definition::jsonb)
returning id;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
return await conn.ExecuteScalarAsync<long>(new CommandDefinition(sql, new
{
name = dto.Name,
version = dto.Version,
description = dto.Description,
definition = dto.Definition.RootElement.GetRawText()
}, cancellationToken: ct));
}
public async Task<IReadOnlyList<ExtractionModelRow>> GetAllAsync(CancellationToken ct)
{
const string sql = """
select
id,
name,
version,
description,
definition::text as definition_json,
created_at,
updated_at
from extraction_model
order by name, version, id;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
var rows = await conn.QueryAsync<ModelRaw>(new CommandDefinition(sql, cancellationToken: ct));
return rows.Select(r => r.ToDto()).ToList();
}
public async Task<ExtractionModelRow?> GetByIdAsync(long id, CancellationToken ct)
{
const string sql = """
select
id as Id,
name as Name,
version as Version,
description as Description,
definition as DefinitionJson,
created_at as CreatedAt,
updated_at as UpdatedAt
from extraction_model
where id = @id
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
var row = await conn.QuerySingleOrDefaultAsync<ModelRaw>(new CommandDefinition(sql, new { id }, cancellationToken: ct));
return row?.ToDto();
}
private class ModelRaw
{
public long Id { get; set; }
public string Name { get; set; } = default!;
public int Version { get; set; }
public string? Description { get; set; }
public string DefinitionJson { get; set; } = default!;
public DateTime CreatedAt { get; set; }
public DateTime UpdatedAt { get; set; }
public ExtractionModelRow ToDto() => new(
Id,
Name,
Version,
Description,
JsonDocument.Parse(DefinitionJson),
CreatedAt,
UpdatedAt
);
}
}

View File

@ -0,0 +1,96 @@
using Dapper;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
namespace ScrapperAPI.Repositories;
public sealed class ExtractionRunRepository : IExtractionRunRepository
{
private readonly IDbConnectionFactory _db;
public ExtractionRunRepository(IDbConnectionFactory db) => _db = db;
public async Task<long> CreateAsync(CreateExtractionRunDto dto, CancellationToken ct)
{
const string sql = """
insert into extraction_run(model_id, session_id, status)
values (@modelId, @sessionId, 0)
returning id;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
return await conn.ExecuteScalarAsync<long>(new CommandDefinition(sql, new
{
modelId = dto.ModelId,
sessionId = dto.SessionId
}, cancellationToken: ct));
}
public async Task<ExtractionRunRow?> GetByIdAsync(long id, CancellationToken ct)
{
const string sql = """
select
id,
model_id as ModelId,
session_id as SessionId,
status,
created_at as CreatedAt,
started_at as StartedAt,
finished_at as FinishedAt,
total,
succeeded,
failed,
error
from extraction_run
where id = @id
limit 1;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
return await conn.QuerySingleOrDefaultAsync<ExtractionRunRow>(new CommandDefinition(sql, new { id }, cancellationToken: ct));
}
public async Task MarkRunningAsync(long runId, CancellationToken ct)
{
const string sql = """
update extraction_run
set status = 1,
started_at = now(),
error = null
where id = @runId;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
await conn.ExecuteAsync(new CommandDefinition(sql, new { runId }, cancellationToken: ct));
}
public async Task MarkDoneAsync(long runId, int total, int succeeded, int failed, CancellationToken ct)
{
const string sql = """
update extraction_run
set status = 2,
finished_at = now(),
total = @total,
succeeded = @succeeded,
failed = @failed,
error = null
where id = @runId;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
await conn.ExecuteAsync(new CommandDefinition(sql, new { runId, total, succeeded, failed }, cancellationToken: ct));
}
public async Task MarkFailedAsync(long runId, string error, CancellationToken ct)
{
const string sql = """
update extraction_run
set status = 3,
finished_at = now(),
error = @error
where id = @runId;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
await conn.ExecuteAsync(new CommandDefinition(sql, new { runId, error }, cancellationToken: ct));
}
}

View File

@ -155,4 +155,31 @@ public sealed class QueueRepository : IQueueRepository
return await conn.ExecuteAsync(new CommandDefinition(sql, new { sessionId, url }, cancellationToken: ct));
}
public async Task<IReadOnlyList<int>> ListQueueIdsAsync(int sessionId, IReadOnlyCollection<short>? statuses, CancellationToken ct)
{
// Ex.: statuses = [2] -> DONE
var statusFilter = statuses is { Count: > 0 };
var sql = statusFilter
? """
select id
from queue
where session_id = @sessionId
and status = any(@statuses)
order by id;
"""
: """
select id
from queue
where session_id = @sessionId
order by id;
""";
using var conn = await _db.CreateOpenConnectionAsync(ct);
var rows = await conn.QueryAsync<int>(new CommandDefinition(sql,
new { sessionId, statuses = statuses?.ToArray() },
cancellationToken: ct));
return rows.ToList();
}
}

View File

@ -11,6 +11,7 @@
<PackageReference Include="Npgsql" Version="10.0.0" />
<PackageReference Include="Microsoft.AspNet.SignalR" Version="2.4.3" />
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
<PackageReference Include="AngleSharp" Version="1.3.0" />
</ItemGroup>
<ItemGroup>
<Content Include="..\.dockerignore">

View File

@ -37,4 +37,56 @@ alter table content
add column content_encoding varchar(20) not null default 'gzip',
add column content_bytes bytea null,
add column original_length int null,
add column compressed_length int null;
add column compressed_length int null;
-- ------------------------------------------------------------
-- Extraction models + runs + extracted json
-- ------------------------------------------------------------
drop table if exists extracted_data;
drop table if exists extraction_run;
drop table if exists extraction_model;
create table extraction_model (
id bigserial primary key,
name varchar(200) not null,
version int not null default 1,
description text null,
definition jsonb not null,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
unique(name, version)
);
create table extraction_run (
id bigserial primary key,
model_id bigint not null references extraction_model(id),
session_id int not null references session(id),
status smallint not null default 0, -- 0=queued 1=running 2=done 3=failed
started_at timestamptz null,
finished_at timestamptz null,
total int not null default 0,
succeeded int not null default 0,
failed int not null default 0,
error text null,
created_at timestamptz not null default now()
);
create index idx_extraction_run_session on extraction_run(session_id);
create table extracted_data (
id bigserial primary key,
run_id bigint not null references extraction_run(id),
model_id bigint not null references extraction_model(id),
session_id int not null references session(id),
queue_id int not null references queue(id),
extracted_json jsonb not null,
success boolean not null default true,
error text null,
extracted_at timestamptz not null default now(),
unique(model_id, queue_id)
);
create index idx_extracted_data_session on extracted_data(session_id);
create index idx_extracted_data_queue on extracted_data(queue_id);
create index idx_extracted_data_json on extracted_data using gin (extracted_json);

View File

@ -0,0 +1,341 @@
using System.Globalization;
using System.Text.Json;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
namespace ScrapperAPI.Services;
/// <summary>
/// Engine genérico de extração baseado em CSS selectors (AngleSharp).
/// A definição do modelo vem como JSON (extraction_model.definition).
/// </summary>
public sealed class ExtractionEngine
{
private readonly HtmlParser _parser = new();
public JsonDocument Extract(string html, JsonElement modelDefinition)
{
var doc = _parser.ParseDocument(html);
var rootSelector = modelDefinition.TryGetProperty("rootSelector", out var rs) && rs.ValueKind == JsonValueKind.String
? rs.GetString()
: null;
IElement root = doc.DocumentElement;
if (!string.IsNullOrWhiteSpace(rootSelector))
{
root = doc.QuerySelector(rootSelector!) ?? root;
}
if (!modelDefinition.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Model definition must contain an array property 'fields'.");
using var stream = new MemoryStream();
using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { Indented = false }))
{
writer.WriteStartObject();
foreach (var field in fields.EnumerateArray())
{
WriteField(writer, root, field);
}
writer.WriteEndObject();
}
stream.Position = 0;
return JsonDocument.Parse(stream);
}
private static void WriteField(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var key = field.GetProperty("key").GetString();
if (string.IsNullOrWhiteSpace(key))
throw new InvalidOperationException("Field 'key' is required.");
var type = field.GetProperty("type").GetString()?.ToLowerInvariant();
if (string.IsNullOrWhiteSpace(type))
throw new InvalidOperationException($"Field '{key}' missing 'type'.");
writer.WritePropertyName(key);
switch (type)
{
case "object":
WriteObject(writer, context, field);
break;
case "array":
WriteArray(writer, context, field);
break;
default:
WritePrimitive(writer, context, field, type);
break;
}
}
private static void WriteObject(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var objContext = ResolveContext(context, field);
if (!field.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Object field must contain an array property 'fields'.");
writer.WriteStartObject();
foreach (var sub in fields.EnumerateArray())
{
WriteField(writer, objContext, sub);
}
writer.WriteEndObject();
}
private static void WriteArray(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
{
writer.WriteStartArray();
writer.WriteEndArray();
return;
}
if (!field.TryGetProperty("items", out var items))
throw new InvalidOperationException("Array field must contain 'items'.");
var nodes = context.QuerySelectorAll(selector!);
writer.WriteStartArray();
foreach (var node in nodes)
{
WriteArrayItem(writer, node, items);
}
writer.WriteEndArray();
}
private static void WriteArrayItem(Utf8JsonWriter writer, IElement itemContext, JsonElement items)
{
var type = items.GetProperty("type").GetString()?.ToLowerInvariant();
if (string.IsNullOrWhiteSpace(type))
throw new InvalidOperationException("Array 'items.type' is required.");
switch (type)
{
case "object":
if (!items.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Array items of type 'object' must contain 'fields'.");
writer.WriteStartObject();
foreach (var sub in fields.EnumerateArray())
{
WriteField(writer, itemContext, sub);
}
writer.WriteEndObject();
break;
case "array":
// array de array
// items.selector indica onde encontrar os sub-itens dentro de cada itemContext
var tmpField = JsonDocument.Parse($"{{\"type\":\"array\",\"selector\":{JsonSerializer.Serialize(items.GetProperty("selector").GetString())},\"items\":{items.GetProperty("items").GetRawText()}}}").RootElement;
WriteArray(writer, itemContext, tmpField);
break;
default:
WritePrimitive(writer, itemContext, items, type);
break;
}
}
private static void WritePrimitive(Utf8JsonWriter writer, IElement context, JsonElement field, string type)
{
var node = ResolveNode(context, field);
var raw = ReadRawValue(node, field);
raw = ApplyTransforms(raw, field);
if (raw is null)
{
writer.WriteNullValue();
return;
}
switch (type)
{
case "string":
writer.WriteStringValue(raw);
break;
case "number":
if (TryParseNumber(raw, field, out var dec))
writer.WriteNumberValue(dec);
else
writer.WriteNullValue();
break;
case "date":
if (TryParseDate(raw, field, out var date))
writer.WriteStringValue(date);
else
writer.WriteNullValue();
break;
case "boolean":
case "bool":
if (TryParseBool(raw, out var b))
writer.WriteBooleanValue(b);
else
writer.WriteNullValue();
break;
default:
// fallback: string
writer.WriteStringValue(raw);
break;
}
}
private static IElement ResolveContext(IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
return context;
return context.QuerySelector(selector!) ?? context;
}
private static IElement? ResolveNode(IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
return context;
return context.QuerySelector(selector!);
}
private static string? ReadRawValue(IElement? node, JsonElement field)
{
if (node is null) return null;
// default source: text
if (!field.TryGetProperty("source", out var source) || source.ValueKind != JsonValueKind.Object)
return node.TextContent;
var kind = source.TryGetProperty("kind", out var k) && k.ValueKind == JsonValueKind.String
? k.GetString()?.ToLowerInvariant()
: "text";
return kind switch
{
"text" => node.TextContent,
"html" => node.InnerHtml,
"attr" => source.TryGetProperty("name", out var n) && n.ValueKind == JsonValueKind.String
? node.GetAttribute(n.GetString()!)
: null,
"value" => node.GetAttribute("value") ?? node.TextContent,
_ => node.TextContent
};
}
private static string? ApplyTransforms(string? raw, JsonElement field)
{
if (raw is null) return null;
if (!field.TryGetProperty("transforms", out var transforms) || transforms.ValueKind != JsonValueKind.Array)
return raw;
var current = raw;
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var tr = t.GetString() ?? "";
if (string.Equals(tr, "trim", StringComparison.OrdinalIgnoreCase))
current = current.Trim();
else if (string.Equals(tr, "lower", StringComparison.OrdinalIgnoreCase))
current = current.ToLowerInvariant();
else if (string.Equals(tr, "upper", StringComparison.OrdinalIgnoreCase))
current = current.ToUpperInvariant();
else if (string.Equals(tr, "removeNonDigits", StringComparison.OrdinalIgnoreCase))
current = new string(current.Where(char.IsDigit).ToArray());
// transforms mais avançados (regex/replace/etc) você pode adicionar depois
}
return current;
}
private static bool TryParseNumber(string raw, JsonElement field, out decimal value)
{
// transform opcional: "number:pt-BR" ou "number:invariant"
var culture = CultureInfo.InvariantCulture;
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
{
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var s = t.GetString() ?? "";
if (s.StartsWith("number:", StringComparison.OrdinalIgnoreCase))
{
var arg = s.Substring("number:".Length);
if (string.Equals(arg, "pt-BR", StringComparison.OrdinalIgnoreCase))
culture = new CultureInfo("pt-BR");
else if (string.Equals(arg, "invariant", StringComparison.OrdinalIgnoreCase))
culture = CultureInfo.InvariantCulture;
}
}
}
return decimal.TryParse(raw, NumberStyles.Any, culture, out value);
}
private static bool TryParseDate(string raw, JsonElement field, out string iso)
{
// transform opcional: "date:dd/MM/yyyy" etc.
string? format = null;
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
{
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var s = t.GetString() ?? "";
if (s.StartsWith("date:", StringComparison.OrdinalIgnoreCase))
format = s.Substring("date:".Length);
}
}
DateTime dt;
if (!string.IsNullOrWhiteSpace(format))
{
if (!DateTime.TryParseExact(raw, format, CultureInfo.InvariantCulture, DateTimeStyles.None, out dt))
{
iso = "";
return false;
}
}
else
{
if (!DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out dt))
{
// tenta pt-BR
if (!DateTime.TryParse(raw, new CultureInfo("pt-BR"), DateTimeStyles.AssumeLocal, out dt))
{
iso = "";
return false;
}
}
}
iso = dt.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture);
return true;
}
private static bool TryParseBool(string raw, out bool value)
{
var s = raw.Trim().ToLowerInvariant();
if (s is "true" or "1" or "yes" or "y" or "sim" or "s") { value = true; return true; }
if (s is "false" or "0" or "no" or "n" or "nao" or "não") { value = false; return true; }
value = false;
return false;
}
}

View File

@ -0,0 +1,219 @@
using System.Collections.Concurrent;
using System.Threading.Channels;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using ScrapperAPI.Dtos;
using ScrapperAPI.Interfaces;
using ScrapperAPI.Services;
using ScrapperAPI.Utils;
namespace ScrapperAPI.Workers;
/// <summary>
/// Processa extrações (modelos) sobre o HTML já capturado (content.content_bytes gzip).
/// </summary>
public sealed class ExtractionCoordinator : BackgroundService, IExtractionCoordinator
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly ILogger<ExtractionCoordinator> _logger;
private readonly ExtractionEngine _engine;
private readonly Channel<long> _startRequests = Channel.CreateUnbounded<long>(
new UnboundedChannelOptions { SingleReader = true, SingleWriter = false });
private readonly ConcurrentDictionary<long, Runtime> _running = new();
public ExtractionCoordinator(
IServiceScopeFactory scopeFactory,
ILogger<ExtractionCoordinator> logger,
ExtractionEngine engine)
{
_scopeFactory = scopeFactory;
_logger = logger;
_engine = engine;
}
public async Task<long> StartRunAsync(StartExtractionRequest request, CancellationToken ct)
{
using var scope = _scopeFactory.CreateScope();
var runs = scope.ServiceProvider.GetRequiredService<IExtractionRunRepository>();
var runId = await runs.CreateAsync(new CreateExtractionRunDto(request.ModelId, request.SessionId), ct);
_running[runId] = new Runtime(runId, request.SessionId, request.ModelId, request.OnlyDone);
await _startRequests.Writer.WriteAsync(runId, ct);
return runId;
}
public ExtractionRuntimeStatus GetRuntimeStatus(long runId)
{
if (!_running.TryGetValue(runId, out var r))
return new ExtractionRuntimeStatus(runId, false, 0, 0, 0, 0, null);
return new ExtractionRuntimeStatus(
RunId: r.RunId,
IsRunning: r.IsRunning,
Processed: r.Processed,
Total: r.Total,
Succeeded: r.Succeeded,
Failed: r.Failed,
CurrentQueueId: r.CurrentQueueId
);
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("ExtractionCoordinator started.");
while (!stoppingToken.IsCancellationRequested)
{
long runId;
try
{
runId = await _startRequests.Reader.ReadAsync(stoppingToken);
}
catch (OperationCanceledException) { break; }
if (!_running.TryGetValue(runId, out var runtime))
continue;
_ = RunOnceAsync(runtime, stoppingToken);
}
}
private async Task RunOnceAsync(Runtime runtime, CancellationToken hostToken)
{
if (!runtime.TryEnter())
return;
runtime.IsRunning = true;
try
{
using var scope = _scopeFactory.CreateScope();
var models = scope.ServiceProvider.GetRequiredService<IExtractionModelRepository>();
var runs = scope.ServiceProvider.GetRequiredService<IExtractionRunRepository>();
var queue = scope.ServiceProvider.GetRequiredService<IQueueRepository>();
var content = scope.ServiceProvider.GetRequiredService<IContentRepository>();
var extracted = scope.ServiceProvider.GetRequiredService<IExtractedDataRepository>();
var modelRow = await models.GetByIdAsync(runtime.ModelId, hostToken);
if (modelRow is null)
{
await runs.MarkFailedAsync(runtime.RunId, $"Model not found: {runtime.ModelId}", hostToken);
return;
}
await runs.MarkRunningAsync(runtime.RunId, hostToken);
var statuses = runtime.OnlyDone ? new short[] { 2 } : null;
var queueIds = await queue.ListQueueIdsAsync(runtime.SessionId, statuses, hostToken);
runtime.Total = queueIds.Count;
foreach (var qid in queueIds)
{
if (hostToken.IsCancellationRequested) break;
runtime.CurrentQueueId = qid;
runtime.Processed++;
try
{
var row = await content.GetCompressedByQueueIdAsync(qid, hostToken);
if (row is null || row.ContentBytes is null || row.ContentBytes.Length == 0)
throw new InvalidOperationException("Content not found");
if (!string.Equals(row.ContentEncoding, "gzip", StringComparison.OrdinalIgnoreCase))
throw new InvalidOperationException($"Unsupported encoding: {row.ContentEncoding}");
var html = CompressionUtils.GzipDecompressUtf8(row.ContentBytes);
using var json = _engine.Extract(html, modelRow.Definition.RootElement);
await extracted.UpsertAsync(new UpsertExtractedDataDto(
RunId: runtime.RunId,
ModelId: runtime.ModelId,
SessionId: runtime.SessionId,
QueueId: qid,
ExtractedJson: json,
Success: true,
Error: null
), hostToken);
runtime.Succeeded++;
}
catch (Exception ex)
{
using var errJson = JsonDocument.Parse("{}");
await extracted.UpsertAsync(new UpsertExtractedDataDto(
RunId: runtime.RunId,
ModelId: runtime.ModelId,
SessionId: runtime.SessionId,
QueueId: qid,
ExtractedJson: errJson,
Success: false,
Error: Truncate(ex.Message, 2000)
), hostToken);
runtime.Failed++;
}
finally
{
runtime.CurrentQueueId = null;
}
}
await runs.MarkDoneAsync(runtime.RunId, runtime.Total, runtime.Succeeded, runtime.Failed, hostToken);
}
catch (Exception ex)
{
try
{
using var scope = _scopeFactory.CreateScope();
var runs = scope.ServiceProvider.GetRequiredService<IExtractionRunRepository>();
await runs.MarkFailedAsync(runtime.RunId, Truncate(ex.ToString(), 8000), hostToken);
}
catch
{
// ignore double-fault
}
}
finally
{
runtime.IsRunning = false;
runtime.Exit();
}
}
private static string Truncate(string s, int max) => s.Length <= max ? s : s[..max];
private sealed class Runtime
{
private int _entered;
public long RunId { get; }
public int SessionId { get; }
public long ModelId { get; }
public bool OnlyDone { get; }
public bool IsRunning { get; set; }
public int Total { get; set; }
public int Processed { get; set; }
public int Succeeded { get; set; }
public int Failed { get; set; }
public int? CurrentQueueId { get; set; }
public Runtime(long runId, int sessionId, long modelId, bool onlyDone)
{
RunId = runId;
SessionId = sessionId;
ModelId = modelId;
OnlyDone = onlyDone;
}
public bool TryEnter() => Interlocked.CompareExchange(ref _entered, 1, 0) == 0;
public void Exit() => Interlocked.Exchange(ref _entered, 0);
}
}