1
0
voyager-api/ScrapperAPI/Services/ExtractionEngine.cs

342 lines
12 KiB
C#

using System.Globalization;
using System.Text.Json;
using AngleSharp.Dom;
using AngleSharp.Html.Parser;
namespace ScrapperAPI.Services;
/// <summary>
/// Engine genérico de extração baseado em CSS selectors (AngleSharp).
/// A definição do modelo vem como JSON (extraction_model.definition).
/// </summary>
public sealed class ExtractionEngine
{
private readonly HtmlParser _parser = new();
public JsonDocument Extract(string html, JsonElement modelDefinition)
{
var doc = _parser.ParseDocument(html);
var rootSelector = modelDefinition.TryGetProperty("rootSelector", out var rs) && rs.ValueKind == JsonValueKind.String
? rs.GetString()
: null;
IElement root = doc.DocumentElement;
if (!string.IsNullOrWhiteSpace(rootSelector))
{
root = doc.QuerySelector(rootSelector!) ?? root;
}
if (!modelDefinition.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Model definition must contain an array property 'fields'.");
using var stream = new MemoryStream();
using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { Indented = false }))
{
writer.WriteStartObject();
foreach (var field in fields.EnumerateArray())
{
WriteField(writer, root, field);
}
writer.WriteEndObject();
}
stream.Position = 0;
return JsonDocument.Parse(stream);
}
private static void WriteField(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var key = field.GetProperty("key").GetString();
if (string.IsNullOrWhiteSpace(key))
throw new InvalidOperationException("Field 'key' is required.");
var type = field.GetProperty("type").GetString()?.ToLowerInvariant();
if (string.IsNullOrWhiteSpace(type))
throw new InvalidOperationException($"Field '{key}' missing 'type'.");
writer.WritePropertyName(key);
switch (type)
{
case "object":
WriteObject(writer, context, field);
break;
case "array":
WriteArray(writer, context, field);
break;
default:
WritePrimitive(writer, context, field, type);
break;
}
}
private static void WriteObject(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var objContext = ResolveContext(context, field);
if (!field.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Object field must contain an array property 'fields'.");
writer.WriteStartObject();
foreach (var sub in fields.EnumerateArray())
{
WriteField(writer, objContext, sub);
}
writer.WriteEndObject();
}
private static void WriteArray(Utf8JsonWriter writer, IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
{
writer.WriteStartArray();
writer.WriteEndArray();
return;
}
if (!field.TryGetProperty("items", out var items))
throw new InvalidOperationException("Array field must contain 'items'.");
var nodes = context.QuerySelectorAll(selector!);
writer.WriteStartArray();
foreach (var node in nodes)
{
WriteArrayItem(writer, node, items);
}
writer.WriteEndArray();
}
private static void WriteArrayItem(Utf8JsonWriter writer, IElement itemContext, JsonElement items)
{
var type = items.GetProperty("type").GetString()?.ToLowerInvariant();
if (string.IsNullOrWhiteSpace(type))
throw new InvalidOperationException("Array 'items.type' is required.");
switch (type)
{
case "object":
if (!items.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
throw new InvalidOperationException("Array items of type 'object' must contain 'fields'.");
writer.WriteStartObject();
foreach (var sub in fields.EnumerateArray())
{
WriteField(writer, itemContext, sub);
}
writer.WriteEndObject();
break;
case "array":
// array de array
// items.selector indica onde encontrar os sub-itens dentro de cada itemContext
var tmpField = JsonDocument.Parse($"{{\"type\":\"array\",\"selector\":{JsonSerializer.Serialize(items.GetProperty("selector").GetString())},\"items\":{items.GetProperty("items").GetRawText()}}}").RootElement;
WriteArray(writer, itemContext, tmpField);
break;
default:
WritePrimitive(writer, itemContext, items, type);
break;
}
}
private static void WritePrimitive(Utf8JsonWriter writer, IElement context, JsonElement field, string type)
{
var node = ResolveNode(context, field);
var raw = ReadRawValue(node, field);
raw = ApplyTransforms(raw, field);
if (raw is null)
{
writer.WriteNullValue();
return;
}
switch (type)
{
case "string":
writer.WriteStringValue(raw);
break;
case "number":
if (TryParseNumber(raw, field, out var dec))
writer.WriteNumberValue(dec);
else
writer.WriteNullValue();
break;
case "date":
if (TryParseDate(raw, field, out var date))
writer.WriteStringValue(date);
else
writer.WriteNullValue();
break;
case "boolean":
case "bool":
if (TryParseBool(raw, out var b))
writer.WriteBooleanValue(b);
else
writer.WriteNullValue();
break;
default:
// fallback: string
writer.WriteStringValue(raw);
break;
}
}
private static IElement ResolveContext(IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
return context;
return context.QuerySelector(selector!) ?? context;
}
private static IElement? ResolveNode(IElement context, JsonElement field)
{
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
? s.GetString()
: null;
if (string.IsNullOrWhiteSpace(selector))
return context;
return context.QuerySelector(selector!);
}
private static string? ReadRawValue(IElement? node, JsonElement field)
{
if (node is null) return null;
// default source: text
if (!field.TryGetProperty("source", out var source) || source.ValueKind != JsonValueKind.Object)
return node.TextContent;
var kind = source.TryGetProperty("kind", out var k) && k.ValueKind == JsonValueKind.String
? k.GetString()?.ToLowerInvariant()
: "text";
return kind switch
{
"text" => node.TextContent,
"html" => node.InnerHtml,
"attr" => source.TryGetProperty("name", out var n) && n.ValueKind == JsonValueKind.String
? node.GetAttribute(n.GetString()!)
: null,
"value" => node.GetAttribute("value") ?? node.TextContent,
_ => node.TextContent
};
}
private static string? ApplyTransforms(string? raw, JsonElement field)
{
if (raw is null) return null;
if (!field.TryGetProperty("transforms", out var transforms) || transforms.ValueKind != JsonValueKind.Array)
return raw;
var current = raw;
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var tr = t.GetString() ?? "";
if (string.Equals(tr, "trim", StringComparison.OrdinalIgnoreCase))
current = current.Trim();
else if (string.Equals(tr, "lower", StringComparison.OrdinalIgnoreCase))
current = current.ToLowerInvariant();
else if (string.Equals(tr, "upper", StringComparison.OrdinalIgnoreCase))
current = current.ToUpperInvariant();
else if (string.Equals(tr, "removeNonDigits", StringComparison.OrdinalIgnoreCase))
current = new string(current.Where(char.IsDigit).ToArray());
// transforms mais avançados (regex/replace/etc) você pode adicionar depois
}
return current;
}
private static bool TryParseNumber(string raw, JsonElement field, out decimal value)
{
// transform opcional: "number:pt-BR" ou "number:invariant"
var culture = CultureInfo.InvariantCulture;
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
{
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var s = t.GetString() ?? "";
if (s.StartsWith("number:", StringComparison.OrdinalIgnoreCase))
{
var arg = s.Substring("number:".Length);
if (string.Equals(arg, "pt-BR", StringComparison.OrdinalIgnoreCase))
culture = new CultureInfo("pt-BR");
else if (string.Equals(arg, "invariant", StringComparison.OrdinalIgnoreCase))
culture = CultureInfo.InvariantCulture;
}
}
}
return decimal.TryParse(raw, NumberStyles.Any, culture, out value);
}
private static bool TryParseDate(string raw, JsonElement field, out string iso)
{
// transform opcional: "date:dd/MM/yyyy" etc.
string? format = null;
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
{
foreach (var t in transforms.EnumerateArray())
{
if (t.ValueKind != JsonValueKind.String) continue;
var s = t.GetString() ?? "";
if (s.StartsWith("date:", StringComparison.OrdinalIgnoreCase))
format = s.Substring("date:".Length);
}
}
DateTime dt;
if (!string.IsNullOrWhiteSpace(format))
{
if (!DateTime.TryParseExact(raw, format, CultureInfo.InvariantCulture, DateTimeStyles.None, out dt))
{
iso = "";
return false;
}
}
else
{
if (!DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out dt))
{
// tenta pt-BR
if (!DateTime.TryParse(raw, new CultureInfo("pt-BR"), DateTimeStyles.AssumeLocal, out dt))
{
iso = "";
return false;
}
}
}
iso = dt.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture);
return true;
}
private static bool TryParseBool(string raw, out bool value)
{
var s = raw.Trim().ToLowerInvariant();
if (s is "true" or "1" or "yes" or "y" or "sim" or "s") { value = true; return true; }
if (s is "false" or "0" or "no" or "n" or "nao" or "não") { value = false; return true; }
value = false;
return false;
}
}