342 lines
12 KiB
C#
342 lines
12 KiB
C#
using System.Globalization;
|
|
using System.Text.Json;
|
|
using AngleSharp.Dom;
|
|
using AngleSharp.Html.Parser;
|
|
|
|
namespace ScrapperAPI.Services;
|
|
|
|
/// <summary>
|
|
/// Engine genérico de extração baseado em CSS selectors (AngleSharp).
|
|
/// A definição do modelo vem como JSON (extraction_model.definition).
|
|
/// </summary>
|
|
public sealed class ExtractionEngine
|
|
{
|
|
private readonly HtmlParser _parser = new();
|
|
|
|
public JsonDocument Extract(string html, JsonElement modelDefinition)
|
|
{
|
|
var doc = _parser.ParseDocument(html);
|
|
|
|
var rootSelector = modelDefinition.TryGetProperty("rootSelector", out var rs) && rs.ValueKind == JsonValueKind.String
|
|
? rs.GetString()
|
|
: null;
|
|
|
|
IElement root = doc.DocumentElement;
|
|
if (!string.IsNullOrWhiteSpace(rootSelector))
|
|
{
|
|
root = doc.QuerySelector(rootSelector!) ?? root;
|
|
}
|
|
|
|
if (!modelDefinition.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
|
|
throw new InvalidOperationException("Model definition must contain an array property 'fields'.");
|
|
|
|
using var stream = new MemoryStream();
|
|
using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { Indented = false }))
|
|
{
|
|
writer.WriteStartObject();
|
|
|
|
foreach (var field in fields.EnumerateArray())
|
|
{
|
|
WriteField(writer, root, field);
|
|
}
|
|
|
|
writer.WriteEndObject();
|
|
}
|
|
|
|
stream.Position = 0;
|
|
return JsonDocument.Parse(stream);
|
|
}
|
|
|
|
private static void WriteField(Utf8JsonWriter writer, IElement context, JsonElement field)
|
|
{
|
|
var key = field.GetProperty("key").GetString();
|
|
if (string.IsNullOrWhiteSpace(key))
|
|
throw new InvalidOperationException("Field 'key' is required.");
|
|
|
|
var type = field.GetProperty("type").GetString()?.ToLowerInvariant();
|
|
if (string.IsNullOrWhiteSpace(type))
|
|
throw new InvalidOperationException($"Field '{key}' missing 'type'.");
|
|
|
|
writer.WritePropertyName(key);
|
|
|
|
switch (type)
|
|
{
|
|
case "object":
|
|
WriteObject(writer, context, field);
|
|
break;
|
|
case "array":
|
|
WriteArray(writer, context, field);
|
|
break;
|
|
default:
|
|
WritePrimitive(writer, context, field, type);
|
|
break;
|
|
}
|
|
}
|
|
|
|
private static void WriteObject(Utf8JsonWriter writer, IElement context, JsonElement field)
|
|
{
|
|
var objContext = ResolveContext(context, field);
|
|
|
|
if (!field.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
|
|
throw new InvalidOperationException("Object field must contain an array property 'fields'.");
|
|
|
|
writer.WriteStartObject();
|
|
foreach (var sub in fields.EnumerateArray())
|
|
{
|
|
WriteField(writer, objContext, sub);
|
|
}
|
|
writer.WriteEndObject();
|
|
}
|
|
|
|
private static void WriteArray(Utf8JsonWriter writer, IElement context, JsonElement field)
|
|
{
|
|
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
|
|
? s.GetString()
|
|
: null;
|
|
|
|
if (string.IsNullOrWhiteSpace(selector))
|
|
{
|
|
writer.WriteStartArray();
|
|
writer.WriteEndArray();
|
|
return;
|
|
}
|
|
|
|
if (!field.TryGetProperty("items", out var items))
|
|
throw new InvalidOperationException("Array field must contain 'items'.");
|
|
|
|
var nodes = context.QuerySelectorAll(selector!);
|
|
|
|
writer.WriteStartArray();
|
|
foreach (var node in nodes)
|
|
{
|
|
WriteArrayItem(writer, node, items);
|
|
}
|
|
writer.WriteEndArray();
|
|
}
|
|
|
|
private static void WriteArrayItem(Utf8JsonWriter writer, IElement itemContext, JsonElement items)
|
|
{
|
|
var type = items.GetProperty("type").GetString()?.ToLowerInvariant();
|
|
if (string.IsNullOrWhiteSpace(type))
|
|
throw new InvalidOperationException("Array 'items.type' is required.");
|
|
|
|
switch (type)
|
|
{
|
|
case "object":
|
|
if (!items.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array)
|
|
throw new InvalidOperationException("Array items of type 'object' must contain 'fields'.");
|
|
|
|
writer.WriteStartObject();
|
|
foreach (var sub in fields.EnumerateArray())
|
|
{
|
|
WriteField(writer, itemContext, sub);
|
|
}
|
|
writer.WriteEndObject();
|
|
break;
|
|
case "array":
|
|
// array de array
|
|
// items.selector indica onde encontrar os sub-itens dentro de cada itemContext
|
|
var tmpField = JsonDocument.Parse($"{{\"type\":\"array\",\"selector\":{JsonSerializer.Serialize(items.GetProperty("selector").GetString())},\"items\":{items.GetProperty("items").GetRawText()}}}").RootElement;
|
|
WriteArray(writer, itemContext, tmpField);
|
|
break;
|
|
default:
|
|
WritePrimitive(writer, itemContext, items, type);
|
|
break;
|
|
}
|
|
}
|
|
|
|
private static void WritePrimitive(Utf8JsonWriter writer, IElement context, JsonElement field, string type)
|
|
{
|
|
var node = ResolveNode(context, field);
|
|
var raw = ReadRawValue(node, field);
|
|
raw = ApplyTransforms(raw, field);
|
|
|
|
if (raw is null)
|
|
{
|
|
writer.WriteNullValue();
|
|
return;
|
|
}
|
|
|
|
switch (type)
|
|
{
|
|
case "string":
|
|
writer.WriteStringValue(raw);
|
|
break;
|
|
case "number":
|
|
if (TryParseNumber(raw, field, out var dec))
|
|
writer.WriteNumberValue(dec);
|
|
else
|
|
writer.WriteNullValue();
|
|
break;
|
|
case "date":
|
|
if (TryParseDate(raw, field, out var date))
|
|
writer.WriteStringValue(date);
|
|
else
|
|
writer.WriteNullValue();
|
|
break;
|
|
case "boolean":
|
|
case "bool":
|
|
if (TryParseBool(raw, out var b))
|
|
writer.WriteBooleanValue(b);
|
|
else
|
|
writer.WriteNullValue();
|
|
break;
|
|
default:
|
|
// fallback: string
|
|
writer.WriteStringValue(raw);
|
|
break;
|
|
}
|
|
}
|
|
|
|
private static IElement ResolveContext(IElement context, JsonElement field)
|
|
{
|
|
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
|
|
? s.GetString()
|
|
: null;
|
|
|
|
if (string.IsNullOrWhiteSpace(selector))
|
|
return context;
|
|
|
|
return context.QuerySelector(selector!) ?? context;
|
|
}
|
|
|
|
private static IElement? ResolveNode(IElement context, JsonElement field)
|
|
{
|
|
var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String
|
|
? s.GetString()
|
|
: null;
|
|
|
|
if (string.IsNullOrWhiteSpace(selector))
|
|
return context;
|
|
|
|
return context.QuerySelector(selector!);
|
|
}
|
|
|
|
private static string? ReadRawValue(IElement? node, JsonElement field)
|
|
{
|
|
if (node is null) return null;
|
|
|
|
// default source: text
|
|
if (!field.TryGetProperty("source", out var source) || source.ValueKind != JsonValueKind.Object)
|
|
return node.TextContent;
|
|
|
|
var kind = source.TryGetProperty("kind", out var k) && k.ValueKind == JsonValueKind.String
|
|
? k.GetString()?.ToLowerInvariant()
|
|
: "text";
|
|
|
|
return kind switch
|
|
{
|
|
"text" => node.TextContent,
|
|
"html" => node.InnerHtml,
|
|
"attr" => source.TryGetProperty("name", out var n) && n.ValueKind == JsonValueKind.String
|
|
? node.GetAttribute(n.GetString()!)
|
|
: null,
|
|
"value" => node.GetAttribute("value") ?? node.TextContent,
|
|
_ => node.TextContent
|
|
};
|
|
}
|
|
|
|
private static string? ApplyTransforms(string? raw, JsonElement field)
|
|
{
|
|
if (raw is null) return null;
|
|
|
|
if (!field.TryGetProperty("transforms", out var transforms) || transforms.ValueKind != JsonValueKind.Array)
|
|
return raw;
|
|
|
|
var current = raw;
|
|
foreach (var t in transforms.EnumerateArray())
|
|
{
|
|
if (t.ValueKind != JsonValueKind.String) continue;
|
|
var tr = t.GetString() ?? "";
|
|
|
|
if (string.Equals(tr, "trim", StringComparison.OrdinalIgnoreCase))
|
|
current = current.Trim();
|
|
else if (string.Equals(tr, "lower", StringComparison.OrdinalIgnoreCase))
|
|
current = current.ToLowerInvariant();
|
|
else if (string.Equals(tr, "upper", StringComparison.OrdinalIgnoreCase))
|
|
current = current.ToUpperInvariant();
|
|
else if (string.Equals(tr, "removeNonDigits", StringComparison.OrdinalIgnoreCase))
|
|
current = new string(current.Where(char.IsDigit).ToArray());
|
|
// transforms mais avançados (regex/replace/etc) você pode adicionar depois
|
|
}
|
|
|
|
return current;
|
|
}
|
|
|
|
private static bool TryParseNumber(string raw, JsonElement field, out decimal value)
|
|
{
|
|
// transform opcional: "number:pt-BR" ou "number:invariant"
|
|
var culture = CultureInfo.InvariantCulture;
|
|
|
|
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var t in transforms.EnumerateArray())
|
|
{
|
|
if (t.ValueKind != JsonValueKind.String) continue;
|
|
var s = t.GetString() ?? "";
|
|
if (s.StartsWith("number:", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var arg = s.Substring("number:".Length);
|
|
if (string.Equals(arg, "pt-BR", StringComparison.OrdinalIgnoreCase))
|
|
culture = new CultureInfo("pt-BR");
|
|
else if (string.Equals(arg, "invariant", StringComparison.OrdinalIgnoreCase))
|
|
culture = CultureInfo.InvariantCulture;
|
|
}
|
|
}
|
|
}
|
|
|
|
return decimal.TryParse(raw, NumberStyles.Any, culture, out value);
|
|
}
|
|
|
|
private static bool TryParseDate(string raw, JsonElement field, out string iso)
|
|
{
|
|
// transform opcional: "date:dd/MM/yyyy" etc.
|
|
string? format = null;
|
|
|
|
if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array)
|
|
{
|
|
foreach (var t in transforms.EnumerateArray())
|
|
{
|
|
if (t.ValueKind != JsonValueKind.String) continue;
|
|
var s = t.GetString() ?? "";
|
|
if (s.StartsWith("date:", StringComparison.OrdinalIgnoreCase))
|
|
format = s.Substring("date:".Length);
|
|
}
|
|
}
|
|
|
|
DateTime dt;
|
|
if (!string.IsNullOrWhiteSpace(format))
|
|
{
|
|
if (!DateTime.TryParseExact(raw, format, CultureInfo.InvariantCulture, DateTimeStyles.None, out dt))
|
|
{
|
|
iso = "";
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out dt))
|
|
{
|
|
// tenta pt-BR
|
|
if (!DateTime.TryParse(raw, new CultureInfo("pt-BR"), DateTimeStyles.AssumeLocal, out dt))
|
|
{
|
|
iso = "";
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
iso = dt.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture);
|
|
return true;
|
|
}
|
|
|
|
private static bool TryParseBool(string raw, out bool value)
|
|
{
|
|
var s = raw.Trim().ToLowerInvariant();
|
|
if (s is "true" or "1" or "yes" or "y" or "sim" or "s") { value = true; return true; }
|
|
if (s is "false" or "0" or "no" or "n" or "nao" or "não") { value = false; return true; }
|
|
value = false;
|
|
return false;
|
|
}
|
|
}
|