using System.Globalization; using System.Text.Json; using AngleSharp.Dom; using AngleSharp.Html.Parser; namespace ScrapperAPI.Services; /// /// Engine genérico de extração baseado em CSS selectors (AngleSharp). /// A definição do modelo vem como JSON (extraction_model.definition). /// public sealed class ExtractionEngine { private readonly HtmlParser _parser = new(); public JsonDocument Extract(string html, JsonElement modelDefinition) { var doc = _parser.ParseDocument(html); var rootSelector = modelDefinition.TryGetProperty("rootSelector", out var rs) && rs.ValueKind == JsonValueKind.String ? rs.GetString() : null; IElement root = doc.DocumentElement; if (!string.IsNullOrWhiteSpace(rootSelector)) { root = doc.QuerySelector(rootSelector!) ?? root; } if (!modelDefinition.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array) throw new InvalidOperationException("Model definition must contain an array property 'fields'."); using var stream = new MemoryStream(); using (var writer = new Utf8JsonWriter(stream, new JsonWriterOptions { Indented = false })) { writer.WriteStartObject(); foreach (var field in fields.EnumerateArray()) { WriteField(writer, root, field); } writer.WriteEndObject(); } stream.Position = 0; return JsonDocument.Parse(stream); } private static void WriteField(Utf8JsonWriter writer, IElement context, JsonElement field) { var key = field.GetProperty("key").GetString(); if (string.IsNullOrWhiteSpace(key)) throw new InvalidOperationException("Field 'key' is required."); var type = field.GetProperty("type").GetString()?.ToLowerInvariant(); if (string.IsNullOrWhiteSpace(type)) throw new InvalidOperationException($"Field '{key}' missing 'type'."); writer.WritePropertyName(key); switch (type) { case "object": WriteObject(writer, context, field); break; case "array": WriteArray(writer, context, field); break; default: WritePrimitive(writer, context, field, type); break; } } private static void WriteObject(Utf8JsonWriter writer, IElement context, JsonElement field) { var objContext = ResolveContext(context, field); if (!field.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array) throw new InvalidOperationException("Object field must contain an array property 'fields'."); writer.WriteStartObject(); foreach (var sub in fields.EnumerateArray()) { WriteField(writer, objContext, sub); } writer.WriteEndObject(); } private static void WriteArray(Utf8JsonWriter writer, IElement context, JsonElement field) { var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String ? s.GetString() : null; if (string.IsNullOrWhiteSpace(selector)) { writer.WriteStartArray(); writer.WriteEndArray(); return; } if (!field.TryGetProperty("items", out var items)) throw new InvalidOperationException("Array field must contain 'items'."); var nodes = context.QuerySelectorAll(selector!); writer.WriteStartArray(); foreach (var node in nodes) { WriteArrayItem(writer, node, items); } writer.WriteEndArray(); } private static void WriteArrayItem(Utf8JsonWriter writer, IElement itemContext, JsonElement items) { var type = items.GetProperty("type").GetString()?.ToLowerInvariant(); if (string.IsNullOrWhiteSpace(type)) throw new InvalidOperationException("Array 'items.type' is required."); switch (type) { case "object": if (!items.TryGetProperty("fields", out var fields) || fields.ValueKind != JsonValueKind.Array) throw new InvalidOperationException("Array items of type 'object' must contain 'fields'."); writer.WriteStartObject(); foreach (var sub in fields.EnumerateArray()) { WriteField(writer, itemContext, sub); } writer.WriteEndObject(); break; case "array": // array de array // items.selector indica onde encontrar os sub-itens dentro de cada itemContext var tmpField = JsonDocument.Parse($"{{\"type\":\"array\",\"selector\":{JsonSerializer.Serialize(items.GetProperty("selector").GetString())},\"items\":{items.GetProperty("items").GetRawText()}}}").RootElement; WriteArray(writer, itemContext, tmpField); break; default: WritePrimitive(writer, itemContext, items, type); break; } } private static void WritePrimitive(Utf8JsonWriter writer, IElement context, JsonElement field, string type) { var node = ResolveNode(context, field); var raw = ReadRawValue(node, field); raw = ApplyTransforms(raw, field); if (raw is null) { writer.WriteNullValue(); return; } switch (type) { case "string": writer.WriteStringValue(raw); break; case "number": if (TryParseNumber(raw, field, out var dec)) writer.WriteNumberValue(dec); else writer.WriteNullValue(); break; case "date": if (TryParseDate(raw, field, out var date)) writer.WriteStringValue(date); else writer.WriteNullValue(); break; case "boolean": case "bool": if (TryParseBool(raw, out var b)) writer.WriteBooleanValue(b); else writer.WriteNullValue(); break; default: // fallback: string writer.WriteStringValue(raw); break; } } private static IElement ResolveContext(IElement context, JsonElement field) { var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String ? s.GetString() : null; if (string.IsNullOrWhiteSpace(selector)) return context; return context.QuerySelector(selector!) ?? context; } private static IElement? ResolveNode(IElement context, JsonElement field) { var selector = field.TryGetProperty("selector", out var s) && s.ValueKind == JsonValueKind.String ? s.GetString() : null; if (string.IsNullOrWhiteSpace(selector)) return context; return context.QuerySelector(selector!); } private static string? ReadRawValue(IElement? node, JsonElement field) { if (node is null) return null; // default source: text if (!field.TryGetProperty("source", out var source) || source.ValueKind != JsonValueKind.Object) return node.TextContent; var kind = source.TryGetProperty("kind", out var k) && k.ValueKind == JsonValueKind.String ? k.GetString()?.ToLowerInvariant() : "text"; return kind switch { "text" => node.TextContent, "html" => node.InnerHtml, "attr" => source.TryGetProperty("name", out var n) && n.ValueKind == JsonValueKind.String ? node.GetAttribute(n.GetString()!) : null, "value" => node.GetAttribute("value") ?? node.TextContent, _ => node.TextContent }; } private static string? ApplyTransforms(string? raw, JsonElement field) { if (raw is null) return null; if (!field.TryGetProperty("transforms", out var transforms) || transforms.ValueKind != JsonValueKind.Array) return raw; var current = raw; foreach (var t in transforms.EnumerateArray()) { if (t.ValueKind != JsonValueKind.String) continue; var tr = t.GetString() ?? ""; if (string.Equals(tr, "trim", StringComparison.OrdinalIgnoreCase)) current = current.Trim(); else if (string.Equals(tr, "lower", StringComparison.OrdinalIgnoreCase)) current = current.ToLowerInvariant(); else if (string.Equals(tr, "upper", StringComparison.OrdinalIgnoreCase)) current = current.ToUpperInvariant(); else if (string.Equals(tr, "removeNonDigits", StringComparison.OrdinalIgnoreCase)) current = new string(current.Where(char.IsDigit).ToArray()); // transforms mais avançados (regex/replace/etc) você pode adicionar depois } return current; } private static bool TryParseNumber(string raw, JsonElement field, out decimal value) { // transform opcional: "number:pt-BR" ou "number:invariant" var culture = CultureInfo.InvariantCulture; if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array) { foreach (var t in transforms.EnumerateArray()) { if (t.ValueKind != JsonValueKind.String) continue; var s = t.GetString() ?? ""; if (s.StartsWith("number:", StringComparison.OrdinalIgnoreCase)) { var arg = s.Substring("number:".Length); if (string.Equals(arg, "pt-BR", StringComparison.OrdinalIgnoreCase)) culture = new CultureInfo("pt-BR"); else if (string.Equals(arg, "invariant", StringComparison.OrdinalIgnoreCase)) culture = CultureInfo.InvariantCulture; } } } return decimal.TryParse(raw, NumberStyles.Any, culture, out value); } private static bool TryParseDate(string raw, JsonElement field, out string iso) { // transform opcional: "date:dd/MM/yyyy" etc. string? format = null; if (field.TryGetProperty("transforms", out var transforms) && transforms.ValueKind == JsonValueKind.Array) { foreach (var t in transforms.EnumerateArray()) { if (t.ValueKind != JsonValueKind.String) continue; var s = t.GetString() ?? ""; if (s.StartsWith("date:", StringComparison.OrdinalIgnoreCase)) format = s.Substring("date:".Length); } } DateTime dt; if (!string.IsNullOrWhiteSpace(format)) { if (!DateTime.TryParseExact(raw, format, CultureInfo.InvariantCulture, DateTimeStyles.None, out dt)) { iso = ""; return false; } } else { if (!DateTime.TryParse(raw, CultureInfo.InvariantCulture, DateTimeStyles.AssumeLocal, out dt)) { // tenta pt-BR if (!DateTime.TryParse(raw, new CultureInfo("pt-BR"), DateTimeStyles.AssumeLocal, out dt)) { iso = ""; return false; } } } iso = dt.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture); return true; } private static bool TryParseBool(string raw, out bool value) { var s = raw.Trim().ToLowerInvariant(); if (s is "true" or "1" or "yes" or "y" or "sim" or "s") { value = true; return true; } if (s is "false" or "0" or "no" or "n" or "nao" or "não") { value = false; return true; } value = false; return false; } }