create database webscrapper_dev; drop table content; drop table queue; drop table session; create table session( id serial primary key, name varchar(255) ); create table queue( id serial primary key, session_id int references session(id), url varchar(255), status smallint not null default 0, started_date timestamp null, finished_date timestamp null, leased_by text null, lease_expires_at timestamptz null, attempts int not null default 0, last_error text null, created_date timestamp default now() ); create index idx_queue_session_status on queue(session_id, status); create index idx_queue_lease on queue(session_id, status, lease_expires_at); -- ------------------------------------------------------------ -- Agents (optional distributed workers) -- ------------------------------------------------------------ drop table if exists agent; create table agent( id text primary key, display_name text null, cert_thumbprint text not null, created_at timestamptz not null default now(), last_seen_at timestamptz not null default now(), is_enabled boolean not null default true ); create table content( id serial primary key, queue_id int references queue(id), content text, created_date timestamp default now() ); create unique index if not exists ux_queue_session_url on queue(session_id, url); alter table content add column content_encoding varchar(20) not null default 'gzip', add column content_bytes bytea null, add column original_length int null, add column compressed_length int null; -- ------------------------------------------------------------ -- Extraction models + runs + extracted json -- ------------------------------------------------------------ drop table if exists extracted_data; drop table if exists extraction_run; drop table if exists extraction_model; create table extraction_model ( id bigserial primary key, name varchar(200) not null, version int not null default 1, description text null, definition jsonb not null, created_at timestamptz not null default now(), updated_at timestamptz not null default now(), unique(name, version) ); create table extraction_run ( id bigserial primary key, model_id bigint not null references extraction_model(id), session_id int not null references session(id), status smallint not null default 0, -- 0=queued 1=running 2=done 3=failed started_at timestamptz null, finished_at timestamptz null, total int not null default 0, succeeded int not null default 0, failed int not null default 0, error text null, created_at timestamptz not null default now() ); create index idx_extraction_run_session on extraction_run(session_id); create table extracted_data ( id bigserial primary key, run_id bigint not null references extraction_run(id), model_id bigint not null references extraction_model(id), session_id int not null references session(id), queue_id int not null references queue(id), extracted_json jsonb not null, success boolean not null default true, error text null, extracted_at timestamptz not null default now(), unique(model_id, queue_id) ); create index idx_extracted_data_session on extracted_data(session_id); create index idx_extracted_data_queue on extracted_data(queue_id); create index idx_extracted_data_json on extracted_data using gin (extracted_json);