110 lines
3.7 KiB
SQL
110 lines
3.7 KiB
SQL
create database webscrapper_dev;
|
|
|
|
drop table content;
|
|
drop table queue;
|
|
drop table session;
|
|
|
|
create table session(
|
|
id serial primary key,
|
|
name varchar(255)
|
|
);
|
|
|
|
create table queue(
|
|
id serial primary key,
|
|
session_id int references session(id),
|
|
url varchar(255),
|
|
status smallint not null default 0,
|
|
started_date timestamp null,
|
|
finished_date timestamp null,
|
|
leased_by text null,
|
|
lease_expires_at timestamptz null,
|
|
attempts int not null default 0,
|
|
last_error text null,
|
|
created_date timestamp default now()
|
|
);
|
|
|
|
create index idx_queue_session_status on queue(session_id, status);
|
|
create index idx_queue_lease on queue(session_id, status, lease_expires_at);
|
|
|
|
-- ------------------------------------------------------------
|
|
-- Agents (optional distributed workers)
|
|
-- ------------------------------------------------------------
|
|
|
|
drop table if exists agent;
|
|
|
|
create table agent(
|
|
id text primary key,
|
|
display_name text null,
|
|
cert_thumbprint text not null,
|
|
created_at timestamptz not null default now(),
|
|
last_seen_at timestamptz not null default now(),
|
|
is_enabled boolean not null default true
|
|
);
|
|
|
|
create table content(
|
|
id serial primary key,
|
|
queue_id int references queue(id),
|
|
content text,
|
|
created_date timestamp default now()
|
|
);
|
|
|
|
create unique index if not exists ux_queue_session_url
|
|
on queue(session_id, url);
|
|
|
|
alter table content
|
|
add column content_encoding varchar(20) not null default 'gzip',
|
|
add column content_bytes bytea null,
|
|
add column original_length int null,
|
|
add column compressed_length int null;
|
|
|
|
-- ------------------------------------------------------------
|
|
-- Extraction models + runs + extracted json
|
|
-- ------------------------------------------------------------
|
|
|
|
drop table if exists extracted_data;
|
|
drop table if exists extraction_run;
|
|
drop table if exists extraction_model;
|
|
|
|
create table extraction_model (
|
|
id bigserial primary key,
|
|
name varchar(200) not null,
|
|
version int not null default 1,
|
|
description text null,
|
|
definition jsonb not null,
|
|
created_at timestamptz not null default now(),
|
|
updated_at timestamptz not null default now(),
|
|
unique(name, version)
|
|
);
|
|
|
|
create table extraction_run (
|
|
id bigserial primary key,
|
|
model_id bigint not null references extraction_model(id),
|
|
session_id int not null references session(id),
|
|
status smallint not null default 0, -- 0=queued 1=running 2=done 3=failed
|
|
started_at timestamptz null,
|
|
finished_at timestamptz null,
|
|
total int not null default 0,
|
|
succeeded int not null default 0,
|
|
failed int not null default 0,
|
|
error text null,
|
|
created_at timestamptz not null default now()
|
|
);
|
|
|
|
create index idx_extraction_run_session on extraction_run(session_id);
|
|
|
|
create table extracted_data (
|
|
id bigserial primary key,
|
|
run_id bigint not null references extraction_run(id),
|
|
model_id bigint not null references extraction_model(id),
|
|
session_id int not null references session(id),
|
|
queue_id int not null references queue(id),
|
|
extracted_json jsonb not null,
|
|
success boolean not null default true,
|
|
error text null,
|
|
extracted_at timestamptz not null default now(),
|
|
unique(model_id, queue_id)
|
|
);
|
|
|
|
create index idx_extracted_data_session on extracted_data(session_id);
|
|
create index idx_extracted_data_queue on extracted_data(queue_id);
|
|
create index idx_extracted_data_json on extracted_data using gin (extracted_json); |