1
0
voyager-api/ScrapperAPI/Scripts/database.sql

92 lines
3.1 KiB
SQL

create database webscrapper_dev;
drop table content;
drop table queue;
drop table session;
create table session(
id serial primary key,
name varchar(255)
);
create table queue(
id serial primary key,
session_id int references session(id),
url varchar(255),
status smallint not null default 0,
started_date timestamp null,
finished_date timestamp null,
attempts int not null default 0,
last_error text null,
created_date timestamp default now()
);
create index idx_queue_session_status on queue(session_id, status);
create table content(
id serial primary key,
queue_id int references queue(id),
content text,
created_date timestamp default now()
);
create unique index if not exists ux_queue_session_url
on queue(session_id, url);
alter table content
add column content_encoding varchar(20) not null default 'gzip',
add column content_bytes bytea null,
add column original_length int null,
add column compressed_length int null;
-- ------------------------------------------------------------
-- Extraction models + runs + extracted json
-- ------------------------------------------------------------
drop table if exists extracted_data;
drop table if exists extraction_run;
drop table if exists extraction_model;
create table extraction_model (
id bigserial primary key,
name varchar(200) not null,
version int not null default 1,
description text null,
definition jsonb not null,
created_at timestamptz not null default now(),
updated_at timestamptz not null default now(),
unique(name, version)
);
create table extraction_run (
id bigserial primary key,
model_id bigint not null references extraction_model(id),
session_id int not null references session(id),
status smallint not null default 0, -- 0=queued 1=running 2=done 3=failed
started_at timestamptz null,
finished_at timestamptz null,
total int not null default 0,
succeeded int not null default 0,
failed int not null default 0,
error text null,
created_at timestamptz not null default now()
);
create index idx_extraction_run_session on extraction_run(session_id);
create table extracted_data (
id bigserial primary key,
run_id bigint not null references extraction_run(id),
model_id bigint not null references extraction_model(id),
session_id int not null references session(id),
queue_id int not null references queue(id),
extracted_json jsonb not null,
success boolean not null default true,
error text null,
extracted_at timestamptz not null default now(),
unique(model_id, queue_id)
);
create index idx_extracted_data_session on extracted_data(session_id);
create index idx_extracted_data_queue on extracted_data(queue_id);
create index idx_extracted_data_json on extracted_data using gin (extracted_json);