init: scripts diversos (crawlers, conversores, scrapers)

This commit is contained in:
2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions

375
kb-processor/base-de-dados.txt Executable file
View File

@@ -0,0 +1,375 @@
Estrutura Detalhada da Base de Dados KB
1. Sistema Base (Schema)
sqlCopyCREATE SCHEMA IF NOT EXISTS kb;
Este é o namespace dedicado que mantém todas as tabelas organizadas e separadas de outros sistemas.
2. Sistema de Armazenamento Principal
2.1 Documentos (kb.documents)
sqlCopyCREATE TABLE kb.documents (
doc_id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
author TEXT,
original_date DATE,
processing_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
source_type TEXT CHECK (source_type IN ('pdf', 'ebook', 'video', 'documentation')),
language TEXT DEFAULT 'pt-PT',
original_file_path TEXT,
md_file_path TEXT,
content TEXT,
content_embedding vector(1536), -- Para busca semântica
metadata JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
version INTEGER DEFAULT 1
);
Armazena informações base dos documentos
Inclui embeddings para busca semântica
Mantém metadados flexíveis em JSONB
Controla versões e datas
2.2 Capítulos (kb.chapters)
sqlCopyCREATE TABLE kb.chapters (
chapter_id SERIAL PRIMARY KEY,
doc_id INTEGER REFERENCES kb.documents(doc_id),
title TEXT NOT NULL,
content TEXT,
content_embedding vector(1536), -- Para busca semântica em nível de capítulo
sequence_number INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
version INTEGER DEFAULT 1
);
Estrutura hierárquica do conteúdo
Embeddings específicos por capítulo
Sequenciamento ordenado
3. Sistema de Classificação e Relações
3.1 Tags e Relações
sqlCopyCREATE TABLE kb.tags (
tag_id SERIAL PRIMARY KEY,
tag_type TEXT NOT NULL,
tag_value TEXT NOT NULL,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(tag_type, tag_value)
);
CREATE TABLE kb.document_tags (
doc_id INTEGER REFERENCES kb.documents(doc_id),
tag_id INTEGER REFERENCES kb.tags(tag_id),
PRIMARY KEY (doc_id, tag_id)
);
Sistema flexível de categorização
Relacionamento muitos-para-muitos
Tipos de tags para melhor organização
3.2 Relacionamentos entre Documentos
sqlCopyCREATE TABLE kb.document_relationships (
relationship_id SERIAL PRIMARY KEY,
source_doc_id INTEGER REFERENCES kb.documents(doc_id),
target_doc_id INTEGER REFERENCES kb.documents(doc_id),
relationship_type TEXT,
strength FLOAT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source_doc_id, target_doc_id, relationship_type)
);
Mapeia conexões entre documentos
Indica força da relação
Permite múltiplos tipos de relacionamento
4. Sistema de Conhecimento
4.1 FAQs Contextuais
sqlCopyCREATE TABLE kb.faqs (
faq_id SERIAL PRIMARY KEY,
chapter_id INTEGER REFERENCES kb.chapters(chapter_id),
question TEXT NOT NULL,
answer TEXT NOT NULL,
question_embedding vector(1536), -- Para busca semântica de perguntas
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
FAQs geradas automaticamente
Embeddings para busca semântica
Vinculadas a capítulos específicos
4.2 Referências
sqlCopyCREATE TABLE kb.references (
ref_id SERIAL PRIMARY KEY,
doc_id INTEGER REFERENCES kb.documents(doc_id),
reference_type TEXT,
citation TEXT,
url TEXT,
doi TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Mantém referências bibliográficas
Suporta múltiplos tipos de citação
Inclui identificadores digitais
5. Sistema de Controle
5.1 Histórico de Versões
sqlCopyCREATE TABLE kb.version_history (
version_id SERIAL PRIMARY KEY,
entity_type TEXT NOT NULL,
entity_id INTEGER NOT NULL,
change_type TEXT NOT NULL,
changes JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_by TEXT
);
Rastreia todas as alterações
Armazena detalhes das mudanças
Mantém auditoria completa
5.2 Fila de Processamento
sqlCopyCREATE TABLE kb.processing_queue (
queue_id SERIAL PRIMARY KEY,
file_path TEXT NOT NULL,
status TEXT DEFAULT 'pending',
priority INTEGER DEFAULT 1,
attempts INTEGER DEFAULT 0,
last_attempt TIMESTAMP,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP
);
Gerencia processamento assíncrono
Controla tentativas e erros
Prioriza processamentos
6. Otimizações
6.1 Índices
sqlCopyCREATE INDEX idx_documents_title ON kb.documents USING gin (to_tsvector('portuguese', title));
CREATE INDEX idx_documents_content ON kb.documents USING gin (to_tsvector('portuguese', content));
CREATE INDEX idx_documents_embedding ON kb.documents USING ivfflat (content_embedding vector_cosine_ops);
Otimiza busca textual
Acelera busca semântica
Melhora performance geral
6.2 Funções e Triggers
sqlCopy-- Função de atualização de timestamp
CREATE OR REPLACE FUNCTION kb.update_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Trigger exemplo
CREATE TRIGGER update_documents_timestamp
BEFORE UPDATE ON kb.documents
FOR EACH ROW
EXECUTE FUNCTION kb.update_timestamp();
Automatiza atualizações de timestamp
Mantém consistência dos dados
Facilita manutenção
Esta estrutura proporciona:
Armazenamento eficiente
Busca semântica avançada
Organização hierárquica
Rastreamento de mudanças
Processamento assíncrono
Performance otimizada
**exemplo
-- Extensões necessárias
CREATE EXTENSION IF NOT EXISTS pg_trgm; -- Para busca textual
CREATE EXTENSION IF NOT EXISTS unaccent; -- Para lidar com acentos
-- Esquema principal
CREATE SCHEMA IF NOT EXISTS kb;
-- Tabela de Documentos
CREATE TABLE kb.documents (
doc_id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
author TEXT,
original_date DATE,
processing_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
source_type TEXT CHECK (source_type IN ('pdf', 'ebook', 'video', 'documentation')),
language TEXT DEFAULT 'pt-PT',
original_file_path TEXT,
md_file_path TEXT,
content TEXT,
metadata JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
version INTEGER DEFAULT 1
);
-- Tabela de Capítulos
CREATE TABLE kb.chapters (
chapter_id SERIAL PRIMARY KEY,
doc_id INTEGER REFERENCES kb.documents(doc_id),
title TEXT NOT NULL,
content TEXT,
sequence_number INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
version INTEGER DEFAULT 1
);
-- Tabela de Tags
CREATE TABLE kb.tags (
tag_id SERIAL PRIMARY KEY,
tag_type TEXT NOT NULL,
tag_value TEXT NOT NULL,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(tag_type, tag_value)
);
-- Tabela de relação Documentos-Tags
CREATE TABLE kb.document_tags (
doc_id INTEGER REFERENCES kb.documents(doc_id),
tag_id INTEGER REFERENCES kb.tags(tag_id),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (doc_id, tag_id)
);
-- Tabela de FAQs
CREATE TABLE kb.faqs (
faq_id SERIAL PRIMARY KEY,
chapter_id INTEGER REFERENCES kb.chapters(chapter_id),
question TEXT NOT NULL,
answer TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Tabela de Referências Bibliográficas
CREATE TABLE kb.references (
ref_id SERIAL PRIMARY KEY,
doc_id INTEGER REFERENCES kb.documents(doc_id),
reference_type TEXT,
citation TEXT,
url TEXT,
doi TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Tabela de Relacionamentos entre Documentos
CREATE TABLE kb.document_relationships (
relationship_id SERIAL PRIMARY KEY,
source_doc_id INTEGER REFERENCES kb.documents(doc_id),
target_doc_id INTEGER REFERENCES kb.documents(doc_id),
relationship_type TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source_doc_id, target_doc_id, relationship_type)
);
-- Tabela de Histórico de Versões
CREATE TABLE kb.version_history (
version_id SERIAL PRIMARY KEY,
entity_type TEXT NOT NULL,
entity_id INTEGER NOT NULL,
change_type TEXT NOT NULL,
changes JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_by TEXT
);
-- Tabela de Fila de Processamento
CREATE TABLE kb.processing_queue (
queue_id SERIAL PRIMARY KEY,
file_path TEXT NOT NULL,
status TEXT DEFAULT 'pending',
priority INTEGER DEFAULT 1,
attempts INTEGER DEFAULT 0,
last_attempt TIMESTAMP,
error_message TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP
);
-- Índices para otimização
CREATE INDEX idx_documents_title ON kb.documents USING gin (to_tsvector('portuguese', title));
CREATE INDEX idx_documents_content ON kb.documents USING gin (to_tsvector('portuguese', content));
CREATE INDEX idx_chapters_content ON kb.chapters USING gin (to_tsvector('portuguese', content));
CREATE INDEX idx_documents_source_type ON kb.documents(source_type);
CREATE INDEX idx_processing_queue_status ON kb.processing_queue(status);
-- Funções
CREATE OR REPLACE FUNCTION kb.update_timestamp()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
-- Triggers para atualização automática de timestamps
CREATE TRIGGER update_documents_timestamp
BEFORE UPDATE ON kb.documents
FOR EACH ROW
EXECUTE FUNCTION kb.update_timestamp();
CREATE TRIGGER update_chapters_timestamp
BEFORE UPDATE ON kb.chapters
FOR EACH ROW
EXECUTE FUNCTION kb.update_timestamp();
-- Views úteis
CREATE VIEW kb.document_summary AS
SELECT
d.doc_id,
d.title,
d.author,
d.source_type,
COUNT(DISTINCT c.chapter_id) as num_chapters,
COUNT(DISTINCT f.faq_id) as num_faqs,
COUNT(DISTINCT dt.tag_id) as num_tags,
d.created_at,
d.updated_at
FROM kb.documents d
LEFT JOIN kb.chapters c ON d.doc_id = c.doc_id
LEFT JOIN kb.faqs f ON c.chapter_id = f.chapter_id
LEFT JOIN kb.document_tags dt ON d.doc_id = dt.doc_id
GROUP BY d.doc_id;
-- Função para adicionar novo documento
CREATE OR REPLACE FUNCTION kb.add_document(
p_title TEXT,
p_author TEXT,
p_source_type TEXT,
p_original_file_path TEXT,
p_md_file_path TEXT
) RETURNS INTEGER AS $$
DECLARE
v_doc_id INTEGER;
BEGIN
INSERT INTO kb.documents (
title,
author,
source_type,
original_file_path,
md_file_path,
created_at
) VALUES (
p_title,
p_author,
p_source_type,
p_original_file_path,
p_md_file_path,
CURRENT_TIMESTAMP
) RETURNING doc_id INTO v_doc_id;
RETURN v_doc_id;
END;
$$ LANGUAGE plpgsql;