feat: refactor 30+ skills to Anthropic progressive disclosure pattern
- All SKILL.md files now <500 lines (avg reduction 69%) - Detailed content extracted to references/ subdirectories - Frontmatter standardised: only name + description (Anthropic standard) - New skills: brand-guidelines, spec-coauthor, report-templates, skill-creator - Design skills: anti-slop guidelines, premium-proposals reference - Removed non-standard frontmatter fields (triggers, version, author, category) Plugins affected: infraestrutura, marketing, dev-tools, crm-ops, gestao, core-tools, negocio, perfex-dev, wordpress, design-media Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
796
dev-tools/skills/pdf/SKILL.md
Normal file
796
dev-tools/skills/pdf/SKILL.md
Normal file
@@ -0,0 +1,796 @@
|
||||
---
|
||||
name: pdf
|
||||
description: Processamento completo de ficheiros PDF — leitura, extraccao de texto/tabelas, merge, split, watermarks, encriptacao, OCR, criacao e preenchimento de formularios.
|
||||
---
|
||||
|
||||
# PDF Processing Guide
|
||||
|
||||
## Resumo
|
||||
|
||||
Guia completo para processamento de PDFs com bibliotecas Python e ferramentas de linha de comandos. Para formularios PDF, seguir as instruccoes na seccao "Preenchimento de formularios". Para funcionalidades avancadas e bibliotecas JavaScript, consultar a seccao "Referencia avancada".
|
||||
|
||||
## Quick Start
|
||||
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
# Read a PDF
|
||||
reader = PdfReader("/media/ealmeida/Dados/GDrive/Cloud/Descomplicar/documento.pdf")
|
||||
print(f"Pages: {len(reader.pages)}")
|
||||
|
||||
# Extract text
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
```
|
||||
|
||||
## Bibliotecas Python
|
||||
|
||||
### pypdf — operacoes basicas
|
||||
|
||||
#### Merge PDFs
|
||||
```python
|
||||
from pypdf import PdfWriter, PdfReader
|
||||
|
||||
writer = PdfWriter()
|
||||
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
|
||||
reader = PdfReader(pdf_file)
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
with open("merged.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
#### Split PDF
|
||||
```python
|
||||
reader = PdfReader("input.pdf")
|
||||
for i, page in enumerate(reader.pages):
|
||||
writer = PdfWriter()
|
||||
writer.add_page(page)
|
||||
with open(f"page_{i+1}.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
#### Extract Metadata
|
||||
```python
|
||||
reader = PdfReader("document.pdf")
|
||||
meta = reader.metadata
|
||||
print(f"Title: {meta.title}")
|
||||
print(f"Author: {meta.author}")
|
||||
print(f"Subject: {meta.subject}")
|
||||
print(f"Creator: {meta.creator}")
|
||||
```
|
||||
|
||||
#### Rotate Pages
|
||||
```python
|
||||
reader = PdfReader("input.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
page = reader.pages[0]
|
||||
page.rotate(90) # Rotate 90 degrees clockwise
|
||||
writer.add_page(page)
|
||||
|
||||
with open("rotated.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
### pdfplumber — extraccao de texto e tabelas
|
||||
|
||||
#### Extract Text with Layout
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text()
|
||||
print(text)
|
||||
```
|
||||
|
||||
#### Extract Tables
|
||||
```python
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
for i, page in enumerate(pdf.pages):
|
||||
tables = page.extract_tables()
|
||||
for j, table in enumerate(tables):
|
||||
print(f"Table {j+1} on page {i+1}:")
|
||||
for row in table:
|
||||
print(row)
|
||||
```
|
||||
|
||||
#### Advanced Table Extraction
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
all_tables = []
|
||||
for page in pdf.pages:
|
||||
tables = page.extract_tables()
|
||||
for table in tables:
|
||||
if table: # Check if table is not empty
|
||||
df = pd.DataFrame(table[1:], columns=table[0])
|
||||
all_tables.append(df)
|
||||
|
||||
# Combine all tables
|
||||
if all_tables:
|
||||
combined_df = pd.concat(all_tables, ignore_index=True)
|
||||
combined_df.to_excel("extracted_tables.xlsx", index=False)
|
||||
```
|
||||
|
||||
### reportlab — criacao de PDFs
|
||||
|
||||
#### Basic PDF Creation
|
||||
```python
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
c = canvas.Canvas("hello.pdf", pagesize=letter)
|
||||
width, height = letter
|
||||
|
||||
# Add text
|
||||
c.drawString(100, height - 100, "Hello World!")
|
||||
c.drawString(100, height - 120, "This is a PDF created with reportlab")
|
||||
|
||||
# Add a line
|
||||
c.line(100, height - 140, 400, height - 140)
|
||||
|
||||
# Save
|
||||
c.save()
|
||||
```
|
||||
|
||||
#### Create PDF with Multiple Pages
|
||||
```python
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
|
||||
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
|
||||
styles = getSampleStyleSheet()
|
||||
story = []
|
||||
|
||||
# Add content
|
||||
title = Paragraph("Report Title", styles['Title'])
|
||||
story.append(title)
|
||||
story.append(Spacer(1, 12))
|
||||
|
||||
body = Paragraph("This is the body of the report. " * 20, styles['Normal'])
|
||||
story.append(body)
|
||||
story.append(PageBreak())
|
||||
|
||||
# Page 2
|
||||
story.append(Paragraph("Page 2", styles['Heading1']))
|
||||
story.append(Paragraph("Content for page 2", styles['Normal']))
|
||||
|
||||
# Build PDF
|
||||
doc.build(story)
|
||||
```
|
||||
|
||||
#### Subscripts and Superscripts
|
||||
|
||||
**Importante**: nunca usar caracteres Unicode subscript/superscript (subscript: 0-9, superscript: 0-9) em PDFs ReportLab. As fontes built-in nao incluem estes glifos, resultando em caixas pretas.
|
||||
|
||||
Usar as tags XML do ReportLab em objectos Paragraph:
|
||||
```python
|
||||
from reportlab.platypus import Paragraph
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
# Subscripts: use <sub> tag
|
||||
chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
|
||||
|
||||
# Superscripts: use <super> tag
|
||||
squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
|
||||
```
|
||||
|
||||
Para texto desenhado com canvas (nao Paragraph), ajustar manualmente o tamanho da fonte e posicao.
|
||||
|
||||
## Ferramentas de linha de comandos
|
||||
|
||||
### pdftotext (poppler-utils)
|
||||
```bash
|
||||
# Extract text
|
||||
pdftotext input.pdf output.txt
|
||||
|
||||
# Extract text preserving layout
|
||||
pdftotext -layout input.pdf output.txt
|
||||
|
||||
# Extract specific pages
|
||||
pdftotext -f 1 -l 5 input.pdf output.txt # Pages 1-5
|
||||
```
|
||||
|
||||
### qpdf
|
||||
```bash
|
||||
# Merge PDFs
|
||||
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
|
||||
|
||||
# Split pages
|
||||
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
|
||||
qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
|
||||
|
||||
# Rotate pages
|
||||
qpdf input.pdf output.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees
|
||||
|
||||
# Remove password
|
||||
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
|
||||
```
|
||||
|
||||
### pdftk (if available)
|
||||
```bash
|
||||
# Merge
|
||||
pdftk file1.pdf file2.pdf cat output merged.pdf
|
||||
|
||||
# Split
|
||||
pdftk input.pdf burst
|
||||
|
||||
# Rotate
|
||||
pdftk input.pdf rotate 1east output rotated.pdf
|
||||
```
|
||||
|
||||
## Tarefas comuns
|
||||
|
||||
### Extrair texto de PDFs digitalizados (OCR)
|
||||
```python
|
||||
# Requires: pip install pytesseract pdf2image
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
# Convert PDF to images
|
||||
images = convert_from_path('scanned.pdf')
|
||||
|
||||
# OCR each page
|
||||
text = ""
|
||||
for i, image in enumerate(images):
|
||||
text += f"Page {i+1}:\n"
|
||||
text += pytesseract.image_to_string(image)
|
||||
text += "\n\n"
|
||||
|
||||
print(text)
|
||||
```
|
||||
|
||||
### Adicionar watermark
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
# Create watermark (or load existing)
|
||||
watermark = PdfReader("watermark.pdf").pages[0]
|
||||
|
||||
# Apply to all pages
|
||||
reader = PdfReader("document.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
for page in reader.pages:
|
||||
page.merge_page(watermark)
|
||||
writer.add_page(page)
|
||||
|
||||
with open("watermarked.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
### Extrair imagens
|
||||
```bash
|
||||
# Using pdfimages (poppler-utils)
|
||||
pdfimages -j input.pdf output_prefix
|
||||
|
||||
# This extracts all images as output_prefix-000.jpg, output_prefix-001.jpg, etc.
|
||||
```
|
||||
|
||||
### Proteccao por password
|
||||
```python
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
reader = PdfReader("input.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
# Add password
|
||||
writer.encrypt("userpassword", "ownerpassword")
|
||||
|
||||
with open("encrypted.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
## Referencia rapida
|
||||
|
||||
| Tarefa | Melhor ferramenta | Comando/codigo |
|
||||
|--------|-------------------|----------------|
|
||||
| Merge PDFs | pypdf | `writer.add_page(page)` |
|
||||
| Split PDFs | pypdf | One page per file |
|
||||
| Extrair texto | pdfplumber | `page.extract_text()` |
|
||||
| Extrair tabelas | pdfplumber | `page.extract_tables()` |
|
||||
| Criar PDFs | reportlab | Canvas or Platypus |
|
||||
| Merge CLI | qpdf | `qpdf --empty --pages ...` |
|
||||
| OCR scanned PDFs | pytesseract | Convert to image first |
|
||||
| Preencher formularios | pypdf ou annotations | Ver seccao abaixo |
|
||||
|
||||
---
|
||||
|
||||
## Preenchimento de formularios
|
||||
|
||||
**Obrigatorio: seguir estes passos por ordem. Nao saltar para codigo directamente.**
|
||||
|
||||
Primeiro verificar se o PDF tem campos preenchíveis. Executar a partir da pasta de scripts desta skill:
|
||||
`python scripts/check_fillable_fields.py <file.pdf>`
|
||||
|
||||
Consoante o resultado, seguir a seccao "Campos preenchíveis" ou "Campos nao preenchíveis".
|
||||
|
||||
### Campos preenchíveis
|
||||
|
||||
Se o PDF tiver campos de formulario nativos:
|
||||
|
||||
1. Extrair informacao dos campos:
|
||||
`python scripts/extract_form_field_info.py <input.pdf> <field_info.json>`
|
||||
|
||||
O JSON resultante contem campos com esta estrutura:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"field_id": "(ID unico do campo)",
|
||||
"page": "(numero da pagina, 1-based)",
|
||||
"rect": "[left, bottom, right, top]",
|
||||
"type": "text | checkbox | radio_group | choice"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Para **checkboxes**: propriedades `checked_value` e `unchecked_value`.
|
||||
Para **radio groups**: lista `radio_options` com `value` e `rect`.
|
||||
Para **choice fields**: lista `choice_options` com `value` e `text`.
|
||||
|
||||
2. Converter PDF para imagens para analise visual:
|
||||
`python scripts/convert_pdf_to_images.py <file.pdf> <output_directory>`
|
||||
Analisar as imagens para determinar o proposito de cada campo.
|
||||
|
||||
3. Criar `field_values.json`:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"field_id": "last_name",
|
||||
"description": "Apelido do utilizador",
|
||||
"page": 1,
|
||||
"value": "Silva"
|
||||
},
|
||||
{
|
||||
"field_id": "Checkbox12",
|
||||
"description": "Checkbox para maiores de 18",
|
||||
"page": 1,
|
||||
"value": "/On"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
4. Preencher:
|
||||
`python scripts/fill_fillable_fields.py <input.pdf> <field_values.json> <output.pdf>`
|
||||
|
||||
### Campos nao preenchíveis
|
||||
|
||||
Se o PDF nao tiver campos nativos, usar anotacoes de texto. Tentar primeiro extraccao por estrutura (mais preciso), depois estimativa visual como fallback.
|
||||
|
||||
#### Passo 1: extraccao por estrutura
|
||||
|
||||
`python scripts/extract_form_structure.py <input.pdf> form_structure.json`
|
||||
|
||||
Extrai labels de texto, linhas horizontais e checkboxes com coordenadas exactas.
|
||||
|
||||
**Se form_structure.json tiver labels significativos** -> usar abordagem A (estrutura).
|
||||
**Se o PDF for digitalizado/imagem** -> usar abordagem B (visual).
|
||||
|
||||
#### Abordagem A: coordenadas por estrutura (preferida)
|
||||
|
||||
Analisar form_structure.json e identificar:
|
||||
- **Label groups**: elementos de texto adjacentes que formam um label
|
||||
- **Row structure**: labels com `top` similar estao na mesma linha
|
||||
- **Field columns**: areas de entrada comecam apos o label (x0 = label.x1 + gap)
|
||||
- **Checkboxes**: usar coordenadas directamente do JSON
|
||||
|
||||
Criar fields.json com `pdf_width`/`pdf_height`:
|
||||
```json
|
||||
{
|
||||
"pages": [
|
||||
{"page_number": 1, "pdf_width": 612, "pdf_height": 792}
|
||||
],
|
||||
"form_fields": [
|
||||
{
|
||||
"page_number": 1,
|
||||
"description": "Campo apelido",
|
||||
"field_label": "Apelido",
|
||||
"label_bounding_box": [43, 63, 87, 73],
|
||||
"entry_bounding_box": [92, 63, 260, 79],
|
||||
"entry_text": {"text": "Silva", "font_size": 10}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
#### Abordagem B: estimativa visual (fallback)
|
||||
|
||||
1. Converter PDF para imagens:
|
||||
`python scripts/convert_pdf_to_images.py <input.pdf> <images_dir/>`
|
||||
|
||||
2. Identificar campos e posicoes aproximadas nas imagens.
|
||||
|
||||
3. Refinar com zoom (ImageMagick):
|
||||
```bash
|
||||
magick <page_image> -crop <width>x<height>+<x>+<y> +repage <crop_output.png>
|
||||
```
|
||||
|
||||
Converter coordenadas do crop de volta para coordenadas da imagem completa:
|
||||
- full_x = crop_x + crop_offset_x
|
||||
- full_y = crop_y + crop_offset_y
|
||||
|
||||
4. Criar fields.json com `image_width`/`image_height`.
|
||||
|
||||
#### Abordagem hibrida
|
||||
|
||||
Quando a extraccao por estrutura funciona para a maioria dos campos mas falta alguns:
|
||||
1. Usar abordagem A para campos detectados
|
||||
2. Usar abordagem B para campos em falta
|
||||
3. Converter todas as coordenadas para PDF:
|
||||
- pdf_x = image_x * (pdf_width / image_width)
|
||||
- pdf_y = image_y * (pdf_height / image_height)
|
||||
4. Usar sistema de coordenadas unico com `pdf_width`/`pdf_height`
|
||||
|
||||
#### Validacao e preenchimento
|
||||
|
||||
Validar bounding boxes antes de preencher:
|
||||
`python scripts/check_bounding_boxes.py fields.json`
|
||||
|
||||
Preencher o formulario:
|
||||
`python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
|
||||
|
||||
Verificar resultado:
|
||||
`python scripts/convert_pdf_to_images.py <output.pdf> <verify_images/>`
|
||||
|
||||
Criar imagem de validacao com bounding boxes sobrepostas:
|
||||
`python scripts/create_validation_image.py <page_number> <fields.json> <input_image> <output_image>`
|
||||
|
||||
---
|
||||
|
||||
## Referencia avancada
|
||||
|
||||
### pypdfium2 — rendering rapido
|
||||
|
||||
```python
|
||||
import pypdfium2 as pdfium
|
||||
from PIL import Image
|
||||
|
||||
# Load PDF
|
||||
pdf = pdfium.PdfDocument("document.pdf")
|
||||
|
||||
# Render page to image
|
||||
page = pdf[0]
|
||||
bitmap = page.render(scale=2.0, rotation=0)
|
||||
img = bitmap.to_pil()
|
||||
img.save("page_1.png", "PNG")
|
||||
|
||||
# Process multiple pages
|
||||
for i, page in enumerate(pdf):
|
||||
bitmap = page.render(scale=1.5)
|
||||
img = bitmap.to_pil()
|
||||
img.save(f"page_{i+1}.jpg", "JPEG", quality=90)
|
||||
```
|
||||
|
||||
### pdfplumber — funcionalidades avancadas
|
||||
|
||||
#### Texto com coordenadas precisas
|
||||
```python
|
||||
import pdfplumber
|
||||
|
||||
with pdfplumber.open("document.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
|
||||
# Extract all text with coordinates
|
||||
chars = page.chars
|
||||
for char in chars[:10]:
|
||||
print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}")
|
||||
|
||||
# Extract text by bounding box (left, top, right, bottom)
|
||||
bbox_text = page.within_bbox((100, 100, 400, 200)).extract_text()
|
||||
```
|
||||
|
||||
#### Tabelas complexas com settings customizados
|
||||
```python
|
||||
import pdfplumber
|
||||
import pandas as pd
|
||||
|
||||
with pdfplumber.open("complex_table.pdf") as pdf:
|
||||
page = pdf.pages[0]
|
||||
|
||||
table_settings = {
|
||||
"vertical_strategy": "lines",
|
||||
"horizontal_strategy": "lines",
|
||||
"snap_tolerance": 3,
|
||||
"intersection_tolerance": 15
|
||||
}
|
||||
tables = page.extract_tables(table_settings)
|
||||
|
||||
# Visual debugging for table extraction
|
||||
img = page.to_image(resolution=150)
|
||||
img.save("debug_layout.png")
|
||||
```
|
||||
|
||||
### reportlab — relatorios profissionais com tabelas
|
||||
|
||||
```python
|
||||
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from reportlab.lib import colors
|
||||
|
||||
data = [
|
||||
['Produto', 'Q1', 'Q2', 'Q3', 'Q4'],
|
||||
['Widgets', '120', '135', '142', '158'],
|
||||
['Gadgets', '85', '92', '98', '105']
|
||||
]
|
||||
|
||||
doc = SimpleDocTemplate("report.pdf")
|
||||
elements = []
|
||||
|
||||
styles = getSampleStyleSheet()
|
||||
title = Paragraph("Relatorio Trimestral de Vendas", styles['Title'])
|
||||
elements.append(title)
|
||||
|
||||
table = Table(data)
|
||||
table.setStyle(TableStyle([
|
||||
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
|
||||
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
|
||||
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
||||
('FONTSIZE', (0, 0), (-1, 0), 14),
|
||||
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
|
||||
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
|
||||
('GRID', (0, 0), (-1, -1), 1, colors.black)
|
||||
]))
|
||||
elements.append(table)
|
||||
|
||||
doc.build(elements)
|
||||
```
|
||||
|
||||
### JavaScript — pdf-lib (criacao e modificacao)
|
||||
|
||||
#### Load and Manipulate Existing PDF
|
||||
```javascript
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import fs from 'fs';
|
||||
|
||||
async function manipulatePDF() {
|
||||
const existingPdfBytes = fs.readFileSync('input.pdf');
|
||||
const pdfDoc = await PDFDocument.load(existingPdfBytes);
|
||||
|
||||
const pageCount = pdfDoc.getPageCount();
|
||||
const newPage = pdfDoc.addPage([600, 400]);
|
||||
newPage.drawText('Added by pdf-lib', { x: 100, y: 300, size: 16 });
|
||||
|
||||
const pdfBytes = await pdfDoc.save();
|
||||
fs.writeFileSync('modified.pdf', pdfBytes);
|
||||
}
|
||||
```
|
||||
|
||||
#### Advanced Merge and Split
|
||||
```javascript
|
||||
import { PDFDocument } from 'pdf-lib';
|
||||
import fs from 'fs';
|
||||
|
||||
async function mergePDFs() {
|
||||
const mergedPdf = await PDFDocument.create();
|
||||
|
||||
const pdf1 = await PDFDocument.load(fs.readFileSync('doc1.pdf'));
|
||||
const pdf2 = await PDFDocument.load(fs.readFileSync('doc2.pdf'));
|
||||
|
||||
const pdf1Pages = await mergedPdf.copyPages(pdf1, pdf1.getPageIndices());
|
||||
pdf1Pages.forEach(page => mergedPdf.addPage(page));
|
||||
|
||||
const pdf2Pages = await mergedPdf.copyPages(pdf2, [0, 2, 4]);
|
||||
pdf2Pages.forEach(page => mergedPdf.addPage(page));
|
||||
|
||||
fs.writeFileSync('merged.pdf', await mergedPdf.save());
|
||||
}
|
||||
```
|
||||
|
||||
### Operacoes avancadas CLI
|
||||
|
||||
#### poppler-utils
|
||||
```bash
|
||||
# Text with bounding box coordinates
|
||||
pdftotext -bbox-layout document.pdf output.xml
|
||||
|
||||
# High-resolution PNG conversion
|
||||
pdftoppm -png -r 300 document.pdf output_prefix
|
||||
|
||||
# Specific page range with high resolution
|
||||
pdftoppm -png -r 600 -f 1 -l 3 document.pdf high_res_pages
|
||||
|
||||
# Extract all embedded images with metadata
|
||||
pdfimages -j -p document.pdf page_images
|
||||
|
||||
# List image info without extracting
|
||||
pdfimages -list document.pdf
|
||||
```
|
||||
|
||||
#### qpdf avancado
|
||||
```bash
|
||||
# Split PDF into groups of pages
|
||||
qpdf --split-pages=3 input.pdf output_group_%02d.pdf
|
||||
|
||||
# Complex page ranges from multiple PDFs
|
||||
qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf
|
||||
|
||||
# Optimize for web (linearize)
|
||||
qpdf --linearize input.pdf optimized.pdf
|
||||
|
||||
# Repair corrupted PDF
|
||||
qpdf --check input.pdf
|
||||
qpdf --fix-qdf damaged.pdf repaired.pdf
|
||||
|
||||
# Advanced encryption with permissions
|
||||
qpdf --encrypt user_pass owner_pass 256 --print=none --modify=none -- input.pdf encrypted.pdf
|
||||
|
||||
# Check encryption status
|
||||
qpdf --show-encryption encrypted.pdf
|
||||
```
|
||||
|
||||
### Processamento em lote com error handling
|
||||
|
||||
```python
|
||||
import os
|
||||
import glob
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def batch_process_pdfs(input_dir, operation='merge'):
|
||||
pdf_files = glob.glob(os.path.join(input_dir, "*.pdf"))
|
||||
|
||||
if operation == 'merge':
|
||||
writer = PdfWriter()
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
reader = PdfReader(pdf_file)
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
logger.info(f"Processed: {pdf_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process {pdf_file}: {e}")
|
||||
continue
|
||||
|
||||
with open("batch_merged.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
|
||||
elif operation == 'extract_text':
|
||||
for pdf_file in pdf_files:
|
||||
try:
|
||||
reader = PdfReader(pdf_file)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text()
|
||||
|
||||
output_file = pdf_file.replace('.pdf', '.txt')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(text)
|
||||
logger.info(f"Extracted text from: {pdf_file}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from {pdf_file}: {e}")
|
||||
continue
|
||||
```
|
||||
|
||||
### Cropping avancado
|
||||
|
||||
```python
|
||||
from pypdf import PdfWriter, PdfReader
|
||||
|
||||
reader = PdfReader("input.pdf")
|
||||
writer = PdfWriter()
|
||||
|
||||
page = reader.pages[0]
|
||||
page.mediabox.left = 50
|
||||
page.mediabox.bottom = 50
|
||||
page.mediabox.right = 550
|
||||
page.mediabox.top = 750
|
||||
|
||||
writer.add_page(page)
|
||||
with open("cropped.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
### Gestao de memoria para PDFs grandes
|
||||
|
||||
```python
|
||||
def process_large_pdf(pdf_path, chunk_size=10):
|
||||
reader = PdfReader(pdf_path)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
for start_idx in range(0, total_pages, chunk_size):
|
||||
end_idx = min(start_idx + chunk_size, total_pages)
|
||||
writer = PdfWriter()
|
||||
|
||||
for i in range(start_idx, end_idx):
|
||||
writer.add_page(reader.pages[i])
|
||||
|
||||
with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as output:
|
||||
writer.write(output)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### PDFs encriptados
|
||||
```python
|
||||
from pypdf import PdfReader
|
||||
|
||||
try:
|
||||
reader = PdfReader("encrypted.pdf")
|
||||
if reader.is_encrypted:
|
||||
reader.decrypt("password")
|
||||
except Exception as e:
|
||||
print(f"Failed to decrypt: {e}")
|
||||
```
|
||||
|
||||
### PDFs corrompidos
|
||||
```bash
|
||||
qpdf --check corrupted.pdf
|
||||
qpdf --replace-input corrupted.pdf
|
||||
```
|
||||
|
||||
### Falha na extraccao de texto (fallback para OCR)
|
||||
```python
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
def extract_text_with_ocr(pdf_path):
|
||||
images = convert_from_path(pdf_path)
|
||||
text = ""
|
||||
for i, image in enumerate(images):
|
||||
text += pytesseract.image_to_string(image)
|
||||
return text
|
||||
```
|
||||
|
||||
## Dicas de performance
|
||||
|
||||
1. **PDFs grandes**: usar streaming em vez de carregar tudo em memoria; `qpdf --split-pages` para dividir
|
||||
2. **Extraccao de texto**: `pdftotext -bbox-layout` e o mais rapido; pdfplumber para tabelas
|
||||
3. **Extraccao de imagens**: `pdfimages` e muito mais rapido que rendering de paginas
|
||||
4. **Preenchimento de formularios**: pdf-lib mantem melhor a estrutura do formulario
|
||||
5. **Memoria**: processar paginas individualmente com pypdfium2 para documentos grandes
|
||||
|
||||
---
|
||||
|
||||
## Integracao Descomplicar
|
||||
|
||||
### Caminhos frequentes para PDFs
|
||||
|
||||
| Localizacao | Caminho |
|
||||
|-------------|---------|
|
||||
| Documentos empresa | `/media/ealmeida/Dados/GDrive/Cloud/Descomplicar/` |
|
||||
| Propostas | `/media/ealmeida/Dados/Hub/03-Propostas/` |
|
||||
| Arquivo clientes | `/media/ealmeida/Dados/GDrive/Arquivo_de_Clientes/` |
|
||||
| Knowledge Base | `/media/ealmeida/Dados/Hub/06-Operacoes/Knowledge-Base/PDFs/` |
|
||||
| Backups | `/media/ealmeida/Dados/GDrive/Backups/` |
|
||||
| Temporarios | `~/.claude-work/` (limpar ao concluir) |
|
||||
|
||||
### MCPs relevantes
|
||||
|
||||
- **mcp__filesystem__read_file** / **write_file**: ler e escrever PDFs locais
|
||||
- **mcp__filesystem__search_files**: encontrar PDFs no sistema
|
||||
- **mcp__google-workspace__drive_search_files**: encontrar PDFs no Google Drive
|
||||
- **mcp__google-workspace__drive_read_file_content**: ler conteudo de ficheiros no Drive
|
||||
- **mcp__google-workspace__drive_upload_file**: enviar PDFs processados para o Drive
|
||||
|
||||
### Workflow tipico Descomplicar
|
||||
|
||||
1. Localizar PDF (filesystem ou Google Drive)
|
||||
2. Descarregar para `~/.claude-work/` se necessario
|
||||
3. Processar (extrair, merge, split, OCR, etc.)
|
||||
4. Guardar resultado no destino final
|
||||
5. Limpar temporarios de `~/.claude-work/`
|
||||
|
||||
---
|
||||
|
||||
## Licencas das bibliotecas
|
||||
|
||||
- **pypdf**: BSD | **pdfplumber**: MIT | **pypdfium2**: Apache/BSD | **reportlab**: BSD
|
||||
- **poppler-utils**: GPL-2 | **qpdf**: Apache | **pdf-lib**: MIT | **pdfjs-dist**: Apache
|
||||
|
||||
---
|
||||
**Versao**: 1.0.0 | **Autor**: Descomplicar®
|
||||
65
dev-tools/skills/pdf/scripts/check_bounding_boxes.py
Normal file
65
dev-tools/skills/pdf/scripts/check_bounding_boxes.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class RectAndField:
|
||||
rect: list[float]
|
||||
rect_type: str
|
||||
field: dict
|
||||
|
||||
|
||||
def get_bounding_box_messages(fields_json_stream) -> list[str]:
|
||||
messages = []
|
||||
fields = json.load(fields_json_stream)
|
||||
messages.append(f"Read {len(fields['form_fields'])} fields")
|
||||
|
||||
def rects_intersect(r1, r2):
|
||||
disjoint_horizontal = r1[0] >= r2[2] or r1[2] <= r2[0]
|
||||
disjoint_vertical = r1[1] >= r2[3] or r1[3] <= r2[1]
|
||||
return not (disjoint_horizontal or disjoint_vertical)
|
||||
|
||||
rects_and_fields = []
|
||||
for f in fields["form_fields"]:
|
||||
rects_and_fields.append(RectAndField(f["label_bounding_box"], "label", f))
|
||||
rects_and_fields.append(RectAndField(f["entry_bounding_box"], "entry", f))
|
||||
|
||||
has_error = False
|
||||
for i, ri in enumerate(rects_and_fields):
|
||||
for j in range(i + 1, len(rects_and_fields)):
|
||||
rj = rects_and_fields[j]
|
||||
if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
|
||||
has_error = True
|
||||
if ri.field is rj.field:
|
||||
messages.append(f"FAILURE: intersection between label and entry bounding boxes for `{ri.field['description']}` ({ri.rect}, {rj.rect})")
|
||||
else:
|
||||
messages.append(f"FAILURE: intersection between {ri.rect_type} bounding box for `{ri.field['description']}` ({ri.rect}) and {rj.rect_type} bounding box for `{rj.field['description']}` ({rj.rect})")
|
||||
if len(messages) >= 20:
|
||||
messages.append("Aborting further checks; fix bounding boxes and try again")
|
||||
return messages
|
||||
if ri.rect_type == "entry":
|
||||
if "entry_text" in ri.field:
|
||||
font_size = ri.field["entry_text"].get("font_size", 14)
|
||||
entry_height = ri.rect[3] - ri.rect[1]
|
||||
if entry_height < font_size:
|
||||
has_error = True
|
||||
messages.append(f"FAILURE: entry bounding box height ({entry_height}) for `{ri.field['description']}` is too short for the text content (font size: {font_size}). Increase the box height or decrease the font size.")
|
||||
if len(messages) >= 20:
|
||||
messages.append("Aborting further checks; fix bounding boxes and try again")
|
||||
return messages
|
||||
|
||||
if not has_error:
|
||||
messages.append("SUCCESS: All bounding boxes are valid")
|
||||
return messages
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: check_bounding_boxes.py [fields.json]")
|
||||
sys.exit(1)
|
||||
with open(sys.argv[1]) as f:
|
||||
messages = get_bounding_box_messages(f)
|
||||
for msg in messages:
|
||||
print(msg)
|
||||
11
dev-tools/skills/pdf/scripts/check_fillable_fields.py
Normal file
11
dev-tools/skills/pdf/scripts/check_fillable_fields.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import sys
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
|
||||
|
||||
reader = PdfReader(sys.argv[1])
|
||||
if (reader.get_fields()):
|
||||
print("This PDF has fillable form fields")
|
||||
else:
|
||||
print("This PDF does not have fillable form fields; you will need to visually determine where to enter data")
|
||||
33
dev-tools/skills/pdf/scripts/convert_pdf_to_images.py
Normal file
33
dev-tools/skills/pdf/scripts/convert_pdf_to_images.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
|
||||
|
||||
|
||||
def convert(pdf_path, output_dir, max_dim=1000):
|
||||
images = convert_from_path(pdf_path, dpi=200)
|
||||
|
||||
for i, image in enumerate(images):
|
||||
width, height = image.size
|
||||
if width > max_dim or height > max_dim:
|
||||
scale_factor = min(max_dim / width, max_dim / height)
|
||||
new_width = int(width * scale_factor)
|
||||
new_height = int(height * scale_factor)
|
||||
image = image.resize((new_width, new_height))
|
||||
|
||||
image_path = os.path.join(output_dir, f"page_{i+1}.png")
|
||||
image.save(image_path)
|
||||
print(f"Saved page {i+1} as {image_path} (size: {image.size})")
|
||||
|
||||
print(f"Converted {len(images)} pages to PNG images")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: convert_pdf_to_images.py [input pdf] [output directory]")
|
||||
sys.exit(1)
|
||||
pdf_path = sys.argv[1]
|
||||
output_directory = sys.argv[2]
|
||||
convert(pdf_path, output_directory)
|
||||
37
dev-tools/skills/pdf/scripts/create_validation_image.py
Normal file
37
dev-tools/skills/pdf/scripts/create_validation_image.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
|
||||
|
||||
|
||||
def create_validation_image(page_number, fields_json_path, input_path, output_path):
|
||||
with open(fields_json_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
img = Image.open(input_path)
|
||||
draw = ImageDraw.Draw(img)
|
||||
num_boxes = 0
|
||||
|
||||
for field in data["form_fields"]:
|
||||
if field["page_number"] == page_number:
|
||||
entry_box = field['entry_bounding_box']
|
||||
label_box = field['label_bounding_box']
|
||||
draw.rectangle(entry_box, outline='red', width=2)
|
||||
draw.rectangle(label_box, outline='blue', width=2)
|
||||
num_boxes += 2
|
||||
|
||||
img.save(output_path)
|
||||
print(f"Created validation image at {output_path} with {num_boxes} bounding boxes")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 5:
|
||||
print("Usage: create_validation_image.py [page number] [fields.json file] [input image path] [output image path]")
|
||||
sys.exit(1)
|
||||
page_number = int(sys.argv[1])
|
||||
fields_json_path = sys.argv[2]
|
||||
input_image_path = sys.argv[3]
|
||||
output_image_path = sys.argv[4]
|
||||
create_validation_image(page_number, fields_json_path, input_image_path, output_image_path)
|
||||
122
dev-tools/skills/pdf/scripts/extract_form_field_info.py
Normal file
122
dev-tools/skills/pdf/scripts/extract_form_field_info.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
|
||||
|
||||
def get_full_annotation_field_id(annotation):
|
||||
components = []
|
||||
while annotation:
|
||||
field_name = annotation.get('/T')
|
||||
if field_name:
|
||||
components.append(field_name)
|
||||
annotation = annotation.get('/Parent')
|
||||
return ".".join(reversed(components)) if components else None
|
||||
|
||||
|
||||
def make_field_dict(field, field_id):
|
||||
field_dict = {"field_id": field_id}
|
||||
ft = field.get('/FT')
|
||||
if ft == "/Tx":
|
||||
field_dict["type"] = "text"
|
||||
elif ft == "/Btn":
|
||||
field_dict["type"] = "checkbox"
|
||||
states = field.get("/_States_", [])
|
||||
if len(states) == 2:
|
||||
if "/Off" in states:
|
||||
field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
|
||||
field_dict["unchecked_value"] = "/Off"
|
||||
else:
|
||||
print(f"Unexpected state values for checkbox `${field_id}`. Its checked and unchecked values may not be correct; if you're trying to check it, visually verify the results.")
|
||||
field_dict["checked_value"] = states[0]
|
||||
field_dict["unchecked_value"] = states[1]
|
||||
elif ft == "/Ch":
|
||||
field_dict["type"] = "choice"
|
||||
states = field.get("/_States_", [])
|
||||
field_dict["choice_options"] = [{
|
||||
"value": state[0],
|
||||
"text": state[1],
|
||||
} for state in states]
|
||||
else:
|
||||
field_dict["type"] = f"unknown ({ft})"
|
||||
return field_dict
|
||||
|
||||
|
||||
def get_field_info(reader: PdfReader):
|
||||
fields = reader.get_fields()
|
||||
|
||||
field_info_by_id = {}
|
||||
possible_radio_names = set()
|
||||
|
||||
for field_id, field in fields.items():
|
||||
if field.get("/Kids"):
|
||||
if field.get("/FT") == "/Btn":
|
||||
possible_radio_names.add(field_id)
|
||||
continue
|
||||
field_info_by_id[field_id] = make_field_dict(field, field_id)
|
||||
|
||||
|
||||
radio_fields_by_id = {}
|
||||
|
||||
for page_index, page in enumerate(reader.pages):
|
||||
annotations = page.get('/Annots', [])
|
||||
for ann in annotations:
|
||||
field_id = get_full_annotation_field_id(ann)
|
||||
if field_id in field_info_by_id:
|
||||
field_info_by_id[field_id]["page"] = page_index + 1
|
||||
field_info_by_id[field_id]["rect"] = ann.get('/Rect')
|
||||
elif field_id in possible_radio_names:
|
||||
try:
|
||||
on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
|
||||
except KeyError:
|
||||
continue
|
||||
if len(on_values) == 1:
|
||||
rect = ann.get("/Rect")
|
||||
if field_id not in radio_fields_by_id:
|
||||
radio_fields_by_id[field_id] = {
|
||||
"field_id": field_id,
|
||||
"type": "radio_group",
|
||||
"page": page_index + 1,
|
||||
"radio_options": [],
|
||||
}
|
||||
radio_fields_by_id[field_id]["radio_options"].append({
|
||||
"value": on_values[0],
|
||||
"rect": rect,
|
||||
})
|
||||
|
||||
fields_with_location = []
|
||||
for field_info in field_info_by_id.values():
|
||||
if "page" in field_info:
|
||||
fields_with_location.append(field_info)
|
||||
else:
|
||||
print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
|
||||
|
||||
def sort_key(f):
|
||||
if "radio_options" in f:
|
||||
rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]
|
||||
else:
|
||||
rect = f.get("rect") or [0, 0, 0, 0]
|
||||
adjusted_position = [-rect[1], rect[0]]
|
||||
return [f.get("page"), adjusted_position]
|
||||
|
||||
sorted_fields = fields_with_location + list(radio_fields_by_id.values())
|
||||
sorted_fields.sort(key=sort_key)
|
||||
|
||||
return sorted_fields
|
||||
|
||||
|
||||
def write_field_info(pdf_path: str, json_output_path: str):
|
||||
reader = PdfReader(pdf_path)
|
||||
field_info = get_field_info(reader)
|
||||
with open(json_output_path, "w") as f:
|
||||
json.dump(field_info, f, indent=2)
|
||||
print(f"Wrote {len(field_info)} fields to {json_output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: extract_form_field_info.py [input pdf] [output json]")
|
||||
sys.exit(1)
|
||||
write_field_info(sys.argv[1], sys.argv[2])
|
||||
115
dev-tools/skills/pdf/scripts/extract_form_structure.py
Normal file
115
dev-tools/skills/pdf/scripts/extract_form_structure.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Extract form structure from a non-fillable PDF.
|
||||
|
||||
This script analyzes the PDF to find:
|
||||
- Text labels with their exact coordinates
|
||||
- Horizontal lines (row boundaries)
|
||||
- Checkboxes (small rectangles)
|
||||
|
||||
Output: A JSON file with the form structure that can be used to generate
|
||||
accurate field coordinates for filling.
|
||||
|
||||
Usage: python extract_form_structure.py <input.pdf> <output.json>
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def extract_form_structure(pdf_path):
|
||||
structure = {
|
||||
"pages": [],
|
||||
"labels": [],
|
||||
"lines": [],
|
||||
"checkboxes": [],
|
||||
"row_boundaries": []
|
||||
}
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
structure["pages"].append({
|
||||
"page_number": page_num,
|
||||
"width": float(page.width),
|
||||
"height": float(page.height)
|
||||
})
|
||||
|
||||
words = page.extract_words()
|
||||
for word in words:
|
||||
structure["labels"].append({
|
||||
"page": page_num,
|
||||
"text": word["text"],
|
||||
"x0": round(float(word["x0"]), 1),
|
||||
"top": round(float(word["top"]), 1),
|
||||
"x1": round(float(word["x1"]), 1),
|
||||
"bottom": round(float(word["bottom"]), 1)
|
||||
})
|
||||
|
||||
for line in page.lines:
|
||||
if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
|
||||
structure["lines"].append({
|
||||
"page": page_num,
|
||||
"y": round(float(line["top"]), 1),
|
||||
"x0": round(float(line["x0"]), 1),
|
||||
"x1": round(float(line["x1"]), 1)
|
||||
})
|
||||
|
||||
for rect in page.rects:
|
||||
width = float(rect["x1"]) - float(rect["x0"])
|
||||
height = float(rect["bottom"]) - float(rect["top"])
|
||||
if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
|
||||
structure["checkboxes"].append({
|
||||
"page": page_num,
|
||||
"x0": round(float(rect["x0"]), 1),
|
||||
"top": round(float(rect["top"]), 1),
|
||||
"x1": round(float(rect["x1"]), 1),
|
||||
"bottom": round(float(rect["bottom"]), 1),
|
||||
"center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
|
||||
"center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
|
||||
})
|
||||
|
||||
lines_by_page = {}
|
||||
for line in structure["lines"]:
|
||||
page = line["page"]
|
||||
if page not in lines_by_page:
|
||||
lines_by_page[page] = []
|
||||
lines_by_page[page].append(line["y"])
|
||||
|
||||
for page, y_coords in lines_by_page.items():
|
||||
y_coords = sorted(set(y_coords))
|
||||
for i in range(len(y_coords) - 1):
|
||||
structure["row_boundaries"].append({
|
||||
"page": page,
|
||||
"row_top": y_coords[i],
|
||||
"row_bottom": y_coords[i + 1],
|
||||
"row_height": round(y_coords[i + 1] - y_coords[i], 1)
|
||||
})
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: extract_form_structure.py <input.pdf> <output.json>")
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
output_path = sys.argv[2]
|
||||
|
||||
print(f"Extracting structure from {pdf_path}...")
|
||||
structure = extract_form_structure(pdf_path)
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(structure, f, indent=2)
|
||||
|
||||
print(f"Found:")
|
||||
print(f" - {len(structure['pages'])} pages")
|
||||
print(f" - {len(structure['labels'])} text labels")
|
||||
print(f" - {len(structure['lines'])} horizontal lines")
|
||||
print(f" - {len(structure['checkboxes'])} checkboxes")
|
||||
print(f" - {len(structure['row_boundaries'])} row boundaries")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
98
dev-tools/skills/pdf/scripts/fill_fillable_fields.py
Normal file
98
dev-tools/skills/pdf/scripts/fill_fillable_fields.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
|
||||
from extract_form_field_info import get_field_info
|
||||
|
||||
|
||||
|
||||
|
||||
def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
|
||||
with open(fields_json_path) as f:
|
||||
fields = json.load(f)
|
||||
fields_by_page = {}
|
||||
for field in fields:
|
||||
if "value" in field:
|
||||
field_id = field["field_id"]
|
||||
page = field["page"]
|
||||
if page not in fields_by_page:
|
||||
fields_by_page[page] = {}
|
||||
fields_by_page[page][field_id] = field["value"]
|
||||
|
||||
reader = PdfReader(input_pdf_path)
|
||||
|
||||
has_error = False
|
||||
field_info = get_field_info(reader)
|
||||
fields_by_ids = {f["field_id"]: f for f in field_info}
|
||||
for field in fields:
|
||||
existing_field = fields_by_ids.get(field["field_id"])
|
||||
if not existing_field:
|
||||
has_error = True
|
||||
print(f"ERROR: `{field['field_id']}` is not a valid field ID")
|
||||
elif field["page"] != existing_field["page"]:
|
||||
has_error = True
|
||||
print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})")
|
||||
else:
|
||||
if "value" in field:
|
||||
err = validation_error_for_field_value(existing_field, field["value"])
|
||||
if err:
|
||||
print(err)
|
||||
has_error = True
|
||||
if has_error:
|
||||
sys.exit(1)
|
||||
|
||||
writer = PdfWriter(clone_from=reader)
|
||||
for page, field_values in fields_by_page.items():
|
||||
writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
|
||||
|
||||
writer.set_need_appearances_writer(True)
|
||||
|
||||
with open(output_pdf_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
|
||||
def validation_error_for_field_value(field_info, field_value):
|
||||
field_type = field_info["type"]
|
||||
field_id = field_info["field_id"]
|
||||
if field_type == "checkbox":
|
||||
checked_val = field_info["checked_value"]
|
||||
unchecked_val = field_info["unchecked_value"]
|
||||
if field_value != checked_val and field_value != unchecked_val:
|
||||
return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"'
|
||||
elif field_type == "radio_group":
|
||||
option_values = [opt["value"] for opt in field_info["radio_options"]]
|
||||
if field_value not in option_values:
|
||||
return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}'
|
||||
elif field_type == "choice":
|
||||
choice_values = [opt["value"] for opt in field_info["choice_options"]]
|
||||
if field_value not in choice_values:
|
||||
return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}'
|
||||
return None
|
||||
|
||||
|
||||
def monkeypatch_pydpf_method():
|
||||
from pypdf.generic import DictionaryObject
|
||||
from pypdf.constants import FieldDictionaryAttributes
|
||||
|
||||
original_get_inherited = DictionaryObject.get_inherited
|
||||
|
||||
def patched_get_inherited(self, key: str, default = None):
|
||||
result = original_get_inherited(self, key, default)
|
||||
if key == FieldDictionaryAttributes.Opt:
|
||||
if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result):
|
||||
result = [r[0] for r in result]
|
||||
return result
|
||||
|
||||
DictionaryObject.get_inherited = patched_get_inherited
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]")
|
||||
sys.exit(1)
|
||||
monkeypatch_pydpf_method()
|
||||
input_pdf = sys.argv[1]
|
||||
fields_json = sys.argv[2]
|
||||
output_pdf = sys.argv[3]
|
||||
fill_pdf_fields(input_pdf, fields_json, output_pdf)
|
||||
107
dev-tools/skills/pdf/scripts/fill_pdf_form_with_annotations.py
Normal file
107
dev-tools/skills/pdf/scripts/fill_pdf_form_with_annotations.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from pypdf.annotations import FreeText
|
||||
|
||||
|
||||
|
||||
|
||||
def transform_from_image_coords(bbox, image_width, image_height, pdf_width, pdf_height):
|
||||
x_scale = pdf_width / image_width
|
||||
y_scale = pdf_height / image_height
|
||||
|
||||
left = bbox[0] * x_scale
|
||||
right = bbox[2] * x_scale
|
||||
|
||||
top = pdf_height - (bbox[1] * y_scale)
|
||||
bottom = pdf_height - (bbox[3] * y_scale)
|
||||
|
||||
return left, bottom, right, top
|
||||
|
||||
|
||||
def transform_from_pdf_coords(bbox, pdf_height):
|
||||
left = bbox[0]
|
||||
right = bbox[2]
|
||||
|
||||
pypdf_top = pdf_height - bbox[1]
|
||||
pypdf_bottom = pdf_height - bbox[3]
|
||||
|
||||
return left, pypdf_bottom, right, pypdf_top
|
||||
|
||||
|
||||
def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
|
||||
|
||||
with open(fields_json_path, "r") as f:
|
||||
fields_data = json.load(f)
|
||||
|
||||
reader = PdfReader(input_pdf_path)
|
||||
writer = PdfWriter()
|
||||
|
||||
writer.append(reader)
|
||||
|
||||
pdf_dimensions = {}
|
||||
for i, page in enumerate(reader.pages):
|
||||
mediabox = page.mediabox
|
||||
pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
|
||||
|
||||
annotations = []
|
||||
for field in fields_data["form_fields"]:
|
||||
page_num = field["page_number"]
|
||||
|
||||
page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
|
||||
pdf_width, pdf_height = pdf_dimensions[page_num]
|
||||
|
||||
if "pdf_width" in page_info:
|
||||
transformed_entry_box = transform_from_pdf_coords(
|
||||
field["entry_bounding_box"],
|
||||
float(pdf_height)
|
||||
)
|
||||
else:
|
||||
image_width = page_info["image_width"]
|
||||
image_height = page_info["image_height"]
|
||||
transformed_entry_box = transform_from_image_coords(
|
||||
field["entry_bounding_box"],
|
||||
image_width, image_height,
|
||||
float(pdf_width), float(pdf_height)
|
||||
)
|
||||
|
||||
if "entry_text" not in field or "text" not in field["entry_text"]:
|
||||
continue
|
||||
entry_text = field["entry_text"]
|
||||
text = entry_text["text"]
|
||||
if not text:
|
||||
continue
|
||||
|
||||
font_name = entry_text.get("font", "Arial")
|
||||
font_size = str(entry_text.get("font_size", 14)) + "pt"
|
||||
font_color = entry_text.get("font_color", "000000")
|
||||
|
||||
annotation = FreeText(
|
||||
text=text,
|
||||
rect=transformed_entry_box,
|
||||
font=font_name,
|
||||
font_size=font_size,
|
||||
font_color=font_color,
|
||||
border_color=None,
|
||||
background_color=None,
|
||||
)
|
||||
annotations.append(annotation)
|
||||
writer.add_annotation(page_number=page_num - 1, annotation=annotation)
|
||||
|
||||
with open(output_pdf_path, "wb") as output:
|
||||
writer.write(output)
|
||||
|
||||
print(f"Successfully filled PDF form and saved to {output_pdf_path}")
|
||||
print(f"Added {len(annotations)} text annotations")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
|
||||
sys.exit(1)
|
||||
input_pdf = sys.argv[1]
|
||||
fields_json = sys.argv[2]
|
||||
output_pdf = sys.argv[3]
|
||||
|
||||
fill_pdf_form(input_pdf, fields_json, output_pdf)
|
||||
Reference in New Issue
Block a user