Use this skill whenever PDF files are involved in any way for the BC-DATASET pipeline. Triggers include: reading or extracting text from PDFs, splitting PDFs, creating PDF reports, processing Brazilian legal PDFs (MP-GO corpus), implementing the PDF→chunks pipeline (ADR-014), or any mention of a .pdf file. DO NOT USE for Word documents (.docx), spreadsheets, or training scripts.
This skill covers PDF operations for the LLMTrainingPlatform, focused on the BC-DATASET ingestion pipeline (ADR-014): PDF → chunks (overlap=200) → QA generation → human review → export.
| Library | Use case | Install |
|---|---|---|
pymupdf4llm | PDF→markdown (LLM-optimized) | uv add pymupdf4llm |
pdfplumber | Table extraction, layout analysis | uv add pdfplumber |
pypdf | Merge, split, rotate, encrypt | uv add pypdf |
langchain-text-splitters | Chunk splitting (overlap=200) | uv add langchain-text-splitters |
import pymupdf4llm
def pdf_para_markdown(caminho: str) -> str:
"""Converte PDF para markdown otimizado para LLMs."""
return pymupdf4llm.to_markdown(caminho)
from langchain_text_splitters import RecursiveCharacterTextSplitter
def dividir_em_chunks(texto: str, chunk_size: int = 1000, overlap: int = 200) -> list[str]:
"""Divide texto em chunks com overlap=200 (ADR-014)."""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ".", " ", ""],
)
return splitter.split_text(texto)
import pymupdf4llm
from langchain_text_splitters import RecursiveCharacterTextSplitter
def processar_pdf(caminho_pdf: str) -> list[str]:
"""Pipeline completo: PDF → markdown → chunks."""
markdown = pymupdf4llm.to_markdown(caminho_pdf)
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
return splitter.split_text(markdown)
import pdfplumber
def extrair_texto(caminho: str) -> str:
"""Extrai texto preservando layout."""
with pdfplumber.open(caminho) as pdf:
return "\n".join(
page.extract_text() or ""
for page in pdf.pages
)
def extrair_tabelas(caminho: str) -> list[list]:
"""Extrai todas as tabelas do PDF."""
tabelas = []
with pdfplumber.open(caminho) as pdf:
for page in pdf.pages:
tabelas.extend(page.extract_tables())
return tabelas
from pypdf import PdfReader, PdfWriter
def mesclar_pdfs(caminhos: list[str], saida: str) -> None:
writer = PdfWriter()
for caminho in caminhos:
reader = PdfReader(caminho)
for page in reader.pages:
writer.add_page(page)
with open(saida, "wb") as f:
writer.write(f)
def dividir_pdf(caminho: str, paginas_por_parte: int, prefixo: str) -> list[str]:
reader = PdfReader(caminho)
saidas = []
for i in range(0, len(reader.pages), paginas_por_parte):
writer = PdfWriter()
for page in reader.pages[i:i + paginas_por_parte]:
writer.add_page(page)
saida = f"{prefixo}_{i // paginas_por_parte + 1}.pdf"
with open(saida, "wb") as f:
writer.write(f)
saidas.append(saida)
return saidas
# Install: sudo apt-get install tesseract-ocr tesseract-ocr-por
pip install pytesseract pdf2image
from pdf2image import convert_from_path
import pytesseract
def extrair_texto_ocr(caminho: str, idioma: str = "por") -> str:
"""OCR para PDFs digitalizados (documentos legais escaneados)."""
imagens = convert_from_path(caminho, dpi=300)
return "\n".join(
pytesseract.image_to_string(img, lang=idioma)
for img in imagens
)
# Extract text (fast)
pdftotext -layout document.pdf output.txt
# Split into pages
pdfseparate document.pdf page_%d.pdf
# Merge
pdfunite doc1.pdf doc2.pdf merged.pdf
# Check/repair
qpdf --check document.pdf
qpdf --fix-qdf damaged.pdf repaired.pdf
# Decrypt
qpdf --password=senha --decrypt encrypted.pdf decrypted.pdf
from pypdf import PdfReader
# Encrypted PDF
reader = PdfReader("encrypted.pdf")
if reader.is_encrypted:
reader.decrypt("senha")
# Scanned/image PDF (no extractable text)
with pdfplumber.open("scanned.pdf") as pdf:
text = pdf.pages[0].extract_text()
if not text or len(text.strip()) < 50:
# Use OCR instead
text = extrair_texto_ocr("scanned.pdf")
See reference.md for:
pypdfium2 (high-performance rendering)pdf-lib (JavaScript)pdfplumber table extraction settings