Extract text, tables, and structured information from PDF documents using pdfplumber, PyPDF2, or pdftotext command-line tools.
This skill helps agents extract information from PDF documents.
The environment has these PDF tools pre-installed:
pdfplumber - Python library for precise text/table extractionPyPDF2 - Python library for PDF manipulationpdftotext - Command-line tool from poppler-utils# Extract all text from a PDF
pdftotext /root/artifacts/paper.pdf -
# Extract specific pages
pdftotext -f 1 -l 3 /root/artifacts/paper.pdf -
# Extract to a file
pdftotext /root/artifacts/paper.pdf /tmp/paper.txt
import pdfplumber
from pathlib import Path
def extract_pdf_text(pdf_path: str) -> str:
"""Extract all text from a PDF file."""
text_parts = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
text_parts.append(text)
return "\n\n".join(text_parts)
# Usage
text = extract_pdf_text("/root/artifacts/paper.pdf")
print(text)
import re
def find_commands(text: str) -> list:
"""Extract shell commands from text."""
# Look for common command patterns
patterns = [
r'docker run[^\n]+',
r'\$[^\n]+',
r'--package=[^\s]+\s+--version=[^\s]+',
]
commands = []
for pattern in patterns:
commands.extend(re.findall(pattern, text))
return commands
text = extract_pdf_text("/root/artifacts/paper.pdf")
commands = find_commands(text)
import pdfplumber
def extract_tables(pdf_path: str) -> list:
"""Extract all tables from a PDF."""
tables = []
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
page_tables = page.extract_tables()
for table in page_tables:
tables.append({
"page": i + 1,
"data": table
})
return tables
import re
def find_package_info(text: str) -> list:
"""Find npm package references (name@version)."""
# Match patterns like [email protected]
pattern = r'([a-z0-9-]+)@(\d+\.\d+\.\d+)'
matches = re.findall(pattern, text.lower())
return [{"name": m[0], "version": m[1]} for m in matches]
Check available PDFs first:
ls -la /root/artifacts/
Preview before full extraction:
pdftotext /root/artifacts/paper.pdf - | head -100
Handle multi-column layouts: pdfplumber handles them better than pdftotext
For structured data: Look for JSON blocks in the text:
import json
import re
json_blocks = re.findall(r'\{[^{}]*\}', text)
for block in json_blocks:
try:
data = json.loads(block)
print(data)
except json.JSONDecodeError:
pass