Convert PDF pages to editable Word documents (.docx) while preserving layout. Use when users want to (1) convert PDF to Word, (2) make PDF content editable, (3) extract PDF pages to docx format, (4) preserve two-column academic paper layout, (5) OCR PDF images to text. Handles PDFs with embedded images by extracting page as image first, then using OCR.
Convert PDF pages to editable Word documents while preserving layout structure.
python scripts/extract_pdf_page.py /path/to/document.pdf 1 -o /output/dir
python scripts/create_two_column_docx.py /output/dir/page1_text.txt output.docx \
--title "Document Title" \
--author "Author Name" \
--page-number 1 \
--total-pages 8
When scripts don't match the exact layout needed, follow this manual process:
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0] # 0-indexed
pil_image = page.to_image(resolution=200).original
pil_image.save("page1.png", "PNG")
tesseract page1.png page1_text -l eng
Read the extracted image to understand:
from docx import Document
from docx.shared import Pt, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
doc = Document()
# Set margins
for section in doc.sections:
section.top_margin = Cm(1.5)
section.bottom_margin = Cm(1.5)
section.left_margin = Cm(1.5)
section.right_margin = Cm(1.5)
# Two-column layout using borderless table
table = doc.add_table(rows=1, cols=2)
# Remove borders from cells
def remove_borders(cell):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcBorders = OxmlElement('w:tcBorders')
for edge in ('left', 'top', 'right', 'bottom'):
el = OxmlElement(f'w:{edge}')
el.set(qn('w:val'), 'nil')
tcBorders.append(el)
tcPr.append(tcBorders)
for cell in table.rows[0].cells:
remove_borders(cell)
cell.width = Cm(8.5)
# Add content to left column
left_cell = table.rows[0].cells[0]
p = left_cell.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
run = p.add_run("Content here...")
run.font.size = Pt(9)
doc.save("output.docx")
[Figure X - See original PDF]Required:
brew install tesseract)Install Python packages:
pip install pdfplumber pillow python-docx
# Or use uvx:
uvx --with pdfplumber --with pillow --with python-docx python script.py