Use when processing documents (PDF, DOCX, TXT), extracting text content, handling file uploads, or implementing document parsing - focuses on file handling and text extraction
☐ Create file input component ☐ Handle file selection events ☐ Validate file type and size ☐ Show upload progress indicator
// components/FileUpload.tsx
import { useState } from 'react';
interface FileUploadProps {
accept?: string;
maxSize?: number; // in bytes
onFileSelect: (file: File) => void;
}
export function FileUpload({ accept = '.pdf,.docx,.txt', maxSize = 10 * 1024 * 1024, onFileSelect }: FileUploadProps) {
const [error, setError] = useState('');
const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0];
if (!file) return;
setError('');
// Validate file size
if (file.size > maxSize) {
setError(`File too large. Max size: ${maxSize / 1024 / 1024}MB`);
return;
}
// Validate file type
const fileType = file.name.split('.').pop()?.toLowerCase();
const acceptedTypes = accept.split(',').map(t => t.trim().replace('.', ''));
if (!acceptedTypes.includes(fileType || '')) {
setError(`Invalid file type. Accepted: ${accept}`);
return;
}
onFileSelect(file);
};
return (
<div>
<input
type="file"
accept={accept}
onChange={handleFileChange}
/>
{error && <p className="error">{error}</p>}
</div>
);
}
☐ Install PDF.js library (pdfjs-dist)
☐ Configure worker for PDF parsing
☐ Extract text from PDF pages
☐ Handle PDF rendering (if showing preview)
// lib/pdfProcessor.ts
import * as pdfjsLib from 'pdfjs-dist';
// Configure worker
pdfjsLib.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjsLib.version}/pdf.worker.min.js`;
export async function extractTextFromPDF(file: File): Promise<string> {
const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
let fullText = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => item.str)
.join(' ');
fullText += pageText + '\n\n';
}
return fullText.trim();
}
☐ Install mammoth library (mammoth)
☐ Extract text or HTML from DOCX
☐ Handle formatting preservation (if needed)
☐ Convert to plain text or structured format
// lib/docxProcessor.ts
import mammoth from 'mammoth';
export async function extractTextFromDOCX(file: File): Promise<string> {
const arrayBuffer = await file.arrayBuffer();
const result = await mammoth.extractRawText({ arrayBuffer });
if (result.messages.length > 0) {
console.warn('DOCX conversion warnings:', result.messages);
}
return result.value;
}
export async function extractHTMLFromDOCX(file: File): Promise<string> {
const arrayBuffer = await file.arrayBuffer();
const result = await mammoth.convertToHtml({ arrayBuffer });
return result.value;
}
☐ Create hook to handle multiple file types ☐ Implement processing logic for each type ☐ Handle loading and error states ☐ Return extracted content
// hooks/useDocumentProcessor.ts
import { useState } from 'react';
import { extractTextFromPDF } from '../lib/pdfProcessor';
import { extractTextFromDOCX } from '../lib/docxProcessor';
export function useDocumentProcessor() {
const [processing, setProcessing] = useState(false);
const [error, setError] = useState<string | null>(null);
const processDocument = async (file: File): Promise<string | null> => {
setProcessing(true);
setError(null);
try {
const fileType = file.name.split('.').pop()?.toLowerCase();
let text: string;
switch (fileType) {
case 'pdf':
text = await extractTextFromPDF(file);
break;
case 'docx':
text = await extractTextFromDOCX(file);
break;
case 'txt':
text = await file.text();
break;
default:
throw new Error(`Unsupported file type: ${fileType}`);
}
return text;
} catch (err) {
const message = err instanceof Error ? err.message : 'Processing failed';
setError(message);
return null;
} finally {
setProcessing(false);
}
};
return { processDocument, processing, error };
}
☐ Create FormData for file upload ☐ Send file to backend API ☐ Track upload progress ☐ Handle backend response
// lib/uploadDocument.ts
export async function uploadDocument(
file: File,
onProgress?: (progress: number) => void
): Promise<{ id: string; url: string }> {
const formData = new FormData();
formData.append('file', file);
return new Promise((resolve, reject) => {
const xhr = new XMLHttpRequest();
// Track upload progress
xhr.upload.addEventListener('progress', (e) => {
if (e.lengthComputable && onProgress) {
const progress = (e.loaded / e.total) * 100;
onProgress(progress);
}
});
xhr.addEventListener('load', () => {
if (xhr.status === 200) {
resolve(JSON.parse(xhr.responseText));
} else {
reject(new Error(`Upload failed: ${xhr.statusText}`));
}
});
xhr.addEventListener('error', () => {
reject(new Error('Network error during upload'));
});
xhr.open('POST', '/api/documents/upload');
xhr.send(formData);
});
}
☐ Test with various file types (PDF, DOCX, TXT) ☐ Test with large files (check performance) ☐ Test error cases (invalid files, corrupted documents) ☐ Verify text extraction accuracy ☐ Test upload to backend (if applicable)
Not configuring PDF.js worker:
// ❌ Wrong - worker not configured
const pdf = await pdfjsLib.getDocument({ data }).promise;
// ✅ Right - configure worker first
pdfjsLib.GlobalWorkerOptions.workerSrc = '...';
const pdf = await pdfjsLib.getDocument({ data }).promise;
Reading file incorrectly:
// ❌ Wrong - file content not read
const text = file.name;
// ✅ Right - read file content
const text = await file.text(); // For text files
const buffer = await file.arrayBuffer(); // For binary files
Never:
Always: