Medical document OCR processing for extracting structured clinical data from medical images (prescriptions, lab results, clinical notes). Uses Google Cloud Vision for text extraction and medical NLP for entity recognition. Deploy when processing healthcare documents, extracting patient data, or converting medical images to structured formats.
This skill enables Claude to extract and structure clinical data from medical documents using OCR (Optical Character Recognition) combined with medical natural language processing. It's designed specifically for healthcare workflows and FHIR R4 compliance.
Use this skill when you need to:
from google.cloud import vision
import io
def extract_medical_text(image_path):
"""Extract text from medical image using Google Cloud Vision"""
client = vision.ImageAnnotatorClient()
with io.open(image_path, 'rb') as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations
if texts:
return texts[0].description
return ""
Extract clinical entities using pattern matching and medical terminology:
import re
def extract_medications(text):
"""Extract medication names and dosages"""
# Pattern for common medication formats
med_pattern = r'([A-Z][a-z]+(?:in|ol|ide|ine))\s+(\d+\s*(?:mg|mcg|g|mL))'
medications = re.findall(med_pattern, text)
return [
{"name": med[0], "dosage": med[1]}
for med in medications
]
def extract_vital_signs(text):
"""Extract vital signs from text"""
vitals = {}
# Blood pressure pattern
bp = re.search(r'BP[:\s]+(\d{2,3}/\d{2,3})', text, re.I)
if bp:
vitals['blood_pressure'] = bp.group(1)
# Heart rate pattern
hr = re.search(r'HR[:\s]+(\d{2,3})', text, re.I)
if hr:
vitals['heart_rate'] = hr.group(1)
# Temperature pattern
temp = re.search(r'Temp[:\s]+(\d{2,3}\.?\d*)', text, re.I)
if temp:
vitals['temperature'] = temp.group(1)
return vitals
def classify_medical_document(text):
"""Determine the type of medical document"""
text_lower = text.lower()
if any(word in text_lower for word in ['prescription', 'rx', 'sig:']):
return 'prescription'
elif any(word in text_lower for word in ['lab results', 'test results', 'specimen']):
return 'lab_results'
elif any(word in text_lower for word in ['progress note', 'soap', 'assessment']):
return 'clinical_notes'
elif any(word in text_lower for word in ['discharge', 'summary']):
return 'discharge_summary'
else:
return 'unknown'
def generate_fhir_medication_request(medication_data):
"""Generate FHIR R4 MedicationRequest from extracted data"""
return {
"resourceType": "MedicationRequest",
"status": "active",
"intent": "order",
"medication": {
"CodeableConcept": {
"text": medication_data['name']
}
},
"dosageInstruction": [{
"text": medication_data['dosage'],
"timing": {
"repeat": {
"frequency": medication_data.get('frequency', 1),
"period": 1,
"periodUnit": "d"
}
},
"doseAndRate": [{
"doseQuantity": {
"value": medication_data.get('dose_value'),
"unit": medication_data.get('dose_unit', 'mg')
}
}]
}]
}
Common medical abbreviations that should be expanded:
MEDICAL_ABBREVIATIONS = {
# Frequency
"BID": "Twice daily",
"TID": "Three times daily",
"QID": "Four times daily",
"QD": "Once daily",
"PRN": "As needed",
"STAT": "Immediately",
"AC": "Before meals",
"PC": "After meals",
"HS": "At bedtime",
# Route
"PO": "By mouth / Oral",
"IV": "Intravenous",
"IM": "Intramuscular",
"SC/SQ": "Subcutaneous",
"SL": "Sublingual",
"TOP": "Topical",
# Clinical
"NPO": "Nothing by mouth",
"SOB": "Shortness of breath",
"N/V": "Nausea and vomiting",
"CBC": "Complete Blood Count",
"CMP": "Comprehensive Metabolic Panel",
"CXR": "Chest X-Ray",
"EKG/ECG": "Electrocardiogram",
"BP": "Blood Pressure",
"HR": "Heart Rate",
"RR": "Respiratory Rate",
"Temp": "Temperature"
}
def process_medical_document(image_path):
"""Complete pipeline for medical document processing"""
# Step 1: OCR extraction
raw_text = extract_medical_text(image_path)
# Step 2: Document classification
doc_type = classify_medical_document(raw_text)
# Step 3: Entity extraction
entities = {
'medications': extract_medications(raw_text),
'vitals': extract_vital_signs(raw_text),
'type': doc_type
}
# Step 4: FHIR resource generation
fhir_resources = []
for med in entities['medications']:
fhir_resources.append(
generate_fhir_medication_request(med)
)
return {
'raw_text': raw_text,
'document_type': doc_type,
'extracted_entities': entities,
'fhir_resources': fhir_resources,
'confidence': 0.85 # Calculate based on OCR confidence
}
def calculate_confidence(ocr_result, extracted_entities):
"""Calculate overall confidence score"""
scores = []
# OCR confidence (from Google Vision)
if hasattr(ocr_result, 'confidence'):
scores.append(ocr_result.confidence)
# Entity extraction confidence
if extracted_entities['medications']:
scores.append(0.9) # High confidence for structured meds
if extracted_entities['vitals']:
scores.append(0.85)
return sum(scores) / len(scores) if scores else 0.0
def validate_extracted_data(data):
"""Validate extracted medical data"""
warnings = []
# Check for missing critical fields
if not data.get('medications'):
warnings.append("No medications detected")
# Validate vital signs ranges
vitals = data.get('vitals', {})
if 'blood_pressure' in vitals:
bp = vitals['blood_pressure']
systolic, diastolic = map(int, bp.split('/'))
if systolic > 180 or diastolic > 120:
warnings.append(f"Critical BP value: {bp}")
return warnings
When processing medical documents:
def audit_log_ocr_operation(user_id, document_id, operation):
"""Log OCR operation for HIPAA compliance"""
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"user_id": user_id,
"document_id": document_id,
"operation": operation,
"phi_accessed": True,
"justification": "Clinical data extraction for patient care"
}
# Store in secure audit log
return log_entry
This skill integrates with the following agents:
def handle_ocr_errors(image_path):
"""Robust error handling for OCR operations"""
try:
result = extract_medical_text(image_path)
if not result:
return {
"error": "No text detected",
"suggestion": "Improve image quality or check orientation"
}
return {"success": True, "text": result}
except Exception as e:
return {
"error": str(e),
"image_path": image_path,
"suggestion": "Check image format and API credentials"
}
# Input: prescription.jpg
result = process_medical_document("prescription.jpg")
# Output:
{
"document_type": "prescription",
"medications": [
{"name": "Metformin", "dosage": "500mg", "frequency": "BID"},
{"name": "Lisinopril", "dosage": "10mg", "frequency": "QD"}
],
"fhir_resources": [...],
"confidence": 0.92
}
# Input: lab_results.jpg
result = process_medical_document("lab_results.jpg")
# Output:
{
"document_type": "lab_results",
"tests": [
{"name": "Hemoglobin A1C", "value": "7.2", "unit": "%"},
{"name": "Fasting Glucose", "value": "126", "unit": "mg/dL"}
],
"confidence": 0.88
}
This skill should be updated when:
Version: 1.0.0
Last Updated: 2024-11-22
Maintainer: Doctors-Linc Development Team