Execute Python scripts for spreadsheet operations with mandatory source data validation and fallback protocols
CRITICAL: Before attempting any spreadsheet operations, validate that your source data is accessible and complete.
import os
import sys
from pathlib import Path
def validate_source_data(source_paths):
"""Validate all required data sources before processing."""
missing = []
inaccessible = []
for path in source_paths:
p = Path(path)
if not p.exists():
missing.append(str(path))
elif not os.access(path, os.R_OK):
inaccessible.append(str(path))
if missing:
print(f"ERROR: Missing data sources: {', '.join(missing)}", file=sys.stderr)
return False
if inaccessible:
print(f"ERROR: Inaccessible data sources: {', '.join(inaccessible)}", file=sys.stderr)
return False
print(f"VALIDATED: {len(source_paths)} source(s) available")
return True
# Usage
sources = ['input.xlsx', 'reference_data.csv']
if not validate_source_data(sources):
sys.exit(1)
import pandas as pd
from openpyxl import load_workbook
def validate_spreadsheet_integrity(file_path, required_sheets=None, required_columns=None):
"""Check spreadsheet structure before processing."""
try:
# Check file is readable
wb = load_workbook(file_path, read_only=True)
# Verify required sheets exist
if required_sheets:
missing_sheets = [s for s in required_sheets if s not in wb.sheetnames]
if missing_sheets:
print(f"ERROR: Missing sheets: {missing_sheets}", file=sys.stderr)
wb.close()
return False
# Verify required columns (sample first sheet)
if required_columns:
ws = wb.active
headers = [cell.value for cell in ws[1]]
missing_cols = [c for c in required_columns if c not in headers]
if missing_cols:
print(f"ERROR: Missing columns: {missing_cols}", file=sys.stderr)
wb.close()
return False
wb.close()
print(f"VALIDATED: Spreadsheet structure OK")
return True
except Exception as e:
print(f"ERROR: Cannot validate spreadsheet: {str(e)}", file=sys.stderr)
return False
For data retrieved from APIs, websites, or external services:
import requests
from urllib3.exceptions import SSLError, MaxRetryError
def validate_external_source(url, timeout=30, max_retries=2):
"""Verify external data source is accessible."""
for attempt in range(max_retries + 1):
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
if response.status_code == 200:
print(f"VALIDATED: External source accessible ({url})")
return True
else:
print(f"WARNING: External source returned {response.status_code}", file=sys.stderr)
except (SSLError, MaxRetryError, requests.ConnectionError) as e:
if attempt == max_retries:
print(f"ERROR: External source inaccessible after {max_retries + 1} attempts: {url}", file=sys.stderr)
return False
print(f"RETRY {attempt + 1}/{max_retries}: {str(e)}", file=sys.stderr)
return False
Use validated Python execution for spreadsheet operations when:
openpyxl, pandas, or similar librariesWhen primary data sources are inaccessible:
ALTERNATIVE_SOURCES = {
'epa_water_data': [
'https://dataservices.epa.illinois.gov/swap', # Primary
'https://www.epa.gov/safewater/data-and-reports', # Federal fallback
'https://waterdata.usgs.gov/nwis', # USGS fallback
],
'financial_data': [
'internal_database.xlsx', # Primary
'backup_financial_data.csv', # Local backup
'request_from_stakeholder', # Manual acquisition
]
}
def try_alternative_sources(source_key):
"""Iterate through alternative sources until one succeeds."""
alternatives = ALTERNATIVE_SOURCES.get(source_key, [])
for i, source in enumerate(alternatives):
print(f"Attempting alternative {i + 1}/{len(alternatives)}: {source}")
if source.startswith('http'):
if validate_external_source(source):
return source
else:
if Path(source).exists():
print(f"SUCCESS: Alternative source found: {source}")
return source
print(f"ERROR: All alternatives exhausted for {source_key}", file=sys.stderr)
return None
def report_data_access_failure(source, error_type, alternatives_tried=0):
"""Standardized error reporting for data access failures."""
error_report = {
'timestamp': datetime.now().isoformat(),
'source': source,
'error_type': error_type,
'alternatives_tried': alternatives_tried,
'action_required': 'Manual data acquisition or source configuration update'
}
print("=" * 60, file=sys.stderr)
print("DATA ACCESS FAILURE REPORT", file=sys.stderr)
print("=" * 60, file=sys.stderr)
for key, value in error_report.items():
print(f"{key}: {value}", file=sys.stderr)
print("=" * 60, file=sys.stderr)
return error_report
The shell_agent tool can:
Direct run_shell with Python is more reliable because it:
.py files first avoids shell_agent parsing issues with heredocs# Step 1: Write validation script
cat > validate_sources.py << 'EOF'
import sys
from pathlib import Path
sources = ['input.xlsx', 'config.json']
missing = [s for s in sources if not Path(s).exists()]
if missing:
print(f"BLOCKED: Missing sources: {missing}", file=sys.stderr)
sys.exit(1)
print("VALIDATED: All sources available")
EOF
# Step 2: Run validation
python3 validate_sources.py || exit 1
# Step 3: Write processing script
cat > process_spreadsheet.py << 'EOF'
import openpyxl
# Your spreadsheet code here
EOF
# Step 4: Execute processing
python3 process_spreadsheet.py
# Step 5: Clean up (optional)
rm validate_sources.py
#!/usr/bin/env python3
"""
Complete workflow: Validate -> Process -> Report
"""
import sys
from pathlib import Path
from datetime import datetime
from openpyxl import load_workbook
def main():
# PHASE 1: Validation
print(f"[{datetime.now().isoformat()}] Starting validation...")
source_file = 'input_data.xlsx'
if not Path(source_file).exists():
print(f"ERROR: Source file not found: {source_file}", file=sys.stderr)
# Check for alternatives
for alt in ['backup_input.xlsx', 'data_backup.csv']:
if Path(alt).exists():
print(f"FALLBACK: Using alternative: {alt}")
source_file = alt
break
else:
print("ERROR: No alternative sources available", file=sys.stderr)
sys.exit(1)
# PHASE 2: Processing
print(f"[{datetime.now().isoformat()}] Processing {source_file}...")
try:
wb = load_workbook(source_file)
ws = wb.active
# Your operations here
for row in ws.iter_rows(min_row=2):
pass # Process data
wb.save('output.xlsx')
print(f"SUCCESS: Processed {ws.max_row - 1} rows")
except Exception as e:
print(f"ERROR: Processing failed: {str(e)}", file=sys.stderr)
sys.exit(1)
# PHASE 3: Cleanup
print(f"[{datetime.now().isoformat()}] Complete")
if __name__ == '__main__':
main()
.py file first, then execute via run_shell| Library | Best For |
|---|---|
openpyxl | Reading/writing .xlsx files, formatting, formulas |
pandas | Data manipulation, analysis, merging datasets |
xlrd | Reading older .xls files (read-only) |
xlsxwriter | Creating new .xlsx files with advanced formatting |
requests | Validating external API/data sources |
pathlib | Cross-platform file path validation |
Issue: Heredoc syntax fails with 'unknown error' when using shell_agent
.py file first, then execute it with python3 script.py. This pattern is significantly more reliable than inline heredoc execution when shell_agent is the executor.Issue: Source data validation fails
ls -laIssue: External data source inaccessible (SSL/proxy errors)
requests.get(url, verify=False)Issue: FileNotFoundError
Issue: PermissionError
Issue: MemoryError on large files
chunksize parameterIssue: Formatting not applying
.copy() for style objectsIssue: Data validation passes but processing fails