Weekly autonomous self-evaluation. Reviews performance, compares instances, validates theses, and makes strategic corrections. The system's mechanism for improving itself.
Weekly self-improvement session (Sunday 4:30 PM ET, Opus, 30 min). Evaluates the trading system's performance, compares parallel instances, and makes corrections to improve future operation.
Read all system outputs from the past week:
PYTHONPATH=. python3 -c "
from src.core.paths import paths
from datetime import datetime, timedelta
import json
# Meta-observer reports (cross-instance comparison)
meta_dir = paths.parallel
meta_reports = sorted(meta_dir.glob('meta_report_*.json'))[-5:]
for f in meta_reports:
with open(f) as fh:
d = json.load(fh)
print(f'Meta {f.name}: {d.get(\"num_instances\",0)} instances, overlap={d.get(\"thesis_overlap_jaccard\",0):.0%}')
# Stress test reports
risk_dir = paths.risk_reports
risk_reports = sorted(risk_dir.glob('stress_test_*.json'))[-3:]
for f in risk_reports:
with open(f) as fh:
d = json.load(fh)
print(f'Stress {f.name}: VaR95={d.get(\"var_95_pct\",0):.1f}%, concentration={d.get(\"concentration_risk_score\",0):.2f}')
# Cross-reference alerts
alerts_file = paths.live / 'cross_reference_alerts.json'
if alerts_file.exists():
with open(alerts_file) as fh:
d = json.load(fh)
print(f'Cross-ref alerts: {d.get(\"total_alerts\",0)} total, {d.get(\"red_flags\",0)} red flags')
# Auto-correction log (last 7 days)
corrections_log = paths.logs / 'auto_corrections.jsonl'
if corrections_log.exists():
cutoff = datetime.now() - timedelta(days=7)
week_corrections = []
for line in open(corrections_log):
try:
entry = json.loads(line.strip())
ts = datetime.fromisoformat(entry.get('timestamp',''))
if ts >= cutoff:
week_corrections.append(entry)
except (json.JSONDecodeError, ValueError):
pass
applied = [c for c in week_corrections if c.get('applied')]
print(f'Auto-corrections this week: {len(week_corrections)} total, {len(applied)} applied')
for c in applied[-5:]:
print(f' [{c[\"action_type\"]}] {c[\"thesis_name\"]}: {c[\"reason\"][:60]}')
# Prediction accuracy
intel_dir = paths.intelligence
daily_updates = sorted(intel_dir.glob('daily_update_*.json'))[-5:]
for f in daily_updates:
with open(f) as fh:
d = json.load(fh)
suggestions = d.get('thesis_suggestions', [])
cal = d.get('calibration', {})
print(f'Belief {f.name}: {len(suggestions)} thesis suggestions, calibration={cal.get(\"overall_accuracy\",\"?\")}'[:80])
# Learning log
from pathlib import Path
learnings_month = datetime.now().strftime('%Y-%m')
learnings_file = paths.learnings / f'{learnings_month}.json'
if learnings_file.exists():
with open(learnings_file) as fh:
learnings = json.load(fh)
print(f'Learnings this month: {len(learnings) if isinstance(learnings, list) else \"dict\"}')
"
Compare all instances' weekly returns vs SPY:
PYTHONPATH=. python3 -c "
from src.core.paths import paths
import json
from pathlib import Path
# Load instance state files for comparison
instances = ['auto', 'beta', 'gamma']
instance_data = {}
for inst in instances:
if inst == 'auto':
state_path = paths.live_state
else:
state_path = Path.home() / f'quant_results_{inst}' / 'live' / 'state.json' # Multi-instance glob - intentional
if state_path.exists():
with open(state_path) as f:
data = json.load(f)
portfolio = data.get('portfolio', {})
print(f'Instance {inst}: equity=\${portfolio.get(\"equity\",0):,.0f}, '
f'cash=\${portfolio.get(\"cash\",0):,.0f}, '
f'positions={portfolio.get(\"total_positions\",0)}, '
f'day_pnl=\${portfolio.get(\"daily_pnl\",0):,.0f}')
instance_data[inst] = data
else:
print(f'Instance {inst}: no state file found')
# Meta-observer latest report for cross-instance comparison
meta_reports = sorted(paths.parallel.glob('meta_report_*.json'))
if meta_reports:
with open(meta_reports[-1]) as f:
meta = json.load(f)
print(f'\\nMeta-observer: {meta.get(\"num_instances\",0)} instances')
consensus = meta.get('consensus_positions', [])
divergent = meta.get('divergent_positions', [])
print(f' Consensus positions (all agree): {consensus[:10]}')
print(f' Divergent positions (disagreement): {divergent[:10]}')
print(f' Thesis overlap (Jaccard): {meta.get(\"thesis_overlap_jaccard\",0):.0%}')
"
Answer these questions in your analysis:
This is the weekly deep reasoning about how the 3 instances see the market differently.
PYTHONPATH=. python3 -c "
import json, sqlite3
from pathlib import Path
from datetime import datetime, timedelta
from collections import defaultdict
# 1. Read opinion divergence from meta-observer
meta_file = Path.home() / 'quant_results/parallel/meta_report_latest.json'
if meta_file.exists():
meta = json.loads(meta_file.read_text())
div = meta.get('opinion_divergence', {})
if div:
print('=== OPINION DIVERGENCE ===')
print(f'Symbols compared: {div.get(\"symbol_count\",0)}')
print(f'Avg direction agreement: {div.get(\"avg_direction_agreement\",0):.0%}')
print(f'High divergence: {div.get(\"high_divergence\",[])}')
print(f'Full consensus: {div.get(\"high_consensus\",[])}')
else:
print('No opinion divergence data yet')
# 2. Decision quality comparison across instances
for inst in ('auto', 'beta', 'gamma'):
if inst == 'auto':
qf = Path.home() / 'quant_results/intelligence/decision_quality.json'
else:
qf = Path.home() / f'quant_results_{inst}/intelligence/decision_quality.json'
if qf.exists():
q = json.loads(qf.read_text())
d10 = q.get('10d', {})
if d10:
print(f'{inst}: 10d quality={d10[\"avg_quality\"]:.0%}, return={d10[\"avg_return\"]:+.1f}%, alpha={d10[\"avg_alpha\"]:+.1f}% (n={d10[\"count\"]})')
# 3. Per-symbol opinion accuracy comparison (from each instance's DB)
print()
print('=== PER-INSTANCE OPINION ACCURACY ===')
for inst in ('auto', 'beta', 'gamma'):
if inst == 'auto':
db_path = Path.home() / 'quant_results/athena.db'
else:
db_path = Path.home() / f'quant_results_{inst}/athena.db'
if not db_path.exists():
continue
try:
conn = sqlite3.connect(str(db_path))
# Total opinions and scored count
total = conn.execute('SELECT COUNT(*) FROM market_opinions').fetchone()[0]
scored = conn.execute('SELECT COUNT(*) FROM market_opinions WHERE score_10d_direction IS NOT NULL').fetchone()[0]
if scored > 0:
avg_dir = conn.execute('SELECT AVG(score_10d_direction) FROM market_opinions WHERE score_10d_direction IS NOT NULL').fetchone()[0]
avg_range = conn.execute('SELECT AVG(score_10d_range) FROM market_opinions WHERE score_10d_range IS NOT NULL').fetchone()[0]
print(f'{inst}: {total} opinions, {scored} scored. 10d direction={avg_dir:.0%}, range={avg_range:.0%}')
# Per-sector accuracy (group by first letter of symbol as rough proxy)
# Better: get thesis-linked opinions
rows = conn.execute('''
SELECT thesis_id, COUNT(*) as n, AVG(score_10d_direction) as dir_acc
FROM market_opinions
WHERE score_10d_direction IS NOT NULL AND thesis_id IS NOT NULL AND thesis_id != ''
GROUP BY thesis_id
HAVING n >= 5
ORDER BY dir_acc DESC
''').fetchall()
for thesis_id, n, dir_acc in rows[:5]:
print(f' thesis {thesis_id[:8]}: {dir_acc:.0%} accuracy (n={n})')
else:
print(f'{inst}: {total} opinions, none scored yet')
conn.close()
except Exception as e:
print(f'{inst}: error reading DB: {e}')
"
Analyze and answer these questions:
Which instance is most accurate? Compare 10d direction accuracy across auto/beta/gamma. Is the most accurate also the most profitable?
Where do they disagree most? For high-divergence symbols, which instance has been RIGHT historically on that sector/thesis?
Instance specialization: Does any instance show consistently better accuracy on specific thesis types (energy vs tech vs defense)?
Consensus as signal: When all 3 agree on direction, what's the actual hit rate? (This is the "error bar" — unanimous agreement should have higher accuracy than split opinions.)
Contrarian signal: When one instance disagrees with the other two, is the contrarian or the majority right more often?
Write findings to strategic context as developing_patterns for the operator/trade-decision sessions to consume.
Read the bug monitor report:
PYTHONPATH=. python3 -c "
from src.intelligence.bug_monitor import BugMonitor
monitor = BugMonitor()
bugs = monitor.scan_all()
summary = monitor.get_summary()
print(f'Total bugs: {summary[\"total\"]}')
print(f'Crashes: {summary[\"crashes\"]}, Errors: {summary[\"errors\"]}')
print(f'Auto-fixable: {summary[\"auto_fixable\"]}')
print(f'Recurring (3+): {summary[\"recurring\"]}')
for error_type, msg, count in summary.get('top_errors', []):
print(f' [{count}x] {error_type}: {msg}')
"
For each auto-fixable bug with 3+ occurrences:
For non-auto-fixable bugs:
For EACH active thesis, evaluate:
PYTHONPATH=. python3 -c "
from src.knowledge.thesis import ThesisTracker
from src.core.paths import paths
from datetime import datetime, timedelta
import json
tracker = ThesisTracker(paths.theses)
theses = tracker.get_all_theses()
# Load corrections for cross-reference
corrections_log = paths.logs / 'auto_corrections.jsonl'
week_corrections = []
if corrections_log.exists():
cutoff = datetime.now() - timedelta(days=7)
for line in open(corrections_log):
try:
entry = json.loads(line.strip())
ts = datetime.fromisoformat(entry.get('timestamp',''))
if ts >= cutoff:
week_corrections.append(entry)
except (json.JSONDecodeError, ValueError):
pass
# Load prediction accuracy from latest belief update
intel_dir = paths.intelligence
daily_updates = sorted(intel_dir.glob('daily_update_*.json'), reverse=True)
thesis_accuracy = {}
if daily_updates:
with open(daily_updates[0]) as f:
report = json.load(f)
for s in report.get('thesis_suggestions', []):
thesis_accuracy[s['thesis_id']] = {
'accuracy': s.get('prediction_accuracy', None),
'count': s.get('prediction_count', 0),
'suggested_change': s.get('suggested_change', 0),
}
print('=== THESIS HEALTH REVIEW ===')
for t in sorted(theses, key=lambda x: -x.conviction):
if t.status != 'active':
continue
# Conviction velocity (last 5 updates)
velocity = 0
if len(t.conviction_history) >= 2:
recent = t.conviction_history[-5:]
velocity = recent[-1].new_value - recent[0].old_value
# Corrections this week
thesis_corrections = [c for c in week_corrections if c.get('thesis_id') == t.id]
# Prediction accuracy
acc = thesis_accuracy.get(t.id, {})
print(f'\\n{t.name} (id={t.id})')
print(f' Conviction: {t.conviction:.0f}% (velocity: {velocity:+.0f})')
print(f' Positions: {t.positions[:6]}')
print(f' Predictions: {acc.get(\"count\",0)} total, accuracy={acc.get(\"accuracy\",\"?\")}'[:60])
print(f' Auto-corrections this week: {len(thesis_corrections)}')
for c in thesis_corrections:
print(f' [{c[\"action_type\"]}] {c[\"reason\"][:60]}')
if velocity < -10:
print(f' WARNING: Conviction declining rapidly ({velocity:+.0f})')
if acc.get('accuracy') is not None and acc['accuracy'] < 0.3 and acc.get('count', 0) >= 5:
print(f' WARNING: Low prediction accuracy ({acc[\"accuracy\"]:.0%})')
"
Decision for each thesis:
Apply conviction changes via ThesisTracker:
from src.knowledge.thesis import ThesisTracker
from src.core.paths import paths
tracker = ThesisTracker(paths.theses)
thesis = tracker.get_thesis("<id>")
thesis.update_conviction(<new_value>, "[SYSTEM REVIEW] <reason>")
tracker._save_thesis(thesis)
Check if any thresholds need adjustment:
PYTHONPATH=. python3 -c "
from src.core.paths import paths
import json
# Stress test scenario accuracy (did any scenario prediction miss badly?)
risk_dir = paths.risk_reports
reports = sorted(risk_dir.glob('stress_test_*.json'), reverse=True)
if reports:
with open(reports[0]) as f:
st = json.load(f)
print('Latest stress test:')
print(f' VaR95: {st.get(\"var_95_pct\",0):.1f}%')
print(f' Concentration: {st.get(\"concentration_risk_score\",0):.2f}')
print(f' Max position: {st.get(\"max_single_position_pct\",0):.1f}%')
print(f' Max sector: {st.get(\"max_single_sector_exposure_pct\",0):.1f}%')
print(f' Recommendations: {st.get(\"recommendations\",[])}')
# Auto-correction cooldown effectiveness
corrections_log = paths.logs / 'auto_corrections.jsonl'
if corrections_log.exists():
cooldown_hits = 0
total = 0
for line in open(corrections_log):
try:
entry = json.loads(line.strip())
total += 1
if not entry.get('applied'):
cooldown_hits += 1
except (json.JSONDecodeError, ValueError):
pass
print(f'\\nCorrection cooldown: {cooldown_hits}/{total} blocked ({cooldown_hits/max(total,1):.0%})')
# Check trim queue
trim_queue = paths.scheduler / 'trim_queue.json'
if trim_queue.exists():
with open(trim_queue) as f:
trims = json.load(f)
print(f'\\nPending trims: {len(trims)}')
for t in trims:
print(f' {t[\"symbol\"]}: {t[\"reason\"][:60]}')
"
Evaluate these parameters:
Log parameter change recommendations (do NOT change code -- log to strategic context).
Based on findings from Steps 1-4, determine if any code changes would improve the system.
What you CAN change (Tier 1 -- direct modification):
What you CAN create (Tier 2 -- new files in standard locations):
src/data/sources/alternative/src/signals/src/intelligence/.claude/skills/What you CANNOT change (Forbidden):
src/upgrades/ (the upgrade system itself)config/credentials*.yaml (security)scripts/auto_corrections.py (core safety loop)src/execution/order_manager.py (trade execution safety)src/core/instance.py and src/core/paths.py (infrastructure)Workflow:
from src.upgrades.auto_upgrader import AutoUpgrader, UpgradeProposal
from datetime import datetime
upgrader = AutoUpgrader()
# 1. Create proposal
proposal = UpgradeProposal(
id=f"upgrade_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
timestamp=datetime.now().isoformat(),
tier=1, # or 2
category="threshold", # or "data_source", "signal", "rule", "skill", "evaluation"
description="Recalibrate gold shock in ceasefire scenario from -12% to -15%",
files_to_modify=["src/risk/stress_tester.py"],
files_to_create=[],
rationale="Actual gold drop this week was -10.3%, scenario predicted -12%. Widening to -15% for safety margin.",
evidence="GLD weekly return: -10.3%. Stress test predicted: -12%. Error: 1.7pp.",
estimated_impact="More accurate risk estimates, earlier alerts on gold concentration",
risk_level="low",
)
# 2. Validate
is_valid, reason = upgrader.validate_proposal(proposal)
if not is_valid:
print(f"Proposal rejected: {reason}")
# Log as Tier 3 for human review