Aggregate results from multiple Isabl analyses into combined datasets. Use when merging VCFs, TSVs, or other outputs across samples or cohorts.
You are helping the user aggregate results from multiple Isabl analyses.
Work through these steps systematically:
import isabl_cli as ii
# Get succeeded analyses for an application in a project
analyses = ii.get_analyses(
projects=PROJECT_PK,
application__name="MUTECT",
status="SUCCEEDED"
)
print(f"Found {len(analyses)} analyses to merge")
for a in analyses[:5]:
target = a.targets[0].system_id if a.targets else "N/A"
print(f" [{a.pk}] {target}")
# Check what results are available
sample_analysis = analyses[0]
print(f"Available results: {list(sample_analysis.results.keys())}")
# Check application's result definitions
app = ii.get_instance("applications", sample_analysis.application.pk)
print(f"Result schema: {app.results}")
Common result keys:
vcf - Variant callsbam - Aligned readstsv / csv - Tabular datasummary - Summary statisticsqc_metrics - Quality metricsfrom pathlib import Path
result_key = "tsv" # The result key to merge
paths = []
for a in analyses:
# Option 1: From results dict (if path stored there)
if result_key in a.results:
path = a.results[result_key]
if isinstance(path, dict):
path = path.get("path") or path.get("url")
paths.append({"analysis_pk": a.pk, "path": path})
# Option 2: Construct from storage_url
else:
expected_path = Path(a.storage_url) / f"{result_key}.tsv"
paths.append({"analysis_pk": a.pk, "path": str(expected_path)})
print(f"Collected {len(paths)} paths")
import os
valid_paths = []
missing = []
for item in paths:
if os.path.exists(item["path"]):
valid_paths.append(item)
else:
missing.append(item)
print(f"Valid: {len(valid_paths)}, Missing: {len(missing)}")
if missing:
print("Missing files:")
for m in missing[:5]:
print(f" Analysis {m['analysis_pk']}: {m['path']}")
import pandas as pd
dfs = []
for item in valid_paths:
df = pd.read_csv(item["path"], sep="\t")
df["analysis_pk"] = item["analysis_pk"]
dfs.append(df)
merged = pd.concat(dfs, ignore_index=True)
print(f"Merged shape: {merged.shape}")
dfs = []
for a in analyses:
target_id = a.targets[0].system_id if a.targets else f"analysis_{a.pk}"
df = pd.read_csv(a.results["tsv"], sep="\t")
df = df.rename(columns={"value": target_id})
dfs.append(df)
# Merge on common key
merged = dfs[0]
for df in dfs[1:]:
merged = merged.merge(df, on="key_column", how="outer")
# Build summary table from individual results
data = []
for a in analyses:
target = a.targets[0] if a.targets else None
row = {
"analysis_pk": a.pk,
"sample_id": target.system_id if target else None,
"category": target.sample.category if target else None,
}
# Extract metrics from results
if "summary" in a.results:
summary = a.results["summary"]
if isinstance(summary, dict):
row.update(summary.get("data", {}))
data.append(row)
merged = pd.DataFrame(data)
# Full example: merge mutation counts from MUTECT analyses
import pandas as pd
from pathlib import Path
analyses = ii.get_analyses(
projects=PROJECT_PK,
application__name="MUTECT",
status="SUCCEEDED"
)
data = []
for a in analyses:
target = a.targets[0]
reference = a.references[0] if a.references else None
# Read the mutations TSV
muts_path = Path(a.storage_url) / "mutations.tsv"
if muts_path.exists():
muts = pd.read_csv(muts_path, sep="\t")
data.append({
"sample_id": target.system_id,
"tumor_category": target.sample.category,
"normal_id": reference.system_id if reference else None,
"total_mutations": len(muts),
"snvs": len(muts[muts["type"] == "SNV"]),
"indels": len(muts[muts["type"] == "INDEL"]),
"analysis_pk": a.pk,
})
merged = pd.DataFrame(data)
merged.to_csv("mutation_summary.csv", index=False)
print(f"Saved {len(merged)} samples to mutation_summary.csv")
# Check for issues
print(f"Shape: {merged.shape}")
print(f"Columns: {list(merged.columns)}")
print(f"Missing values:\n{merged.isnull().sum()}")
print(f"Duplicates: {merged.duplicated().sum()}")
# Sample preview
print(merged.head())
# First, generate VCF list file from Python
vcf_paths = []
for a in analyses:
vcf_path = Path(a.storage_url) / "output.vcf.gz"
if vcf_path.exists():
vcf_paths.append(str(vcf_path))
# Write to file for bcftools
with open("vcf_files.txt", "w") as f:
f.write("\n".join(vcf_paths))
print(f"Listed {len(vcf_paths)} VCF files")
# Then merge with bcftools
bcftools merge -l vcf_files.txt -o merged.vcf.gz -O z
# Collect coverage metrics from multiple analyses
coverage_data = []
for a in analyses:
metrics_file = Path(a.storage_url) / "coverage_metrics.txt"
if metrics_file.exists():
# Parse coverage metrics (format varies by tool)
with open(metrics_file) as f:
for line in f:
if line.startswith("MEAN_COVERAGE"):
coverage_data.append({
"sample": a.targets[0].system_id,
"mean_coverage": float(line.split("\t")[1])
})
# Use project-level analysis if available
project_analyses = ii.get_analyses(
project_level_analysis=PROJECT_PK,
application__name="COHORT_SUMMARY",
status="SUCCEEDED"
)
if project_analyses:
# Results already aggregated
merged = project_analyses[0].results