Query COSMIC (Catalogue Of Somatic Mutations In Cancer) for cancer somatic mutations, gene census data, mutational signatures, drug resistance variants, and cancer gene annotations. REST API v3.1 supports gene/sample/variant queries. Free registration required. For germline clinical variants use clinvar-database; for drug-target data use opentargets-database or chembl-database-bioactivity.
COSMIC (Catalogue Of Somatic Mutations In Cancer) is the world's largest expert-curated database of somatic mutations in cancer, covering 6.7M+ coding mutations, 40,000+ cancer samples, 19,000+ genes across all cancer types. It includes the Cancer Gene Census (critical cancer genes), mutational signatures (SBS, DBS, ID), drug resistance variants, copy number data, gene expression, and methylation. The REST API v3.1 enables programmatic queries; most features are freely accessible after registration.
clinvar-database; for drug-target associations use opentargets-databaserequests, pandaspip install requests pandas
# Register at https://cancer.sanger.ac.uk/cosmic/register to obtain API credentials
import requests
import base64
# COSMIC API requires base64-encoded email:password authentication
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
# Get mutations for KRAS gene
r = requests.get(f"{BASE}/mutations",
headers=HEADERS,
params={"gene_name": "KRAS", "limit": 5})
r.raise_for_status()
data = r.json()
print(f"Total KRAS mutations: {data['meta']['total']}")
for m in data["data"][:3]:
print(f" {m['mutation_id']:15s} AA: {m.get('mutation_aa')} | Cancer: {m.get('primary_site')}")
Retrieve all COSMIC somatic mutations for a gene, with cancer type and amino acid change.
import requests, base64, pandas as pd
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
def get_gene_mutations(gene, limit=100, cancer_site=None):
params = {"gene_name": gene, "limit": limit}
if cancer_site:
params["primary_site"] = cancer_site
r = requests.get(f"{BASE}/mutations", headers=HEADERS, params=params)
r.raise_for_status()
return r.json()
data = get_gene_mutations("TP53", limit=20)
print(f"Total TP53 mutations in COSMIC: {data['meta']['total']}")
rows = []
for m in data["data"][:10]:
rows.append({
"mutation_id": m.get("mutation_id"),
"mutation_aa": m.get("mutation_aa"),
"mutation_cds": m.get("mutation_cds"),
"primary_site": m.get("primary_site"),
"histology": m.get("primary_histology"),
"count": m.get("count"),
})
df = pd.DataFrame(rows)
print(df.head())
# Filter by cancer site
data_lung = get_gene_mutations("TP53", cancer_site="lung", limit=20)
print(f"\nTP53 mutations in lung cancer: {data_lung['meta']['total']}")
Retrieve the COSMIC Cancer Gene Census — classified cancer driver genes.
import requests, base64, pandas as pd
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
r = requests.get(f"{BASE}/genes", headers=HEADERS, params={"limit": 100})
r.raise_for_status()
data = r.json()
print(f"Total genes in COSMIC: {data['meta']['total']}")
# Get Cancer Gene Census genes
r_cgc = requests.get(f"{BASE}/genes",
headers=HEADERS,
params={"cgc_tier": "1", "limit": 50})
cgc_data = r_cgc.json()
print(f"\nCGC Tier 1 genes: {cgc_data['meta']['total']}")
rows = []
for g in cgc_data["data"][:15]:
rows.append({
"gene": g.get("gene_name"),
"tier": g.get("cgc_tier"),
"role": g.get("role_in_cancer"),
"mutation_types": g.get("mutation_types"),
"tumour_types": str(g.get("tumour_types_somatic", []))[:80],
})
df = pd.DataFrame(rows)
print(df.to_string(index=False))
Retrieve details for a known COSMIC mutation ID (COSM…).
import requests, base64
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
# KRAS G12D mutation
mutation_id = "COSM521"
r = requests.get(f"{BASE}/mutations/{mutation_id}", headers=HEADERS)
r.raise_for_status()
m = r.json()
print(f"Mutation ID : {m.get('mutation_id')}")
print(f"Gene : {m.get('gene_name')}")
print(f"AA change : {m.get('mutation_aa')}")
print(f"CDS change : {m.get('mutation_cds')}")
print(f"Substitution: {m.get('mutation_description')}")
print(f"Count : {m.get('count')} samples")
print(f"Cancer types: {str(m.get('cancer_types', []))[:100]}")
Retrieve all somatic mutations for a specific cancer sample.
import requests, base64, pandas as pd
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
# Search for a specific sample
r = requests.get(f"{BASE}/samples",
headers=HEADERS,
params={"primary_site": "breast", "limit": 5})
r.raise_for_status()
samples = r.json()["data"]
print(f"Example breast cancer samples:")
for s in samples[:3]:
print(f" {s.get('sample_id')}: {s.get('sample_name')} | {s.get('primary_histology')}")
# Get mutations for a specific sample
if samples:
sample_id = samples[0]["sample_id"]
r2 = requests.get(f"{BASE}/samples/{sample_id}/mutations", headers=HEADERS)
if r2.ok:
muts = r2.json()["data"]
print(f"\nMutations in sample {sample_id}: {len(muts)}")
for m in muts[:5]:
print(f" {m.get('gene_name'):10s} {m.get('mutation_aa')}")
Retrieve COSMIC mutational signature data for cancer types.
import requests, base64, pandas as pd
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
# List available mutational signatures
r = requests.get(f"{BASE}/signatures", headers=HEADERS)
r.raise_for_status()
sigs = r.json()["data"]
print(f"COSMIC mutational signatures: {len(sigs)}")
for s in sigs[:5]:
print(f" {s.get('signature_name')}: {s.get('aetiology', '')[:80]}")
# Get signature attributions by cancer type
r2 = requests.get(f"{BASE}/signatures/attributions",
headers=HEADERS,
params={"cancer_type": "Breast", "limit": 10})
if r2.ok:
attributions = r2.json()["data"]
for a in attributions[:5]:
print(f" {a.get('signature_name')}: {a.get('attribution_proportion'):.2%} in breast cancer")
Query the COSMIC drug resistance database for variants conferring drug resistance.
import requests, base64, pandas as pd
EMAIL = "[email protected]"
PASSWORD = "your_password"
token = base64.b64encode(f"{EMAIL}:{PASSWORD}".encode()).decode()
BASE = "https://cancer.sanger.ac.uk/cosmic/api"
HEADERS = {"Authorization": f"Basic {token}"}
# Get drug resistance variants
r = requests.get(f"{BASE}/resistance_mutations",
headers=HEADERS,
params={"gene": "EGFR", "limit": 20})
if r.ok:
data = r.json()
print(f"EGFR drug resistance variants: {data['meta'].get('total', 'n/a')}")
for v in data.get("data", [])[:5]:
print(f" {v.get('mutation_aa'):20s} Drug: {v.get('drug')} | Resistance: {v.get('resistance_type')}")