Name: Molecule Resolver
Author: conradry

Skills suchen.../

Molecule Resolver | Skills Pool

import pandas as pd
from pathlib import Path

data_dir = Path("/tmp/geo_agent/<accession>")
obs_csv_path = data_dir / f"{key}_standardized_obs.csv"
standardized_obs = pd.read_csv(obs_csv_path, index_col=0)

# Identify the precursor compound column
compound_col = "<compound_column>"  # e.g., "compound", "drug", "treatment"
compound_names = standardized_obs[compound_col].dropna().unique().tolist() if compound_col in standardized_obs.columns else obs_df[compound_col].dropna().unique().tolist()
print(f"Unique compounds: {len(compound_names)}")
print(compound_names[:20])

CONTROL_LABELS = {"dmso", "vehicle", "control", "untreated", "mock", "pbs", "media", "medium", "none", "empty"}

actual_compounds = [c for c in compound_names if c.strip().lower() not in CONTROL_LABELS]
control_compounds = [c for c in compound_names if c.strip().lower() in CONTROL_LABELS]
print(f"Actual compounds: {len(actual_compounds)}, Controls: {len(control_compounds)}")

def derive_is_control(value) -> bool:
    if pd.isna(value):
        # NaN compound does NOT imply control — cell may have been treated
        # but compound identity is unknown
        return False
    if str(value).strip().lower() in CONTROL_LABELS:
        return True
    return False

source_col = compound_col if compound_col in standardized_obs.columns else None
source_series = standardized_obs[compound_col] if source_col else obs_df[compound_col]
standardized_obs["validated_is_control"] = source_series.apply(derive_is_control)
standardized_obs.to_csv(obs_csv_path)

resolved, unresolved = resolve_pubchem_cids(names=actual_compounds)
# resolved: dict[str, int] — name -> PubChem CID
# unresolved: set[str] — names that didn't resolve
print(f"Resolved: {len(resolved)}, Unresolved: {len(unresolved)}")

name_map = {c: c for c in actual_compounds}
for c in control_compounds:
    name_map[c] = None

standardized_obs["validated_chemical_perturbation"] = source_series.map(name_map)
standardized_obs["validated_chemical_perturbation_pubchem_cid"] = source_series.map(resolved)
standardized_obs.to_csv(obs_csv_path)

corrections = {
    "Glesatinib?(MGCD265)": "Glesatinib",
    "Tucidinostat (Chidamide)": "Tucidinostat",
    # ... agent builds this by inspecting unresolved names
}

standardized_obs["validated_chemical_perturbation"] = (
    standardized_obs["validated_chemical_perturbation"].replace(corrections)
)

corrected_names = list(corrections.values())
resolved_corrections, still_unresolved = resolve_pubchem_cids(names=corrected_names)

all_resolved = {**resolved, **resolved_corrections}
for orig, fixed in corrections.items():
    if fixed in resolved_corrections:
        all_resolved[orig] = resolved_corrections[fixed]

standardized_obs["validated_chemical_perturbation_pubchem_cid"] = (
    standardized_obs["validated_chemical_perturbation"].map(all_resolved)
)
standardized_obs.to_csv(obs_csv_path)

if still_unresolved:
    print(f"Still unresolved after correction: {still_unresolved}")
    print("Flag these for user review.")

smiles_for_unresolved = [smiles_map[name] for name in still_unresolved if name in smiles_map]
if smiles_for_unresolved:
    resolved_smiles, _ = resolve_pubchem_cids(smiles=smiles_for_unresolved)
    # Merge results

Molecule Resolver

Molecule Resolver

Interface

Imports

Workflow

Molecule Resolver

Molecule Resolver

Interface

Imports

Workflow

1. Load the obs CSV and extract unique compounds

2. Detect control labels

3. Derive `validated_is_control`

4. Initial resolution

5. Save initial results

6. Fix unresolved compound names

7. Re-resolve corrected names

8. SMILES fallback (if applicable)

Resolution Strategy

Rules

Deep Research

Data Analyst

Academic Researcher

Data Scientist

Biopython

Binary Analysis Patterns

Molecule Resolver

Molecule Resolver

Interface

Imports

Workflow

Molecule Resolver

Molecule Resolver

Interface

Imports

Workflow

1. Load the obs CSV and extract unique compounds

2. Detect control labels

3. Derive validated_is_control

4. Initial resolution

5. Save initial results

6. Fix unresolved compound names

7. Re-resolve corrected names

8. SMILES fallback (if applicable)

Resolution Strategy

Rules

Deep Research

Data Analyst

Academic Researcher

Data Scientist

Biopython

Binary Analysis Patterns

3. Derive `validated_is_control`