Use Bio.PDB to parse and analyze protein structures (PDB/mmCIF) for structural bioinformatics tasks; use when you need structure parsing, geometry calculations, or structural comparison/superposition.
Bio.PDB parsers.NeighborSearchSuperimposer).biopython (>= 1.79)numpy (>= 1.21)DSSP executable (e.g., mkdssp, version depends on your system installation)Create config/task_config.json:
{
"input_path": "data/1ubq.pdb",
"format": "pdb",
"chain_id": "A",
"atom_name": "CA",
"distance_cutoff": 8.0,
"output_path": "outputs/chainA_ca_neighbors.json"
}
Create scripts/neighbor_search.py:
import json
from pathlib import Path
import numpy as np
from Bio.PDB import PDBParser, MMCIFParser, NeighborSearch
def load_structure(input_path: str, fmt: str):
if fmt.lower() in ("pdb", ".pdb"):
parser = PDBParser(QUIET=True)
elif fmt.lower() in ("cif", "mmcif", ".cif", ".mmcif"):
parser = MMCIFParser(QUIET=True)
else:
raise ValueError(f"Unsupported format: {fmt}")
return parser.get_structure("structure", input_path)
def main():
config_path = Path("config/task_config.json")
with config_path.open("r", encoding="utf-8") as f:
cfg = json.load(f)
structure = load_structure(cfg["input_path"], cfg["format"])
# Use the first model by default
model = next(structure.get_models())
chain = model[cfg["chain_id"]]
# Collect atoms for neighbor search
all_atoms = list(structure.get_atoms())
ns = NeighborSearch(all_atoms)
# Pick a reference atom (first residue in chain that has the requested atom)
ref_atom = None
for residue in chain.get_residues():
if cfg["atom_name"] in residue:
ref_atom = residue[cfg["atom_name"]]
break
if ref_atom is None:
raise RuntimeError(f"No atom '{cfg['atom_name']}' found in chain {cfg['chain_id']}")
cutoff = float(cfg["distance_cutoff"])
neighbors = ns.search(ref_atom.coord, cutoff, level="R") # residues within cutoff
results = []
for res in neighbors:
# Skip hetero/water if desired; here we keep everything and report identifiers
res_id = res.get_id() # (hetflag, resseq, icode)
results.append(
{
"chain_id": res.get_parent().id,
"resname": res.get_resname(),
"resseq": int(res_id[1]),
"icode": (res_id[2] or "").strip(),
}
)
out_path = Path(cfg["output_path"])
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
json.dump(
{
"input_path": cfg["input_path"],
"reference": {
"chain_id": cfg["chain_id"],
"atom_name": cfg["atom_name"],
"cutoff": cutoff,
},
"neighbor_residues": results,
},
f,
ensure_ascii=False,
indent=2,
)
if __name__ == "__main__":
main()
Run the script:
python scripts/neighbor_search.py
config/task_config.json as an intermediate file and invoke scripts via python scripts/<task_name>.py. Avoid stacking many CLI -- arguments; prefer config files.encoding="utf-8". When writing JSON, use ensure_ascii=False to preserve non-ASCII characters.PDBParser(QUIET=True) for .pdb.MMCIFParser(QUIET=True) for .cif/.mmcif.get_models(), get_chains(), get_residues(), get_atoms()).np.linalg.norm(a.coord - b.coord).Bio.PDB.vectors.calc_angle, calc_dihedral) when needed.NeighborSearch(list(structure.get_atoms())) builds a spatial index over atoms.search(center, radius, level="A"|"R"|"C"...) returns neighbors at the requested hierarchy level (atoms, residues, etc.).Superimposer)biopython_structure_result.md unless the skill documentation defines a better convention.Run this minimal verification path before full execution when possible:
No local script validation step is required for this skill.
Expected output format:
Result file: biopython_structure_result.md
Validation summary: PASS/FAIL with brief notes
Assumptions: explicit list if any