Bioinformatics with Biopython for sequence manipulation, file parsing, BLAST, and phylogenetics. Use when working with DNA/RNA/protein sequences or biological databases.
from Bio.Seq import Seq; seq = Seq("ATGCGA")seq.complement(); Reverse complement: seq.reverse_complement()seq.transcribe() (DNA to RNA)seq.translate() (DNA/RNA to protein)from Bio.SeqUtils import gc_fraction; gc_fraction(seq)from Bio.SeqUtils import molecular_weightfor rec in SeqIO.parse("file.fasta", "fasta"): ...for rec in SeqIO.parse("file.gb", "genbank"): ...rec = SeqIO.read("file.fasta", "fasta")SeqIO.write(records, "output.fasta", "fasta")SeqIO.convert("input.gb", "genbank", "output.fasta", "fasta")idx = SeqIO.index("large.fasta", "fasta") for random accessfrom Bio.Blast import NCBIWWW; result = NCBIWWW.qblast("blastn", "nt", seq)from Bio.Blast import NCBIXML; records = NCBIXML.parse(result)Entrez.email before any NCBI accessEntrez.email = "[email protected]"handle = Entrez.esearch(db="pubmed", term="query")handle = Entrez.efetch(db="nucleotide", id="ID", rettype="fasta")from Bio import Phylo; tree = Phylo.read("tree.nwk", "newick")Phylo.draw(tree) or Phylo.draw_ascii(tree)for clade in tree.find_clades(): ...tree.distance(clade1, clade2)parser = PDBParser(); structure = parser.get_structure("id", "file.pdb")structure.get_atoms()MMCIFParser() instead of PDBParser()SeqIO.parse as an iterator — it exhausts after one passSeqIO.index() not SeqIO.to_dict() to avoid memory issues