import re
# Stop codons
stop = re.compile(r'TAA|TAG|TGA')
# Overlapping ATG starts — requires lookahead
for m in re.finditer(r'(?=(ATG))', dna):
print(m.start(), m.group(1))
# ORF: greedy (longest) vs lazy (shortest)
greedy = re.findall(r'ATG.*TAG', dna) # first ATG to LAST TAG
lazy = re.findall(r'ATG.*?TAG', dna) # first ATG to nearest TAG
# Homopolymer runs (4+ bases)
for m in re.finditer(r'(.)\1{3,}', dna):
print(m.group(), m.start(), len(m.group()))
# Tandem repeats
for m in re.finditer(r'([ATGC]{3})\1+', dna):
unit = m.group(1)
copies = len(m.group()) // len(unit)
# IUPAC ambiguity: translate before matching
IUPAC = {'N':'[ATGC]','R':'[AG]','Y':'[CT]','W':'[AT]','S':'[GC]',
'M':'[AC]','K':'[GT]','B':'[CGT]','D':'[AGT]','H':'[ACT]','V':'[ACG]'}
def iupac_to_regex(seq):
return ''.join(IUPAC.get(b, b) for b in seq.upper())
# Restriction digest (overlapping sites)
def find_cut_positions(dna, site, cut_offset):
return [m.start() + cut_offset for m in re.finditer(f'(?={site})', dna.upper())]
# FASTA header parsing (named groups)
header = ">sp|P04637|P53_HUMAN Cellular tumor antigen OS=Homo sapiens"
pat = r'>sp\|(?P<acc>[^|]+)\|(?P<entry>\S+)\s+(?P<desc>.+?)\s+OS=(?P<org>.+)'
m = re.search(pat, header)
# m.group('acc'), m.group('org') etc.
# Transcription: T→U
rna = re.sub('T', 'U', dna)
# Split at restriction sites
fragments = re.split(r'GAATTC|GGATCC|AAGCTT', dna)
# Multi-line FASTA headers
headers = re.findall(r'^>.*', fasta, re.MULTILINE)
# Non-standard nucleotides
non_std = re.findall(r'[^ATGCN]', seq)
# Mask homopolymers with N
def mask_repeat(m):
return 'N' * len(m.group()) if len(m.group()) >= 5 else m.group()
masked = re.sub(r'(.)\1{2,}', mask_repeat, dna)