Detect positive selection using dN/dS (omega) tests with PAML codeml and HyPhy. Identify sites and branches under adaptive evolution through codon models and branch-site tests. Use when testing for adaptive evolution in gene families or identifying positively selected sites.
Reference examples tested with: BioPython 1.83+, HyPhy 2.5+, PAML 4.10+, scipy 1.12+
Before using code patterns, verify installed versions match. If versions differ:
pip show <package> then help(module.function) to check signaturesIf code throws ImportError, AttributeError, or TypeError, introspect the installed package and adapt the example to match the actual API rather than retrying.
"Test my gene for positive selection" → Detect adaptive evolution using dN/dS (omega) codon models including site models, branch models, and branch-site tests to identify positively selected sites.
codeml for site and branch-site dN/dS testshyphy busted, hyphy meme for HyPhy selection tests'''
dN/dS (omega, ω) interpretation:
- ω < 1: Purifying (negative) selection - deleterious mutations removed
- ω = 1: Neutral evolution - no selective pressure
- ω > 1: Positive (diversifying) selection - advantageous mutations favored
Most genes: ω << 1 (strong purifying selection)
Immune genes, reproduction: Often show ω > 1 at specific sites
'''
Goal: Test for positive selection across a gene using codon-based site models.
Approach: Prepare a codon alignment in PHYLIP format, create codeml control files for nested models (M7 vs M8 or M1a vs M2a), run codeml, extract log-likelihoods, perform a likelihood ratio test, and identify positively selected sites from the BEB analysis.
'''Run PAML codeml for selection analysis'''
import subprocess
import os
from Bio import SeqIO
from Bio.Seq import Seq
def prepare_codon_alignment(cds_fasta, output_phy):
'''Prepare codon alignment in PHYLIP format
Requirements:
- CDS sequences (in-frame, no stop codons except terminal)
- Multiple sequence alignment already performed
- Sequence length divisible by 3
'''
records = list(SeqIO.parse(cds_fasta, 'fasta'))
# Validate codon alignment
for rec in records:
if len(rec.seq) % 3 != 0:
print(f'Warning: {rec.id} length not divisible by 3')
# Write PHYLIP format
n_seq = len(records)
seq_len = len(records[0].seq)
with open(output_phy, 'w') as f:
f.write(f' {n_seq} {seq_len}\n')
for rec in records:
# PHYLIP names: 10 characters, padded
name = rec.id[:10].ljust(10)
f.write(f'{name}{str(rec.seq)}\n')
return output_phy
def create_codeml_control(alignment_file, tree_file, output_dir, model='M8'):
'''Create codeml control file
Site models for detecting positive selection:
- M0: One ratio (single ω for all sites)
- M1a: Nearly neutral (ω0 < 1, ω1 = 1)
- M2a: Positive selection (ω0 < 1, ω1 = 1, ω2 > 1)
- M7: Beta (ω from beta distribution, 0 < ω < 1)
- M8: Beta + ω > 1 (allows positive selection)
- M8a: Beta + ω = 1 (null for M8 comparison)
Standard comparison: M8 vs M7 or M8 vs M8a
'''
model_params = {
'M0': {'NSsites': 0, 'model': 0},
'M1a': {'NSsites': 1, 'model': 0},
'M2a': {'NSsites': 2, 'model': 0},
'M7': {'NSsites': 7, 'model': 0},
'M8': {'NSsites': 8, 'model': 0},
'M8a': {'NSsites': 8, 'model': 0, 'fix_omega': 1, 'omega': 1},
}
params = model_params.get(model, model_params['M8'])
ctl_content = f'''
seqfile = {alignment_file}
treefile = {tree_file}
outfile = {output_dir}/mlc
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
model = {params.get('model', 0)}
NSsites = {params.get('NSsites', 8)}
icode = 0
fix_kappa = 0
kappa = 2
fix_omega = {params.get('fix_omega', 0)}
omega = {params.get('omega', 1)}
'''
ctl_file = f'{output_dir}/codeml_{model}.ctl'
with open(ctl_file, 'w') as f:
f.write(ctl_content)
return ctl_file
def run_codeml(ctl_file):
'''Run PAML codeml'''
cmd = f'codeml {ctl_file}'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print(f'Codeml error: {result.stderr}')
return result
def parse_codeml_output(mlc_file):
'''Parse codeml output for likelihood and parameters'''
results = {'lnL': None, 'omega': None, 'kappa': None, 'sites': []}
with open(mlc_file) as f:
content = f.read()
# Extract log-likelihood
for line in content.split('\n'):
if 'lnL' in line and 'np' in line:
parts = line.split()
for i, p in enumerate(parts):
if p == 'lnL':
results['lnL'] = float(parts[i + 2])
break
# Extract omega values
if 'omega' in line.lower() and '=' in line:
parts = line.split('=')
if len(parts) >= 2:
try:
results['omega'] = float(parts[-1].strip().split()[0])
except ValueError:
pass
# Extract positively selected sites (BEB analysis)
if 'Bayes Empirical Bayes' in content:
beb_section = content.split('Bayes Empirical Bayes')[1]
for line in beb_section.split('\n'):
parts = line.split()
if len(parts) >= 5:
try:
site = int(parts[0])
aa = parts[1]
prob = float(parts[2])
# Sites with P > 0.95 considered significant
# Sites with P > 0.99 highly significant
if prob > 0.95:
results['sites'].append({
'position': site,
'amino_acid': aa,
'probability': prob,
'significance': '**' if prob > 0.99 else '*'
})
except (ValueError, IndexError):
continue
return results
def likelihood_ratio_test(lnL_null, lnL_alt, df=2):
'''Perform likelihood ratio test
For M8 vs M7: df = 2
For M2a vs M1a: df = 2
For branch-site test: df = 1
Significance thresholds (chi-square):
- df=1: 3.84 (p<0.05), 6.63 (p<0.01)
- df=2: 5.99 (p<0.05), 9.21 (p<0.01)
'''
from scipy import stats
lrt = 2 * (lnL_alt - lnL_null)
p_value = 1 - stats.chi2.cdf(lrt, df)
return {
'LRT_statistic': lrt,
'degrees_freedom': df,
'p_value': p_value,
'significant': p_value < 0.05
}
Goal: Test for positive selection on a specific lineage (foreground branch) while allowing variable rates across the tree.
Approach: Mark the foreground lineage with #1 in the tree file, create branch-site model A control files for both alternative and null hypotheses, run codeml, and perform an LRT with df=1.
def create_branch_site_control(alignment, tree, foreground_branch, output_dir):
'''Create control file for branch-site test
Branch-site model A tests for positive selection on
specific branches (foreground) while allowing variation
across the tree.
Foreground branch: Mark with #1 in tree file
Example: ((A,B),(C #1,D));
Comparison: Model A vs null (fix_omega = 1)
'''
ctl_content = f'''
seqfile = {alignment}
treefile = {tree}
outfile = {output_dir}/branch_site.mlc
runmode = 0
seqtype = 1
CodonFreq = 2
model = 2
NSsites = 2
fix_kappa = 0
kappa = 2
fix_omega = 0
omega = 1
'''
ctl_file = f'{output_dir}/branch_site.ctl'
with open(ctl_file, 'w') as f:
f.write(ctl_content)
return ctl_file
def mark_foreground_branch(tree_file, foreground_taxa, output_file):
'''Mark foreground lineage in Newick tree
For testing selection on specific lineage:
- Add #1 after the clade of interest
- Can mark multiple branches with different tags
'''
with open(tree_file) as f:
tree = f.read().strip()
# Simple marking - for complex trees use ete3
for taxon in foreground_taxa:
tree = tree.replace(taxon, f'{taxon} #1')
with open(output_file, 'w') as f:
f.write(tree)
return output_file
Goal: Detect positive selection using HyPhy's suite of methods for gene-wide and site-specific tests.
Approach: Run BUSTED for gene-wide episodic selection, MEME for site-specific episodic selection, or FEL for pervasive site selection, and parse the JSON output for p-values and selected sites.
def run_hyphy_busted(alignment_file, tree_file, output_json):
'''Run HyPhy BUSTED for gene-wide selection
BUSTED (Branch-site Unrestricted Statistical Test for
Episodic Diversification) tests whether a gene has
experienced positive selection at any site on any branch.
More sensitive than PAML for episodic selection
'''
cmd = f'hyphy busted --alignment {alignment_file} --tree {tree_file} --output {output_json}'
subprocess.run(cmd, shell=True)
return output_json
def run_hyphy_meme(alignment_file, tree_file, output_json):
'''Run HyPhy MEME for site-specific selection
MEME (Mixed Effects Model of Evolution) detects
episodic diversifying selection at individual sites.
Better than PAML M8 when selection is episodic
(not constant across all branches)
'''
cmd = f'hyphy meme --alignment {alignment_file} --tree {tree_file} --output {output_json}'
subprocess.run(cmd, shell=True)
return output_json
def run_hyphy_fel(alignment_file, tree_file, output_json):
'''Run HyPhy FEL for pervasive selection
FEL (Fixed Effects Likelihood) tests for pervasive
selection at each site across the entire phylogeny.
Use when selection is expected to be constant
'''
cmd = f'hyphy fel --alignment {alignment_file} --tree {tree_file} --output {output_json}'
subprocess.run(cmd, shell=True)
return output_json
def parse_hyphy_json(json_file):
'''Parse HyPhy JSON output'''
import json
with open(json_file) as f:
results = json.load(f)
# Extract test results
test_results = results.get('test results', {})
# Extract sites under selection (MEME/FEL)
sites = []
mle = results.get('MLE', {}).get('content', {})
for site_data in mle.get('0', {}).values():
if isinstance(site_data, list) and len(site_data) > 5:
# Structure varies by method
pass
return {
'p_value': test_results.get('p-value'),
'LRT': test_results.get('LRT'),
'sites': sites
}