Best practices for designing reproducible experiments
Load this skill when designing experiments that need to be reproducible and statistically valid.
import random
import numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
print(f"[DECISION] Using random seed: {SEED}")
import sys
print(f"[INFO] Python: {sys.version}")
print(f"[INFO] NumPy: {np.__version__}")
print(f"[INFO] Pandas: {pd.__version__}")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=SEED, stratify=y
)
print(f"[EXPERIMENT] Train: {len(X_train)}, Test: {len(X_test)}")
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"[METRIC] CV Accuracy: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
print("[EXPERIMENT] A/B Test Design")
print(f"[INFO] Control group: {len(control)}")
print(f"[INFO] Treatment group: {len(treatment)}")
# Power analysis
from statsmodels.stats.power import TTestIndPower
power = TTestIndPower()
sample_size = power.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
print(f"[CALC] Required sample size per group: {sample_size:.0f}")
Power analysis ensures your experiment has sufficient sample size to detect meaningful effects. Without adequate power, you risk false negatives (missing real effects).
from statsmodels.stats.power import TTestIndPower, FTestAnovaPower, NormalIndPower
import numpy as np
print("[DECISION] Conducting a priori power analysis before data collection")
# For two-group comparison (t-test)
power_analysis = TTestIndPower()
# Parameters:
# - effect_size: Expected Cohen's d (0.2=small, 0.5=medium, 0.8=large)
# - alpha: Significance level (typically 0.05)
# - power: Desired statistical power (typically 0.80 or 0.90)
# - ratio: Ratio of group sizes (1.0 = equal groups)
effect_size = 0.5 # Medium effect size
alpha = 0.05
desired_power = 0.80
sample_size = power_analysis.solve_power(
effect_size=effect_size,
alpha=alpha,
power=desired_power,
ratio=1.0,
alternative='two-sided'
)
print(f"[STAT:estimate] Required n per group: {np.ceil(sample_size):.0f}")
print(f"[STAT:estimate] Total sample needed: {np.ceil(sample_size)*2:.0f}")
print(f"[DECISION] Targeting {effect_size} effect size (Cohen's d = medium)")
# After data collection, calculate achieved power
actual_n = 50 # Actual sample size per group
achieved_power = power_analysis.solve_power(
effect_size=effect_size,
alpha=alpha,
nobs1=actual_n,
ratio=1.0,
alternative='two-sided'
)
print(f"[STAT:estimate] Achieved power: {achieved_power:.3f}")
if achieved_power < 0.80:
print(f"[LIMITATION] Study is underpowered ({achieved_power:.0%} < 80%)")
print("[LIMITATION] Negative results may be due to insufficient sample size")