Name: Ab Testing Experimentation
Author: moshesham

スキルを検索.../

Ab Testing Experimentation | Skills Pool

import numpy as np
from scipy import stats

def calculate_sample_size(
    baseline_rate: float,
    minimum_detectable_effect: float,
    alpha: float = 0.05,
    power: float = 0.80,
    two_sided: bool = True
) -> dict:
    """
    Calculate required sample size per group for a two-proportion z-test.

    Args:
        baseline_rate: Current conversion rate (e.g., 0.10 for 10%)
        minimum_detectable_effect: Relative change to detect (e.g., 0.05 for 5% relative lift)
        alpha: Significance level (Type I error rate)
        power: Statistical power (1 - Type II error rate)
        two_sided: Whether the test is two-sided

    Returns:
        Dict with sample size per group, total, and parameters
    """
    p1 = baseline_rate
    p2 = baseline_rate * (1 + minimum_detectable_effect)

    # Pooled proportion
    p_bar = (p1 + p2) / 2

    # Z-scores
    z_alpha = stats.norm.ppf(1 - alpha / (2 if two_sided else 1))
    z_beta = stats.norm.ppf(power)

    # Sample size formula (per group)
    numerator = (z_alpha * np.sqrt(2 * p_bar * (1 - p_bar)) +
                 z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2
    denominator = (p2 - p1) ** 2

    n_per_group = int(np.ceil(numerator / denominator))

    return {
        'n_per_group': n_per_group,
        'n_total': n_per_group * 2,
        'baseline_rate': p1,
        'expected_treatment_rate': p2,
        'absolute_difference': p2 - p1,
        'relative_mde': minimum_detectable_effect,
        'alpha': alpha,
        'power': power,
    }

# Example: 10% baseline conversion, detect 5% relative lift
result = calculate_sample_size(baseline_rate=0.10, minimum_detectable_effect=0.05)
print(f"Need {result['n_per_group']:,} users per group ({result['n_total']:,} total)")
print(f"Detecting {result['baseline_rate']:.1%} → {result['expected_treatment_rate']:.1%}")

def sample_size_continuous(
    baseline_mean: float,
    baseline_std: float,
    minimum_detectable_effect: float,
    alpha: float = 0.05,
    power: float = 0.80
) -> int:
    """
    Sample size for detecting a change in a continuous metric (e.g., revenue per user).

    Args:
        baseline_mean: Current metric mean
        baseline_std: Current metric standard deviation
        minimum_detectable_effect: Relative change to detect

    Returns:
        Sample size per group
    """
    delta = baseline_mean * minimum_detectable_effect
    z_alpha = stats.norm.ppf(1 - alpha / 2)
    z_beta = stats.norm.ppf(power)

    n = int(np.ceil(2 * ((z_alpha + z_beta) * baseline_std / delta) ** 2))
    return n

# Example: revenue per user, mean=$5.00, std=$12.00, detect 3% lift
n = sample_size_continuous(5.0, 12.0, 0.03)
print(f"Need {n:,} users per group")

def analyze_ab_test(
    control_conversions: int,
    control_total: int,
    treatment_conversions: int,
    treatment_total: int,
    alpha: float = 0.05
) -> dict:
    """
    Analyze an A/B test result for a binary outcome.

    Returns:
        Dict with rates, lift, confidence interval, p-value, and decision
    """
    p_c = control_conversions / control_total
    p_t = treatment_conversions / treatment_total
    lift = (p_t - p_c) / p_c if p_c > 0 else float('inf')

    # Standard error of the difference
    se = np.sqrt(p_c * (1 - p_c) / control_total + p_t * (1 - p_t) / treatment_total)

    # Z-statistic and p-value
    z_stat = (p_t - p_c) / se
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

    # Confidence interval for the difference
    z_crit = stats.norm.ppf(1 - alpha / 2)
    ci_lower = (p_t - p_c) - z_crit * se
    ci_upper = (p_t - p_c) + z_crit * se

    return {
        'control_rate': p_c,
        'treatment_rate': p_t,
        'absolute_difference': p_t - p_c,
        'relative_lift': lift,
        'standard_error': se,
        'z_statistic': z_stat,
        'p_value': p_value,
        'ci_95': (ci_lower, ci_upper),
        'significant': p_value < alpha,
        'decision': 'SHIP IT' if p_value < alpha and lift > 0 else
                    'DO NOT SHIP' if p_value < alpha and lift < 0 else
                    'NO SIGNIFICANT DIFFERENCE'
    }

# Example
result = analyze_ab_test(
    control_conversions=4900, control_total=50000,
    treatment_conversions=5150, treatment_total=50000
)
for k, v in result.items():
    print(f"  {k}: {v}")

def bonferroni_correction(p_values: list, alpha: float = 0.05) -> list:
    """Apply Bonferroni correction: adjust alpha by number of comparisons."""
    adjusted_alpha = alpha / len(p_values)
    return [{'p_value': p, 'significant': p < adjusted_alpha} for p in p_values]

def cuped_adjustment(
    y_post: np.ndarray,
    y_pre: np.ndarray
) -> np.ndarray:
    """
    Apply CUPED variance reduction.

    Uses pre-experiment metric values as a control variate to reduce
    variance of the post-experiment metric estimate.

    Args:
        y_post: Post-experiment metric values per user
        y_pre: Pre-experiment metric values per user (same period length)

    Returns:
        Adjusted metric values with reduced variance
    """
    # Compute theta (optimal coefficient)
    cov = np.cov(y_post, y_pre)[0, 1]
    var_pre = np.var(y_pre)
    theta = cov / var_pre if var_pre > 0 else 0

    # Adjusted metric
    y_adjusted = y_post - theta * (y_pre - np.mean(y_pre))
    return y_adjusted

Period	Region A	Region B
Hour 1	Treatment	Control
Hour 2	Control	Treatment
Hour 3	Treatment	Control
...	...	...

Ab Testing Experimentation

Overview

When to Use This Skill

Ab Testing Experimentation

Overview

When to Use This Skill

Experiment Design Checklist

Sample Size Calculation

Two-Proportion Z-Test

Continuous Metrics (T-Test)

Analyzing Experiment Results

Frequentist Analysis

Common Pitfalls

1. Peeking (Repeated Significance Testing)

2. Multiple Comparisons

3. Network Effects / Interference (SUTVA Violation)

4. Novelty & Primacy Effects

5. Simpson's Paradox

6. Survivorship Bias

Advanced Techniques

CUPED (Controlled-experiment Using Pre-Experiment Data)

Switchback Experiments

Multi-Armed Bandits

Best Practices

Additional Resources

Automation Audit Ops

Github Qa Labels

Jupyter Notebook

Tidb Integrationtest Recorder

Quality Nonconformance

Hugging Face Trackio