Statistics

What I do

I provide comprehensive expertise in statistical analysis, the science of collecting, analyzing, interpreting, and presenting data. I enable you to perform descriptive statistics, conduct hypothesis tests, build regression models, analyze variance, estimate confidence intervals, and make data-driven decisions. My knowledge spans from basic statistical measures to advanced techniques like multiple regression and ANOVA, essential for scientific research, data science, business analytics, and evidence-based decision making.

When to use me

Use statistics when you need to: analyze experimental results and determine significance, build predictive models from observational data, compare groups or treatments for differences, estimate population parameters from samples, detect patterns and relationships in datasets, perform A/B testing for product decisions, conduct quality control analysis, or communicate uncertainty in findings to stakeholders.

Core Concepts

: Summarizing data through measures of central tendency (mean, median, mode) and dispersion (variance, standard deviation, range).

Statistics

What I do

When to use me

Core Concepts

: Summarizing data through measures of central tendency (mean, median, mode) and dispersion (variance, standard deviation, range).

import numpy as np from scipy import stats from statsmodels.stats.proportion import proportion_confint # One-sample confidence interval for mean np.random.seed(42) sample = np.random.normal(loc=100, scale=15, size=50) sample_mean = np.mean(sample) sample_std = np.std(sample, ddof=1) n = len(sample) confidence_level = 0.95 t_critical = stats.t.ppf((1 + confidence_level) / 2, df=n-1) margin_of_error = t_critical * sample_std / np.sqrt(n) ci_lower = sample_mean - margin_of_error ci_upper = sample_mean + margin_of_error print(f"95% Confidence Interval for mean:") print(f" Sample mean: {sample_mean:.2f}") print(f" CI: [{ci_lower:.2f}, {ci_upper:.2f}]") # Using bootstrap for confidence intervals def bootstrap_ci(data, statistic=np.mean, n_bootstrap=10000, confidence_level=0.95): np.random.seed(42) bootstrap_stats = [] n = len(data) for _ in range(n_bootstrap): sample_boot = np.random.choice(data, size=n, replace=True) bootstrap_stats.append(statistic(sample_boot)) alpha = (1 - confidence_level) / 2 return np.percentile(bootstrap_stats, [alpha*100, (1-alpha)*100]) ci_bootstrap = bootstrap_ci(sample) print(f"Bootstrap 95% CI: [{ci_bootstrap[0]:.2f}, {ci_bootstrap[1]:.2f}]") # Confidence interval for proportion n_successes = 45 n_trials = 100 ci_prop = proportion_confint(n_successes, n_trials, method='wilson') print(f"\n95% CI for proportion (45/100): [{ci_prop[0]:.4f}, {ci_prop[1]:.4f}]") # Difference of means confidence interval np.random.seed(42) sample1 = np.random.normal(loc=100, scale=15, size=30) sample2 = np.random.normal(loc=105, scale=15, size=35) mean_diff = np.mean(sample1) - np.mean(sample2) se_diff = np.sqrt(np.var(sample1, ddof=1)/len(sample1) + np.var(sample2, ddof=1)/len(sample2)) dof = len(sample1) + len(sample2) - 2 t_crit_diff = stats.t.ppf((1 + confidence_level) / 2, df=dof) ci_diff_lower = mean_diff - t_crit_diff * se_diff ci_diff_upper = mean_diff + t_crit_diff * se_diff print(f"\n95% CI for difference of means: [{ci_diff_lower:.2f}, {ci_diff_upper:.2f}]")

import numpy as np import pandas as pd from scipy import stats from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error # Simple linear regression np.random.seed(42) X = np.linspace(0, 10, 100) true_slope = 2.5 true_intercept = 1.0 noise = np.random.normal(0, 2, 100) y = true_slope * X + true_intercept + noise # Using scipy for simple regression slope, intercept, r_value, p_value, std_err = stats.linregress(X, y) print("Simple Linear Regression (scipy):") print(f" Slope: {slope:.4f} (true: {true_slope})") print(f" Intercept: {intercept:.4f} (true: {true_intercept})") print(f" R-squared: {r_value**2:.4f}") print(f" p-value: {p_value:.2e}") print(f" Standard error: {std_err:.4f}") # Multiple linear regression with sklearn np.random.seed(42) n_samples = 100 X1 = np.random.normal(0, 1, n_samples) X2 = np.random.normal(0, 1, n_samples) X3 = np.random.normal(0, 1, n_samples) noise = np.random.normal(0, 0.5, n_samples) y = 1 + 2*X1 + 3*X2 + 0.5*X3 + noise X_multi = np.column_stack([X1, X2, X3]) model = LinearRegression() model.fit(X_multi, y) print("\nMultiple Linear Regression (sklearn):") print(f" Intercept: {model.intercept_:.4f}") print(f" Coefficients: {model.coef_}") print(f" R-squared: {model.score(X_multi, y):.4f}") # Predictions and residuals y_pred = model.predict(X_multi) residuals = y - y_pred # Residual analysis print(f"\nResidual Statistics:") print(f" Mean of residuals: {np.mean(residuals):.6f} (should be ~0)") print(f" Std of residuals: {np.std(residuals):.4f}") # Polynomial regression X_poly = np.linspace(-3, 3, 50) y_poly = 0.5 * X_poly**3 - 2 * X_poly**2 + X_poly + np.random.normal(0, 2, 50) from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline poly_model = make_pipeline(PolynomialFeatures(degree=3), LinearRegression()) poly_model.fit(X_poly.reshape(-1, 1), y_poly) y_poly_pred = poly_model.predict(X_poly.reshape(-1, 1)) print(f"\nPolynomial Regression (degree=3):") print(f" R-squared: {r2_score(y_poly, y_poly_pred):.4f}")

import numpy as np from scipy import stats from scipy.stats import f_oneway, kruskal # One-way ANOVA np.random.seed(42) group1 = np.random.normal(loc=70, scale=10, size=30) group2 = np.random.normal(loc=75, scale=10, size=30) group3 = np.random.normal(loc=80, scale=10, size=30) f_stat, p_anova = f_oneway(group1, group2, group3) print("One-way ANOVA:") print(f" F-statistic: {f_stat:.4f}") print(f" p-value: {p_anova:.4f}") print(f" Significant difference at α=0.05: {p_anova < 0.05}") # Effect size (eta-squared) ss_between = sum(len(g) * (np.mean(g) - np.mean(np.concatenate([group1, group2, group3])))**2 for g in [group1, group2, group3]) ss_total = sum((np.concatenate([group1, group2, group3]) - np.mean(np.concatenate([group1, group2, group3])))**2) eta_squared = ss_between / ss_total print(f" Effect size (η²): {eta_squared:.4f}") # Post-hoc test (Tukey's HSD) from scipy.stats import tukey_hsd result = tukey_hsd(group1, group2, group3) print(f"\nTukey's HSD results:") print(f" group1 vs group2: p={result.pvalue[0,1]:.4f}") print(f" group1 vs group3: p={result.pvalue[0,2]:.4f}") print(f" group2 vs group3: p={result.pvalue[1,2]:.4f}") # Kruskal-Wallis test (non-parametric alternative) h_stat, p_kruskal = kruskal(group1, group2, group3) print(f"\nKruskal-Wallis test:") print(f" H-statistic: {h_stat:.4f}") print(f" p-value: {p_kruskal:.4f}") # Two-way ANOVA (simplified using regression) np.random.seed(42) factor_a = np.repeat([0, 1], 50) factor_b = np.tile(np.repeat([0, 1], 25), 2) interaction = factor_a * factor_b y_two_way = 50 + 5*factor_a + 3*factor_b + 2*interaction + np.random.normal(0, 5, 100) # Using statsmodels for proper ANOVA import statsmodels.api as sm from statsmodels.formula.api import ols data_anova = pd.DataFrame({ 'y': y_two_way, 'A': factor_a, 'B': factor_b }) model_anova = ols('y ~ C(A) + C(B) + C(A):C(B)', data=data_anova).fit() anova_table = sm.stats.anova_lm(model_anova, typ=2) print("\nTwo-way ANOVA table:") print(anova_table)

Statistics Math

Statistics

What I do

When to use me

Core Concepts

Statistics Math

Statistics

What I do

When to use me

Core Concepts

Code Examples

Descriptive Statistics and Data Summaries

Hypothesis Testing

Confidence Intervals

Regression Analysis

ANOVA and Group Comparisons

Best Practices

Visualization Expert

Data Analyst

Huggingface Hub

Multi Reviewer Patterns

Dbt Transformation Patterns

Startup Financial Modeling