Statistical distribution fitting skill for input modeling in simulation and analysis.
name distribution-fitter description Statistical distribution fitting skill for input modeling in simulation and analysis. allowed-tools Bash(*) Read Write Edit Glob Grep WebFetch metadata {"author":"babysitter-sdk","version":"1.0.0","category":"simulation","backlog-id":"SK-IE-006"} distribution-fitter You are distribution-fitter
Automated Distribution Fitting from fitter import Fitter import numpy as np def fit_distribution ( data, distributions= None ): """ Fit multiple distributions and select best fit """ if distributions is None : distributions = [ 'norm' , 'expon' , 'gamma' , 'lognorm' , 'weibull_min' , 'beta' , 'uniform' , 'triang' ]
f = Fitter(data, distributions=distributions) f.fit()
summary = f.summary()
best = f.get_best(method= 'sumsquare_error' ) return { "best_distribution" : list (best.keys())[ 0 ], "parameters" : best, "summary" : summary.to_dict(), "all_fits" : f.fitted_param } 2. Goodness-of-Fit Testing from scipy import stats import numpy as np def goodness_of_fit_tests ( data, distribution, params ): """ Perform multiple goodness-of-fit tests """ results = {}
ks_stat, ks_pvalue = stats.kstest(data, distribution, args=params) results[ 'kolmogorov_smirnov' ] = { 'statistic' : ks_stat, 'p_value' : ks_pvalue, 'conclusion' : 'accept' if ks_pvalue > 0.05 else 'reject' }
observed, bins = np.histogram(data, bins= 'auto' ) dist = getattr (stats, distribution) expected = len (data) * np.diff(dist.cdf(bins, *params))
mask = expected >= 5 chi2_stat, chi2_pvalue = stats.chisquare( observed[mask], expected[mask] ) results[ 'chi_square' ] = { 'statistic' : chi2_stat, 'p_value' : chi2_pvalue, 'degrees_of_freedom' : sum (mask) - len (params) - 1 }
if distribution in [ 'norm' , 'expon' , 'gumbel' , 'logistic' ]: ad_result = stats.anderson(data, dist=distribution) results[ 'anderson_darling' ] = { 'statistic' : ad_result.statistic, 'critical_values' : dict ( zip ( [ '15%' , '10%' , '5%' , '2.5%' , '1%' ], ad_result.critical_values )) } return results 3. Maximum Likelihood Estimation from scipy.optimize import minimize from scipy import stats def mle_fit ( data, distribution ): """ Fit distribution using maximum likelihood """ dist = getattr (stats, distribution)
bounds = get_parameter_bounds(distribution)
def neg_log_likelihood ( params ): return -np. sum (dist.logpdf(data, *params))
x0 = get_initial_params(data, distribution)
result = minimize(neg_log_likelihood, x0, bounds=bounds, method= 'L-BFGS-B' )
from scipy.optimize import approx_fprime hessian = np.zeros(( len (result.x), len (result.x))) epsilon = 1e-5 for i in range ( len (result.x)): hessian[i] = approx_fprime(result.x, lambda p: approx_fprime(p, neg_log_likelihood, epsilon)[i], epsilon)
se = np.sqrt(np.diag(np.linalg.inv(hessian)))
return { "distribution" : distribution, "parameters" : result.x.tolist(), "standard_errors" : se.tolist(), "log_likelihood" : -result.fun, "aic" : 2 * len (result.x) + 2
timestamps = np.array(timestamps) interarrivals = np.diff(timestamps)
stats_summary = { "count" : len (interarrivals), "mean" : np.mean(interarrivals), "std" : np.std(interarrivals), "cv" : np.std(interarrivals) / np.mean(interarrivals), "min" : np. min (interarrivals), "max" : np. max (interarrivals), "median" : np.median(interarrivals) }
exp_params = stats.expon.fit(interarrivals, floc= 0 ) ks_stat, ks_pvalue = stats.kstest(interarrivals, 'expon' , args=exp_params)
is_poisson = ks_pvalue >
0.05 and 0.8 < stats_summary[ 'cv' ] < 1.2
fit_result = fit_distribution(interarrivals) return { "statistics" : stats_summary, "poisson_process_test" : { "ks_statistic" : ks_stat, "p_value" : ks_pvalue, "cv_test" : stats_summary[ 'cv' ], "is_poisson" : is_poisson }, "best_fit" : fit_result, "arrival_rate" : 1 / stats_summary[ 'mean' ] } 5. Empirical Distribution class EmpiricalDistribution : """ Create empirical distribution from data """ def init ( self, data ): self .data = np.sort(data) self .n = len (data) self .ecdf = np.arange( 1 , self .n + 1 ) / self .n def cdf ( self, x ): """Cumulative distribution function""" return np.searchsorted( self .data, x, side= 'right' ) / self .n def ppf ( self, q ): """Percent point function (inverse CDF)""" idx = int (q * self .n) return self .data[ min (idx, self .n - 1 )] def sample ( self, size= 1 ): """Generate random samples""" u = np.random.uniform( 0 , 1 , size) return np.array([ self .ppf(ui) for ui in u]) def to_dict ( self ): """Export for storage""" return { "type" : "empirical" , "values" : self .data.tolist(), "probabilities" : self .ecdf.tolist() } 6. Distribution Comparison def compare_distributions ( data, candidates ): """ Compare multiple distribution fits """ results = [] for dist_name in candidates: try : dist = getattr (stats, dist_name) params = dist.fit(data)
ll = np. sum (dist.logpdf(data, *params))
k = len (params) n = len (data) aic = 2
ks_stat, ks_pvalue = stats.kstest(data, dist_name, args=params)
results.append({
"distribution" : dist_name, "parameters" : params, "log_likelihood" : ll, "aic" : aic, "bic" : bic, "ks_statistic" : ks_stat, "ks_pvalue" : ks_pvalue }) except Exception as e: continue
results.sort(key= lambda x: x[ 'aic' ]) return { "rankings" : results, "best_by_aic" : results[ 0 ][ 'distribution' ], "best_by_bic" : min (results, key= lambda x: x[ 'bic' ])[ 'distribution' ] } Process Integration This skill integrates with the following processes: discrete-event-simulation-modeling.js queuing-system-analysis.js demand-forecasting-model-development.js Output Format { "data_summary" : { "n" : 500 , "mean" : 5.2 , "std" : 2.1 , "cv" : 0.40 } , "best_fit" : { "distribution" : "gamma" , "parameters" : { "shape" : 6.1 , "scale" : 0.85 } , "goodness_of_fit" : { "ks_statistic" : 0.032 , "ks_pvalue" : 0.67 , "aic" : 1523.4 } } , "alternative_fits" : [ { "distribution" : "lognorm" , "aic" : 1528.1 } , { "distribution" : "weibull" , "aic" : 1531.2 } ] , "recommendation" : "Use gamma(6.1, 0.85) for simulation input" } Tools/Libraries Library Description Use Case scipy.stats Statistical functions Core fitting fitter Auto fitting Quick analysis statsmodels Advanced stats Detailed tests R fitdistrplus R package Complex fitting Best Practices Visualize first