statistics-math
from pluginagentmarketplace/custom-plugin-data-engineer
Data Engineer Plugin - ETL pipelines, data infrastructure, and data processing tools
1 stars1 forksUpdated Jan 5, 2026
npx skills add https://github.com/pluginagentmarketplace/custom-plugin-data-engineer --skill statistics-mathSKILL.md
Statistics & Mathematics
Mathematical foundations for data science, machine learning, and statistical analysis.
Quick Start
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
# Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56])
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Std Dev: {np.std(data, ddof=1):.2f}")
print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
# Hypothesis Testing
sample_a = [23, 45, 67, 32, 45]
sample_b = [56, 78, 45, 67, 89]
t_stat, p_value = stats.ttest_ind(sample_a, sample_b)
print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
print("Reject null hypothesis: significant difference")
else:
print("Fail to reject null hypothesis")
Core Concepts
1. Probability Distributions
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
# Normal Distribution
mu, sigma = 100, 15
normal_dist = stats.norm(loc=mu, scale=sigma)
x = np.linspace(50, 150, 100)
# PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}")
print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}")
print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
# Binomial Distribution (discrete)
n, p = 100, 0.3
binom_dist = stats.binom(n=n, p=p)
print(f"P(X = 30): {binom_dist.pmf(30):.4f}")
print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
# Poisson Distribution (events per time)
lambda_param = 5
poisson_dist = stats.poisson(mu=lambda_param)
print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
# Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000)
sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
2. Hypothesis Testing Framework
from scipy import stats
import numpy as np
class HypothesisTest:
"""Framework for statistical hypothesis testing."""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""Independent samples t-test."""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""Chi-square test for categorical data."""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""Two-proportion z-test for A/B testing."""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
# Usage
result = HypothesisTest.ab_test_proportion(
conversions_a=120, total_a=1000,
conversions_b=150, total_b=1000
)
print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
3. Linear Algebra Essentials
import numpy as np
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Basic operations
print("Matrix multiplication:", A @ B)
print("Element-wise:", A * B)
print("Transpose:", A.T)
print("Inverse:", np.linalg.inv(A))
print("Determinant:", np.linalg.det(A))
# Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"Eigenvalues: {eigenvalues}")
# Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A)
print(f"Singular values: {S}")
# Solving linear systems: Ax = b
b = np.array([5, 11])
x = np.linalg.solve(A, b)
print(f"Solution: {x}")
# Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3])
vec2 = np.array([4, 5, 6])
print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
4. Regression Analysis
...
Repository Stats
Stars1
Forks1
LicenseOther