Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added statistics/__init__.py
Empty file.
48 changes: 48 additions & 0 deletions statistics/central_tendency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# central_tendency.py
from typing import Iterable, List, Union

Number = Union[int, float]


def mean(values: Iterable[Number]) -> float:
"""Return arithmetic mean of values. Raises ValueError if empty."""
vals = list(values)
if not vals:
raise ValueError("mean() arg is an empty sequence")
return round(sum(vals) / len(vals), 2)


def median(values: Iterable[Number]) -> float:
"""Return median. For even length returns the average of two middle values."""
vals = sorted(values)
if not vals:
raise ValueError("median() arg is an empty sequence")
n = len(vals)
mid = n // 2
if n % 2 == 1:
return float(vals[mid])
return (vals[mid - 1] + vals[mid]) / 2.0


def mode(values: Iterable[Number]) -> Union[Number, List[Number]]:
"""
Return the mode value if unique, otherwise return a list of modes.
Example: [1,2,2,3] -> 2 ; [1,1,2,2] -> [1,2]
"""
vals = list(values)
if not vals:
raise ValueError("mode() arg is an empty sequence")
freq = {}
for v in vals:
freq[v] = freq.get(v, 0) + 1
max_count = max(freq.values())
modes = [k for k, count in freq.items() if count == max_count]
return modes[0] if len(modes) == 1 else modes


# ---------- Quick usage ----------
if __name__ == "__main__":
sample = [2, 5, 1, 2, 3, 5, 2]
print("mean:", mean(sample))
print("median:", median(sample))
print("mode:", mode(sample))
77 changes: 77 additions & 0 deletions statistics/dispersion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# dispersion.py
from typing import Iterable, List, Union
import math

Number = Union[int, float]


def _to_list(values: Iterable[Number]) -> List[float]:
vals = list(values)
if not vals:
raise ValueError("sequence is empty")
return [float(x) for x in vals]


def data_range(values: Iterable[Number]) -> float:
"""Return range = max - min."""
vals = _to_list(values)
return max(vals) - min(vals)


def variance(values: Iterable[Number], sample: bool = True) -> float:
"""
Return variance. By default sample=True uses sample variance (n-1).
Use sample=False for population variance (n).
"""
vals = _to_list(values)
n = len(vals)
if sample and n < 2:
raise ValueError("sample variance requires at least two data points")
mean_val = sum(vals) / n
ssd = sum((x - mean_val) ** 2 for x in vals)
denom = n - 1 if sample else n
return round(ssd / denom, 2)


def stdev(values: Iterable[Number], sample: bool = True) -> float:
"""Return standard deviation (sqrt of variance)."""
final_val = math.sqrt(variance(values, sample=sample))
return round(final_val, 2)


def iqr(values: Iterable[Number]) -> float:
"""
Return interquartile range (Q3 - Q1).
Uses simple median-of-halves method (consistent with many textbooks).
"""
vals = sorted(_to_list(values))
n = len(vals)
mid = n // 2

if n % 2 == 0:
lower = vals[:mid]
upper = vals[mid:]
else:
lower = vals[:mid] # excludes median
upper = vals[mid + 1:] # excludes median

def _median(arr: List[float]) -> float:
m = len(arr)
if m == 0:
return 0.0
mid_i = m // 2
return arr[mid_i] if m % 2 == 1 else (arr[mid_i - 1] + arr[mid_i]) / 2.0

q1 = _median(lower)
q3 = _median(upper)
return q3 - q1


# ---------- Quick usage ----------
if __name__ == "__main__":
sample = [1, 2, 2, 3, 4, 7, 9]
print("range:", data_range(sample))
print("sample variance:", variance(sample, sample=True))
print("population variance:", variance(sample, sample=False))
print("sample stdev:", stdev(sample, sample=True))
print("IQR:", iqr(sample))
68 changes: 68 additions & 0 deletions statistics/frequency_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# frequency_stats.py
from typing import Iterable, Dict, List, Tuple, Union
from collections import Counter

Number = Union[int, float]


def frequency_table(values: Iterable[Number]) -> Dict[Number, int]:
"""Return a frequency table (value -> count)."""
vals = list(values)
if not vals:
return {}
return dict(Counter(vals))


def relative_frequency(values: Iterable[Number]) -> Dict[Number, float]:
"""Return relative frequencies (value -> proportion)."""
vals = list(values)
n = len(vals)
if n == 0:
return {}
cnt = Counter(vals)
return {k: v / n for k, v in cnt.items()}


def cumulative_frequency(values: Iterable[Number]) -> List[Tuple[Number, int]]:
"""
Return a sorted list of (value, cumulative_count).
Example: [1,1,2,3] -> [(1,2),(2,3),(3,4)]
"""
vals = sorted(list(values))
if not vals:
return []
cnt = Counter(vals)
items = sorted(cnt.items())
cum = []
running = 0
for val, count in items:
running += count
cum.append((val, running))
return cum


def top_k_modes(values: Iterable[Number], k: int = 1) -> List[Number]:
"""
Return top-k most frequent values (ties allowed). If k=1 returns list of top mode(s).
"""
if k < 1:
raise ValueError("k must be >= 1")
vals = list(values)
if not vals:
return []
cnt = Counter(vals)
most_common = cnt.most_common()
# Determine cutoff frequency for top-k positions
if k >= len(most_common):
return [val for val, _ in most_common]
cutoff = most_common[k - 1][1]
return [val for val, c in most_common if c >= cutoff]


# ---------- Quick usage ----------
if __name__ == "__main__":
data = [1, 2, 2, 3, 3, 3, 4]
print("freq table:", frequency_table(data))
print("relative freq:", relative_frequency(data))
print("cumulative freq:", cumulative_frequency(data))
print("top 2 modes:", top_k_modes(data, k=2))
57 changes: 57 additions & 0 deletions statistics/probability_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# probability_basic.py
from typing import Iterable
import math
from collections import Counter

def factorial(n: int) -> int:
"""Return n! for non-negative integer n."""
if n < 0:
raise ValueError("factorial() not defined for negative values")
return math.prod(range(1, n + 1)) if n > 0 else 1


def permutations(n: int, r: int) -> int:
"""Return P(n, r) = n! / (n-r)!"""
if not (0 <= r <= n):
raise ValueError("require 0 <= r <= n")
return factorial(n) // factorial(n - r)


def combinations(n: int, r: int) -> int:
"""Return C(n, r) = n! / (r! (n-r)!)"""
if not (0 <= r <= n):
raise ValueError("require 0 <= r <= n")
return factorial(n) // (factorial(r) * factorial(n - r))


def binomial_pmf(k: int, n: int, p: float) -> float:
"""
Return binomial probability P(X = k) for X ~ Binomial(n, p)
"""
if not (0 <= k <= n):
raise ValueError("k must be between 0 and n")
if not (0.0 <= p <= 1.0):
raise ValueError("p must be between 0 and 1")
return combinations(n, k) * (p ** k) * ((1 - p) ** (n - k))


def empirical_probability(event_values: Iterable, sample_space_values: Iterable) -> float:
"""
Compute empirical probability of event_values within sample_space_values.
event_values may be a subset of sample_space_values (both iterable of outcomes).
Example: empirical_probability([1,1],[1,1,2,3]) -> 0.5
"""
s = list(sample_space_values)
if not s:
raise ValueError("sample space cannot be empty")
event_count = sum(1 for x in s if x in set(event_values))
return event_count / len(s)


# ---------- Quick usage ----------
if __name__ == "__main__":
print("5! =", factorial(5))
print("P(5,2) =", permutations(5, 2))
print("C(5,2) =", combinations(5, 2))
print("Binomial P(X=2; n=5, p=0.3) =", binomial_pmf(2, 5, 0.3))
print("Empirical prob of [1] in [1,1,2,3] =", empirical_probability([1], [1,1,2,3]))