diff --git a/statistics/__init__.py b/statistics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/statistics/central_tendency.py b/statistics/central_tendency.py new file mode 100644 index 0000000..4eacfec --- /dev/null +++ b/statistics/central_tendency.py @@ -0,0 +1,48 @@ +# central_tendency.py +from typing import Iterable, List, Union + +Number = Union[int, float] + + +def mean(values: Iterable[Number]) -> float: + """Return arithmetic mean of values. Raises ValueError if empty.""" + vals = list(values) + if not vals: + raise ValueError("mean() arg is an empty sequence") + return round(sum(vals) / len(vals), 2) + + +def median(values: Iterable[Number]) -> float: + """Return median. For even length returns the average of two middle values.""" + vals = sorted(values) + if not vals: + raise ValueError("median() arg is an empty sequence") + n = len(vals) + mid = n // 2 + if n % 2 == 1: + return float(vals[mid]) + return (vals[mid - 1] + vals[mid]) / 2.0 + + +def mode(values: Iterable[Number]) -> Union[Number, List[Number]]: + """ + Return the mode value if unique, otherwise return a list of modes. + Example: [1,2,2,3] -> 2 ; [1,1,2,2] -> [1,2] + """ + vals = list(values) + if not vals: + raise ValueError("mode() arg is an empty sequence") + freq = {} + for v in vals: + freq[v] = freq.get(v, 0) + 1 + max_count = max(freq.values()) + modes = [k for k, count in freq.items() if count == max_count] + return modes[0] if len(modes) == 1 else modes + + +# ---------- Quick usage ---------- +if __name__ == "__main__": + sample = [2, 5, 1, 2, 3, 5, 2] + print("mean:", mean(sample)) + print("median:", median(sample)) + print("mode:", mode(sample)) diff --git a/statistics/dispersion.py b/statistics/dispersion.py new file mode 100644 index 0000000..de199ab --- /dev/null +++ b/statistics/dispersion.py @@ -0,0 +1,77 @@ +# dispersion.py +from typing import Iterable, List, Union +import math + +Number = Union[int, float] + + +def _to_list(values: Iterable[Number]) -> List[float]: + vals = list(values) + if not vals: + raise ValueError("sequence is empty") + return [float(x) for x in vals] + + +def data_range(values: Iterable[Number]) -> float: + """Return range = max - min.""" + vals = _to_list(values) + return max(vals) - min(vals) + + +def variance(values: Iterable[Number], sample: bool = True) -> float: + """ + Return variance. By default sample=True uses sample variance (n-1). + Use sample=False for population variance (n). + """ + vals = _to_list(values) + n = len(vals) + if sample and n < 2: + raise ValueError("sample variance requires at least two data points") + mean_val = sum(vals) / n + ssd = sum((x - mean_val) ** 2 for x in vals) + denom = n - 1 if sample else n + return round(ssd / denom, 2) + + +def stdev(values: Iterable[Number], sample: bool = True) -> float: + """Return standard deviation (sqrt of variance).""" + final_val = math.sqrt(variance(values, sample=sample)) + return round(final_val, 2) + + +def iqr(values: Iterable[Number]) -> float: + """ + Return interquartile range (Q3 - Q1). + Uses simple median-of-halves method (consistent with many textbooks). + """ + vals = sorted(_to_list(values)) + n = len(vals) + mid = n // 2 + + if n % 2 == 0: + lower = vals[:mid] + upper = vals[mid:] + else: + lower = vals[:mid] # excludes median + upper = vals[mid + 1:] # excludes median + + def _median(arr: List[float]) -> float: + m = len(arr) + if m == 0: + return 0.0 + mid_i = m // 2 + return arr[mid_i] if m % 2 == 1 else (arr[mid_i - 1] + arr[mid_i]) / 2.0 + + q1 = _median(lower) + q3 = _median(upper) + return q3 - q1 + + +# ---------- Quick usage ---------- +if __name__ == "__main__": + sample = [1, 2, 2, 3, 4, 7, 9] + print("range:", data_range(sample)) + print("sample variance:", variance(sample, sample=True)) + print("population variance:", variance(sample, sample=False)) + print("sample stdev:", stdev(sample, sample=True)) + print("IQR:", iqr(sample)) diff --git a/statistics/frequency_stats.py b/statistics/frequency_stats.py new file mode 100644 index 0000000..28f5ddd --- /dev/null +++ b/statistics/frequency_stats.py @@ -0,0 +1,68 @@ +# frequency_stats.py +from typing import Iterable, Dict, List, Tuple, Union +from collections import Counter + +Number = Union[int, float] + + +def frequency_table(values: Iterable[Number]) -> Dict[Number, int]: + """Return a frequency table (value -> count).""" + vals = list(values) + if not vals: + return {} + return dict(Counter(vals)) + + +def relative_frequency(values: Iterable[Number]) -> Dict[Number, float]: + """Return relative frequencies (value -> proportion).""" + vals = list(values) + n = len(vals) + if n == 0: + return {} + cnt = Counter(vals) + return {k: v / n for k, v in cnt.items()} + + +def cumulative_frequency(values: Iterable[Number]) -> List[Tuple[Number, int]]: + """ + Return a sorted list of (value, cumulative_count). + Example: [1,1,2,3] -> [(1,2),(2,3),(3,4)] + """ + vals = sorted(list(values)) + if not vals: + return [] + cnt = Counter(vals) + items = sorted(cnt.items()) + cum = [] + running = 0 + for val, count in items: + running += count + cum.append((val, running)) + return cum + + +def top_k_modes(values: Iterable[Number], k: int = 1) -> List[Number]: + """ + Return top-k most frequent values (ties allowed). If k=1 returns list of top mode(s). + """ + if k < 1: + raise ValueError("k must be >= 1") + vals = list(values) + if not vals: + return [] + cnt = Counter(vals) + most_common = cnt.most_common() + # Determine cutoff frequency for top-k positions + if k >= len(most_common): + return [val for val, _ in most_common] + cutoff = most_common[k - 1][1] + return [val for val, c in most_common if c >= cutoff] + + +# ---------- Quick usage ---------- +if __name__ == "__main__": + data = [1, 2, 2, 3, 3, 3, 4] + print("freq table:", frequency_table(data)) + print("relative freq:", relative_frequency(data)) + print("cumulative freq:", cumulative_frequency(data)) + print("top 2 modes:", top_k_modes(data, k=2)) diff --git a/statistics/probability_basic.py b/statistics/probability_basic.py new file mode 100644 index 0000000..7479bbd --- /dev/null +++ b/statistics/probability_basic.py @@ -0,0 +1,57 @@ +# probability_basic.py +from typing import Iterable +import math +from collections import Counter + +def factorial(n: int) -> int: + """Return n! for non-negative integer n.""" + if n < 0: + raise ValueError("factorial() not defined for negative values") + return math.prod(range(1, n + 1)) if n > 0 else 1 + + +def permutations(n: int, r: int) -> int: + """Return P(n, r) = n! / (n-r)!""" + if not (0 <= r <= n): + raise ValueError("require 0 <= r <= n") + return factorial(n) // factorial(n - r) + + +def combinations(n: int, r: int) -> int: + """Return C(n, r) = n! / (r! (n-r)!)""" + if not (0 <= r <= n): + raise ValueError("require 0 <= r <= n") + return factorial(n) // (factorial(r) * factorial(n - r)) + + +def binomial_pmf(k: int, n: int, p: float) -> float: + """ + Return binomial probability P(X = k) for X ~ Binomial(n, p) + """ + if not (0 <= k <= n): + raise ValueError("k must be between 0 and n") + if not (0.0 <= p <= 1.0): + raise ValueError("p must be between 0 and 1") + return combinations(n, k) * (p ** k) * ((1 - p) ** (n - k)) + + +def empirical_probability(event_values: Iterable, sample_space_values: Iterable) -> float: + """ + Compute empirical probability of event_values within sample_space_values. + event_values may be a subset of sample_space_values (both iterable of outcomes). + Example: empirical_probability([1,1],[1,1,2,3]) -> 0.5 + """ + s = list(sample_space_values) + if not s: + raise ValueError("sample space cannot be empty") + event_count = sum(1 for x in s if x in set(event_values)) + return event_count / len(s) + + +# ---------- Quick usage ---------- +if __name__ == "__main__": + print("5! =", factorial(5)) + print("P(5,2) =", permutations(5, 2)) + print("C(5,2) =", combinations(5, 2)) + print("Binomial P(X=2; n=5, p=0.3) =", binomial_pmf(2, 5, 0.3)) + print("Empirical prob of [1] in [1,1,2,3] =", empirical_probability([1], [1,1,2,3]))