From 2dd4f4839fe866d57a63ae4ba18ccaef5d3b54fb Mon Sep 17 00:00:00 2001 From: Taufia Hussain Date: Tue, 3 Feb 2026 13:30:18 +0100 Subject: [PATCH] feat: sync config/compute_metrics/run + README with similarity_scoring --- README.md | 88 +++++++++++---------- compute_metrics.py | 186 ++++++++++++++++++++++----------------------- config.py | 143 ++++++++++++++++++++-------------- offense_helpers.py | 69 +++++++++++------ run.py | 84 +++++++++++++++----- 5 files changed, 333 insertions(+), 237 deletions(-) diff --git a/README.md b/README.md index 6211d98..c64566c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# Introduction +# population_metrics Batch runner to compute **population-level sentencing metrics** and **suitability scores** for all individuals, writing a flat file (CSV/Parquet). The pipeline is strict about missing inputs: when nothing can be evaluated for a person, we emit NaNs instead of 0 so the case can be flagged, metrics are **skipped** when their prerequisites aren’t present (no fabricated values). Metrics are **named and extensible**; new metrics can be added without changing positional order. -## Contents +## Repo contents - `config.py` — Paths (DEV/PROD), column map (`COLS`), defaults (`DEFAULTS`), offense lists (`OFFENSE_LISTS`), and metric weights (`METRIC_WEIGHTS`). - `compute_metrics.py` — Library functions to read raw tables and compute **named features** for a single ID (skip-if-missing). - `sentencing_math.py` — Pure math (no I/O): time decomposition, proportions, frequency/trend, rehab, and name-based suitability. @@ -63,38 +63,38 @@ This allows downstream tools to tell “not evaluated / insufficient data” apa - Errors (if any): `*.errors.jsonl` with `{id, error}` records. - Console preview prints the first rows/columns for a quick check. -## Worked Examples +## Worked examples (from scratch) These examples walk through **exactly** what the pipeline computes for a specific ID: counts → denominators → proportions → time pieces → trend/frequency → named vector → suitability. The LaTeX below **matches the paper** notation. -### Example 1 -**CDCR ID:** `00173d8423`
+# Worked Example (REAL DATA) +**CDCR ID:** `00173d8423` **Offense Lists (active for this run)** - Violent: `['187', '211', '245']` - Nonviolent: `['459', '484', '10851']` -#### Inputs +## Inputs - Current offense rows found: **11** - Prior offense rows found: **6** -#### Counts by Category +### Counts by Category - Current: {'violent': 1, 'nonviolent': 1, 'other': 9, 'clash': 0} - Prior: {'violent': 0, 'nonviolent': 4, 'other': 2, 'clash': 0} -#### Time Pieces -- `current_sentence_months` = 10000.000 -- `completed_months` = 330.000 -- `past_time_months` = NA -- `pct_current_completed` = 3.300 -- `time_outside_months` = 0.000 +### Time Pieces +- current_sentence_months = 10000.000 +- completed_months = 330.000 +- past_time_months = NA +- pct_current_completed = 3.300 +- time_outside_months = 0.000 -**Definition:** +**Paper definition (Eq. B.2–15):** $$ \mathrm{out}^t_i = t_d - \mathrm{in}^{(\mathrm{vio+nonvio}),t}_i - \text{childhood}. $$ -#### Calculations +### Calculations (refer to LaTeX section for formulas) - `desc_nonvio_curr = 1/2 = 0.500` (see Eq. **DESC-NONVIO-CURR**) - `desc_nonvio_past = 4/4 = 1.000` (see Eq. **DESC-NONVIO-PAST**) @@ -108,13 +108,13 @@ $$ - Frequency (per month outside): - `raw_freq_violent = NA; raw_freq_total = NA` - - `normalized: **SKIPPED**` (requires `time_outside > 0`, `freq_min_rate` and `freq_max_rate`) + `normalized: **SKIPPED** (requires `time_outside > 0` and `freq_min_rate`/`freq_max_rate` (see Eqs. **FREQ-VIO**, **FREQ-TOTAL**) - Age (min–max): - `age_raw = 38.000`, `min = 18.000`, `max = 90.000` → `age = 0.278` (see Eq. **AGE-NORM**) -#### Final Metric Vector +## Final Metric Vector (named) Order: `desc_nonvio_curr, desc_nonvio_past, age, freq_violent, freq_total, severity_trend, edu_general, edu_advanced, rehab_general, rehab_advanced` Values: `[0.500, 1.000, 0.278, SKIPPED, SKIPPED, 0.112, SKIPPED, SKIPPED, SKIPPED, SKIPPED]` @@ -123,35 +123,35 @@ Values: `[0.500, 1.000, 0.278, SKIPPED, SKIPPED, 0.112, SKIPPED, SKIPPED, SKIPPE ### Example 2 -**CDCR ID:** `0029029e5b`
+**CDCR ID:** `0029029e5b` **Offense Lists (active for this run)** - Violent: `['187', '211', '245']` - Nonviolent: `['459', '484', '10851']` -#### Inputs +## Inputs - Current offense rows found: **1** - Prior offense rows found: **2** -#### Counts by Category +### Counts by Category - Current: {'violent': 1, 'nonviolent': 0, 'other': 0, 'clash': 0} - Prior: {'violent': 2, 'nonviolent': 0, 'other': 0, 'clash': 0} -#### Time Pieces -- `current_sentence_months` = 84.000 -- `completed_months` = 67.200 -- `past_time_months` = NA -- `pct_current_completed` = 80.000 -- `time_outside_months` = 0.000 +### Time Pieces +- current_sentence_months = 84.000 +- completed_months = 67.200 +- past_time_months = NA +- pct_current_completed = 80.000 +- time_outside_months = 0.000 -**Definition:** +**Paper definition (Eq. B.2–15):** $$ \mathrm{out}^t_i = t_d - \mathrm{in}^{(\mathrm{vio+nonvio}),t}_i - \text{childhood}. $$ -#### Calculations +### Calculations (refer to LaTeX section for formulas) - `desc_nonvio_curr = 0/1 = 0.000` (see Eq. **DESC-NONVIO-CURR**) - `desc_nonvio_past = 0/2 = 0.000` (see Eq. **DESC-NONVIO-PAST**) @@ -166,20 +166,20 @@ $$ - Frequency (per month outside): - `violent_total = 3; total_conv = 3; time_outside = 0.000` - `raw_freq_violent = NA; raw_freq_total = NA` - - `normalized: **SKIPPED**` (requires `time_outside > 0`, `freq_min_rate` and `freq_max_rate`) + - `normalized: **SKIPPED** (requires `time_outside > 0` and `freq_min_rate`/`freq_max_rate` (see Eqs. **FREQ-VIO**, **FREQ-TOTAL**) - Age (min–max): - `age_raw = 38.000`, `min = 18.000`, `max = 90.000` → `age = 0.278` (see Eq. **AGE-NORM**) -#### Final Metric Vector +## Final Metric Vector (named) Order: `desc_nonvio_curr, desc_nonvio_past, age, freq_violent, freq_total, severity_trend, edu_general, edu_advanced, rehab_general, rehab_advanced` Values: `[0.000, 0.000, 0.278, SKIPPED, SKIPPED, 0.000, SKIPPED, SKIPPED, SKIPPED, SKIPPED]` **Score:** `0.278` (out of `3.000`) — **9.3% of maximum** **Contributing metrics:** `age, desc_nonvio_curr, desc_nonvio_past, severity_trend` -### Re‑generate Examples +### Re‑generate these examples **macOS/Linux** ```bash CFG_PROFILE=DEV python docs_1/make_worked_example.py --uid "0029029e5b" --violent "187,211,245" --nonviolent "459,484,10851" --age-years 38 --exposure-months 480 --freq-bounds "0,0.05" --out docs_1/README_worked_example_0029029e5b.md @@ -192,7 +192,7 @@ python docs_1\make_worked_example.py --uid "0029029e5b" --violent "187,211,245" python docs_1\make_worked_example.py --uid "00173d8423" --violent "187,211,245" --nonviolent "459,484,10851" --age-years 38 --exposure-months 480 --freq-bounds "0,0.05" --out "docs_1\README_worked_example_00173d8423.md" ``` -## Formulas Implemented +## Formulas implemented (LaTeX) - **Descriptive proportions:** $$ @@ -303,7 +303,7 @@ and `x_k* = 0` for `d_k = −1` (negative-direction metrics). > • Frequency requires **both** `time_outside > 0` **and** configured `freq_min_rate`/`freq_max_rate`. > • Rehab/education are per‑month‑inside, then min–max normalized **only if** inputs and bounds are provided; otherwise **omitted**. -## Validation Checklist +## Validation checklist - Proportion metrics are computed **only** when denominators \(> 0\); otherwise the metric is **SKIPPED**. - Frequency requires **both** `time_outside > 0` **and** `freq_min_rate`/`freq_max_rate` in `config.py`. - Offense classification uses only `OFFENSE_LISTS`; anything unlisted → **other** (and does not contribute to denominators). @@ -311,8 +311,7 @@ and `x_k* = 0` for `d_k = −1` (negative-direction metrics). - When comparing individuals (similarity), compute on the **intersection of present features** and require a minimum shared‑dimension count (e.g., ≥3). Consider also Euclidean or Tanimoto for sensitivity analysis. - If no metrics pass the gating (denominators 0, missing exposure, missing age, etc.), the scorer returns NaN (or None, depending on runner) and sets evaluated = 0. This is intentional and we do not fabricate zeros for unevaluable people. -## Programmatic Example -```python +## Programmatic example import math import config as CFG import compute_metrics as cm @@ -329,7 +328,8 @@ ids = demo[CFG.COLS["id"]].astype(str).dropna().unique().tolist()[:3] rows = [] for uid in ids: - feats, aux = cm.compute_features(uid, demo, cur, pri, CFG.OFFENSE_LISTS) + feats, aux = cm.compute_features(str(uid), demo, cur, pri, CFG.OFFENSE_LISTS) + # NOTE: feats is "skip-if-missing" — it may not contain every metric in CFG.METRIC_NAMES # name-based suitability; may return NaN/None if no evaluable metrics score_ratio, num, den = sm.suitability_score_named( @@ -349,25 +349,28 @@ for uid in ids: score_ratio_safe = math.nan num_safe = math.nan den_safe = math.nan + score_pct_of_out = math.nan evaluated = 0 else: score_ratio_safe = float(score_ratio) num_safe = float(num) den_safe = float(den) + score_pct_of_out = (num_safe / den_safe) * 100.0 evaluated = 1 - # Optional: expose time_outside if present in aux + # Optional: expose aux fields if present time_outside_months = aux.get("time_outside") pct_completed = aux.get("pct_completed") rows.append( { - CFG.COLS["id"]: uid, - **feats, # all computed named metrics - "score": num_safe, # numerator (Σ w·m) - "score_out_of": den_safe, # denominator (Σ w·x*) + CFG.COLS["id"]: str(uid), + **feats, # computed named metrics (may be a subset) + "score": num_safe, # numerator (Σ w·m) + "score_out_of": den_safe, # denominator (Σ w·x*) "score_ratio": score_ratio_safe, - "evaluated": evaluated, # 1 = evaluated, 0 = not evaluable + "score_pct_of_out": score_pct_of_out, + "evaluated": evaluated, # 1 = evaluated, 0 = not evaluable "time_outside_months": time_outside_months, "pct_completed": pct_completed, } @@ -375,6 +378,7 @@ for uid in ids: df = pd.DataFrame(rows) print(df.head()) + ``` ## Troubleshooting diff --git a/compute_metrics.py b/compute_metrics.py index 6ad7bc4..a1ab486 100644 --- a/compute_metrics.py +++ b/compute_metrics.py @@ -1,42 +1,42 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -compute_metrics.py — parameterized, raw-data–oriented (library only) +compute_metrics.py — parameterized, raw-data–oriented feature computation (no CLI) -Pipeline (library functions): +Pipeline: read → parse → classify(offenses) → time → features -Scoring/printing should be handled by a separate runner (e.g., run_compute_metrics.py). Relies on: - - config.py: PATHS, COLS, DEFAULTS, METRIC_WEIGHTS, DEFAULT_TIME_ELAPSED_YEARS + - config.py: PATHS, COLS, DEFAULTS, OFFENSE_LISTS, OFFENSE_POLICY - sentencing_math.py: pure math helpers (imported as sm) - - offense_helpers.py: classify_offense (uses OFFENSE_LISTS/OFFENSE_POLICY from config) + - offense_helpers.py: classify_offense (uses OFFENSE_LISTS/OFFENSE_POLICY) Design notes: • Missing numerics remain NaN (configurable via DEFAULTS["missing_numeric"]). • Time fields are unit-aware by column name: "...year..."→*12, "...day..."→/30, else months. - • Convictions are handled via sm.Convictions (single class). - • Weights are NAME-BASED and should be applied outside this module. - • Exposure window: uses DEFAULTS["months_elapsed_total"] if provided; otherwise - computes per-person exposure as months from (DOB+18y) to reference_date. - • Severity trend: years_elapsed is computed from commitment tables - (first prior commitment date → last current commitment date); if - DEFAULT_TIME_ELAPSED_YEARS is not None, that value overrides the - computed years when scoring severity_trend. • STRICT SKIP-IF-MISSING: features are ONLY added when inputs are valid. + • Frequency exposure window (months) uses DEFAULTS["months_elapsed_for_frequency"] if provided, + else computes a per-person exposure window (dob/reference_date if available). + • Severity trend horizon (years) is computed from commitments (first prior → last current), + optionally overridden by DEFAULTS["years_elapsed_for_trend"] when use_default_trend_years=True. """ + from __future__ import annotations from typing import Any, Dict, Optional, Tuple + +import math import numpy as np import pandas as pd from pandas.tseries.offsets import DateOffset + import config as CFG import sentencing_math as sm from offense_helpers import classify_offense # Small config helpers + def _cfg_col(name: str) -> Optional[str]: """Return configured column name (or None) for a logical field.""" return getattr(CFG, "COLS", {}).get(name) @@ -47,7 +47,8 @@ def _cfg_default(key: str, fallback: Any) -> Any: return getattr(CFG, "DEFAULTS", {}).get(key, fallback) -# I/O (CSV/XLSX via pandas) +# I/O helpers + def _to_raw_github_url(path: str) -> str: """Allow GitHub 'blob' URLs in config by converting them to 'raw' URLs.""" if not isinstance(path, str): @@ -81,7 +82,8 @@ def get_row_by_id(df: pd.DataFrame, id_col: str, uid: str) -> Optional[pd.Series return None if sub.empty else sub.iloc[0] -# Parsing (NaN-honest) +# Parsing helpers + def _to_float_or_nan(x: Any) -> float: """Parse numeric strings safely; return NaN if missing/invalid.""" try: @@ -110,7 +112,8 @@ def to_months(val: Any, colname: Optional[str]) -> float: return x -# Offense counting (uses classify_offense from offense_helpers.py) +# Offense counting utilities + def count_offenses_by_category( df: pd.DataFrame, id_col: str, @@ -129,21 +132,26 @@ def count_offenses_by_category( return out -# Time + Age extractors +# Time + Age extract + def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs]: """ Build sm.TimeInputs from the demographics row using configured columns. - Requires at least the two fields in DEFAULTS['require_time_fields']. + Requires at least the fields listed in DEFAULTS['require_time_fields']. """ if demo_row is None: return None - cur = to_months(demo_row.get(_cfg_col("current_sentence")), _cfg_col("current_sentence")) - com = to_months(demo_row.get(_cfg_col("completed_time")), _cfg_col("completed_time")) + cur_col = _cfg_col("current_sentence") + com_col = _cfg_col("completed_time") + pas_col = _cfg_col("past_time") + + cur = to_months(demo_row.get(cur_col), cur_col) + com = to_months(demo_row.get(com_col), com_col) + pas = ( - to_months(demo_row.get(_cfg_col("past_time")), _cfg_col("past_time")) - if _cfg_col("past_time") - else _cfg_default("missing_numeric", np.nan) + to_months(demo_row.get(pas_col), pas_col) + if pas_col else _cfg_default("missing_numeric", np.nan) ) req = tuple(_cfg_default("require_time_fields", ("current_sentence", "completed_time"))) @@ -152,7 +160,6 @@ def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs if (need_cur and np.isnan(cur)) or (need_com and np.isnan(com)): return None - # Childhood months come from config (no hard-coding) return sm.TimeInputs( current_sentence_months=cur, completed_months=com, @@ -162,19 +169,18 @@ def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs def extract_age_years(demo_row: Optional[pd.Series]) -> Optional[float]: - """ - Return age in years if present; else None. - Caller will SKIP the 'age' feature if this returns None. - """ + """Return age in years if present; else None (caller will skip age feature).""" if demo_row is None: return None col = _cfg_col("age_years") if col and (col in demo_row) and pd.notna(demo_row[col]): - return _to_float_or_nan(demo_row[col]) + v = _to_float_or_nan(demo_row[col]) + return None if np.isnan(v) else float(v) return None -# Exposure helpers +# Exposure / elapsed time + def _months_between(start: pd.Timestamp, end: pd.Timestamp) -> Optional[float]: """Return months between two timestamps (≈ days/30) or None if either is NaT.""" if pd.isna(start) or pd.isna(end): @@ -191,36 +197,28 @@ def _years_between(start: pd.Timestamp, end: pd.Timestamp) -> Optional[float]: return max(0.0, days / 365.25) -def _years_elapsed_from_commitments( +def years_elapsed_prior_curr_commitments( uid: str, current_df: pd.DataFrame, prior_df: pd.DataFrame, ) -> Optional[float]: """ - Compute years_elapsed for severity trend as: - first recorded prior commitment date → last recorded current commitment date. + Calculate elapsed years between: + first recorded PRIOR commitment date -> last recorded CURRENT commitment date. Uses optional config columns: - COLS["prior_commit_date"], COLS["current_commit_date"]. + COLS["prior_commit_date"], COLS["current_commit_date"] - Returns None if: - • any required column is missing, or - • there are no valid dates for this uid. + Returns None if required columns/dates are missing for this uid. """ - id_col = CFG.COLS.get("id") + id_col = _cfg_col("id") prior_date_col = _cfg_col("prior_commit_date") current_date_col = _cfg_col("current_commit_date") - if ( - prior_date_col is None - or current_date_col is None - or id_col is None - or prior_df is None - or current_df is None - ): + if id_col is None or prior_date_col is None or current_date_col is None: + return None + if prior_df is None or current_df is None: return None - - # Verify columns exist if ( id_col not in prior_df.columns or id_col not in current_df.columns @@ -231,36 +229,35 @@ def _years_elapsed_from_commitments( prior_sub = prior_df.loc[prior_df[id_col].astype(str) == str(uid)] curr_sub = current_df.loc[current_df[id_col].astype(str) == str(uid)] - if prior_sub.empty or curr_sub.empty: return None prior_dates = pd.to_datetime(prior_sub[prior_date_col], errors="coerce") curr_dates = pd.to_datetime(curr_sub[current_date_col], errors="coerce") - if prior_dates.notna().sum() == 0 or curr_dates.notna().sum() == 0: return None first_prior = prior_dates.min() last_current = curr_dates.max() - return _years_between(first_prior, last_current) -# Feature computation (public API) +# Feature computation API + def compute_features( uid: str, demo: pd.DataFrame, current_df: pd.DataFrame, prior_df: pd.DataFrame, lists: Dict[str, Any], + use_default_trend_years: bool = True, ) -> Tuple[Dict[str, float], Dict[str, Any]]: """ Compute name-keyed metrics for a single ID. Returns: feats: name→value dictionary (features are ONLY added when inputs are valid). - aux: auxiliary info useful for debugging/QA (time pieces, raw counts, etc.). + aux: auxiliary info for QA/debugging. """ cols = CFG.COLS row = get_row_by_id(demo, cols["id"], uid) @@ -268,21 +265,27 @@ def compute_features( feats: Dict[str, float] = {} aux: Dict[str, Any] = {} - # Determine exposure window (months) for frequency metrics - # Prefer global config; else compute per-person as months from (DOB+18y) → reference_date - per_person_exposure = _cfg_default("months_elapsed_total", None) + + # Frequency exposure (MONTHS) + + per_person_exposure = _cfg_default("months_elapsed_for_frequency", None) + if per_person_exposure is None and row is not None: dob_col, ref_col = _cfg_col("dob"), _cfg_col("reference_date") if dob_col and ref_col and (dob_col in row) and (ref_col in row): dob = pd.to_datetime(row.get(dob_col), errors="coerce") ref = pd.to_datetime(row.get(ref_col), errors="coerce") adulthood = (dob + DateOffset(years=18)) if pd.notna(dob) else pd.NaT - start = adulthood if pd.notna(adulthood) else dob # fall back to dob if adulthood missing + start = adulthood if pd.notna(adulthood) else dob per_person_exposure = _months_between(start, ref) - # Time (inside/outside) + aux["months_elapsed_for_frequency"] = per_person_exposure + + + # Time (pct/outside) + t = extract_time_inputs(row) - if t: + if t is not None: aux["time_inputs"] = t _, pct_completed, time_outside = sm.compute_time_vars(t, per_person_exposure) aux["pct_completed"] = pct_completed @@ -291,9 +294,11 @@ def compute_features( aux["pct_completed"] = np.nan aux["time_outside"] = np.nan - # Age (normalized) — SKIP IF MISSING + + # Age (optional) + age_val = extract_age_years(row) - if age_val is not None and not np.isnan(age_val): + if age_val is not None: feats["age"] = sm.score_age_norm( age_val, _cfg_default("age_min", None), @@ -301,9 +306,11 @@ def compute_features( ) aux["age_value"] = age_val else: - aux["age_value"] = np.nan # recorded for QA, but no 'age' feature added + aux["age_value"] = np.nan + # Convictions (current & prior) + cur = count_offenses_by_category(current_df, cols["id"], uid, cols["current_offense_text"], lists) pri = count_offenses_by_category(prior_df, cols["id"], uid, cols["prior_offense_text"], lists) aux["counts_by_category"] = {"current": cur, "prior": pri} @@ -315,13 +322,15 @@ def compute_features( past_violent=pri["violent"], ) - # Descriptive proportions — only when denominators > 0 + # Descriptive proportions if conv.curr_total > 0: feats["desc_nonvio_curr"] = sm.score_desc_nonvio_curr(conv.curr_nonviolent, conv.curr_total) if conv.past_total > 0: feats["desc_nonvio_past"] = sm.score_desc_nonvio_past(conv.past_nonviolent, conv.past_total) - # Frequency (rates) — require time_outside > 0 AND explicit bounds + + # Frequency metrics + minr, maxr = _cfg_default("freq_min_rate", None), _cfg_default("freq_max_rate", None) time_outside = aux["time_outside"] have_bounds = (minr is not None and maxr is not None and float(maxr) > float(minr)) @@ -336,41 +345,30 @@ def compute_features( feats["freq_total"] = sm.score_freq_total(conv.total, time_outside, minr, maxr) # else: skip both freq_* features - # Severity trend — only when both denominators > 0 + + # Severity trend + if conv.curr_total > 0 and conv.past_total > 0: - # 1) Compute years elapsed from commitments - yrs_from_commits = _years_elapsed_from_commitments(uid, current_df, prior_df) - aux["years_elapsed_from_commitments"] = yrs_from_commits + yrs_from_commits = years_elapsed_prior_curr_commitments(uid, current_df, prior_df) + aux["years_elapsed_prior_curr_commitments"] = yrs_from_commits - # 2) Start with computed value, then optionally override via config - yrs_elapsed = yrs_from_commits + yrs_elapsed_for_trend = yrs_from_commits - override_years = getattr(CFG, "DEFAULT_TIME_ELAPSED_YEARS", None) - if override_years is not None: - # Config override (acts as default horizon when set) + # Only ONE override knob, from DEFAULTS + override_years = _cfg_default("years_elapsed_for_trend", None) + if use_default_trend_years and override_years is not None: try: - yrs_elapsed = float(override_years) + yrs_elapsed_for_trend = float(override_years) except Exception: - # If override is misconfigured, silently keep computed yrs_elapsed - pass - elif yrs_elapsed is None: - # Fallback if neither computed nor override is available - yrs_elapsed = 0.0 - - aux["years_elapsed_for_trend"] = yrs_elapsed - - feats["severity_trend"] = sm.score_severity_trend( - conv.curr_violent_prop, - conv.past_violent_prop, - yrs_elapsed, - ) - # else: skip severity_trend - - # Rehabilitation / Education metrics: - # Intentionally omitted here because the public tables we load do not contain - # reliable program-credit fields. Per policy, we do not fabricate zeros. - # When a rehab credits source is provided (via config paths/columns or a join), - # callers should construct sm.RehabInputs and include these features; otherwise - # they are skipped and NOT added to the vector. + pass # keep computed value if override is invalid + + aux["years_elapsed_for_trend"] = yrs_elapsed_for_trend + + if yrs_elapsed_for_trend is not None: + feats["severity_trend"] = sm.score_severity_trend( + conv.curr_violent_prop, + conv.past_violent_prop, + yrs_elapsed_for_trend, + ) return feats, aux diff --git a/config.py b/config.py index 7a216c6..85486c0 100644 --- a/config.py +++ b/config.py @@ -1,75 +1,96 @@ # config.py +from __future__ import annotations + import os import math from typing import Any, Dict -# Profiles & Data Locations -PROFILE = os.getenv("CFG_PROFILE", "PROD") # "DEV" or "PROD" -COMMIT_SHA = os.getenv("DATA_COMMIT", "main") # pin for reproducibility - -PATHS_PROD = { - "demographics": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/demographics.csv", - "prior_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/prior_commitments.csv", - "current_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/current_commitments.csv", - # future optional (if available later): - # "rehab": "path-or-url-to-rehab-credits.csv", -} - -PATHS_DEV = { - "demographics": r"D:\Judge_bias_detection\milestone_2\demographics.csv", - "prior_commitments": r"D:\Judge_bias_detection\milestone_2\prior_commitments.csv", - "current_commitments": r"D:\Judge_bias_detection\milestone_2\current_commitments.csv", -} - -PATHS = PATHS_PROD if PROFILE == "PROD" else PATHS_DEV +# Data locations (single source of truth) +# +# Default behavior: +# - Reads from the offenses_data GitHub repo (raw URLs) +# Optional local override: +# - Set SIMILARITY_DATA_DIR to a folder containing: +# demographics.csv, prior_commitments.csv, current_commitments.csv + +COMMIT_SHA = os.getenv("DATA_COMMIT", "main") # pin for reproducibility +DATA_DIR = os.getenv("SIMILARITY_DATA_DIR", "").strip() + +if DATA_DIR: + PATHS: Dict[str, str] = { + "demographics": os.path.join(DATA_DIR, "demographics.csv"), + "prior_commitments": os.path.join(DATA_DIR, "prior_commitments.csv"), + "current_commitments": os.path.join(DATA_DIR, "current_commitments.csv"), + # future optional: + # "rehab": os.path.join(DATA_DIR, "rehab_credits.csv"), + } +else: + PATHS = { + "demographics": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/demographics.csv", + "prior_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/prior_commitments.csv", + "current_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/current_commitments.csv", + # future optional: + # "rehab": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/rehab_credits.csv", + } # Similarity / severity configuration + # Minimum number of overlapping (valid) features required for similarity. -# If the intersection size is < MIN_OVERLAP_FOR_SIMILARITY, all similarity -# measures (cosine, euclidean_sim, tanimoto, jaccard) should return NaN. +# If the intersection size is < MIN_OVERLAP_FOR_SIMILARITY, similarity measures return NaN. MIN_OVERLAP_FOR_SIMILARITY: int = 3 # Decay rate λ used in the severity_trend formula: -# severity_trend = Δv * exp(-λ * years_elapsed) +# severity_trend = Δv * exp(-λ * years_elapsed_for_trend) SEVERITY_DECAY_RATE: float = 0.15 # can be tuned as needed -# Global override for the years_elapsed used in severity_trend. -# Workflow: -# 1) By default, code computes elapsed years from -# first prior → last current commitment dates. -# 2) If DEFAULT_TIME_ELAPSED_YEARS is not None, it replaces -# the computed value and acts as the default horizon. -DEFAULT_TIME_ELAPSED_YEARS: Any = 10.0 - # Column Map + COLS: Dict[str, Any] = { - "id": "cdcno", # REQUIRED identifier - "age_years": None, # No age available -> feature skipped - "dob": None, # optional (unused if None) - "reference_date": None, # optional - # Time/term fields (optional; used if your compute code supports them) + "id": "cdcno", # REQUIRED identifier + + # Optional / unavailable in current data -> feature skipped + "age_years": None, + "dob": None, + "reference_date": None, + + # Time/term fields (optional; used if compute code supports them) "current_sentence": "aggregate sentence in months", - "completed_time": "time served in years", - "past_time": None, # optional + "completed_time": "time served in years", + "past_time": None, # optional + # Offense text fields (used by counting logic) "current_offense_text": "offense", - "prior_offense_text": "offense", - # Category text (ignored by compute if not used) + "prior_offense_text": "offense", + + # Category text (ignored unless compute uses it) "current_category_text": "offense category", - "prior_category_text": "offense category", + "prior_category_text": "offense category", + + # Optional commitment date columns (only needed if present in tables) + # "prior_commit_date": "commitment_date", + # "current_commit_date": "commitment_date", } + # Defaults / Behavior Knobs + +# Aparna note: keep ALL tunable defaults in one place (this dict), +# so we don't have confusing "DEFAULT_*" globals vs DEFAULTS dict. DEFAULTS: Dict[str, Any] = { "missing_numeric": math.nan, "require_time_fields": ("current_sentence", "completed_time"), - # Optional global exposure window (months) for frequency metrics. - # If None, the code computes a per-person window from (DOB+18y) to reference_date. - "months_elapsed_total": None, + # Frequency exposure window (MONTHS) used ONLY for freq_* metrics. + # If None, code computes a per-person exposure window (implementation-defined). + "months_elapsed_for_frequency": None, + + # Trend horizon override (YEARS) used ONLY for severity_trend. + # If None, compute from: (first prior commitment date → last current commitment date). + # If set (e.g., 10.0), it overrides the computed years when compute uses defaults. + "years_elapsed_for_trend": 10.0, # Age normalization (only used if age_years is present and valid) "age_min": 18.0, @@ -92,16 +113,18 @@ "rehab_advanced_credits": 0.0, }, "rehab_norm_bounds": { - "edu_general": (None, None), - "edu_advanced": (None, None), + "edu_general": (None, None), + "edu_advanced": (None, None), "rehab_general": (None, None), - "rehab_advanced":(None, None), + "rehab_advanced": (None, None), }, } -# Offense Policies (constants) + +# Offense Policies + OFFENSE_LISTS = { - "violent": ["187", "211", "245"], + "violent": ["187", "211", "245"], "nonviolent": ["459", "484", "10851"], } @@ -111,15 +134,23 @@ "strip_punctuation": True, } + # Metric Names & Weights + METRIC_NAMES = [ - "desc_nonvio_curr", "desc_nonvio_past", "age", - "freq_violent", "freq_total", "severity_trend", - "edu_general", "edu_advanced", "rehab_general", "rehab_advanced", + "desc_nonvio_curr", + "desc_nonvio_past", + "age", + "freq_violent", + "freq_total", + "severity_trend", + "edu_general", + "edu_advanced", + "rehab_general", + "rehab_advanced", ] METRIC_WEIGHTS: Dict[str, float] = { - # Age is a full metric (normalized and positively aligned with suitability) "age": 1.0, "desc_nonvio_curr": 1.0, "desc_nonvio_past": 1.0, @@ -138,7 +169,6 @@ "age": +1, "freq_violent": -1, "freq_total": -1, - # severity_trend is inversely related to suitability (ideal = 0) "severity_trend": -1, "edu_general": +1, "edu_advanced": +1, @@ -149,13 +179,12 @@ METRIC_RANGES: Dict[str, Any] = { "desc_nonvio_curr": (0.0, 1.0), "desc_nonvio_past": (0.0, 1.0), - # metric is normalized to [0,1] "age": (0.0, 1.0), "freq_violent": (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]), - "freq_total": (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]), + "freq_total": (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]), "severity_trend": (0.0, 1.0), - "edu_general": (0.0, 1.0), - "edu_advanced": (0.0, 1.0), + "edu_general": (0.0, 1.0), + "edu_advanced": (0.0, 1.0), "rehab_general": (0.0, 1.0), "rehab_advanced": (0.0, 1.0), } diff --git a/offense_helpers.py b/offense_helpers.py index b2d367a..7bd91b5 100644 --- a/offense_helpers.py +++ b/offense_helpers.py @@ -1,6 +1,26 @@ -# offense_helpers.py — STRICT, config-driven (Aparna-approved) +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +offense_helpers.py — strict, config-driven offense classification + +Design: +- Prefer numeric penal code extraction when present (e.g., "PC 187(a)" -> "187") +- Classify using config.OFFENSE_LISTS: + violent: explicit list + nonviolent: explicit list (or special "rest" mode) +- Optional "rest" mode: + * Either set OFFENSE_LISTS["nonviolent"] = "rest" + * OR set OFFENSE_POLICY["nonviolent_rest_mode"] = True + In either case: anything not in violent_list is treated as nonviolent. + +Notes: +- We intentionally do NOT implement case-insensitivity or punctuation stripping beyond numeric extraction, + because penal-code extraction is the primary normalization for these datasets. +""" + from __future__ import annotations -from typing import Any, Dict + +from typing import Any, Dict, Optional import re import config as CFG @@ -10,13 +30,12 @@ def _normalize_offense_token(x: Any) -> str: """ - Prefer a numeric penal code if present (e.g., 'PC 187(a)' -> '187'), + Prefer numeric penal code if present (e.g., 'PC 187(a)' -> '187'), else return the original string trimmed. IMPORTANT: - - Does NOT use OFFENSE_POLICY - - No lowercase conversion - - No punctuation stripping beyond numeric extraction + - Strict mode: does NOT apply OFFENSE_POLICY case folding / punctuation stripping. + - Numeric extraction already removes most formatting variance. """ if x is None: return "" @@ -27,48 +46,48 @@ def _normalize_offense_token(x: Any) -> str: return m.group(0) if m else s -def classify_offense(code_or_text: Any, lists: Dict[str, Any] | None = None) -> str: +def classify_offense(code_or_text: Any, lists: Optional[Dict[str, Any]] = None) -> str: """ - Strict classification using config.OFFENSE_LISTS. - Does NOT use OFFENSE_POLICY (even though it exists in config.py). + Strict classification using offense lists. Returns: "violent", "nonviolent", "other", or "clash" Logic: - 1. violent list is always explicit - 2. nonviolent list may be: - - explicit list - - "rest" meaning: everything not violent is nonviolent - 3. clash if token appears in both lists (rare, but safe) + 1) If token in both violent and nonviolent lists -> "clash" + 2) If token in violent list -> "violent" + 3) If nonviolent is explicit list: + token in list -> "nonviolent" else "other" + 4) If "rest" mode enabled: + anything not violent -> "nonviolent" + 5) fallback -> "other" """ - li = lists or CFG.OFFENSE_LISTS + li = lists if lists is not None else getattr(CFG, "OFFENSE_LISTS", {}) token = _normalize_offense_token(code_or_text) if token == "": return "other" violent_list = li.get("violent", []) or [] - non_list = li.get("nonviolent", []) + non_list = li.get("nonviolent", []) + + # Determine whether "rest mode" is enabled + policy = getattr(CFG, "OFFENSE_POLICY", {}) or {} + rest_mode = bool(policy.get("nonviolent_rest_mode", False)) or (non_list == "rest") is_v = token in violent_list - is_n = isinstance(non_list, list) and token in non_list + is_n = isinstance(non_list, list) and (token in non_list) - # Case 1: token appears in both lists → clash if is_v and is_n: return "clash" - - # Case 2: explicit violent if is_v: return "violent" - # Case 3: explicit nonviolent list if isinstance(non_list, list): - return "nonviolent" if is_n else "other" + return "nonviolent" if is_n else ("nonviolent" if rest_mode else "other") - # Case 4: nonviolent == "rest" mode - if non_list == "rest": + # non_list is not a list (e.g., "rest") + if rest_mode: return "nonviolent" - # Case 5: fallback return "other" diff --git a/run.py b/run.py index 86d9252..7dabdec 100644 --- a/run.py +++ b/run.py @@ -2,10 +2,16 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import argparse, json, math + +import argparse +import json +import math +import traceback from typing import Dict, Any, List, Optional + import pandas as pd from tqdm import tqdm + import config as CFG import sentencing_math as sm import compute_metrics as cm @@ -24,14 +30,35 @@ def _flatten_counts(prefix: str, d: Dict[str, Any]) -> Dict[str, Any]: def _load_ids(ids_csv: Optional[str], demo: pd.DataFrame) -> List[str]: id_col = CFG.COLS["id"] + if demo is None or id_col not in demo.columns: + raise ValueError( + f"Demographics table is missing required id column '{id_col}'. " + f"Available columns: {list(demo.columns) if demo is not None else 'None'}" + ) + if ids_csv: df_ids = pd.read_csv(ids_csv) if id_col not in df_ids.columns: raise ValueError(f"--ids-csv must contain a column named '{id_col}' (from config.COLS['id']).") return df_ids[id_col].astype(str).dropna().unique().tolist() + return demo[id_col].astype(str).dropna().unique().tolist() +def _ensure_dense_metrics(feats: Dict[str, Any]) -> Dict[str, Any]: + """ + If CFG.METRIC_NAMES exists, ensure all metric columns appear (NaN if missing). + This makes population outputs schema-stable. + """ + names = getattr(CFG, "METRIC_NAMES", None) + if not names: + return feats + out = dict(feats) + for k in names: + out.setdefault(k, math.nan) + return out + + def main(): ap = argparse.ArgumentParser(description="Compute population-level sentencing metrics.") ap.add_argument("--out", default="population_metrics.csv") @@ -39,19 +66,21 @@ def main(): ap.add_argument("--ids-csv", default=None) ap.add_argument("--limit", type=int, default=None) ap.add_argument("--include-aux", action="store_true") + ap.add_argument("--dense", action="store_true", help="Include all CFG.METRIC_NAMES columns (NaN if missing).") ap.add_argument("--print-every", type=int, default=0) ap.add_argument("--fail-fast", action="store_true") + ap.add_argument("--tracebacks", action="store_true", help="Include stack traces in the .errors.jsonl file.") args = ap.parse_args() # Load source tables demo = cm.read_table(CFG.PATHS["demographics"]) - cur = cm.read_table(CFG.PATHS["current_commitments"]) - pri = cm.read_table(CFG.PATHS["prior_commitments"]) + cur = cm.read_table(CFG.PATHS["current_commitments"]) + pri = cm.read_table(CFG.PATHS["prior_commitments"]) # Policy knobs - lists = getattr(CFG, "OFFENSE_LISTS", {"violent": [], "nonviolent": []}) - weights = getattr(CFG, "METRIC_WEIGHTS", getattr(CFG, "WEIGHTS", {})) - directions = getattr(CFG, "METRIC_DIRECTIONS", {}) + lists = getattr(CFG, "OFFENSE_LISTS", {"violent": [], "nonviolent": []}) + weights = getattr(CFG, "METRIC_WEIGHTS", getattr(CFG, "WEIGHTS", {})) + directions = getattr(CFG, "METRIC_DIRECTIONS", {}) # Who to run ids = _load_ids(args.ids_csv, demo) @@ -67,6 +96,9 @@ def main(): try: feats, aux = cm.compute_features(str(uid), demo, cur, pri, lists) + # Optional: stable schema (include NaNs for missing metrics) + feats_out = _ensure_dense_metrics(feats) if args.dense else feats + # Final suitability as ratio + parts: # numerator = w · m (dot with actual metrics) # denom = w · x* (dot with best-case vector) @@ -77,7 +109,7 @@ def main(): return_parts=True, ) - # NaN/None/0 safe handling + # NaN/None/0 safe handling no_denom = ( denom is None or denom == 0 @@ -88,23 +120,22 @@ def main(): score_ratio_safe = math.nan numerator_safe = math.nan denom_safe = math.nan - evaluated_flag = 0 # "not evaluated / insufficient data" + evaluated_flag = 0 # not evaluated / insufficient data score_pct_of_out = math.nan else: score_ratio_safe = float(score_ratio) numerator_safe = float(numerator) denom_safe = float(denom) - evaluated_flag = 1 # "evaluated" - # <-- HERE is the NaN-safe percentage + evaluated_flag = 1 # evaluated score_pct_of_out = (numerator_safe / denom_safe) * 100.0 record: Dict[str, Any] = { - CFG.COLS["id"]: uid, - **feats, + CFG.COLS["id"]: str(uid), + **feats_out, "score": numerator_safe, "score_out_of": denom_safe, "score_ratio": score_ratio_safe, - "score_pct_of_out": score_pct_of_out, # NEW + "score_pct_of_out": score_pct_of_out, "evaluated": evaluated_flag, } @@ -122,18 +153,33 @@ def main(): except Exception as e: if args.fail_fast: raise - errors.append({CFG.COLS["id"]: uid, "error": f"{type(e).__name__}: {e}"}) + err_rec: Dict[str, Any] = { + CFG.COLS["id"]: str(uid), + "error": f"{type(e).__name__}: {e}", + } + if args.tracebacks: + err_rec["traceback"] = traceback.format_exc() + errors.append(err_rec) finally: pbar.update(1) pbar.close() out_df = pd.DataFrame(rows) - # Put ID first + # Column ordering: id, metrics (if configured), score fields, then the rest id_col = CFG.COLS["id"] - cols = out_df.columns.tolist() - if id_col in cols: - out_df = out_df[[id_col] + [c for c in cols if c != id_col]] + metric_order = getattr(CFG, "METRIC_NAMES", []) + score_order = ["score_ratio", "score", "score_out_of", "score_pct_of_out", "evaluated"] + + ordered: List[str] = [] + for c in [id_col] + list(metric_order) + score_order: + if c in out_df.columns and c not in ordered: + ordered.append(c) + # append any remaining columns + for c in out_df.columns: + if c not in ordered: + ordered.append(c) + out_df = out_df[ordered] if not out_df.empty else out_df # Write out_fmt = ( @@ -156,7 +202,7 @@ def main(): f.write(json.dumps(rec) + "\n") print(f"Encountered {len(errors)} errors. Details → {err_path}") - # Preview a few key columns if present + # Preview if not out_df.empty: preferred = [id_col, "score_ratio", "score", "score_out_of", "score_pct_of_out", "evaluated"] extra = [c for c in out_df.columns if c not in preferred][:5]