From 2dd4f4839fe866d57a63ae4ba18ccaef5d3b54fb Mon Sep 17 00:00:00 2001
From: Taufia Hussain <taufia.hussain@gmail.com>
Date: Tue, 3 Feb 2026 13:30:18 +0100
Subject: [PATCH] feat: sync config/compute_metrics/run + README with
 similarity_scoring

---
 README.md          |  88 +++++++++++----------
 compute_metrics.py | 186 ++++++++++++++++++++++-----------------------
 config.py          | 143 ++++++++++++++++++++--------------
 offense_helpers.py |  69 +++++++++++------
 run.py             |  84 +++++++++++++++-----
 5 files changed, 333 insertions(+), 237 deletions(-)
diff --git a/README.md b/README.md
index 6211d98..c64566c 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
-# Introduction
+# population_metrics
 Batch runner to compute **population-level sentencing metrics** and **suitability scores** for all individuals, writing a flat file (CSV/Parquet). The pipeline is strict about missing inputs:
 when nothing can be evaluated for a person, we emit NaNs instead of 0 so the case can be flagged,
 metrics are **skipped** when their prerequisites aren’t present (no fabricated values). Metrics are **named and extensible**; new metrics can be added without changing positional order.
 
-## Contents
+## Repo contents
 - `config.py` — Paths (DEV/PROD), column map (`COLS`), defaults (`DEFAULTS`), offense lists (`OFFENSE_LISTS`), and metric weights (`METRIC_WEIGHTS`).
 - `compute_metrics.py` — Library functions to read raw tables and compute **named features** for a single ID (skip-if-missing).
 - `sentencing_math.py` — Pure math (no I/O): time decomposition, proportions, frequency/trend, rehab, and name-based suitability.
@@ -63,38 +63,38 @@ This allows downstream tools to tell “not evaluated / insufficient data” apa
 - Errors (if any): `*.errors.jsonl` with `{id, error}` records.
 - Console preview prints the first rows/columns for a quick check.
 
-## Worked Examples
+## Worked examples (from scratch)
 These examples walk through **exactly** what the pipeline computes for a specific ID: counts → denominators → proportions → time pieces → trend/frequency → named vector → suitability. The LaTeX below **matches the paper** notation.
 
-### Example 1
-**CDCR ID:** `00173d8423`<br>
+# Worked Example (REAL DATA)
 
+**CDCR ID:** `00173d8423`
 **Offense Lists (active for this run)**
 - Violent: `['187', '211', '245']`
 - Nonviolent: `['459', '484', '10851']`
 
-#### Inputs
+## Inputs
 - Current offense rows found: **11**
 - Prior offense rows found: **6**
 
-#### Counts by Category
+### Counts by Category
 - Current: {'violent': 1, 'nonviolent': 1, 'other': 9, 'clash': 0}
 - Prior:   {'violent': 0, 'nonviolent': 4, 'other': 2, 'clash': 0}
 
-#### Time Pieces
-- `current_sentence_months` = 10000.000
-- `completed_months` = 330.000
-- `past_time_months` = NA 
-- `pct_current_completed` = 3.300
-- `time_outside_months` = 0.000
+### Time Pieces
+- current_sentence_months = 10000.000
+- completed_months = 330.000
+- past_time_months = NA 
+- pct_current_completed = 3.300
+- time_outside_months = 0.000
 
-**Definition:**
+**Paper definition (Eq. B.2–15):**
 
 $$
 \mathrm{out}^t_i = t_d - \mathrm{in}^{(\mathrm{vio+nonvio}),t}_i - \text{childhood}.
 $$
 
-#### Calculations
+### Calculations (refer to LaTeX section for formulas)
 
 - `desc_nonvio_curr = 1/2 = 0.500` (see Eq. **DESC-NONVIO-CURR**)
 - `desc_nonvio_past = 4/4 = 1.000` (see Eq. **DESC-NONVIO-PAST**)
@@ -108,13 +108,13 @@ $$
 
 - Frequency (per month outside):
   - `raw_freq_violent = NA; raw_freq_total = NA`
-  - `normalized: **SKIPPED**` (requires `time_outside > 0`, `freq_min_rate` and `freq_max_rate`)
+    `normalized: **SKIPPED** (requires `time_outside > 0` and `freq_min_rate`/`freq_max_rate`  
      (see Eqs. **FREQ-VIO**, **FREQ-TOTAL**)
 
 - Age (min–max):
   - `age_raw = 38.000`, `min = 18.000`, `max = 90.000` → `age = 0.278` (see Eq. **AGE-NORM**)
 
-#### Final Metric Vector
+## Final Metric Vector (named)
 Order: `desc_nonvio_curr, desc_nonvio_past, age, freq_violent, freq_total, severity_trend, edu_general, edu_advanced, rehab_general, rehab_advanced`  
 Values: `[0.500, 1.000, 0.278, SKIPPED, SKIPPED, 0.112, SKIPPED, SKIPPED, SKIPPED, SKIPPED]`
 
@@ -123,35 +123,35 @@ Values: `[0.500, 1.000, 0.278, SKIPPED, SKIPPED, 0.112, SKIPPED, SKIPPED, SKIPPE
 
 
 ### Example 2 
-**CDCR ID:** `0029029e5b`<br>
+**CDCR ID:** `0029029e5b`
 
 **Offense Lists (active for this run)**
 - Violent: `['187', '211', '245']`
 - Nonviolent: `['459', '484', '10851']`
 
-#### Inputs
+## Inputs
 - Current offense rows found: **1**
 - Prior offense rows found: **2**
 
-#### Counts by Category
+### Counts by Category
 - Current: {'violent': 1, 'nonviolent': 0, 'other': 0, 'clash': 0}
 - Prior:   {'violent': 2, 'nonviolent': 0, 'other': 0, 'clash': 0}
 
 
-#### Time Pieces
-- `current_sentence_months` = 84.000
-- `completed_months` = 67.200
-- `past_time_months` = NA
-- `pct_current_completed` = 80.000
-- `time_outside_months` = 0.000
+### Time Pieces
+- current_sentence_months = 84.000
+- completed_months = 67.200
+- past_time_months = NA
+- pct_current_completed = 80.000
+- time_outside_months = 0.000
 
-**Definition:**
+**Paper definition (Eq. B.2–15):**
 
 $$
 \mathrm{out}^t_i = t_d - \mathrm{in}^{(\mathrm{vio+nonvio}),t}_i - \text{childhood}.
 $$
 
-#### Calculations
+### Calculations (refer to LaTeX section for formulas)
 
 - `desc_nonvio_curr = 0/1 = 0.000` (see Eq. **DESC-NONVIO-CURR**)
 - `desc_nonvio_past = 0/2 = 0.000` (see Eq. **DESC-NONVIO-PAST**)
@@ -166,20 +166,20 @@ $$
 - Frequency (per month outside):
   - `violent_total = 3; total_conv = 3; time_outside = 0.000`
   - `raw_freq_violent = NA; raw_freq_total = NA`
-  - `normalized: **SKIPPED**` (requires `time_outside > 0`, `freq_min_rate` and `freq_max_rate`)
+  - `normalized: **SKIPPED** (requires `time_outside > 0` and `freq_min_rate`/`freq_max_rate`  
     (see Eqs. **FREQ-VIO**, **FREQ-TOTAL**)
 
 - Age (min–max):
   - `age_raw = 38.000`, `min = 18.000`, `max = 90.000` → `age = 0.278` (see Eq. **AGE-NORM**)
 
-#### Final Metric Vector
+## Final Metric Vector (named)
 Order: `desc_nonvio_curr, desc_nonvio_past, age, freq_violent, freq_total, severity_trend, edu_general, edu_advanced, rehab_general, rehab_advanced`  
 Values: `[0.000, 0.000, 0.278, SKIPPED, SKIPPED, 0.000, SKIPPED, SKIPPED, SKIPPED, SKIPPED]`
 
 **Score:** `0.278` (out of `3.000`) — **9.3% of maximum**   
 **Contributing metrics:** `age, desc_nonvio_curr, desc_nonvio_past, severity_trend`
 
-### Re‑generate Examples
+### Re‑generate these examples
 **macOS/Linux**
 ```bash
 CFG_PROFILE=DEV python docs_1/make_worked_example.py --uid "0029029e5b" --violent "187,211,245" --nonviolent "459,484,10851" --age-years 38 --exposure-months 480 --freq-bounds "0,0.05" --out docs_1/README_worked_example_0029029e5b.md
@@ -192,7 +192,7 @@ python docs_1\make_worked_example.py --uid "0029029e5b" --violent "187,211,245"
 python docs_1\make_worked_example.py --uid "00173d8423" --violent "187,211,245" --nonviolent "459,484,10851" --age-years 38 --exposure-months 480 --freq-bounds "0,0.05" --out "docs_1\README_worked_example_00173d8423.md"
 ```
 
-## Formulas Implemented
+## Formulas implemented (LaTeX)
 - **Descriptive proportions:**
 
 $$
@@ -303,7 +303,7 @@ and `x_k* = 0` for `d_k = −1` (negative-direction metrics).
 > • Frequency requires **both** `time_outside > 0` **and** configured `freq_min_rate`/`freq_max_rate`.  
 > • Rehab/education are per‑month‑inside, then min–max normalized **only if** inputs and bounds are provided; otherwise **omitted**.
 
-## Validation Checklist
+## Validation checklist
 - Proportion metrics are computed **only** when denominators \(> 0\); otherwise the metric is **SKIPPED**.
 - Frequency requires **both** `time_outside > 0` **and** `freq_min_rate`/`freq_max_rate` in `config.py`.
 - Offense classification uses only `OFFENSE_LISTS`; anything unlisted → **other** (and does not contribute to denominators).
@@ -311,8 +311,7 @@ and `x_k* = 0` for `d_k = −1` (negative-direction metrics).
 - When comparing individuals (similarity), compute on the **intersection of present features** and require a minimum shared‑dimension count (e.g., ≥3). Consider also Euclidean or Tanimoto for sensitivity analysis.
 - If no metrics pass the gating (denominators 0, missing exposure, missing age, etc.), the scorer returns NaN (or None, depending on runner) and sets evaluated = 0. This is intentional and we do not fabricate zeros for unevaluable people.
 
-## Programmatic Example
-```python
+## Programmatic example
 import math
 import config as CFG
 import compute_metrics as cm
@@ -329,7 +328,8 @@ ids = demo[CFG.COLS["id"]].astype(str).dropna().unique().tolist()[:3]
 
 rows = []
 for uid in ids:
-    feats, aux = cm.compute_features(uid, demo, cur, pri, CFG.OFFENSE_LISTS)
+    feats, aux = cm.compute_features(str(uid), demo, cur, pri, CFG.OFFENSE_LISTS)
+    # NOTE: feats is "skip-if-missing" — it may not contain every metric in CFG.METRIC_NAMES
 
     # name-based suitability; may return NaN/None if no evaluable metrics
     score_ratio, num, den = sm.suitability_score_named(
@@ -349,25 +349,28 @@ for uid in ids:
         score_ratio_safe = math.nan
         num_safe = math.nan
         den_safe = math.nan
+        score_pct_of_out = math.nan
         evaluated = 0
     else:
         score_ratio_safe = float(score_ratio)
         num_safe = float(num)
         den_safe = float(den)
+        score_pct_of_out = (num_safe / den_safe) * 100.0
         evaluated = 1
 
-    # Optional: expose time_outside if present in aux
+    # Optional: expose aux fields if present
     time_outside_months = aux.get("time_outside")
     pct_completed = aux.get("pct_completed")
 
     rows.append(
         {
-            CFG.COLS["id"]: uid,
-            **feats,                     # all computed named metrics
-            "score": num_safe,           # numerator (Σ w·m)
-            "score_out_of": den_safe,    # denominator (Σ w·x*)
+            CFG.COLS["id"]: str(uid),
+            **feats,                      # computed named metrics (may be a subset)
+            "score": num_safe,            # numerator (Σ w·m)
+            "score_out_of": den_safe,     # denominator (Σ w·x*)
             "score_ratio": score_ratio_safe,
-            "evaluated": evaluated,      # 1 = evaluated, 0 = not evaluable
+            "score_pct_of_out": score_pct_of_out,
+            "evaluated": evaluated,       # 1 = evaluated, 0 = not evaluable
             "time_outside_months": time_outside_months,
             "pct_completed": pct_completed,
         }
@@ -375,6 +378,7 @@ for uid in ids:
 
 df = pd.DataFrame(rows)
 print(df.head())
+
 ```
 
 ## Troubleshooting
diff --git a/compute_metrics.py b/compute_metrics.py
index 6ad7bc4..a1ab486 100644
--- a/compute_metrics.py
+++ b/compute_metrics.py
@@ -1,42 +1,42 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-compute_metrics.py — parameterized, raw-data–oriented (library only)
+compute_metrics.py — parameterized, raw-data–oriented feature computation (no CLI)
 
-Pipeline (library functions):
+Pipeline:
   read → parse → classify(offenses) → time → features
-Scoring/printing should be handled by a separate runner (e.g., run_compute_metrics.py).
 
 Relies on:
-  - config.py: PATHS, COLS, DEFAULTS, METRIC_WEIGHTS, DEFAULT_TIME_ELAPSED_YEARS
+  - config.py: PATHS, COLS, DEFAULTS, OFFENSE_LISTS, OFFENSE_POLICY
   - sentencing_math.py: pure math helpers (imported as sm)
-  - offense_helpers.py: classify_offense (uses OFFENSE_LISTS/OFFENSE_POLICY from config)
+  - offense_helpers.py: classify_offense (uses OFFENSE_LISTS/OFFENSE_POLICY)
 
 Design notes:
   • Missing numerics remain NaN (configurable via DEFAULTS["missing_numeric"]).
   • Time fields are unit-aware by column name: "...year..."→*12, "...day..."→/30, else months.
-  • Convictions are handled via sm.Convictions (single class).
-  • Weights are NAME-BASED and should be applied outside this module.
-  • Exposure window: uses DEFAULTS["months_elapsed_total"] if provided; otherwise
-    computes per-person exposure as months from (DOB+18y) to reference_date.
-  • Severity trend: years_elapsed is computed from commitment tables
-    (first prior commitment date → last current commitment date); if
-    DEFAULT_TIME_ELAPSED_YEARS is not None, that value overrides the
-    computed years when scoring severity_trend.
   • STRICT SKIP-IF-MISSING: features are ONLY added when inputs are valid.
+  • Frequency exposure window (months) uses DEFAULTS["months_elapsed_for_frequency"] if provided,
+    else computes a per-person exposure window (dob/reference_date if available).
+  • Severity trend horizon (years) is computed from commitments (first prior → last current),
+    optionally overridden by DEFAULTS["years_elapsed_for_trend"] when use_default_trend_years=True.
 """
 
+
 from __future__ import annotations
 from typing import Any, Dict, Optional, Tuple
+
+import math
 import numpy as np
 import pandas as pd
 from pandas.tseries.offsets import DateOffset
+
 import config as CFG
 import sentencing_math as sm
 from offense_helpers import classify_offense
 
 
 # Small config helpers
+
 def _cfg_col(name: str) -> Optional[str]:
     """Return configured column name (or None) for a logical field."""
     return getattr(CFG, "COLS", {}).get(name)
@@ -47,7 +47,8 @@ def _cfg_default(key: str, fallback: Any) -> Any:
     return getattr(CFG, "DEFAULTS", {}).get(key, fallback)
 
 
-# I/O (CSV/XLSX via pandas)
+# I/O helpers
+
 def _to_raw_github_url(path: str) -> str:
     """Allow GitHub 'blob' URLs in config by converting them to 'raw' URLs."""
     if not isinstance(path, str):
@@ -81,7 +82,8 @@ def get_row_by_id(df: pd.DataFrame, id_col: str, uid: str) -> Optional[pd.Series
     return None if sub.empty else sub.iloc[0]
 
 
-# Parsing (NaN-honest)
+# Parsing helpers
+
 def _to_float_or_nan(x: Any) -> float:
     """Parse numeric strings safely; return NaN if missing/invalid."""
     try:
@@ -110,7 +112,8 @@ def to_months(val: Any, colname: Optional[str]) -> float:
     return x
 
 
-# Offense counting (uses classify_offense from offense_helpers.py)
+# Offense counting utilities
+
 def count_offenses_by_category(
     df: pd.DataFrame,
     id_col: str,
@@ -129,21 +132,26 @@ def count_offenses_by_category(
     return out
 
 
-# Time + Age extractors
+# Time + Age extract
+
 def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs]:
     """
     Build sm.TimeInputs from the demographics row using configured columns.
-    Requires at least the two fields in DEFAULTS['require_time_fields'].
+    Requires at least the fields listed in DEFAULTS['require_time_fields'].
     """
     if demo_row is None:
         return None
 
-    cur = to_months(demo_row.get(_cfg_col("current_sentence")), _cfg_col("current_sentence"))
-    com = to_months(demo_row.get(_cfg_col("completed_time")), _cfg_col("completed_time"))
+    cur_col = _cfg_col("current_sentence")
+    com_col = _cfg_col("completed_time")
+    pas_col = _cfg_col("past_time")
+
+    cur = to_months(demo_row.get(cur_col), cur_col)
+    com = to_months(demo_row.get(com_col), com_col)
+
     pas = (
-        to_months(demo_row.get(_cfg_col("past_time")), _cfg_col("past_time"))
-        if _cfg_col("past_time")
-        else _cfg_default("missing_numeric", np.nan)
+        to_months(demo_row.get(pas_col), pas_col)
+        if pas_col else _cfg_default("missing_numeric", np.nan)
     )
 
     req = tuple(_cfg_default("require_time_fields", ("current_sentence", "completed_time")))
@@ -152,7 +160,6 @@ def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs
     if (need_cur and np.isnan(cur)) or (need_com and np.isnan(com)):
         return None
 
-    # Childhood months come from config (no hard-coding)
     return sm.TimeInputs(
         current_sentence_months=cur,
         completed_months=com,
@@ -162,19 +169,18 @@ def extract_time_inputs(demo_row: Optional[pd.Series]) -> Optional[sm.TimeInputs
 
 
 def extract_age_years(demo_row: Optional[pd.Series]) -> Optional[float]:
-    """
-    Return age in years if present; else None.
-    Caller will SKIP the 'age' feature if this returns None.
-    """
+    """Return age in years if present; else None (caller will skip age feature)."""
     if demo_row is None:
         return None
     col = _cfg_col("age_years")
     if col and (col in demo_row) and pd.notna(demo_row[col]):
-        return _to_float_or_nan(demo_row[col])
+        v = _to_float_or_nan(demo_row[col])
+        return None if np.isnan(v) else float(v)
     return None
 
 
-# Exposure helpers
+# Exposure / elapsed time
+
 def _months_between(start: pd.Timestamp, end: pd.Timestamp) -> Optional[float]:
     """Return months between two timestamps (≈ days/30) or None if either is NaT."""
     if pd.isna(start) or pd.isna(end):
@@ -191,36 +197,28 @@ def _years_between(start: pd.Timestamp, end: pd.Timestamp) -> Optional[float]:
     return max(0.0, days / 365.25)
 
 
-def _years_elapsed_from_commitments(
+def years_elapsed_prior_curr_commitments(
     uid: str,
     current_df: pd.DataFrame,
     prior_df: pd.DataFrame,
 ) -> Optional[float]:
     """
-    Compute years_elapsed for severity trend as:
-        first recorded prior commitment date → last recorded current commitment date.
+    Calculate elapsed years between:
+        first recorded PRIOR commitment date -> last recorded CURRENT commitment date.
 
     Uses optional config columns:
-        COLS["prior_commit_date"], COLS["current_commit_date"].
+        COLS["prior_commit_date"], COLS["current_commit_date"]
 
-    Returns None if:
-      • any required column is missing, or
-      • there are no valid dates for this uid.
+    Returns None if required columns/dates are missing for this uid.
     """
-    id_col = CFG.COLS.get("id")
+    id_col = _cfg_col("id")
     prior_date_col = _cfg_col("prior_commit_date")
     current_date_col = _cfg_col("current_commit_date")
 
-    if (
-        prior_date_col is None
-        or current_date_col is None
-        or id_col is None
-        or prior_df is None
-        or current_df is None
-    ):
+    if id_col is None or prior_date_col is None or current_date_col is None:
+        return None
+    if prior_df is None or current_df is None:
         return None
-
-    # Verify columns exist
     if (
         id_col not in prior_df.columns
         or id_col not in current_df.columns
@@ -231,36 +229,35 @@ def _years_elapsed_from_commitments(
 
     prior_sub = prior_df.loc[prior_df[id_col].astype(str) == str(uid)]
     curr_sub = current_df.loc[current_df[id_col].astype(str) == str(uid)]
-
     if prior_sub.empty or curr_sub.empty:
         return None
 
     prior_dates = pd.to_datetime(prior_sub[prior_date_col], errors="coerce")
     curr_dates = pd.to_datetime(curr_sub[current_date_col], errors="coerce")
-
     if prior_dates.notna().sum() == 0 or curr_dates.notna().sum() == 0:
         return None
 
     first_prior = prior_dates.min()
     last_current = curr_dates.max()
-
     return _years_between(first_prior, last_current)
 
 
-# Feature computation (public API)
+# Feature computation API
+
 def compute_features(
     uid: str,
     demo: pd.DataFrame,
     current_df: pd.DataFrame,
     prior_df: pd.DataFrame,
     lists: Dict[str, Any],
+    use_default_trend_years: bool = True,
 ) -> Tuple[Dict[str, float], Dict[str, Any]]:
     """
     Compute name-keyed metrics for a single ID.
 
     Returns:
         feats: name→value dictionary (features are ONLY added when inputs are valid).
-        aux:   auxiliary info useful for debugging/QA (time pieces, raw counts, etc.).
+        aux:   auxiliary info for QA/debugging.
     """
     cols = CFG.COLS
     row = get_row_by_id(demo, cols["id"], uid)
@@ -268,21 +265,27 @@ def compute_features(
     feats: Dict[str, float] = {}
     aux: Dict[str, Any] = {}
 
-    # Determine exposure window (months) for frequency metrics
-    # Prefer global config; else compute per-person as months from (DOB+18y) → reference_date
-    per_person_exposure = _cfg_default("months_elapsed_total", None)
+
+    # Frequency exposure (MONTHS)
+    
+    per_person_exposure = _cfg_default("months_elapsed_for_frequency", None)
+
     if per_person_exposure is None and row is not None:
         dob_col, ref_col = _cfg_col("dob"), _cfg_col("reference_date")
         if dob_col and ref_col and (dob_col in row) and (ref_col in row):
             dob = pd.to_datetime(row.get(dob_col), errors="coerce")
             ref = pd.to_datetime(row.get(ref_col), errors="coerce")
             adulthood = (dob + DateOffset(years=18)) if pd.notna(dob) else pd.NaT
-            start = adulthood if pd.notna(adulthood) else dob  # fall back to dob if adulthood missing
+            start = adulthood if pd.notna(adulthood) else dob
             per_person_exposure = _months_between(start, ref)
 
-    # Time (inside/outside)
+    aux["months_elapsed_for_frequency"] = per_person_exposure
+
+    
+    # Time (pct/outside)
+    
     t = extract_time_inputs(row)
-    if t:
+    if t is not None:
         aux["time_inputs"] = t
         _, pct_completed, time_outside = sm.compute_time_vars(t, per_person_exposure)
         aux["pct_completed"] = pct_completed
@@ -291,9 +294,11 @@ def compute_features(
         aux["pct_completed"] = np.nan
         aux["time_outside"] = np.nan
 
-    # Age (normalized) — SKIP IF MISSING
+    
+    # Age (optional)
+    
     age_val = extract_age_years(row)
-    if age_val is not None and not np.isnan(age_val):
+    if age_val is not None:
         feats["age"] = sm.score_age_norm(
             age_val,
             _cfg_default("age_min", None),
@@ -301,9 +306,11 @@ def compute_features(
         )
         aux["age_value"] = age_val
     else:
-        aux["age_value"] = np.nan  # recorded for QA, but no 'age' feature added
+        aux["age_value"] = np.nan
 
+    
     # Convictions (current & prior)
+    
     cur = count_offenses_by_category(current_df, cols["id"], uid, cols["current_offense_text"], lists)
     pri = count_offenses_by_category(prior_df, cols["id"], uid, cols["prior_offense_text"], lists)
     aux["counts_by_category"] = {"current": cur, "prior": pri}
@@ -315,13 +322,15 @@ def compute_features(
         past_violent=pri["violent"],
     )
 
-    # Descriptive proportions — only when denominators > 0
+    # Descriptive proportions
     if conv.curr_total > 0:
         feats["desc_nonvio_curr"] = sm.score_desc_nonvio_curr(conv.curr_nonviolent, conv.curr_total)
     if conv.past_total > 0:
         feats["desc_nonvio_past"] = sm.score_desc_nonvio_past(conv.past_nonviolent, conv.past_total)
 
-    # Frequency (rates) — require time_outside > 0 AND explicit bounds
+    
+    # Frequency metrics
+    
     minr, maxr = _cfg_default("freq_min_rate", None), _cfg_default("freq_max_rate", None)
     time_outside = aux["time_outside"]
     have_bounds = (minr is not None and maxr is not None and float(maxr) > float(minr))
@@ -336,41 +345,30 @@ def compute_features(
         feats["freq_total"] = sm.score_freq_total(conv.total, time_outside, minr, maxr)
     # else: skip both freq_* features
 
-    # Severity trend — only when both denominators > 0
+    
+    # Severity trend
+    
     if conv.curr_total > 0 and conv.past_total > 0:
-        # 1) Compute years elapsed from commitments
-        yrs_from_commits = _years_elapsed_from_commitments(uid, current_df, prior_df)
-        aux["years_elapsed_from_commitments"] = yrs_from_commits
+        yrs_from_commits = years_elapsed_prior_curr_commitments(uid, current_df, prior_df)
+        aux["years_elapsed_prior_curr_commitments"] = yrs_from_commits
 
-        # 2) Start with computed value, then optionally override via config
-        yrs_elapsed = yrs_from_commits
+        yrs_elapsed_for_trend = yrs_from_commits
 
-        override_years = getattr(CFG, "DEFAULT_TIME_ELAPSED_YEARS", None)
-        if override_years is not None:
-            # Config override (acts as default horizon when set)
+        # Only ONE override knob, from DEFAULTS
+        override_years = _cfg_default("years_elapsed_for_trend", None)
+        if use_default_trend_years and override_years is not None:
             try:
-                yrs_elapsed = float(override_years)
+                yrs_elapsed_for_trend = float(override_years)
             except Exception:
-                # If override is misconfigured, silently keep computed yrs_elapsed
-                pass
-        elif yrs_elapsed is None:
-            # Fallback if neither computed nor override is available
-            yrs_elapsed = 0.0
-
-        aux["years_elapsed_for_trend"] = yrs_elapsed
-
-        feats["severity_trend"] = sm.score_severity_trend(
-            conv.curr_violent_prop,
-            conv.past_violent_prop,
-            yrs_elapsed,
-        )
-    # else: skip severity_trend
-
-    # Rehabilitation / Education metrics:
-    # Intentionally omitted here because the public tables we load do not contain
-    # reliable program-credit fields. Per policy, we do not fabricate zeros.
-    # When a rehab credits source is provided (via config paths/columns or a join),
-    # callers should construct sm.RehabInputs and include these features; otherwise
-    # they are skipped and NOT added to the vector.
+                pass  # keep computed value if override is invalid
+
+        aux["years_elapsed_for_trend"] = yrs_elapsed_for_trend
+
+        if yrs_elapsed_for_trend is not None:
+            feats["severity_trend"] = sm.score_severity_trend(
+                conv.curr_violent_prop,
+                conv.past_violent_prop,
+                yrs_elapsed_for_trend,
+            )
 
     return feats, aux
diff --git a/config.py b/config.py
index 7a216c6..85486c0 100644
--- a/config.py
+++ b/config.py
@@ -1,75 +1,96 @@
 # config.py
+from __future__ import annotations
+
 import os
 import math
 from typing import Any, Dict
 
-# Profiles & Data Locations
-PROFILE = os.getenv("CFG_PROFILE", "PROD")          # "DEV" or "PROD"
-COMMIT_SHA = os.getenv("DATA_COMMIT", "main")       # pin for reproducibility
-
-PATHS_PROD = {
-    "demographics":        f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/demographics.csv",
-    "prior_commitments":   f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/prior_commitments.csv",
-    "current_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/current_commitments.csv",
-    # future optional (if available later):
-    # "rehab":            "path-or-url-to-rehab-credits.csv",
-}
-
-PATHS_DEV = {
-    "demographics":        r"D:\Judge_bias_detection\milestone_2\demographics.csv",
-    "prior_commitments":   r"D:\Judge_bias_detection\milestone_2\prior_commitments.csv",
-    "current_commitments": r"D:\Judge_bias_detection\milestone_2\current_commitments.csv",
-}
-
-PATHS = PATHS_PROD if PROFILE == "PROD" else PATHS_DEV
+# Data locations (single source of truth)
+#
+# Default behavior:
+#   - Reads from the offenses_data GitHub repo (raw URLs)
+# Optional local override:
+#   - Set SIMILARITY_DATA_DIR to a folder containing:
+#       demographics.csv, prior_commitments.csv, current_commitments.csv
+
+COMMIT_SHA = os.getenv("DATA_COMMIT", "main")  # pin for reproducibility
+DATA_DIR = os.getenv("SIMILARITY_DATA_DIR", "").strip()
+
+if DATA_DIR:
+    PATHS: Dict[str, str] = {
+        "demographics": os.path.join(DATA_DIR, "demographics.csv"),
+        "prior_commitments": os.path.join(DATA_DIR, "prior_commitments.csv"),
+        "current_commitments": os.path.join(DATA_DIR, "current_commitments.csv"),
+        # future optional:
+        # "rehab": os.path.join(DATA_DIR, "rehab_credits.csv"),
+    }
+else:
+    PATHS = {
+        "demographics": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/demographics.csv",
+        "prior_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/prior_commitments.csv",
+        "current_commitments": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/current_commitments.csv",
+        # future optional:
+        # "rehab": f"https://raw.githubusercontent.com/redoio/offenses_data/{COMMIT_SHA}/data/rehab_credits.csv",
+    }
 
 
 # Similarity / severity configuration
 
+
 # Minimum number of overlapping (valid) features required for similarity.
-# If the intersection size is < MIN_OVERLAP_FOR_SIMILARITY, all similarity
-# measures (cosine, euclidean_sim, tanimoto, jaccard) should return NaN.
+# If the intersection size is < MIN_OVERLAP_FOR_SIMILARITY, similarity measures return NaN.
 MIN_OVERLAP_FOR_SIMILARITY: int = 3
 
 # Decay rate λ used in the severity_trend formula:
-#   severity_trend = Δv * exp(-λ * years_elapsed)
+#   severity_trend = Δv * exp(-λ * years_elapsed_for_trend)
 SEVERITY_DECAY_RATE: float = 0.15  # can be tuned as needed
 
-# Global override for the years_elapsed used in severity_trend.
-# Workflow:
-#   1) By default, code computes elapsed years from
-#      first prior → last current commitment dates.
-#   2) If DEFAULT_TIME_ELAPSED_YEARS is not None, it replaces
-#      the computed value and acts as the default horizon.
-DEFAULT_TIME_ELAPSED_YEARS: Any = 10.0
-
 
 # Column Map
+
 COLS: Dict[str, Any] = {
-    "id": "cdcno",                           # REQUIRED identifier
-    "age_years": None,                       # No age available -> feature skipped
-    "dob": None,                             # optional (unused if None)
-    "reference_date": None,                  # optional
-    # Time/term fields (optional; used if your compute code supports them)
+    "id": "cdcno",  # REQUIRED identifier
+
+    # Optional / unavailable in current data -> feature skipped
+    "age_years": None,
+    "dob": None,
+    "reference_date": None,
+
+    # Time/term fields (optional; used if compute code supports them)
     "current_sentence": "aggregate sentence in months",
-    "completed_time":  "time served in years",
-    "past_time":       None,                 # optional
+    "completed_time": "time served in years",
+    "past_time": None,  # optional
+
     # Offense text fields (used by counting logic)
     "current_offense_text": "offense",
-    "prior_offense_text":   "offense",
-    # Category text (ignored by compute if not used)
+    "prior_offense_text": "offense",
+
+    # Category text (ignored unless compute uses it)
     "current_category_text": "offense category",
-    "prior_category_text":   "offense category",
+    "prior_category_text": "offense category",
+
+    # Optional commitment date columns (only needed if present in tables)
+    # "prior_commit_date": "commitment_date",
+    # "current_commit_date": "commitment_date",
 }
 
+
 # Defaults / Behavior Knobs
+
+# Aparna note: keep ALL tunable defaults in one place (this dict),
+# so we don't have confusing "DEFAULT_*" globals vs DEFAULTS dict.
 DEFAULTS: Dict[str, Any] = {
     "missing_numeric": math.nan,
     "require_time_fields": ("current_sentence", "completed_time"),
 
-    # Optional global exposure window (months) for frequency metrics.
-    # If None, the code computes a per-person window from (DOB+18y) to reference_date.
-    "months_elapsed_total": None,
+    # Frequency exposure window (MONTHS) used ONLY for freq_* metrics.
+    # If None, code computes a per-person exposure window (implementation-defined).
+    "months_elapsed_for_frequency": None,
+
+    # Trend horizon override (YEARS) used ONLY for severity_trend.
+    # If None, compute from: (first prior commitment date → last current commitment date).
+    # If set (e.g., 10.0), it overrides the computed years when compute uses defaults.
+    "years_elapsed_for_trend": 10.0,
 
     # Age normalization (only used if age_years is present and valid)
     "age_min": 18.0,
@@ -92,16 +113,18 @@
         "rehab_advanced_credits": 0.0,
     },
     "rehab_norm_bounds": {
-        "edu_general":   (None, None),
-        "edu_advanced":  (None, None),
+        "edu_general": (None, None),
+        "edu_advanced": (None, None),
         "rehab_general": (None, None),
-        "rehab_advanced":(None, None),
+        "rehab_advanced": (None, None),
     },
 }
 
-# Offense Policies (constants)
+
+# Offense Policies
+
 OFFENSE_LISTS = {
-    "violent":    ["187", "211", "245"],
+    "violent": ["187", "211", "245"],
     "nonviolent": ["459", "484", "10851"],
 }
 
@@ -111,15 +134,23 @@
     "strip_punctuation": True,
 }
 
+
 # Metric Names & Weights
+
 METRIC_NAMES = [
-    "desc_nonvio_curr", "desc_nonvio_past", "age",
-    "freq_violent", "freq_total", "severity_trend",
-    "edu_general", "edu_advanced", "rehab_general", "rehab_advanced",
+    "desc_nonvio_curr",
+    "desc_nonvio_past",
+    "age",
+    "freq_violent",
+    "freq_total",
+    "severity_trend",
+    "edu_general",
+    "edu_advanced",
+    "rehab_general",
+    "rehab_advanced",
 ]
 
 METRIC_WEIGHTS: Dict[str, float] = {
-    # Age is a full metric (normalized and positively aligned with suitability)
     "age": 1.0,
     "desc_nonvio_curr": 1.0,
     "desc_nonvio_past": 1.0,
@@ -138,7 +169,6 @@
     "age": +1,
     "freq_violent": -1,
     "freq_total": -1,
-    # severity_trend is inversely related to suitability (ideal = 0)
     "severity_trend": -1,
     "edu_general": +1,
     "edu_advanced": +1,
@@ -149,13 +179,12 @@
 METRIC_RANGES: Dict[str, Any] = {
     "desc_nonvio_curr": (0.0, 1.0),
     "desc_nonvio_past": (0.0, 1.0),
-    # metric is normalized to [0,1]
     "age": (0.0, 1.0),
     "freq_violent": (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]),
-    "freq_total":   (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]),
+    "freq_total": (DEFAULTS["freq_min_rate"], DEFAULTS["freq_max_rate"]),
     "severity_trend": (0.0, 1.0),
-    "edu_general":   (0.0, 1.0),
-    "edu_advanced":  (0.0, 1.0),
+    "edu_general": (0.0, 1.0),
+    "edu_advanced": (0.0, 1.0),
     "rehab_general": (0.0, 1.0),
     "rehab_advanced": (0.0, 1.0),
 }
diff --git a/offense_helpers.py b/offense_helpers.py
index b2d367a..7bd91b5 100644
--- a/offense_helpers.py
+++ b/offense_helpers.py
@@ -1,6 +1,26 @@
-# offense_helpers.py — STRICT, config-driven (Aparna-approved)
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+offense_helpers.py — strict, config-driven offense classification
+
+Design:
+- Prefer numeric penal code extraction when present (e.g., "PC 187(a)" -> "187")
+- Classify using config.OFFENSE_LISTS:
+    violent:   explicit list
+    nonviolent: explicit list (or special "rest" mode)
+- Optional "rest" mode:
+    * Either set OFFENSE_LISTS["nonviolent"] = "rest"
+    * OR set OFFENSE_POLICY["nonviolent_rest_mode"] = True
+  In either case: anything not in violent_list is treated as nonviolent.
+
+Notes:
+- We intentionally do NOT implement case-insensitivity or punctuation stripping beyond numeric extraction,
+  because penal-code extraction is the primary normalization for these datasets.
+"""
+
 from __future__ import annotations
-from typing import Any, Dict
+
+from typing import Any, Dict, Optional
 import re
 import config as CFG
 
@@ -10,13 +30,12 @@
 
 def _normalize_offense_token(x: Any) -> str:
     """
-    Prefer a numeric penal code if present (e.g., 'PC 187(a)' -> '187'),
+    Prefer numeric penal code if present (e.g., 'PC 187(a)' -> '187'),
     else return the original string trimmed.
 
     IMPORTANT:
-      - Does NOT use OFFENSE_POLICY
-      - No lowercase conversion
-      - No punctuation stripping beyond numeric extraction
+      - Strict mode: does NOT apply OFFENSE_POLICY case folding / punctuation stripping.
+      - Numeric extraction already removes most formatting variance.
     """
     if x is None:
         return ""
@@ -27,48 +46,48 @@ def _normalize_offense_token(x: Any) -> str:
     return m.group(0) if m else s
 
 
-def classify_offense(code_or_text: Any, lists: Dict[str, Any] | None = None) -> str:
+def classify_offense(code_or_text: Any, lists: Optional[Dict[str, Any]] = None) -> str:
     """
-    Strict classification using config.OFFENSE_LISTS.
-    Does NOT use OFFENSE_POLICY (even though it exists in config.py).
+    Strict classification using offense lists.
 
     Returns:
         "violent", "nonviolent", "other", or "clash"
 
     Logic:
-      1. violent list is always explicit
-      2. nonviolent list may be:
-           - explicit list
-           - "rest" meaning: everything not violent is nonviolent
-      3. clash if token appears in both lists (rare, but safe)
+      1) If token in both violent and nonviolent lists -> "clash"
+      2) If token in violent list -> "violent"
+      3) If nonviolent is explicit list:
+            token in list -> "nonviolent" else "other"
+      4) If "rest" mode enabled:
+            anything not violent -> "nonviolent"
+      5) fallback -> "other"
     """
-    li = lists or CFG.OFFENSE_LISTS
+    li = lists if lists is not None else getattr(CFG, "OFFENSE_LISTS", {})
 
     token = _normalize_offense_token(code_or_text)
     if token == "":
         return "other"
 
     violent_list = li.get("violent", []) or []
-    non_list     = li.get("nonviolent", [])
+    non_list = li.get("nonviolent", [])
+
+    # Determine whether "rest mode" is enabled
+    policy = getattr(CFG, "OFFENSE_POLICY", {}) or {}
+    rest_mode = bool(policy.get("nonviolent_rest_mode", False)) or (non_list == "rest")
 
     is_v = token in violent_list
-    is_n = isinstance(non_list, list) and token in non_list
+    is_n = isinstance(non_list, list) and (token in non_list)
 
-    # Case 1: token appears in both lists → clash
     if is_v and is_n:
         return "clash"
-
-    # Case 2: explicit violent
     if is_v:
         return "violent"
 
-    # Case 3: explicit nonviolent list
     if isinstance(non_list, list):
-        return "nonviolent" if is_n else "other"
+        return "nonviolent" if is_n else ("nonviolent" if rest_mode else "other")
 
-    # Case 4: nonviolent == "rest" mode
-    if non_list == "rest":
+    # non_list is not a list (e.g., "rest")
+    if rest_mode:
         return "nonviolent"
 
-    # Case 5: fallback
     return "other"
diff --git a/run.py b/run.py
index 86d9252..7dabdec 100644
--- a/run.py
+++ b/run.py
@@ -2,10 +2,16 @@
 # -*- coding: utf-8 -*-
 
 from __future__ import annotations
-import argparse, json, math
+
+import argparse
+import json
+import math
+import traceback
 from typing import Dict, Any, List, Optional
+
 import pandas as pd
 from tqdm import tqdm
+
 import config as CFG
 import sentencing_math as sm
 import compute_metrics as cm
@@ -24,14 +30,35 @@ def _flatten_counts(prefix: str, d: Dict[str, Any]) -> Dict[str, Any]:
 
 def _load_ids(ids_csv: Optional[str], demo: pd.DataFrame) -> List[str]:
     id_col = CFG.COLS["id"]
+    if demo is None or id_col not in demo.columns:
+        raise ValueError(
+            f"Demographics table is missing required id column '{id_col}'. "
+            f"Available columns: {list(demo.columns) if demo is not None else 'None'}"
+        )
+
     if ids_csv:
         df_ids = pd.read_csv(ids_csv)
         if id_col not in df_ids.columns:
             raise ValueError(f"--ids-csv must contain a column named '{id_col}' (from config.COLS['id']).")
         return df_ids[id_col].astype(str).dropna().unique().tolist()
+
     return demo[id_col].astype(str).dropna().unique().tolist()
 
 
+def _ensure_dense_metrics(feats: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    If CFG.METRIC_NAMES exists, ensure all metric columns appear (NaN if missing).
+    This makes population outputs schema-stable.
+    """
+    names = getattr(CFG, "METRIC_NAMES", None)
+    if not names:
+        return feats
+    out = dict(feats)
+    for k in names:
+        out.setdefault(k, math.nan)
+    return out
+
+
 def main():
     ap = argparse.ArgumentParser(description="Compute population-level sentencing metrics.")
     ap.add_argument("--out", default="population_metrics.csv")
@@ -39,19 +66,21 @@ def main():
     ap.add_argument("--ids-csv", default=None)
     ap.add_argument("--limit", type=int, default=None)
     ap.add_argument("--include-aux", action="store_true")
+    ap.add_argument("--dense", action="store_true", help="Include all CFG.METRIC_NAMES columns (NaN if missing).")
     ap.add_argument("--print-every", type=int, default=0)
     ap.add_argument("--fail-fast", action="store_true")
+    ap.add_argument("--tracebacks", action="store_true", help="Include stack traces in the .errors.jsonl file.")
     args = ap.parse_args()
 
     # Load source tables
     demo = cm.read_table(CFG.PATHS["demographics"])
-    cur  = cm.read_table(CFG.PATHS["current_commitments"])
-    pri  = cm.read_table(CFG.PATHS["prior_commitments"])
+    cur = cm.read_table(CFG.PATHS["current_commitments"])
+    pri = cm.read_table(CFG.PATHS["prior_commitments"])
 
     # Policy knobs
-    lists       = getattr(CFG, "OFFENSE_LISTS", {"violent": [], "nonviolent": []})
-    weights     = getattr(CFG, "METRIC_WEIGHTS", getattr(CFG, "WEIGHTS", {}))
-    directions  = getattr(CFG, "METRIC_DIRECTIONS", {})
+    lists = getattr(CFG, "OFFENSE_LISTS", {"violent": [], "nonviolent": []})
+    weights = getattr(CFG, "METRIC_WEIGHTS", getattr(CFG, "WEIGHTS", {}))
+    directions = getattr(CFG, "METRIC_DIRECTIONS", {})
 
     # Who to run
     ids = _load_ids(args.ids_csv, demo)
@@ -67,6 +96,9 @@ def main():
         try:
             feats, aux = cm.compute_features(str(uid), demo, cur, pri, lists)
 
+            # Optional: stable schema (include NaNs for missing metrics)
+            feats_out = _ensure_dense_metrics(feats) if args.dense else feats
+
             # Final suitability as ratio + parts:
             #   numerator = w · m       (dot with actual metrics)
             #   denom     = w · x*      (dot with best-case vector)
@@ -77,7 +109,7 @@ def main():
                 return_parts=True,
             )
 
-            #  NaN/None/0 safe handling 
+            # NaN/None/0 safe handling
             no_denom = (
                 denom is None
                 or denom == 0
@@ -88,23 +120,22 @@ def main():
                 score_ratio_safe = math.nan
                 numerator_safe = math.nan
                 denom_safe = math.nan
-                evaluated_flag = 0  # "not evaluated / insufficient data"
+                evaluated_flag = 0  # not evaluated / insufficient data
                 score_pct_of_out = math.nan
             else:
                 score_ratio_safe = float(score_ratio)
                 numerator_safe = float(numerator)
                 denom_safe = float(denom)
-                evaluated_flag = 1  # "evaluated"
-                # <-- HERE is the NaN-safe percentage
+                evaluated_flag = 1  # evaluated
                 score_pct_of_out = (numerator_safe / denom_safe) * 100.0
 
             record: Dict[str, Any] = {
-                CFG.COLS["id"]: uid,
-                **feats,
+                CFG.COLS["id"]: str(uid),
+                **feats_out,
                 "score": numerator_safe,
                 "score_out_of": denom_safe,
                 "score_ratio": score_ratio_safe,
-                "score_pct_of_out": score_pct_of_out,  # NEW
+                "score_pct_of_out": score_pct_of_out,
                 "evaluated": evaluated_flag,
             }
 
@@ -122,18 +153,33 @@ def main():
         except Exception as e:
             if args.fail_fast:
                 raise
-            errors.append({CFG.COLS["id"]: uid, "error": f"{type(e).__name__}: {e}"})
+            err_rec: Dict[str, Any] = {
+                CFG.COLS["id"]: str(uid),
+                "error": f"{type(e).__name__}: {e}",
+            }
+            if args.tracebacks:
+                err_rec["traceback"] = traceback.format_exc()
+            errors.append(err_rec)
         finally:
             pbar.update(1)
     pbar.close()
 
     out_df = pd.DataFrame(rows)
 
-    # Put ID first
+    # Column ordering: id, metrics (if configured), score fields, then the rest
     id_col = CFG.COLS["id"]
-    cols = out_df.columns.tolist()
-    if id_col in cols:
-        out_df = out_df[[id_col] + [c for c in cols if c != id_col]]
+    metric_order = getattr(CFG, "METRIC_NAMES", [])
+    score_order = ["score_ratio", "score", "score_out_of", "score_pct_of_out", "evaluated"]
+
+    ordered: List[str] = []
+    for c in [id_col] + list(metric_order) + score_order:
+        if c in out_df.columns and c not in ordered:
+            ordered.append(c)
+    # append any remaining columns
+    for c in out_df.columns:
+        if c not in ordered:
+            ordered.append(c)
+    out_df = out_df[ordered] if not out_df.empty else out_df
 
     # Write
     out_fmt = (
@@ -156,7 +202,7 @@ def main():
                 f.write(json.dumps(rec) + "\n")
         print(f"Encountered {len(errors)} errors. Details → {err_path}")
 
-    # Preview a few key columns if present
+    # Preview
     if not out_df.empty:
         preferred = [id_col, "score_ratio", "score", "score_out_of", "score_pct_of_out", "evaluated"]
         extra = [c for c in out_df.columns if c not in preferred][:5]