From 74302bc3b417f120c6dd5084a46d126151726172 Mon Sep 17 00:00:00 2001
From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:59:40 +0200
Subject: [PATCH 1/4] feat: work with new 2nd level headers

We decided to change the 2nd level headers of "patient" from
"#" to "info". This commit makes the lydata package compatible
with that.
---
 src/lydata/accessor.py | 69 +++++++++++++++++++++++++++++++++---------
 src/lydata/loader.py   | 10 +++---
 src/lydata/utils.py    | 66 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 121 insertions(+), 24 deletions(-)

diff --git a/src/lydata/accessor.py b/src/lydata/accessor.py
index 96f9dc6..8138d49 100644
--- a/src/lydata/accessor.py
+++ b/src/lydata/accessor.py
@@ -45,7 +45,8 @@
 
 from lydata.utils import (
     ModalityConfig,
-    get_default_column_map,
+    get_default_column_map_new,
+    get_default_column_map_old,
     get_default_modalities,
 )
 from lydata.validator import construct_schema
@@ -161,7 +162,7 @@ def __init__(
         self.colname = column
         self.operator = operator
         self.value = value
-        self._column_map = get_default_column_map()
+        self._column_map = get_default_column_map_old()
 
     def __repr__(self) -> str:
         """Return a string representation of the query."""
@@ -536,7 +537,7 @@ def _expand_mapping(
     >>> _expand_mapping({'age': 'foo', 'hpv': 'bar'})
     {('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'}
     """
-    _colname_map = colname_map or get_default_column_map().from_short
+    _colname_map = colname_map or get_default_column_map_old().from_short
     expanded_map = {}
 
     for colname, func in short_map.items():
@@ -560,7 +561,8 @@ class LyDataAccessor:
     def __init__(self, obj: pd.DataFrame) -> None:
         """Initialize the accessor with a DataFrame."""
         self._obj = obj
-        self._column_map = get_default_column_map()
+        self._column_map_old = get_default_column_map_old()
+        self._column_map_new = get_default_column_map_new()
 
     def __contains__(self, key: str) -> bool:
         """Check if a column is contained in the DataFrame.
@@ -572,14 +574,32 @@ def __contains__(self, key: str) -> bool:
         False
         >>> ("patient", "#", "age") in df.ly
         True
+        >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]})
+        >>> "age" in df.ly
+        True
+        >>> "foo" in df.ly
+        False
+        >>> ("patient", "info", "age") in df.ly
+        True
         """
-        _key = self._get_safe_long(key)
-        return _key in self._obj
+        _key_old = self._get_safe_long_old(key)
+        _key_new = self._get_safe_long_new(key)
+        return _key_new in self._obj or _key_old in self._obj
 
     def __getitem__(self, key: str) -> pd.Series:
         """Allow column access by short name, too."""
-        _key = self._get_safe_long(key)
-        return self._obj[_key]
+        _key_old = self._get_safe_long_old(key)
+        _key_new = self._get_safe_long_new(key)
+
+        try:
+            return self._obj[_key_new]
+        except KeyError as err_from_new:
+            try:
+                return self._obj[_key_old]
+            except KeyError:
+                raise KeyError(
+                    f"Neither '{_key_new}' nor '{_key_old}' found in DataFrame."
+                ) from err_from_new
 
     def __getattr__(self, name: str) -> Any:
         """Access columns also by short name.
@@ -590,6 +610,12 @@ def __getattr__(self, name: str) -> Any:
         1    52
         2    73
         Name: (patient, #, age), dtype: int64
+        >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]})
+        >>> df.ly.age
+        0    61
+        1    52
+        2    73
+        Name: (patient, info, age), dtype: int64
         >>> df.ly.foo
         Traceback (most recent call last):
             ...
@@ -600,9 +626,13 @@ def __getattr__(self, name: str) -> Any:
         except KeyError as key_err:
             raise AttributeError(f"Attribute {name!r} not found.") from key_err
 
-    def _get_safe_long(self, key: Any) -> tuple[str, str, str]:
-        """Get the long column name or return the input."""
-        return getattr(self._column_map.from_short.get(key), "long", key)
+    def _get_safe_long_old(self, key: Any) -> tuple[str, str, str]:
+        """Get the old long column name or return the input."""
+        return getattr(self._column_map_old.from_short.get(key), "long", key)
+
+    def _get_safe_long_new(self, key: Any) -> tuple[str, str, str]:
+        """Get the new long column name or return the input."""
+        return getattr(self._column_map_new.from_short.get(key), "long", key)
 
     def validate(self, modalities: list[str] | None = None) -> pd.DataFrame:
         """Validate the DataFrame against the lydata schema.
@@ -697,7 +727,7 @@ def stats(
         The ``agg_funcs`` argument is a mapping of column names to functions that
         receive a :py:class:`pd.Series` and return a :py:class:`pd.Series`. The default
         is a useful selection of statistics for the most common columns. E.g., for the
-        column ``('patient', '#', 'age')`` (or its short column name ``age``), the
+        column ``('patient', 'info', 'age')`` (or its short column name ``age``), the
         default function returns the value counts.
 
         The ``use_shortnames`` argument determines whether the output should use the
@@ -712,11 +742,20 @@ def stats(
         ...     ('tumor', '1', 't_stage'): [2, 3, 1, 2],
         ... })
         >>> df.ly.stats()   # doctest: +NORMALIZE_WHITESPACE
+        {'age': {61: 2, 52: 1, 73: 1},
+         'hpv': {True: 2, False: 1, None: 1},
+         't_stage': {2: 2, 3: 1, 1: 1}}
+        >>> df = pd.DataFrame({
+        ...     ('patient', 'info', 'age'): [61, 52, 73, 61],
+        ...     ('patient', 'info', 'hpv_status'): [True, False, None, True],
+        ...     ('tumor', 'info', 't_stage'): [2, 3, 1, 2],
+        ... })
+        >>> df.ly.stats()   # doctest: +NORMALIZE_WHITESPACE
         {'age': {61: 2, 52: 1, 73: 1},
          'hpv': {True: 2, False: 1, None: 1},
          't_stage': {2: 2, 3: 1, 1: 1}}
         """
-        _agg_funcs = self._column_map.from_short.copy()
+        _agg_funcs = self._column_map_new.from_short.copy()
         _agg_funcs.update(agg_funcs or {})
         stats = {}
 
@@ -725,8 +764,8 @@ def stats(
                 continue
 
             column = self[colname]
-            if use_shortnames and colname in self._column_map.from_long:
-                colname = self._column_map.from_long[colname].short
+            if use_shortnames and colname in self._column_map_old.from_long:
+                colname = self._column_map_old.from_long[colname].short
 
             stats[colname] = getattr(func(column), f"to_{out_format}")()
 
diff --git a/src/lydata/loader.py b/src/lydata/loader.py
index dcaf957..741f2a0 100644
--- a/src/lydata/loader.py
+++ b/src/lydata/loader.py
@@ -123,14 +123,14 @@ def get_content_file(
         This method always tries to fetch the most recent version of the file.
 
         >>> conf = LyDataset(
-        ...     year=2023,
+        ...     year=2025,
         ...     institution="usz",
         ...     subsite="hypopharynx-larynx",
         ...     repo_name="lycosystem/lydata.private",
-        ...     ref="2023-usz-hypopharynx-larynx",
+        ...     ref="2025-usz-hypopharynx-larynx",
         ... )
         >>> conf.get_content_file()
-        ContentFile(path="2023-usz-hypopharynx-larynx/data.csv")
+        ContentFile(path="2025-usz-hypopharynx-larynx/data.csv")
         """
         if self._content_file is not None:
             if self._content_file.update():
@@ -262,7 +262,7 @@ def available_datasets(
      '2025-hvh-oropharynx']
     >>> avail_gen = available_datasets(
     ...     repo_name="lycosystem/lydata.private",
-    ...     ref="2024-umcg-hypopharynx-larynx",
+    ...     ref="2025-umcg-hypopharynx-larynx",
     ...     use_github=True,
     ... )
     >>> sorted([ds.name for ds in avail_gen])   # doctest: +NORMALIZE_WHITESPACE
@@ -270,7 +270,7 @@ def available_datasets(
      '2021-usz-oropharynx',
      '2023-clb-multisite',
      '2023-isb-multisite',
-     '2024-umcg-hypopharynx-larynx']
+     '2025-umcg-hypopharynx-larynx']
     >>> avail_gen = available_datasets(
     ...     institution="hvh",
     ...     ref="6ac98d",
diff --git a/src/lydata/utils.py b/src/lydata/utils.py
index 3925067..12b8daf 100644
--- a/src/lydata/utils.py
+++ b/src/lydata/utils.py
@@ -66,7 +66,15 @@ def update_and_expand(
 
 @dataclass
 class _ColumnSpec:
-    """Class for specifying column names and aggfuncs."""
+    """Class for specifying column names and aggfuncs.
+
+    This serves a dual purpose:
+
+    1. It is a simple container that ties together a short name and a long name. For
+       this we could have used a `namedtuple` as well.
+    2. Every `_ColumnSpec` is also an aggregation function in itself. This is used in
+       the :py:meth:`~lydata.accessor.LyDataAccessor.stats` method.
+    """
 
     short: str
     long: tuple[str, str, str]
@@ -108,14 +116,18 @@ def __iter__(self):
         return iter(self.from_short.values())
 
 
-def get_default_column_map() -> _ColumnMap:
-    """Get the default column map.
+def get_default_column_map_old() -> _ColumnMap:
+    """Get the old default column map.
 
     This map defines which short column names can be used to access columns in the
     DataFrames.
 
     >>> from lydata import accessor, loader
-    >>> df = next(loader.load_datasets(institution="usz"))
+    >>> df = next(loader.load_datasets(
+    ...     institution="usz",
+    ...     repo_name="lycosystem/lydata.private",
+    ...     ref="ab04379a36b6946306041d1d38ad7e97df8ee7ba",
+    ... ))
     >>> df.ly.surgery   # doctest: +ELLIPSIS
     0      False
     ...
@@ -150,6 +162,52 @@ def get_default_column_map() -> _ColumnMap:
     )
 
 
+def get_default_column_map_new() -> _ColumnMap:
+    """Get the old default column map.
+
+    This map defines which short column names can be used to access columns in the
+    DataFrames.
+
+    >>> from lydata import accessor, loader
+    >>> df = next(loader.load_datasets(
+    ...     institution="usz",
+    ...     repo_name="lycosystem/lydata.private",
+    ...     ref="ce2ac255b8aec7443375b610e5254a46bf236a46",
+    ... ))
+    >>> df.ly.surgery   # doctest: +ELLIPSIS
+    0      False
+    ...
+    286    False
+    Name: (patient, info, neck_dissection), Length: 287, dtype: bool
+    >>> df.ly.smoke   # doctest: +ELLIPSIS
+    0       True
+    ...
+    286     True
+    Name: (patient, info, nicotine_abuse), Length: 287, dtype: bool
+    """
+    return _ColumnMap.from_list(
+        [
+            _ColumnSpec("id", ("patient", "info", "id")),
+            _ColumnSpec("institution", ("patient", "info", "institution")),
+            _ColumnSpec("sex", ("patient", "info", "sex")),
+            _ColumnSpec("age", ("patient", "info", "age")),
+            _ColumnSpec("weight", ("patient", "info", "weight")),
+            _ColumnSpec("date", ("patient", "info", "diagnose_date")),
+            _ColumnSpec("surgery", ("patient", "info", "neck_dissection")),
+            _ColumnSpec("hpv", ("patient", "info", "hpv_status")),
+            _ColumnSpec("smoke", ("patient", "info", "nicotine_abuse")),
+            _ColumnSpec("alcohol", ("patient", "info", "alcohol_abuse")),
+            _ColumnSpec("t_stage", ("tumor", "info", "t_stage")),
+            _ColumnSpec("n_stage", ("patient", "info", "n_stage")),
+            _ColumnSpec("m_stage", ("patient", "info", "m_stage")),
+            _ColumnSpec("midext", ("tumor", "info", "extension")),
+            _ColumnSpec("subsite", ("tumor", "info", "subsite")),
+            _ColumnSpec("volume", ("tumor", "info", "volume")),
+            _ColumnSpec("central", ("tumor", "info", "central")),
+        ]
+    )
+
+
 class ModalityConfig(BaseModel):
     """Define a diagnostic or pathological modality."""
 

From 8a3680e84508eced1b7c732eca9ac56273d387ba Mon Sep 17 00:00:00 2001
From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:26:54 +0200
Subject: [PATCH 2/4] feat: add `side` as short column to `ly` accessor

---
 src/lydata/utils.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/lydata/utils.py b/src/lydata/utils.py
index 12b8daf..5141184 100644
--- a/src/lydata/utils.py
+++ b/src/lydata/utils.py
@@ -158,10 +158,27 @@ def get_default_column_map_old() -> _ColumnMap:
             _ColumnSpec("subsite", ("tumor", "1", "subsite")),
             _ColumnSpec("volume", ("tumor", "1", "volume")),
             _ColumnSpec("central", ("tumor", "1", "central")),
+            _ColumnSpec("side", ("tumor", "1", "side")),
         ]
     )
 
 
+def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]:
+    """Convert an old long name to a new long name.
+
+    >>> new_from_old(("patient", "#", "neck_dissection"))
+    ('patient', 'info', 'neck_dissection')
+    >>> new_from_old(("tumor", "1", "t_stage"))
+    ('tumor', 'info', 't_stage')
+    >>> new_from_old(("a", "b", "c"))
+    ('a', 'b', 'c')
+    """
+    start, middle, end = long_name
+    if (start == "patient" and middle == "#") or (start == "tumor" and middle == "1"):
+        middle = "info"
+    return (start, middle, end)
+
+
 def get_default_column_map_new() -> _ColumnMap:
     """Get the old default column map.
 
@@ -187,23 +204,8 @@ def get_default_column_map_new() -> _ColumnMap:
     """
     return _ColumnMap.from_list(
         [
-            _ColumnSpec("id", ("patient", "info", "id")),
-            _ColumnSpec("institution", ("patient", "info", "institution")),
-            _ColumnSpec("sex", ("patient", "info", "sex")),
-            _ColumnSpec("age", ("patient", "info", "age")),
-            _ColumnSpec("weight", ("patient", "info", "weight")),
-            _ColumnSpec("date", ("patient", "info", "diagnose_date")),
-            _ColumnSpec("surgery", ("patient", "info", "neck_dissection")),
-            _ColumnSpec("hpv", ("patient", "info", "hpv_status")),
-            _ColumnSpec("smoke", ("patient", "info", "nicotine_abuse")),
-            _ColumnSpec("alcohol", ("patient", "info", "alcohol_abuse")),
-            _ColumnSpec("t_stage", ("tumor", "info", "t_stage")),
-            _ColumnSpec("n_stage", ("patient", "info", "n_stage")),
-            _ColumnSpec("m_stage", ("patient", "info", "m_stage")),
-            _ColumnSpec("midext", ("tumor", "info", "extension")),
-            _ColumnSpec("subsite", ("tumor", "info", "subsite")),
-            _ColumnSpec("volume", ("tumor", "info", "volume")),
-            _ColumnSpec("central", ("tumor", "info", "central")),
+            _ColumnSpec(cs.short, _new_from_old(cs.long))
+            for cs in get_default_column_map_old()
         ]
     )
 

From 8da24ab70e9a7f0c0035f6d3b40498251b6c97d3 Mon Sep 17 00:00:00 2001
From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:30:13 +0200
Subject: [PATCH 3/4] chore: update changelog

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de0f732..71febfa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.3.2] - 2025-07-22
+
+### 🚀 Features
+
+- Work with new 2nd level headers (see [this lyDATA issue](https://github.com/lycosystem/lydata/issues/21))
+- Add `side` as short column to `ly` accessor
+
 ## [0.3.1] - 2025-07-12
 
 ### 🐛 Bug Fixes

From 01c7ca2852c72990b46e011cf6088ff9e3a4ef69 Mon Sep 17 00:00:00 2001
From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:33:00 +0200
Subject: [PATCH 4/4] test: fix misspelled docstring

---
 src/lydata/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lydata/utils.py b/src/lydata/utils.py
index 5141184..deaf046 100644
--- a/src/lydata/utils.py
+++ b/src/lydata/utils.py
@@ -166,11 +166,11 @@ def get_default_column_map_old() -> _ColumnMap:
 def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]:
     """Convert an old long name to a new long name.
 
-    >>> new_from_old(("patient", "#", "neck_dissection"))
+    >>> _new_from_old(("patient", "#", "neck_dissection"))
     ('patient', 'info', 'neck_dissection')
-    >>> new_from_old(("tumor", "1", "t_stage"))
+    >>> _new_from_old(("tumor", "1", "t_stage"))
     ('tumor', 'info', 't_stage')
-    >>> new_from_old(("a", "b", "c"))
+    >>> _new_from_old(("a", "b", "c"))
     ('a', 'b', 'c')
     """
     start, middle, end = long_name