From 74302bc3b417f120c6dd5084a46d126151726172 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 11:59:40 +0200 Subject: [PATCH 1/4] feat: work with new 2nd level headers We decided to change the 2nd level headers of "patient" from "#" to "info". This commit makes the lydata package compatible with that. --- src/lydata/accessor.py | 69 +++++++++++++++++++++++++++++++++--------- src/lydata/loader.py | 10 +++--- src/lydata/utils.py | 66 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 121 insertions(+), 24 deletions(-) diff --git a/src/lydata/accessor.py b/src/lydata/accessor.py index 96f9dc6..8138d49 100644 --- a/src/lydata/accessor.py +++ b/src/lydata/accessor.py @@ -45,7 +45,8 @@ from lydata.utils import ( ModalityConfig, - get_default_column_map, + get_default_column_map_new, + get_default_column_map_old, get_default_modalities, ) from lydata.validator import construct_schema @@ -161,7 +162,7 @@ def __init__( self.colname = column self.operator = operator self.value = value - self._column_map = get_default_column_map() + self._column_map = get_default_column_map_old() def __repr__(self) -> str: """Return a string representation of the query.""" @@ -536,7 +537,7 @@ def _expand_mapping( >>> _expand_mapping({'age': 'foo', 'hpv': 'bar'}) {('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'} """ - _colname_map = colname_map or get_default_column_map().from_short + _colname_map = colname_map or get_default_column_map_old().from_short expanded_map = {} for colname, func in short_map.items(): @@ -560,7 +561,8 @@ class LyDataAccessor: def __init__(self, obj: pd.DataFrame) -> None: """Initialize the accessor with a DataFrame.""" self._obj = obj - self._column_map = get_default_column_map() + self._column_map_old = get_default_column_map_old() + self._column_map_new = get_default_column_map_new() def __contains__(self, key: str) -> bool: """Check if a column is contained in the DataFrame. @@ -572,14 +574,32 @@ def __contains__(self, key: str) -> bool: False >>> ("patient", "#", "age") in df.ly True + >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]}) + >>> "age" in df.ly + True + >>> "foo" in df.ly + False + >>> ("patient", "info", "age") in df.ly + True """ - _key = self._get_safe_long(key) - return _key in self._obj + _key_old = self._get_safe_long_old(key) + _key_new = self._get_safe_long_new(key) + return _key_new in self._obj or _key_old in self._obj def __getitem__(self, key: str) -> pd.Series: """Allow column access by short name, too.""" - _key = self._get_safe_long(key) - return self._obj[_key] + _key_old = self._get_safe_long_old(key) + _key_new = self._get_safe_long_new(key) + + try: + return self._obj[_key_new] + except KeyError as err_from_new: + try: + return self._obj[_key_old] + except KeyError: + raise KeyError( + f"Neither '{_key_new}' nor '{_key_old}' found in DataFrame." + ) from err_from_new def __getattr__(self, name: str) -> Any: """Access columns also by short name. @@ -590,6 +610,12 @@ def __getattr__(self, name: str) -> Any: 1 52 2 73 Name: (patient, #, age), dtype: int64 + >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]}) + >>> df.ly.age + 0 61 + 1 52 + 2 73 + Name: (patient, info, age), dtype: int64 >>> df.ly.foo Traceback (most recent call last): ... @@ -600,9 +626,13 @@ def __getattr__(self, name: str) -> Any: except KeyError as key_err: raise AttributeError(f"Attribute {name!r} not found.") from key_err - def _get_safe_long(self, key: Any) -> tuple[str, str, str]: - """Get the long column name or return the input.""" - return getattr(self._column_map.from_short.get(key), "long", key) + def _get_safe_long_old(self, key: Any) -> tuple[str, str, str]: + """Get the old long column name or return the input.""" + return getattr(self._column_map_old.from_short.get(key), "long", key) + + def _get_safe_long_new(self, key: Any) -> tuple[str, str, str]: + """Get the new long column name or return the input.""" + return getattr(self._column_map_new.from_short.get(key), "long", key) def validate(self, modalities: list[str] | None = None) -> pd.DataFrame: """Validate the DataFrame against the lydata schema. @@ -697,7 +727,7 @@ def stats( The ``agg_funcs`` argument is a mapping of column names to functions that receive a :py:class:`pd.Series` and return a :py:class:`pd.Series`. The default is a useful selection of statistics for the most common columns. E.g., for the - column ``('patient', '#', 'age')`` (or its short column name ``age``), the + column ``('patient', 'info', 'age')`` (or its short column name ``age``), the default function returns the value counts. The ``use_shortnames`` argument determines whether the output should use the @@ -712,11 +742,20 @@ def stats( ... ('tumor', '1', 't_stage'): [2, 3, 1, 2], ... }) >>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE + {'age': {61: 2, 52: 1, 73: 1}, + 'hpv': {True: 2, False: 1, None: 1}, + 't_stage': {2: 2, 3: 1, 1: 1}} + >>> df = pd.DataFrame({ + ... ('patient', 'info', 'age'): [61, 52, 73, 61], + ... ('patient', 'info', 'hpv_status'): [True, False, None, True], + ... ('tumor', 'info', 't_stage'): [2, 3, 1, 2], + ... }) + >>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE {'age': {61: 2, 52: 1, 73: 1}, 'hpv': {True: 2, False: 1, None: 1}, 't_stage': {2: 2, 3: 1, 1: 1}} """ - _agg_funcs = self._column_map.from_short.copy() + _agg_funcs = self._column_map_new.from_short.copy() _agg_funcs.update(agg_funcs or {}) stats = {} @@ -725,8 +764,8 @@ def stats( continue column = self[colname] - if use_shortnames and colname in self._column_map.from_long: - colname = self._column_map.from_long[colname].short + if use_shortnames and colname in self._column_map_old.from_long: + colname = self._column_map_old.from_long[colname].short stats[colname] = getattr(func(column), f"to_{out_format}")() diff --git a/src/lydata/loader.py b/src/lydata/loader.py index dcaf957..741f2a0 100644 --- a/src/lydata/loader.py +++ b/src/lydata/loader.py @@ -123,14 +123,14 @@ def get_content_file( This method always tries to fetch the most recent version of the file. >>> conf = LyDataset( - ... year=2023, + ... year=2025, ... institution="usz", ... subsite="hypopharynx-larynx", ... repo_name="lycosystem/lydata.private", - ... ref="2023-usz-hypopharynx-larynx", + ... ref="2025-usz-hypopharynx-larynx", ... ) >>> conf.get_content_file() - ContentFile(path="2023-usz-hypopharynx-larynx/data.csv") + ContentFile(path="2025-usz-hypopharynx-larynx/data.csv") """ if self._content_file is not None: if self._content_file.update(): @@ -262,7 +262,7 @@ def available_datasets( '2025-hvh-oropharynx'] >>> avail_gen = available_datasets( ... repo_name="lycosystem/lydata.private", - ... ref="2024-umcg-hypopharynx-larynx", + ... ref="2025-umcg-hypopharynx-larynx", ... use_github=True, ... ) >>> sorted([ds.name for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE @@ -270,7 +270,7 @@ def available_datasets( '2021-usz-oropharynx', '2023-clb-multisite', '2023-isb-multisite', - '2024-umcg-hypopharynx-larynx'] + '2025-umcg-hypopharynx-larynx'] >>> avail_gen = available_datasets( ... institution="hvh", ... ref="6ac98d", diff --git a/src/lydata/utils.py b/src/lydata/utils.py index 3925067..12b8daf 100644 --- a/src/lydata/utils.py +++ b/src/lydata/utils.py @@ -66,7 +66,15 @@ def update_and_expand( @dataclass class _ColumnSpec: - """Class for specifying column names and aggfuncs.""" + """Class for specifying column names and aggfuncs. + + This serves a dual purpose: + + 1. It is a simple container that ties together a short name and a long name. For + this we could have used a `namedtuple` as well. + 2. Every `_ColumnSpec` is also an aggregation function in itself. This is used in + the :py:meth:`~lydata.accessor.LyDataAccessor.stats` method. + """ short: str long: tuple[str, str, str] @@ -108,14 +116,18 @@ def __iter__(self): return iter(self.from_short.values()) -def get_default_column_map() -> _ColumnMap: - """Get the default column map. +def get_default_column_map_old() -> _ColumnMap: + """Get the old default column map. This map defines which short column names can be used to access columns in the DataFrames. >>> from lydata import accessor, loader - >>> df = next(loader.load_datasets(institution="usz")) + >>> df = next(loader.load_datasets( + ... institution="usz", + ... repo_name="lycosystem/lydata.private", + ... ref="ab04379a36b6946306041d1d38ad7e97df8ee7ba", + ... )) >>> df.ly.surgery # doctest: +ELLIPSIS 0 False ... @@ -150,6 +162,52 @@ def get_default_column_map() -> _ColumnMap: ) +def get_default_column_map_new() -> _ColumnMap: + """Get the old default column map. + + This map defines which short column names can be used to access columns in the + DataFrames. + + >>> from lydata import accessor, loader + >>> df = next(loader.load_datasets( + ... institution="usz", + ... repo_name="lycosystem/lydata.private", + ... ref="ce2ac255b8aec7443375b610e5254a46bf236a46", + ... )) + >>> df.ly.surgery # doctest: +ELLIPSIS + 0 False + ... + 286 False + Name: (patient, info, neck_dissection), Length: 287, dtype: bool + >>> df.ly.smoke # doctest: +ELLIPSIS + 0 True + ... + 286 True + Name: (patient, info, nicotine_abuse), Length: 287, dtype: bool + """ + return _ColumnMap.from_list( + [ + _ColumnSpec("id", ("patient", "info", "id")), + _ColumnSpec("institution", ("patient", "info", "institution")), + _ColumnSpec("sex", ("patient", "info", "sex")), + _ColumnSpec("age", ("patient", "info", "age")), + _ColumnSpec("weight", ("patient", "info", "weight")), + _ColumnSpec("date", ("patient", "info", "diagnose_date")), + _ColumnSpec("surgery", ("patient", "info", "neck_dissection")), + _ColumnSpec("hpv", ("patient", "info", "hpv_status")), + _ColumnSpec("smoke", ("patient", "info", "nicotine_abuse")), + _ColumnSpec("alcohol", ("patient", "info", "alcohol_abuse")), + _ColumnSpec("t_stage", ("tumor", "info", "t_stage")), + _ColumnSpec("n_stage", ("patient", "info", "n_stage")), + _ColumnSpec("m_stage", ("patient", "info", "m_stage")), + _ColumnSpec("midext", ("tumor", "info", "extension")), + _ColumnSpec("subsite", ("tumor", "info", "subsite")), + _ColumnSpec("volume", ("tumor", "info", "volume")), + _ColumnSpec("central", ("tumor", "info", "central")), + ] + ) + + class ModalityConfig(BaseModel): """Define a diagnostic or pathological modality.""" From 8a3680e84508eced1b7c732eca9ac56273d387ba Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:26:54 +0200 Subject: [PATCH 2/4] feat: add `side` as short column to `ly` accessor --- src/lydata/utils.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/lydata/utils.py b/src/lydata/utils.py index 12b8daf..5141184 100644 --- a/src/lydata/utils.py +++ b/src/lydata/utils.py @@ -158,10 +158,27 @@ def get_default_column_map_old() -> _ColumnMap: _ColumnSpec("subsite", ("tumor", "1", "subsite")), _ColumnSpec("volume", ("tumor", "1", "volume")), _ColumnSpec("central", ("tumor", "1", "central")), + _ColumnSpec("side", ("tumor", "1", "side")), ] ) +def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]: + """Convert an old long name to a new long name. + + >>> new_from_old(("patient", "#", "neck_dissection")) + ('patient', 'info', 'neck_dissection') + >>> new_from_old(("tumor", "1", "t_stage")) + ('tumor', 'info', 't_stage') + >>> new_from_old(("a", "b", "c")) + ('a', 'b', 'c') + """ + start, middle, end = long_name + if (start == "patient" and middle == "#") or (start == "tumor" and middle == "1"): + middle = "info" + return (start, middle, end) + + def get_default_column_map_new() -> _ColumnMap: """Get the old default column map. @@ -187,23 +204,8 @@ def get_default_column_map_new() -> _ColumnMap: """ return _ColumnMap.from_list( [ - _ColumnSpec("id", ("patient", "info", "id")), - _ColumnSpec("institution", ("patient", "info", "institution")), - _ColumnSpec("sex", ("patient", "info", "sex")), - _ColumnSpec("age", ("patient", "info", "age")), - _ColumnSpec("weight", ("patient", "info", "weight")), - _ColumnSpec("date", ("patient", "info", "diagnose_date")), - _ColumnSpec("surgery", ("patient", "info", "neck_dissection")), - _ColumnSpec("hpv", ("patient", "info", "hpv_status")), - _ColumnSpec("smoke", ("patient", "info", "nicotine_abuse")), - _ColumnSpec("alcohol", ("patient", "info", "alcohol_abuse")), - _ColumnSpec("t_stage", ("tumor", "info", "t_stage")), - _ColumnSpec("n_stage", ("patient", "info", "n_stage")), - _ColumnSpec("m_stage", ("patient", "info", "m_stage")), - _ColumnSpec("midext", ("tumor", "info", "extension")), - _ColumnSpec("subsite", ("tumor", "info", "subsite")), - _ColumnSpec("volume", ("tumor", "info", "volume")), - _ColumnSpec("central", ("tumor", "info", "central")), + _ColumnSpec(cs.short, _new_from_old(cs.long)) + for cs in get_default_column_map_old() ] ) From 8da24ab70e9a7f0c0035f6d3b40498251b6c97d3 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:30:13 +0200 Subject: [PATCH 3/4] chore: update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index de0f732..71febfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to this project will be documented in this file. +## [0.3.2] - 2025-07-22 + +### 🚀 Features + +- Work with new 2nd level headers (see [this lyDATA issue](https://github.com/lycosystem/lydata/issues/21)) +- Add `side` as short column to `ly` accessor + ## [0.3.1] - 2025-07-12 ### 🐛 Bug Fixes From 01c7ca2852c72990b46e011cf6088ff9e3a4ef69 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:33:00 +0200 Subject: [PATCH 4/4] test: fix misspelled docstring --- src/lydata/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lydata/utils.py b/src/lydata/utils.py index 5141184..deaf046 100644 --- a/src/lydata/utils.py +++ b/src/lydata/utils.py @@ -166,11 +166,11 @@ def get_default_column_map_old() -> _ColumnMap: def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]: """Convert an old long name to a new long name. - >>> new_from_old(("patient", "#", "neck_dissection")) + >>> _new_from_old(("patient", "#", "neck_dissection")) ('patient', 'info', 'neck_dissection') - >>> new_from_old(("tumor", "1", "t_stage")) + >>> _new_from_old(("tumor", "1", "t_stage")) ('tumor', 'info', 't_stage') - >>> new_from_old(("a", "b", "c")) + >>> _new_from_old(("a", "b", "c")) ('a', 'b', 'c') """ start, middle, end = long_name