diff --git a/CHANGELOG.md b/CHANGELOG.md index de0f732..71febfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to this project will be documented in this file. +## [0.3.2] - 2025-07-22 + +### 🚀 Features + +- Work with new 2nd level headers (see [this lyDATA issue](https://github.com/lycosystem/lydata/issues/21)) +- Add `side` as short column to `ly` accessor + ## [0.3.1] - 2025-07-12 ### 🐛 Bug Fixes diff --git a/src/lydata/accessor.py b/src/lydata/accessor.py index 96f9dc6..8138d49 100644 --- a/src/lydata/accessor.py +++ b/src/lydata/accessor.py @@ -45,7 +45,8 @@ from lydata.utils import ( ModalityConfig, - get_default_column_map, + get_default_column_map_new, + get_default_column_map_old, get_default_modalities, ) from lydata.validator import construct_schema @@ -161,7 +162,7 @@ def __init__( self.colname = column self.operator = operator self.value = value - self._column_map = get_default_column_map() + self._column_map = get_default_column_map_old() def __repr__(self) -> str: """Return a string representation of the query.""" @@ -536,7 +537,7 @@ def _expand_mapping( >>> _expand_mapping({'age': 'foo', 'hpv': 'bar'}) {('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'} """ - _colname_map = colname_map or get_default_column_map().from_short + _colname_map = colname_map or get_default_column_map_old().from_short expanded_map = {} for colname, func in short_map.items(): @@ -560,7 +561,8 @@ class LyDataAccessor: def __init__(self, obj: pd.DataFrame) -> None: """Initialize the accessor with a DataFrame.""" self._obj = obj - self._column_map = get_default_column_map() + self._column_map_old = get_default_column_map_old() + self._column_map_new = get_default_column_map_new() def __contains__(self, key: str) -> bool: """Check if a column is contained in the DataFrame. @@ -572,14 +574,32 @@ def __contains__(self, key: str) -> bool: False >>> ("patient", "#", "age") in df.ly True + >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]}) + >>> "age" in df.ly + True + >>> "foo" in df.ly + False + >>> ("patient", "info", "age") in df.ly + True """ - _key = self._get_safe_long(key) - return _key in self._obj + _key_old = self._get_safe_long_old(key) + _key_new = self._get_safe_long_new(key) + return _key_new in self._obj or _key_old in self._obj def __getitem__(self, key: str) -> pd.Series: """Allow column access by short name, too.""" - _key = self._get_safe_long(key) - return self._obj[_key] + _key_old = self._get_safe_long_old(key) + _key_new = self._get_safe_long_new(key) + + try: + return self._obj[_key_new] + except KeyError as err_from_new: + try: + return self._obj[_key_old] + except KeyError: + raise KeyError( + f"Neither '{_key_new}' nor '{_key_old}' found in DataFrame." + ) from err_from_new def __getattr__(self, name: str) -> Any: """Access columns also by short name. @@ -590,6 +610,12 @@ def __getattr__(self, name: str) -> Any: 1 52 2 73 Name: (patient, #, age), dtype: int64 + >>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]}) + >>> df.ly.age + 0 61 + 1 52 + 2 73 + Name: (patient, info, age), dtype: int64 >>> df.ly.foo Traceback (most recent call last): ... @@ -600,9 +626,13 @@ def __getattr__(self, name: str) -> Any: except KeyError as key_err: raise AttributeError(f"Attribute {name!r} not found.") from key_err - def _get_safe_long(self, key: Any) -> tuple[str, str, str]: - """Get the long column name or return the input.""" - return getattr(self._column_map.from_short.get(key), "long", key) + def _get_safe_long_old(self, key: Any) -> tuple[str, str, str]: + """Get the old long column name or return the input.""" + return getattr(self._column_map_old.from_short.get(key), "long", key) + + def _get_safe_long_new(self, key: Any) -> tuple[str, str, str]: + """Get the new long column name or return the input.""" + return getattr(self._column_map_new.from_short.get(key), "long", key) def validate(self, modalities: list[str] | None = None) -> pd.DataFrame: """Validate the DataFrame against the lydata schema. @@ -697,7 +727,7 @@ def stats( The ``agg_funcs`` argument is a mapping of column names to functions that receive a :py:class:`pd.Series` and return a :py:class:`pd.Series`. The default is a useful selection of statistics for the most common columns. E.g., for the - column ``('patient', '#', 'age')`` (or its short column name ``age``), the + column ``('patient', 'info', 'age')`` (or its short column name ``age``), the default function returns the value counts. The ``use_shortnames`` argument determines whether the output should use the @@ -712,11 +742,20 @@ def stats( ... ('tumor', '1', 't_stage'): [2, 3, 1, 2], ... }) >>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE + {'age': {61: 2, 52: 1, 73: 1}, + 'hpv': {True: 2, False: 1, None: 1}, + 't_stage': {2: 2, 3: 1, 1: 1}} + >>> df = pd.DataFrame({ + ... ('patient', 'info', 'age'): [61, 52, 73, 61], + ... ('patient', 'info', 'hpv_status'): [True, False, None, True], + ... ('tumor', 'info', 't_stage'): [2, 3, 1, 2], + ... }) + >>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE {'age': {61: 2, 52: 1, 73: 1}, 'hpv': {True: 2, False: 1, None: 1}, 't_stage': {2: 2, 3: 1, 1: 1}} """ - _agg_funcs = self._column_map.from_short.copy() + _agg_funcs = self._column_map_new.from_short.copy() _agg_funcs.update(agg_funcs or {}) stats = {} @@ -725,8 +764,8 @@ def stats( continue column = self[colname] - if use_shortnames and colname in self._column_map.from_long: - colname = self._column_map.from_long[colname].short + if use_shortnames and colname in self._column_map_old.from_long: + colname = self._column_map_old.from_long[colname].short stats[colname] = getattr(func(column), f"to_{out_format}")() diff --git a/src/lydata/loader.py b/src/lydata/loader.py index dcaf957..741f2a0 100644 --- a/src/lydata/loader.py +++ b/src/lydata/loader.py @@ -123,14 +123,14 @@ def get_content_file( This method always tries to fetch the most recent version of the file. >>> conf = LyDataset( - ... year=2023, + ... year=2025, ... institution="usz", ... subsite="hypopharynx-larynx", ... repo_name="lycosystem/lydata.private", - ... ref="2023-usz-hypopharynx-larynx", + ... ref="2025-usz-hypopharynx-larynx", ... ) >>> conf.get_content_file() - ContentFile(path="2023-usz-hypopharynx-larynx/data.csv") + ContentFile(path="2025-usz-hypopharynx-larynx/data.csv") """ if self._content_file is not None: if self._content_file.update(): @@ -262,7 +262,7 @@ def available_datasets( '2025-hvh-oropharynx'] >>> avail_gen = available_datasets( ... repo_name="lycosystem/lydata.private", - ... ref="2024-umcg-hypopharynx-larynx", + ... ref="2025-umcg-hypopharynx-larynx", ... use_github=True, ... ) >>> sorted([ds.name for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE @@ -270,7 +270,7 @@ def available_datasets( '2021-usz-oropharynx', '2023-clb-multisite', '2023-isb-multisite', - '2024-umcg-hypopharynx-larynx'] + '2025-umcg-hypopharynx-larynx'] >>> avail_gen = available_datasets( ... institution="hvh", ... ref="6ac98d", diff --git a/src/lydata/utils.py b/src/lydata/utils.py index 3925067..deaf046 100644 --- a/src/lydata/utils.py +++ b/src/lydata/utils.py @@ -66,7 +66,15 @@ def update_and_expand( @dataclass class _ColumnSpec: - """Class for specifying column names and aggfuncs.""" + """Class for specifying column names and aggfuncs. + + This serves a dual purpose: + + 1. It is a simple container that ties together a short name and a long name. For + this we could have used a `namedtuple` as well. + 2. Every `_ColumnSpec` is also an aggregation function in itself. This is used in + the :py:meth:`~lydata.accessor.LyDataAccessor.stats` method. + """ short: str long: tuple[str, str, str] @@ -108,14 +116,18 @@ def __iter__(self): return iter(self.from_short.values()) -def get_default_column_map() -> _ColumnMap: - """Get the default column map. +def get_default_column_map_old() -> _ColumnMap: + """Get the old default column map. This map defines which short column names can be used to access columns in the DataFrames. >>> from lydata import accessor, loader - >>> df = next(loader.load_datasets(institution="usz")) + >>> df = next(loader.load_datasets( + ... institution="usz", + ... repo_name="lycosystem/lydata.private", + ... ref="ab04379a36b6946306041d1d38ad7e97df8ee7ba", + ... )) >>> df.ly.surgery # doctest: +ELLIPSIS 0 False ... @@ -146,6 +158,54 @@ def get_default_column_map() -> _ColumnMap: _ColumnSpec("subsite", ("tumor", "1", "subsite")), _ColumnSpec("volume", ("tumor", "1", "volume")), _ColumnSpec("central", ("tumor", "1", "central")), + _ColumnSpec("side", ("tumor", "1", "side")), + ] + ) + + +def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]: + """Convert an old long name to a new long name. + + >>> _new_from_old(("patient", "#", "neck_dissection")) + ('patient', 'info', 'neck_dissection') + >>> _new_from_old(("tumor", "1", "t_stage")) + ('tumor', 'info', 't_stage') + >>> _new_from_old(("a", "b", "c")) + ('a', 'b', 'c') + """ + start, middle, end = long_name + if (start == "patient" and middle == "#") or (start == "tumor" and middle == "1"): + middle = "info" + return (start, middle, end) + + +def get_default_column_map_new() -> _ColumnMap: + """Get the old default column map. + + This map defines which short column names can be used to access columns in the + DataFrames. + + >>> from lydata import accessor, loader + >>> df = next(loader.load_datasets( + ... institution="usz", + ... repo_name="lycosystem/lydata.private", + ... ref="ce2ac255b8aec7443375b610e5254a46bf236a46", + ... )) + >>> df.ly.surgery # doctest: +ELLIPSIS + 0 False + ... + 286 False + Name: (patient, info, neck_dissection), Length: 287, dtype: bool + >>> df.ly.smoke # doctest: +ELLIPSIS + 0 True + ... + 286 True + Name: (patient, info, nicotine_abuse), Length: 287, dtype: bool + """ + return _ColumnMap.from_list( + [ + _ColumnSpec(cs.short, _new_from_old(cs.long)) + for cs in get_default_column_map_old() ] )