Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

All notable changes to this project will be documented in this file.

## [0.3.2] - 2025-07-22

### 🚀 Features

- Work with new 2nd level headers (see [this lyDATA issue](https://github.com/lycosystem/lydata/issues/21))
- Add `side` as short column to `ly` accessor

## [0.3.1] - 2025-07-12

### 🐛 Bug Fixes
Expand Down
69 changes: 54 additions & 15 deletions src/lydata/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@

from lydata.utils import (
ModalityConfig,
get_default_column_map,
get_default_column_map_new,
get_default_column_map_old,
get_default_modalities,
)
from lydata.validator import construct_schema
Expand Down Expand Up @@ -161,7 +162,7 @@ def __init__(
self.colname = column
self.operator = operator
self.value = value
self._column_map = get_default_column_map()
self._column_map = get_default_column_map_old()

def __repr__(self) -> str:
"""Return a string representation of the query."""
Expand Down Expand Up @@ -536,7 +537,7 @@ def _expand_mapping(
>>> _expand_mapping({'age': 'foo', 'hpv': 'bar'})
{('patient', '#', 'age'): 'foo', ('patient', '#', 'hpv_status'): 'bar'}
"""
_colname_map = colname_map or get_default_column_map().from_short
_colname_map = colname_map or get_default_column_map_old().from_short
expanded_map = {}

for colname, func in short_map.items():
Expand All @@ -560,7 +561,8 @@ class LyDataAccessor:
def __init__(self, obj: pd.DataFrame) -> None:
"""Initialize the accessor with a DataFrame."""
self._obj = obj
self._column_map = get_default_column_map()
self._column_map_old = get_default_column_map_old()
self._column_map_new = get_default_column_map_new()

def __contains__(self, key: str) -> bool:
"""Check if a column is contained in the DataFrame.
Expand All @@ -572,14 +574,32 @@ def __contains__(self, key: str) -> bool:
False
>>> ("patient", "#", "age") in df.ly
True
>>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]})
>>> "age" in df.ly
True
>>> "foo" in df.ly
False
>>> ("patient", "info", "age") in df.ly
True
"""
_key = self._get_safe_long(key)
return _key in self._obj
_key_old = self._get_safe_long_old(key)
_key_new = self._get_safe_long_new(key)
return _key_new in self._obj or _key_old in self._obj

def __getitem__(self, key: str) -> pd.Series:
"""Allow column access by short name, too."""
_key = self._get_safe_long(key)
return self._obj[_key]
_key_old = self._get_safe_long_old(key)
_key_new = self._get_safe_long_new(key)

try:
return self._obj[_key_new]
except KeyError as err_from_new:
try:
return self._obj[_key_old]
except KeyError:
raise KeyError(
f"Neither '{_key_new}' nor '{_key_old}' found in DataFrame."
) from err_from_new

def __getattr__(self, name: str) -> Any:
"""Access columns also by short name.
Expand All @@ -590,6 +610,12 @@ def __getattr__(self, name: str) -> Any:
1 52
2 73
Name: (patient, #, age), dtype: int64
>>> df = pd.DataFrame({("patient", "info", "age"): [61, 52, 73]})
>>> df.ly.age
0 61
1 52
2 73
Name: (patient, info, age), dtype: int64
>>> df.ly.foo
Traceback (most recent call last):
...
Expand All @@ -600,9 +626,13 @@ def __getattr__(self, name: str) -> Any:
except KeyError as key_err:
raise AttributeError(f"Attribute {name!r} not found.") from key_err

def _get_safe_long(self, key: Any) -> tuple[str, str, str]:
"""Get the long column name or return the input."""
return getattr(self._column_map.from_short.get(key), "long", key)
def _get_safe_long_old(self, key: Any) -> tuple[str, str, str]:
"""Get the old long column name or return the input."""
return getattr(self._column_map_old.from_short.get(key), "long", key)

def _get_safe_long_new(self, key: Any) -> tuple[str, str, str]:
"""Get the new long column name or return the input."""
return getattr(self._column_map_new.from_short.get(key), "long", key)

def validate(self, modalities: list[str] | None = None) -> pd.DataFrame:
"""Validate the DataFrame against the lydata schema.
Expand Down Expand Up @@ -697,7 +727,7 @@ def stats(
The ``agg_funcs`` argument is a mapping of column names to functions that
receive a :py:class:`pd.Series` and return a :py:class:`pd.Series`. The default
is a useful selection of statistics for the most common columns. E.g., for the
column ``('patient', '#', 'age')`` (or its short column name ``age``), the
column ``('patient', 'info', 'age')`` (or its short column name ``age``), the
default function returns the value counts.

The ``use_shortnames`` argument determines whether the output should use the
Expand All @@ -712,11 +742,20 @@ def stats(
... ('tumor', '1', 't_stage'): [2, 3, 1, 2],
... })
>>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE
{'age': {61: 2, 52: 1, 73: 1},
'hpv': {True: 2, False: 1, None: 1},
't_stage': {2: 2, 3: 1, 1: 1}}
>>> df = pd.DataFrame({
... ('patient', 'info', 'age'): [61, 52, 73, 61],
... ('patient', 'info', 'hpv_status'): [True, False, None, True],
... ('tumor', 'info', 't_stage'): [2, 3, 1, 2],
... })
>>> df.ly.stats() # doctest: +NORMALIZE_WHITESPACE
{'age': {61: 2, 52: 1, 73: 1},
'hpv': {True: 2, False: 1, None: 1},
't_stage': {2: 2, 3: 1, 1: 1}}
"""
_agg_funcs = self._column_map.from_short.copy()
_agg_funcs = self._column_map_new.from_short.copy()
_agg_funcs.update(agg_funcs or {})
stats = {}

Expand All @@ -725,8 +764,8 @@ def stats(
continue

column = self[colname]
if use_shortnames and colname in self._column_map.from_long:
colname = self._column_map.from_long[colname].short
if use_shortnames and colname in self._column_map_old.from_long:
colname = self._column_map_old.from_long[colname].short

stats[colname] = getattr(func(column), f"to_{out_format}")()

Expand Down
10 changes: 5 additions & 5 deletions src/lydata/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,14 @@ def get_content_file(
This method always tries to fetch the most recent version of the file.

>>> conf = LyDataset(
... year=2023,
... year=2025,
... institution="usz",
... subsite="hypopharynx-larynx",
... repo_name="lycosystem/lydata.private",
... ref="2023-usz-hypopharynx-larynx",
... ref="2025-usz-hypopharynx-larynx",
... )
>>> conf.get_content_file()
ContentFile(path="2023-usz-hypopharynx-larynx/data.csv")
ContentFile(path="2025-usz-hypopharynx-larynx/data.csv")
"""
if self._content_file is not None:
if self._content_file.update():
Expand Down Expand Up @@ -262,15 +262,15 @@ def available_datasets(
'2025-hvh-oropharynx']
>>> avail_gen = available_datasets(
... repo_name="lycosystem/lydata.private",
... ref="2024-umcg-hypopharynx-larynx",
... ref="2025-umcg-hypopharynx-larynx",
... use_github=True,
... )
>>> sorted([ds.name for ds in avail_gen]) # doctest: +NORMALIZE_WHITESPACE
['2021-clb-oropharynx',
'2021-usz-oropharynx',
'2023-clb-multisite',
'2023-isb-multisite',
'2024-umcg-hypopharynx-larynx']
'2025-umcg-hypopharynx-larynx']
>>> avail_gen = available_datasets(
... institution="hvh",
... ref="6ac98d",
Expand Down
68 changes: 64 additions & 4 deletions src/lydata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,15 @@ def update_and_expand(

@dataclass
class _ColumnSpec:
"""Class for specifying column names and aggfuncs."""
"""Class for specifying column names and aggfuncs.

This serves a dual purpose:

1. It is a simple container that ties together a short name and a long name. For
this we could have used a `namedtuple` as well.
2. Every `_ColumnSpec` is also an aggregation function in itself. This is used in
the :py:meth:`~lydata.accessor.LyDataAccessor.stats` method.
"""

short: str
long: tuple[str, str, str]
Expand Down Expand Up @@ -108,14 +116,18 @@ def __iter__(self):
return iter(self.from_short.values())


def get_default_column_map() -> _ColumnMap:
"""Get the default column map.
def get_default_column_map_old() -> _ColumnMap:
"""Get the old default column map.

This map defines which short column names can be used to access columns in the
DataFrames.

>>> from lydata import accessor, loader
>>> df = next(loader.load_datasets(institution="usz"))
>>> df = next(loader.load_datasets(
... institution="usz",
... repo_name="lycosystem/lydata.private",
... ref="ab04379a36b6946306041d1d38ad7e97df8ee7ba",
... ))
>>> df.ly.surgery # doctest: +ELLIPSIS
0 False
...
Expand Down Expand Up @@ -146,6 +158,54 @@ def get_default_column_map() -> _ColumnMap:
_ColumnSpec("subsite", ("tumor", "1", "subsite")),
_ColumnSpec("volume", ("tumor", "1", "volume")),
_ColumnSpec("central", ("tumor", "1", "central")),
_ColumnSpec("side", ("tumor", "1", "side")),
]
)


def _new_from_old(long_name: tuple[str, str, str]) -> tuple[str, str, str]:
"""Convert an old long name to a new long name.

>>> _new_from_old(("patient", "#", "neck_dissection"))
('patient', 'info', 'neck_dissection')
>>> _new_from_old(("tumor", "1", "t_stage"))
('tumor', 'info', 't_stage')
>>> _new_from_old(("a", "b", "c"))
('a', 'b', 'c')
"""
start, middle, end = long_name
if (start == "patient" and middle == "#") or (start == "tumor" and middle == "1"):
middle = "info"
return (start, middle, end)


def get_default_column_map_new() -> _ColumnMap:
"""Get the old default column map.

This map defines which short column names can be used to access columns in the
DataFrames.

>>> from lydata import accessor, loader
>>> df = next(loader.load_datasets(
... institution="usz",
... repo_name="lycosystem/lydata.private",
... ref="ce2ac255b8aec7443375b610e5254a46bf236a46",
... ))
>>> df.ly.surgery # doctest: +ELLIPSIS
0 False
...
286 False
Name: (patient, info, neck_dissection), Length: 287, dtype: bool
>>> df.ly.smoke # doctest: +ELLIPSIS
0 True
...
286 True
Name: (patient, info, nicotine_abuse), Length: 287, dtype: bool
"""
return _ColumnMap.from_list(
[
_ColumnSpec(cs.short, _new_from_old(cs.long))
for cs in get_default_column_map_old()
]
)

Expand Down
Loading