From bd029b0ad3c63f4297ae99426ad76ec5b6022b93 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:07:33 +0100 Subject: [PATCH 1/4] fix: order of sub-/superlevel inference --- lydata/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lydata/utils.py b/lydata/utils.py index 41d51a3..f0c433e 100644 --- a/lydata/utils.py +++ b/lydata/utils.py @@ -123,8 +123,8 @@ def get_default_modalities() -> dict[str, ModalityConfig]: def infer_all_levels( dataset: pd.DataFrame, - infer_sublevels_kwargs: dict[str, Any] | None = None, infer_superlevels_kwargs: dict[str, Any] | None = None, + infer_sublevels_kwargs: dict[str, Any] | None = None, ) -> pd.DataFrame: """Infer all levels of involvement for each diagnostic modality. @@ -138,14 +138,14 @@ def infer_all_levels( result = dataset.copy() - result = result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs)) - return result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs)) + result = result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs)) + return result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs)) def infer_and_combine_levels( dataset: pd.DataFrame, - infer_sublevels_kwargs: dict[str, Any] | None = None, infer_superlevels_kwargs: dict[str, Any] | None = None, + infer_sublevels_kwargs: dict[str, Any] | None = None, combine_kwargs: dict[str, Any] | None = None, ) -> pd.DataFrame: """Enhance the dataset by inferring additional columns from the data. @@ -171,8 +171,8 @@ def infer_and_combine_levels( """ result = infer_all_levels( dataset, - infer_sublevels_kwargs=infer_sublevels_kwargs, infer_superlevels_kwargs=infer_superlevels_kwargs, + infer_sublevels_kwargs=infer_sublevels_kwargs, ) combine_kwargs = combine_kwargs or {} max_llh = pd.concat( From 57f926fd5812b7b7b712dd6024539a9fe81aad1a Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:53:48 +0100 Subject: [PATCH 2/4] fix: don't ignore present sub-/superlvl cols Ignoring a column just because it is present is dangerous: It could contain `None` values. Therefore, I stop ignoring them. --- lydata/accessor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lydata/accessor.py b/lydata/accessor.py index 80ba032..d41bd0a 100644 --- a/lydata/accessor.py +++ b/lydata/accessor.py @@ -785,9 +785,6 @@ def infer_sublevels( for subid in subids: sublevel = superlevel + subid - if sublevel in self._obj[modality, side]: - continue - result.loc[is_healthy, (modality, side, sublevel)] = False result.loc[~is_healthy, (modality, side, sublevel)] = None @@ -851,9 +848,6 @@ def infer_superlevels( except KeyError: continue - if superlevel in self._obj[modality, side]: - continue - result.loc[are_all_healthy, (modality, side, superlevel)] = False result.loc[is_any_involved, (modality, side, superlevel)] = True result.loc[is_unknown, (modality, side, superlevel)] = None From 7ba9927d320add811921b487b5d3eccce2bcfa16 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 3 Dec 2024 13:55:52 +0100 Subject: [PATCH 3/4] feat(utils): add better update func for pandas This new `update_and_expand()` function does what pandas' `.update()` method does, but additionally adds new columns from the updating `DataFrame`. This is important for inferring the sub- and superlevel involvement. --- lydata/utils.py | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/lydata/utils.py b/lydata/utils.py index f0c433e..5b2cdb0 100644 --- a/lydata/utils.py +++ b/lydata/utils.py @@ -8,6 +8,37 @@ from pydantic import BaseModel, Field +def update_and_expand( + left: pd.DataFrame, + right: pd.DataFrame, + **update_kwargs: Any, +) -> pd.DataFrame: + """Update ``left`` with values from ``right``, also adding . + + The added feature of this function over pandas' :py:meth:`~pandas.DataFrame.update` + is that it also adds columns that are present in ``right`` but not in ``left``. + + Any keyword arguments are also directly passed to the + :py:meth:`~pandas.DataFrame.update`. + + >>> left = pd.DataFrame({"a": [1, 2, None], "b": [3, 4, 5]}) + >>> right = pd.DataFrame({"a": [None, 3, 4], "c": [6, 7, 8]}) + >>> update_and_expand(left, right) + a b c + 0 1.0 3 6 + 1 3.0 4 7 + 2 4.0 5 8 + """ + result = left.copy() + result.update(right, **update_kwargs) + + for column in right.columns: + if column not in result.columns: + result[column] = right[column] + + return result + + @dataclass class _ColumnSpec: """Class for specifying column names and aggfuncs.""" @@ -128,7 +159,7 @@ def infer_all_levels( ) -> pd.DataFrame: """Infer all levels of involvement for each diagnostic modality. - This function first infers sublevel (e.g. 'IIa", and 'IIb') involvement for each + This function first infers sublevel (e.g. 'IIa', and 'IIb') involvement for each modality using :py:meth:`~lydata.accessor.LyDataAccessor.infer_sublevels`. Then, it infers superlevel (e.g. 'II') involvement for each modality using :py:meth:`~lydata.accessor.LyDataAccessor.infer_superlevels`. @@ -138,8 +169,14 @@ def infer_all_levels( result = dataset.copy() - result = result.join(result.ly.infer_superlevels(**infer_superlevels_kwargs)) - return result.join(result.ly.infer_sublevels(**infer_sublevels_kwargs)) + result = update_and_expand( + left=result, + right=result.ly.infer_superlevels(**infer_superlevels_kwargs), + ) + return update_and_expand( + left=result, + right=result.ly.infer_sublevels(**infer_sublevels_kwargs), + ) def infer_and_combine_levels( From 0a350b99c5093e671d7b874a6e0027e2023a38dd Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:56:54 +0100 Subject: [PATCH 4/4] chore: update changelog --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce8ca27..4801cad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to this project will be documented in this file. +## [0.2.2] - 2024-12-03 + +### 🚀 Features + +- *(utils)* Add better update func for pandas + +### 🐛 Bug Fixes + +- Order of sub-/superlevel inference +- Don't ignore present sub-/superlvl cols + ## [0.2.1] - 2024-11-29 ### 🐛 Bug Fixes @@ -191,6 +202,7 @@ Initial implementation of the lyDATA library. +[0.2.2]: https://github.com/rmnldwg/lydata/compare/0.2.1..0.2.2 [0.2.1]: https://github.com/rmnldwg/lydata/compare/0.2.0..0.2.1 [0.2.0]: https://github.com/rmnldwg/lydata/compare/0.1.2..0.2.0 [0.1.2]: https://github.com/rmnldwg/lydata/compare/0.1.1..0.1.2