From 8cd2f0397b117961e4fd395ebaef71e89328d63d Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:28:48 +0100 Subject: [PATCH 1/3] test: check whether #13 was fixed --- lydata/accessor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lydata/accessor.py b/lydata/accessor.py index 7683375..2debaf0 100644 --- a/lydata/accessor.py +++ b/lydata/accessor.py @@ -741,9 +741,9 @@ def combine( :py:meth:`~pandas.DataFrame.update` the original DataFrame with the result. >>> df = pd.DataFrame({ - ... ('MRI' , 'ipsi', 'I'): [False, True , True , None], - ... ('CT' , 'ipsi', 'I'): [False, True , False, True], - ... ('pathology', 'ipsi', 'I'): [True , None , False, None], + ... ('MRI' , 'ipsi', 'I'): [False, True , True , None, None], + ... ('CT' , 'ipsi', 'I'): [False, True , False, True, None], + ... ('pathology', 'ipsi', 'I'): [True , None , False, None, None], ... }) >>> df.ly.combine() # doctest: +NORMALIZE_WHITESPACE ipsi @@ -752,6 +752,7 @@ def combine( 1 True 2 False 3 True + 4 None """ modalities = modalities or get_default_modalities() modalities = { From 831e44c82b1f7b1f036bf61733e0a920575a590a Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 29 Nov 2024 14:48:29 +0100 Subject: [PATCH 2/3] fix: all `None` diagnoses are preserved Fixes: #13 --- lydata/accessor.py | 176 ++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 115 deletions(-) diff --git a/lydata/accessor.py b/lydata/accessor.py index 2debaf0..80ba032 100644 --- a/lydata/accessor.py +++ b/lydata/accessor.py @@ -424,105 +424,41 @@ def align_diagnoses( return diagnosis_stack -def _create_raising_func(method: str): - """Raise ValueError for wrong ``method``.""" +def _stack_to_float_matrix(diagnosis_stack: list[pd.DataFrame]) -> np.ndarray: + """Convert diagnosis stack to 3D array of floats with ``Nones`` as ``np.nan``.""" + diagnosis_matrix = np.array(diagnosis_stack) + diagnosis_matrix[pd.isna(diagnosis_matrix)] = np.nan + return np.astype(diagnosis_matrix, float) - def raise_value_err(*args, **kwargs): - raise ValueError(f"Unknown method {method}") - - return raise_value_err - - -def _false_estimate( - obs: np.ndarray, - false_pos_probs: np.ndarray, - true_neg_probs: np.ndarray, - method: Literal["prod", "max"], -) -> float: - """Compute estimate of ``False``, given ``obs``. - >>> _false_estimate([True, False], [0.1, 0.6], [0.4, 0.7], method="whatever") - Traceback (most recent call last): - ... - ValueError: Unknown method whatever - """ - false_llhs = np.where(obs, false_pos_probs, true_neg_probs) - nans_masked = np.where( - pd.isna(obs), - 1.0 if method == "prod" else 0.0, - false_llhs, - ) - method = getattr(np, method, _create_raising_func(method)) - return method(nans_masked) - - -def _true_estimate( - obs: np.ndarray, - true_pos_probs: np.ndarray, - false_neg_probs: np.ndarray, - method: Literal["prod", "max"], -) -> float: - """Compute estimate of ``True``, given ``obs``. - - >>> obs = [True, False, np.nan] - >>> true_pos_probs = [0.8, 0.6, 0.9] - >>> false_neg_probs = [0.6, 0.7, 0.9] - >>> _true_estimate(obs, true_pos_probs, false_neg_probs, method="max") - np.float64(0.8) - >>> tmp = _true_estimate(obs, true_pos_probs, false_neg_probs, method="prod") - >>> np.isclose(tmp, 0.56) - np.True_ - """ - true_llhs = np.where(obs, true_pos_probs, false_neg_probs) - nans_masked = np.where( - pd.isna(obs), - 1.0 if method == "prod" else 0.0, - true_llhs, - ) - method = getattr(np, method, _create_raising_func(method)) - return method(nans_masked) - - -def _max_likelihood( - obs: np.ndarray, - specificities: np.ndarray, +def _evaluate_likelihood_ratios( + diagnosis_matrix: np.ndarray, sensitivities: np.ndarray, -) -> bool: - """Compute most likely true state based on all ``obs``. - - >>> obs = np.array([True, False, np.nan, None]) - >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7]) - >>> specificities = np.array([0.9, 0.7, 0.7, 0.7]) - >>> _max_likelihood(obs, sensitivities, specificities) - np.True_ - >>> obs = np.array([True, False, False, False]) - >>> _max_likelihood(obs, sensitivities, specificities) - np.False_ + specificities: np.ndarray, + method: Literal["max_llh", "rank"], +) -> np.ndarray: + """Compare the likelihoods of true/false diagnoses using the given ``method``. + + The ``diagnosis_matrix`` is a 3D array of shape ``(n_modalities, n_patients, + n_levels)``. The ``sensitivities`` and ``specificities`` are 1D arrays of shape + ``(n_modalities,)``. When choosing the ``method="max_llh"``, the likelihood of each + diagnosis is combined into one likelihood for each patient and level. With + ``method="rank"``, the most trustworthy diagnosis is chosen for each patient and + level. """ - healthy_llh = _false_estimate(obs, 1 - specificities, specificities, method="prod") - involved_llhs = _true_estimate(obs, sensitivities, 1 - sensitivities, method="prod") - return healthy_llh < involved_llhs + true_pos = sensitivities[:, None, None] * diagnosis_matrix + false_neg = (1 - sensitivities[:, None, None]) * (1 - diagnosis_matrix) + true_neg = specificities[:, None, None] * (1 - diagnosis_matrix) + false_pos = (1 - specificities[:, None, None]) * diagnosis_matrix + if method not in {"max_llh", "rank"}: + raise ValueError(f"Unknown method {method}") -def _rank_trustworthy( - obs: np.ndarray, - specificities: np.ndarray, - sensitivities: np.ndarray, -) -> bool: - """Estimate true state based on most trustworthy value in ``obs``. - - >>> obs = np.array([True, False, np.nan, None]) - >>> sensitivities = np.array([0.9, 0.7, 0.7, 0.7]) - >>> specificities = np.array([0.9, 0.7, 0.7, 0.7]) - >>> _rank_trustworthy(obs, sensitivities, specificities) - np.True_ - >>> obs = np.array([True, False, False, False]) - >>> _rank_trustworthy(obs, sensitivities, specificities) - np.True_ - """ - healthy_llh = _false_estimate(obs, 1 - specificities, specificities, method="max") - involved_llhs = _true_estimate(obs, sensitivities, 1 - sensitivities, method="max") - return healthy_llh < involved_llhs + agg_func = np.nanprod if method == "max_llh" else np.nanmax + true_llh = agg_func(true_pos + false_neg, axis=0) + false_llh = agg_func(true_neg + false_pos, axis=0) + + return true_llh >= false_llh def _expand_mapping( @@ -726,6 +662,18 @@ def stats( return stats + def _filter_and_sort_modalities( + self, + modalities: dict[str, ModalityConfig] | None = None, + ) -> dict[str, ModalityConfig]: + """Return only those ``modalities`` present in data and sorted as in data.""" + modalities = modalities or get_default_modalities() + return { + modality_name: modality_config + for modality_name, modality_config in modalities.items() + if modality_name in self.get_modalities() + } + def combine( self, modalities: dict[str, ModalityConfig] | None = None, @@ -733,17 +681,20 @@ def combine( ) -> pd.DataFrame: """Combine diagnoses of ``modalities`` using ``method``. - The details of what the ``method`` does and how can be found in their - respective documentations: :py:func:`max_likelihood` and - :py:func:`rank_trustworthy`. + The order of the provided ``modalities`` does not matter, as it is aligned + with the order in the DataFrame. With ``method="max_llh"``, the most likely + true state of involvement is inferred based on all available diagnoses for + each patient and level. With ``method="rank"``, only the most trustworthy + diagnosis is chosen for each patient and level based on the sensitivity and + specificity of the given list of ``modalities``. The result contains only the combined columns. The intended use is to :py:meth:`~pandas.DataFrame.update` the original DataFrame with the result. >>> df = pd.DataFrame({ - ... ('MRI' , 'ipsi', 'I'): [False, True , True , None, None], - ... ('CT' , 'ipsi', 'I'): [False, True , False, True, None], - ... ('pathology', 'ipsi', 'I'): [True , None , False, None, None], + ... ('CT' , 'ipsi', 'I'): [False, True , False, True, None], + ... ('MRI' , 'ipsi', 'I'): [False, True , True , None, None], + ... ('pathology', 'ipsi', 'I'): [True , None , None, False, None], ... }) >>> df.ly.combine() # doctest: +NORMALIZE_WHITESPACE ipsi @@ -751,29 +702,24 @@ def combine( 0 True 1 True 2 False - 3 True + 3 False 4 None """ - modalities = modalities or get_default_modalities() - modalities = { - modality_name: modality_config - for modality_name, modality_config in modalities.items() - if modality_name in self.get_modalities() - } + modalities = self._filter_and_sort_modalities(modalities) diagnosis_stack = align_diagnoses(self._obj, list(modalities.keys())) - columns = diagnosis_stack[0].columns - diagnosis_stack = np.array(diagnosis_stack) - - funcs1d = {"max_llh": _max_likelihood, "rank": _rank_trustworthy} - result = np.apply_along_axis( - func1d=funcs1d[method], - axis=0, - arr=diagnosis_stack, + diagnosis_matrix = _stack_to_float_matrix(diagnosis_stack) + all_nan_mask = np.all(np.isnan(diagnosis_matrix), axis=0) + + result = _evaluate_likelihood_ratios( + diagnosis_matrix=diagnosis_matrix, sensitivities=np.array([mod.sens for mod in modalities.values()]), specificities=np.array([mod.spec for mod in modalities.values()]), + method=method, ) - return pd.DataFrame(result, columns=columns) + result = np.astype(result, object) + result[all_nan_mask] = None + return pd.DataFrame(result, columns=diagnosis_stack[0].columns) def infer_sublevels( self, From 34946bfd1c75500fbe4d02958d2ce421c1a062cd Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Fri, 29 Nov 2024 15:05:36 +0100 Subject: [PATCH 3/3] chore: update changelog --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index abf7eae..ce8ca27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,16 @@ All notable changes to this project will be documented in this file. +## [0.2.1] - 2024-11-29 + +### ๐Ÿ› Bug Fixes + +- If an LNL of a patient was unobserved (i.e., all diagnoses `None`), then the method `ly.combine()` returns `None` for that patient's LNL. Fixes [#13] + +### ๐Ÿงช Testing + +- Change the doctest of `ly.combine()` to check whether [#13] was fixed. + ## [0.2.0] - 2024-11-14 ### ๐Ÿš€ Features @@ -181,6 +191,7 @@ Initial implementation of the lyDATA library. +[0.2.1]: https://github.com/rmnldwg/lydata/compare/0.2.0..0.2.1 [0.2.0]: https://github.com/rmnldwg/lydata/compare/0.1.2..0.2.0 [0.1.2]: https://github.com/rmnldwg/lydata/compare/0.1.1..0.1.2 [0.1.1]: https://github.com/rmnldwg/lydata/compare/0.1.0..0.1.1 @@ -192,3 +203,4 @@ Initial implementation of the lyDATA library. [#2]: https://github.com/rmnldwg/lydata/issues/2 [#4]: https://github.com/rmnldwg/lydata/issues/4 +[#13]: https://github.com/rmnldwg/lydata/issues/13