From 7bc5c85003b00c23c09d6c7a66a8ae9a11dbfeec Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:17:39 +0200 Subject: [PATCH 1/5] docs: fix `join` cmd example call --- src/lyscripts/data/join.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/lyscripts/data/join.py b/src/lyscripts/data/join.py index 68f8d46..6ecd8df 100644 --- a/src/lyscripts/data/join.py +++ b/src/lyscripts/data/join.py @@ -29,9 +29,10 @@ def cli_cmd(self) -> None: .. code-block:: bash - lydata join \ - --inputs='["data.source": "file1.csv", "data.source": "file2.csv"]' \ - --output="joined.csv" + lyscripts data join \ + --inputs '{"source": "file1.csv"}' \ + --inputs '{"source": "file2.csv"}' \ + --output-file "joined.csv" But it also allows for concatenating datasets fetched directly from the `lydata Github repo`_. Due to the rather complex command signature, we @@ -51,7 +52,7 @@ def cli_cmd(self) -> None: .. code-block:: bash - lydata join --configs=datasets.ly.yaml --output=joined.csv + lyscripts data join --configs datasets.ly.yaml --output-file joined.csv .. _pydantic: https://docs.pydantic.dev/latest/ .. _lydata Github repo: https://github.com/rmnldwg/lydata From a3f015dc313447ba9e7b3d028bdd304cdd15b017 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 14:28:26 +0200 Subject: [PATCH 2/5] change: access only via lydata for compatibility We have changed the lydata 2nd level headers slightly for the patient and tumor info (see https://github.com/lycosystem/lydata/issues/21 for more info). Since the lydata package was already updated to be compatible with that change, we simply need to route every access the lyscripts make to the data through lydata package and hence be compatible too. --- pyproject.toml | 2 +- src/lyscripts/compute/prevalences.py | 2 +- src/lyscripts/data/lyproxify.py | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 22eb330..9fa9e78 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ "pydantic", "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0", "numpydantic", - "lydata >= 0.2.5", + "lydata >= 0.3.2", "loguru", ] dynamic = ["version"] diff --git a/src/lyscripts/compute/prevalences.py b/src/lyscripts/compute/prevalences.py index 21c2548..8097a42 100644 --- a/src/lyscripts/compute/prevalences.py +++ b/src/lyscripts/compute/prevalences.py @@ -139,7 +139,7 @@ def observe_prevalence( QueryPortion(match=np.int64(7), total=np.int64(79)) """ mapping = mapping or DataConfig.model_fields["mapping"].default_factory() - data["tumor", "1", "t_stage"] = data.ly.t_stage.map(mapping) + data.ly.t_stage = data.ly.t_stage.map(mapping) has_t_stage = C("t_stage").isin(scenario_config.t_stages) if scenario_config.midext is None: diff --git a/src/lyscripts/data/lyproxify.py b/src/lyscripts/data/lyproxify.py index 02154f8..a3bed39 100644 --- a/src/lyscripts/data/lyproxify.py +++ b/src/lyscripts/data/lyproxify.py @@ -12,8 +12,10 @@ from pathlib import Path from typing import Annotated, Any +import lydata # noqa: F401 import pandas as pd from loguru import logger +from lydata import C from pydantic import AfterValidator, Field, FilePath from lyscripts.cli import assemble_main @@ -111,7 +113,7 @@ def cli_cmd(self) -> None: reduced = exclude_patients(trimmed, mapping.EXCLUDE) processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP) - if ("tumor", "1", "side") in processed.columns: + if "side" in processed.ly: processed = leftright_to_ipsicontra(processed) save_table_to_csv(file_path=self.output_file, table=processed) @@ -289,8 +291,8 @@ def leftright_to_ipsicontra(data: pd.DataFrame): involvement. """ len_before = len(data) - left_data = data.loc[data["tumor", "1", "side"] != "right"] - right_data = data.loc[data["tumor", "1", "side"] == "right"] + left_data = data.ly.query(C("side") != "right") + right_data = data.ly.query(C("side") == "right") left_data = left_data.rename(columns={"left": "ipsi"}, level=1) left_data = left_data.rename(columns={"right": "contra"}, level=1) From 9f4947340138081acc3c8a2754ab00081175f86a Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 14:50:38 +0200 Subject: [PATCH 3/5] feat: add `data fetch` command Fixes: #75 --- src/lyscripts/data/__init__.py | 2 ++ src/lyscripts/data/fetch.py | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 src/lyscripts/data/fetch.py diff --git a/src/lyscripts/data/__init__.py b/src/lyscripts/data/__init__.py index 7e4f54f..cd1fb8d 100644 --- a/src/lyscripts/data/__init__.py +++ b/src/lyscripts/data/__init__.py @@ -18,6 +18,7 @@ from lyscripts.data import ( # noqa: F401 enhance, + fetch, generate, join, lyproxify, @@ -34,6 +35,7 @@ class DataCLI(BaseSettings): lyproxify: CliSubCommand[lyproxify.LyproxifyCLI] join: CliSubCommand[join.JoinCLI] split: CliSubCommand[split.SplitCLI] + fetch: CliSubCommand[fetch.FetchCLI] filter: CliSubCommand[filter_.FilterCLI] enhance: CliSubCommand[enhance.EnhanceCLI] generate: CliSubCommand[generate.GenerateCLI] diff --git a/src/lyscripts/data/fetch.py b/src/lyscripts/data/fetch.py new file mode 100644 index 0000000..dccbf87 --- /dev/null +++ b/src/lyscripts/data/fetch.py @@ -0,0 +1,57 @@ +"""Small command to fetch the data from a remote using the lydata package.""" + +from pathlib import Path + +import lydata # noqa: F401 +from loguru import logger +from lydata.loader import LyDataset +from pydantic import Field + +from lyscripts.cli import assemble_main +from lyscripts.configs import BaseCLI + + +class FetchCLI(LyDataset, BaseCLI): + """Fetch a specific dataset from the lyDATA repository.""" + + github_token: str | None = Field( + default=None, + description=( + "GitHub token to access private datasets. Can also be provided as " + "`GITHUB_TOKEN` environment variable." + ), + ) + github_user: str | None = Field( + default=None, + description=( + "GitHub user for non-token login. Can also be provided as " + "`GITHUB_USER` environment variable." + ), + ) + github_password: str | None = Field( + default=None, + description=( + "GitHub password for non-token login. Can also be provided as " + "`GITHUB_PASSWORD` environment variable." + ), + ) + output_file: Path = Field(description="The path to save the dataset to.") + + def cli_cmd(self): + """Execute the ``fetch`` command.""" + logger.enable("lydata") + logger.debug(self.model_dump_json(indent=2)) + + dataset = self.get_dataframe( + use_github=True, + token=self.github_token, + user=self.github_user, + password=self.github_password, + ) + dataset.to_csv(self.output_file, index=False) + logger.success(f"Fetched dataset and saved to {self.output_file}") + + +if __name__ == "__main__": + main = assemble_main(settings_cls=FetchCLI, prog_name="fetch") + main() From 1061d839c6ecca077b4b5dca6e30a94ba708df6d Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:39:13 +0200 Subject: [PATCH 4/5] change: make cli work with new lydata format This is related to https://github.com/lycosystem/lydata/issues/21 --- pyproject.toml | 5 ++++- src/lyscripts/compute/prevalences.py | 4 +++- src/lyscripts/data/lyproxify.py | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9fa9e78..f3f409b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ "pydantic", "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0", "numpydantic", - "lydata >= 0.3.2", + "lydata >= 0.3.3", "loguru", ] dynamic = ["version"] @@ -193,5 +193,8 @@ ignore_tags = "" topo_order = false # sort the commits inside sections by oldest/newest order sort_commits = "oldest" + +[tool.uv.sources] +lydata = { path = "../lydata-package", editable = true } # limit the number of commits included in the changelog. # limit_commits = 42 diff --git a/src/lyscripts/compute/prevalences.py b/src/lyscripts/compute/prevalences.py index 8097a42..8771eb9 100644 --- a/src/lyscripts/compute/prevalences.py +++ b/src/lyscripts/compute/prevalences.py @@ -14,6 +14,7 @@ from loguru import logger from lydata import C, Q from lydata.accessor import NoneQ, QueryPortion +from lydata.utils import is_old from lymph import models from pydantic import Field from rich import progress @@ -139,7 +140,8 @@ def observe_prevalence( QueryPortion(match=np.int64(7), total=np.int64(79)) """ mapping = mapping or DataConfig.model_fields["mapping"].default_factory() - data.ly.t_stage = data.ly.t_stage.map(mapping) + key = ("tumor", "1", "t_stage") if is_old(data) else ("tumor", "info", "t_stage") + data[key] = data.ly.t_stage.map(mapping) has_t_stage = C("t_stage").isin(scenario_config.t_stages) if scenario_config.midext is None: diff --git a/src/lyscripts/data/lyproxify.py b/src/lyscripts/data/lyproxify.py index a3bed39..3b14a63 100644 --- a/src/lyscripts/data/lyproxify.py +++ b/src/lyscripts/data/lyproxify.py @@ -226,7 +226,7 @@ def transform_to_lyprox( .. code-block:: python column_map = { - ("patient", "#", "age"): { + ("patient", "info", "age"): { "func": compute_age_from_raw, "kwargs": {"randomize": False}, "columns": ["birthday", "date of diagnosis"] @@ -237,7 +237,7 @@ def transform_to_lyprox( values of the columns ``"birthday"`` and ``"date of diagnosis"`` as positional arguments, and the keyword argument ``"randomize"`` is set to ``False``. The function then returns the patient's age, which is subsequently stored in the column - ``("patient", "#", "age")``. + ``("patient", "info", "age")``. Note that the ``column_map`` dictionary must have either a ``"default"`` key or ``"func"`` along with ``"columns"`` and ``"kwargs"``, depending on the function From b5b621f2735eaaf56bf67f3d0b420db6383dfa25 Mon Sep 17 00:00:00 2001 From: Roman Ludwig <48687784+rmnldwg@users.noreply.github.com> Date: Tue, 22 Jul 2025 16:45:21 +0200 Subject: [PATCH 5/5] chore: update changelog --- CHANGELOG.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index abcb7d5..2c78d30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,29 @@ All notable changes to this project will be documented in this file. +## [1.0.0rc3] - 2025-07-22 + +### Documentation + +- Fix `join` command's example call. + +### Features + +- Add `data fetch` command. Fixes [#75]. + +### Change + +- Access data only via lydata for compatibility.\ + We have changed the lydata 2nd level headers slightly for the + patient and tumor info (see + https://github.com/lycosystem/lydata/issues/21 for more info).\ + Since the lydata package was already updated to be compatible with + that change, we simply need to route every access the lyscripts + make to the data through lydata package and hence be compatible + too. +- Make CLI work with new lydata format.\ + This is also related to https://github.com/lycosystem/lydata/issues/21 + ## [1.0.0rc2] - 2025-06-26 ### Bug Fixes