diff --git a/CHANGELOG.md b/CHANGELOG.md index abcb7d5..2c78d30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,29 @@ All notable changes to this project will be documented in this file. +## [1.0.0rc3] - 2025-07-22 + +### Documentation + +- Fix `join` command's example call. + +### Features + +- Add `data fetch` command. Fixes [#75]. + +### Change + +- Access data only via lydata for compatibility.\ + We have changed the lydata 2nd level headers slightly for the + patient and tumor info (see + https://github.com/lycosystem/lydata/issues/21 for more info).\ + Since the lydata package was already updated to be compatible with + that change, we simply need to route every access the lyscripts + make to the data through lydata package and hence be compatible + too. +- Make CLI work with new lydata format.\ + This is also related to https://github.com/lycosystem/lydata/issues/21 + ## [1.0.0rc2] - 2025-06-26 ### Bug Fixes diff --git a/pyproject.toml b/pyproject.toml index 22eb330..f3f409b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,7 @@ dependencies = [ "pydantic", "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0", "numpydantic", - "lydata >= 0.2.5", + "lydata >= 0.3.3", "loguru", ] dynamic = ["version"] @@ -193,5 +193,8 @@ ignore_tags = "" topo_order = false # sort the commits inside sections by oldest/newest order sort_commits = "oldest" + +[tool.uv.sources] +lydata = { path = "../lydata-package", editable = true } # limit the number of commits included in the changelog. # limit_commits = 42 diff --git a/src/lyscripts/compute/prevalences.py b/src/lyscripts/compute/prevalences.py index 21c2548..8771eb9 100644 --- a/src/lyscripts/compute/prevalences.py +++ b/src/lyscripts/compute/prevalences.py @@ -14,6 +14,7 @@ from loguru import logger from lydata import C, Q from lydata.accessor import NoneQ, QueryPortion +from lydata.utils import is_old from lymph import models from pydantic import Field from rich import progress @@ -139,7 +140,8 @@ def observe_prevalence( QueryPortion(match=np.int64(7), total=np.int64(79)) """ mapping = mapping or DataConfig.model_fields["mapping"].default_factory() - data["tumor", "1", "t_stage"] = data.ly.t_stage.map(mapping) + key = ("tumor", "1", "t_stage") if is_old(data) else ("tumor", "info", "t_stage") + data[key] = data.ly.t_stage.map(mapping) has_t_stage = C("t_stage").isin(scenario_config.t_stages) if scenario_config.midext is None: diff --git a/src/lyscripts/data/__init__.py b/src/lyscripts/data/__init__.py index 7e4f54f..cd1fb8d 100644 --- a/src/lyscripts/data/__init__.py +++ b/src/lyscripts/data/__init__.py @@ -18,6 +18,7 @@ from lyscripts.data import ( # noqa: F401 enhance, + fetch, generate, join, lyproxify, @@ -34,6 +35,7 @@ class DataCLI(BaseSettings): lyproxify: CliSubCommand[lyproxify.LyproxifyCLI] join: CliSubCommand[join.JoinCLI] split: CliSubCommand[split.SplitCLI] + fetch: CliSubCommand[fetch.FetchCLI] filter: CliSubCommand[filter_.FilterCLI] enhance: CliSubCommand[enhance.EnhanceCLI] generate: CliSubCommand[generate.GenerateCLI] diff --git a/src/lyscripts/data/fetch.py b/src/lyscripts/data/fetch.py new file mode 100644 index 0000000..dccbf87 --- /dev/null +++ b/src/lyscripts/data/fetch.py @@ -0,0 +1,57 @@ +"""Small command to fetch the data from a remote using the lydata package.""" + +from pathlib import Path + +import lydata # noqa: F401 +from loguru import logger +from lydata.loader import LyDataset +from pydantic import Field + +from lyscripts.cli import assemble_main +from lyscripts.configs import BaseCLI + + +class FetchCLI(LyDataset, BaseCLI): + """Fetch a specific dataset from the lyDATA repository.""" + + github_token: str | None = Field( + default=None, + description=( + "GitHub token to access private datasets. Can also be provided as " + "`GITHUB_TOKEN` environment variable." + ), + ) + github_user: str | None = Field( + default=None, + description=( + "GitHub user for non-token login. Can also be provided as " + "`GITHUB_USER` environment variable." + ), + ) + github_password: str | None = Field( + default=None, + description=( + "GitHub password for non-token login. Can also be provided as " + "`GITHUB_PASSWORD` environment variable." + ), + ) + output_file: Path = Field(description="The path to save the dataset to.") + + def cli_cmd(self): + """Execute the ``fetch`` command.""" + logger.enable("lydata") + logger.debug(self.model_dump_json(indent=2)) + + dataset = self.get_dataframe( + use_github=True, + token=self.github_token, + user=self.github_user, + password=self.github_password, + ) + dataset.to_csv(self.output_file, index=False) + logger.success(f"Fetched dataset and saved to {self.output_file}") + + +if __name__ == "__main__": + main = assemble_main(settings_cls=FetchCLI, prog_name="fetch") + main() diff --git a/src/lyscripts/data/join.py b/src/lyscripts/data/join.py index 68f8d46..6ecd8df 100644 --- a/src/lyscripts/data/join.py +++ b/src/lyscripts/data/join.py @@ -29,9 +29,10 @@ def cli_cmd(self) -> None: .. code-block:: bash - lydata join \ - --inputs='["data.source": "file1.csv", "data.source": "file2.csv"]' \ - --output="joined.csv" + lyscripts data join \ + --inputs '{"source": "file1.csv"}' \ + --inputs '{"source": "file2.csv"}' \ + --output-file "joined.csv" But it also allows for concatenating datasets fetched directly from the `lydata Github repo`_. Due to the rather complex command signature, we @@ -51,7 +52,7 @@ def cli_cmd(self) -> None: .. code-block:: bash - lydata join --configs=datasets.ly.yaml --output=joined.csv + lyscripts data join --configs datasets.ly.yaml --output-file joined.csv .. _pydantic: https://docs.pydantic.dev/latest/ .. _lydata Github repo: https://github.com/rmnldwg/lydata diff --git a/src/lyscripts/data/lyproxify.py b/src/lyscripts/data/lyproxify.py index 02154f8..3b14a63 100644 --- a/src/lyscripts/data/lyproxify.py +++ b/src/lyscripts/data/lyproxify.py @@ -12,8 +12,10 @@ from pathlib import Path from typing import Annotated, Any +import lydata # noqa: F401 import pandas as pd from loguru import logger +from lydata import C from pydantic import AfterValidator, Field, FilePath from lyscripts.cli import assemble_main @@ -111,7 +113,7 @@ def cli_cmd(self) -> None: reduced = exclude_patients(trimmed, mapping.EXCLUDE) processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP) - if ("tumor", "1", "side") in processed.columns: + if "side" in processed.ly: processed = leftright_to_ipsicontra(processed) save_table_to_csv(file_path=self.output_file, table=processed) @@ -224,7 +226,7 @@ def transform_to_lyprox( .. code-block:: python column_map = { - ("patient", "#", "age"): { + ("patient", "info", "age"): { "func": compute_age_from_raw, "kwargs": {"randomize": False}, "columns": ["birthday", "date of diagnosis"] @@ -235,7 +237,7 @@ def transform_to_lyprox( values of the columns ``"birthday"`` and ``"date of diagnosis"`` as positional arguments, and the keyword argument ``"randomize"`` is set to ``False``. The function then returns the patient's age, which is subsequently stored in the column - ``("patient", "#", "age")``. + ``("patient", "info", "age")``. Note that the ``column_map`` dictionary must have either a ``"default"`` key or ``"func"`` along with ``"columns"`` and ``"kwargs"``, depending on the function @@ -289,8 +291,8 @@ def leftright_to_ipsicontra(data: pd.DataFrame): involvement. """ len_before = len(data) - left_data = data.loc[data["tumor", "1", "side"] != "right"] - right_data = data.loc[data["tumor", "1", "side"] == "right"] + left_data = data.ly.query(C("side") != "right") + right_data = data.ly.query(C("side") == "right") left_data = left_data.rename(columns={"left": "ipsi"}, level=1) left_data = left_data.rename(columns={"right": "contra"}, level=1)