lycosystem · rmnldwg · Jul 23, 2025 · Jun 30, 2025 · Jul 17, 2025 · Jul 22, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,29 @@
 
 All notable changes to this project will be documented in this file.
 
+## [1.0.0rc3] - 2025-07-22
+
+### Documentation
+
+- Fix `join` command's example call.
+
+### Features
+
+- Add `data fetch` command. Fixes [#75].
+
+### Change
+
+- Access data only via lydata for compatibility.\
+  We have changed the lydata 2nd level headers slightly for the
+  patient and tumor info (see
+  https://github.com/lycosystem/lydata/issues/21 for more info).\
+  Since the lydata package was already updated to be compatible with
+  that change, we simply need to route every access the lyscripts
+  make to the data through lydata package and hence be compatible
+  too.
+- Make CLI work with new lydata format.\
+  This is also related to https://github.com/lycosystem/lydata/issues/21
+
 ## [1.0.0rc2] - 2025-06-26
 
 ### Bug Fixes

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ dependencies = [
     "pydantic",
     "pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0",
     "numpydantic",
-    "lydata >= 0.2.5",
+    "lydata >= 0.3.3",
     "loguru",
 ]
 dynamic = ["version"]
@@ -193,5 +193,8 @@ ignore_tags = ""
 topo_order = false
 # sort the commits inside sections by oldest/newest order
 sort_commits = "oldest"
+
+[tool.uv.sources]
+lydata = { path = "../lydata-package", editable = true }
 # limit the number of commits included in the changelog.
 # limit_commits = 42
diff --git a/src/lyscripts/compute/prevalences.py b/src/lyscripts/compute/prevalences.py
@@ -14,6 +14,7 @@
 from loguru import logger
 from lydata import C, Q
 from lydata.accessor import NoneQ, QueryPortion
+from lydata.utils import is_old
 from lymph import models
 from pydantic import Field
 from rich import progress
@@ -139,7 +140,8 @@ def observe_prevalence(
     QueryPortion(match=np.int64(7), total=np.int64(79))
     """
     mapping = mapping or DataConfig.model_fields["mapping"].default_factory()
-    data["tumor", "1", "t_stage"] = data.ly.t_stage.map(mapping)
+    key = ("tumor", "1", "t_stage") if is_old(data) else ("tumor", "info", "t_stage")
+    data[key] = data.ly.t_stage.map(mapping)
 
     has_t_stage = C("t_stage").isin(scenario_config.t_stages)
     if scenario_config.midext is None:

diff --git a/src/lyscripts/data/__init__.py b/src/lyscripts/data/__init__.py
@@ -18,6 +18,7 @@
 
 from lyscripts.data import (  # noqa: F401
     enhance,
+    fetch,
     generate,
     join,
     lyproxify,
@@ -34,6 +35,7 @@ class DataCLI(BaseSettings):
     lyproxify: CliSubCommand[lyproxify.LyproxifyCLI]
     join: CliSubCommand[join.JoinCLI]
     split: CliSubCommand[split.SplitCLI]
+    fetch: CliSubCommand[fetch.FetchCLI]
     filter: CliSubCommand[filter_.FilterCLI]
     enhance: CliSubCommand[enhance.EnhanceCLI]
     generate: CliSubCommand[generate.GenerateCLI]

diff --git a/src/lyscripts/data/fetch.py b/src/lyscripts/data/fetch.py
@@ -0,0 +1,57 @@
+"""Small command to fetch the data from a remote using the lydata package."""
+
+from pathlib import Path
+
+import lydata  # noqa: F401
+from loguru import logger
+from lydata.loader import LyDataset
+from pydantic import Field
+
+from lyscripts.cli import assemble_main
+from lyscripts.configs import BaseCLI
+
+
+class FetchCLI(LyDataset, BaseCLI):
+    """Fetch a specific dataset from the lyDATA repository."""
+
+    github_token: str | None = Field(
+        default=None,
+        description=(
+            "GitHub token to access private datasets. Can also be provided as "
+            "`GITHUB_TOKEN` environment variable."
+        ),
+    )
+    github_user: str | None = Field(
+        default=None,
+        description=(
+            "GitHub user for non-token login. Can also be provided as "
+            "`GITHUB_USER` environment variable."
+        ),
+    )
+    github_password: str | None = Field(
+        default=None,
+        description=(
+            "GitHub password for non-token login. Can also be provided as "
+            "`GITHUB_PASSWORD` environment variable."
+        ),
+    )
+    output_file: Path = Field(description="The path to save the dataset to.")
+
+    def cli_cmd(self):
+        """Execute the ``fetch`` command."""
+        logger.enable("lydata")
+        logger.debug(self.model_dump_json(indent=2))
+
+        dataset = self.get_dataframe(
+            use_github=True,
+            token=self.github_token,
+            user=self.github_user,
+            password=self.github_password,
+        )
+        dataset.to_csv(self.output_file, index=False)
+        logger.success(f"Fetched dataset and saved to {self.output_file}")
+
+
+if __name__ == "__main__":
+    main = assemble_main(settings_cls=FetchCLI, prog_name="fetch")
+    main()
diff --git a/src/lyscripts/data/join.py b/src/lyscripts/data/join.py
@@ -29,9 +29,10 @@ def cli_cmd(self) -> None:
 
         .. code-block:: bash
 
-            lydata join \
-            --inputs='["data.source": "file1.csv", "data.source": "file2.csv"]' \
-            --output="joined.csv"
+            lyscripts data join \
+            --inputs '{"source": "file1.csv"}' \
+            --inputs '{"source": "file2.csv"}' \
+            --output-file "joined.csv"
 
         But it also allows for concatenating datasets fetched directly from the
         `lydata Github repo`_. Due to the rather complex command signature, we
@@ -51,7 +52,7 @@ def cli_cmd(self) -> None:
 
         .. code-block:: bash
 
-            lydata join --configs=datasets.ly.yaml --output=joined.csv
+            lyscripts data join --configs datasets.ly.yaml --output-file joined.csv
 
         .. _pydantic: https://docs.pydantic.dev/latest/
         .. _lydata Github repo: https://github.com/rmnldwg/lydata

diff --git a/src/lyscripts/data/lyproxify.py b/src/lyscripts/data/lyproxify.py
@@ -12,8 +12,10 @@
 from pathlib import Path
 from typing import Annotated, Any
 
+import lydata  # noqa: F401
 import pandas as pd
 from loguru import logger
+from lydata import C
 from pydantic import AfterValidator, Field, FilePath
 
 from lyscripts.cli import assemble_main
@@ -111,7 +113,7 @@ def cli_cmd(self) -> None:
         reduced = exclude_patients(trimmed, mapping.EXCLUDE)
         processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)
 
-        if ("tumor", "1", "side") in processed.columns:
+        if "side" in processed.ly:
             processed = leftright_to_ipsicontra(processed)
 
         save_table_to_csv(file_path=self.output_file, table=processed)
@@ -224,7 +226,7 @@ def transform_to_lyprox(
     .. code-block:: python
 
         column_map = {
-            ("patient", "#", "age"): {
+            ("patient", "info", "age"): {
                 "func": compute_age_from_raw,
                 "kwargs": {"randomize": False},
                 "columns": ["birthday", "date of diagnosis"]
@@ -235,7 +237,7 @@ def transform_to_lyprox(
     values of the columns ``"birthday"`` and ``"date of diagnosis"`` as positional
     arguments, and the keyword argument ``"randomize"`` is set to ``False``. The
     function then returns the patient's age, which is subsequently stored in the column
-    ``("patient", "#", "age")``.
+    ``("patient", "info", "age")``.
 
     Note that the ``column_map`` dictionary must have either a ``"default"`` key or
     ``"func"`` along with ``"columns"`` and ``"kwargs"``, depending on the function
@@ -289,8 +291,8 @@ def leftright_to_ipsicontra(data: pd.DataFrame):
     involvement.
     """
     len_before = len(data)
-    left_data = data.loc[data["tumor", "1", "side"] != "right"]
-    right_data = data.loc[data["tumor", "1", "side"] == "right"]
+    left_data = data.ly.query(C("side") != "right")
+    right_data = data.ly.query(C("side") == "right")
 
     left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
     left_data = left_data.rename(columns={"right": "contra"}, level=1)