Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,29 @@

All notable changes to this project will be documented in this file.

## [1.0.0rc3] - 2025-07-22

### Documentation

- Fix `join` command's example call.

### Features

- Add `data fetch` command. Fixes [#75].

### Change

- Access data only via lydata for compatibility.\
We have changed the lydata 2nd level headers slightly for the
patient and tumor info (see
https://github.com/lycosystem/lydata/issues/21 for more info).\
Since the lydata package was already updated to be compatible with
that change, we simply need to route every access the lyscripts
make to the data through lydata package and hence be compatible
too.
- Make CLI work with new lydata format.\
This is also related to https://github.com/lycosystem/lydata/issues/21

## [1.0.0rc2] - 2025-06-26

### Bug Fixes
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dependencies = [
"pydantic",
"pydantic-settings >= 2.7.0, != 2.9.1, != 2.9.0",
"numpydantic",
"lydata >= 0.2.5",
"lydata >= 0.3.3",
"loguru",
]
dynamic = ["version"]
Expand Down Expand Up @@ -193,5 +193,8 @@ ignore_tags = ""
topo_order = false
# sort the commits inside sections by oldest/newest order
sort_commits = "oldest"

[tool.uv.sources]
lydata = { path = "../lydata-package", editable = true }
# limit the number of commits included in the changelog.
# limit_commits = 42
4 changes: 3 additions & 1 deletion src/lyscripts/compute/prevalences.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from loguru import logger
from lydata import C, Q
from lydata.accessor import NoneQ, QueryPortion
from lydata.utils import is_old
from lymph import models
from pydantic import Field
from rich import progress
Expand Down Expand Up @@ -139,7 +140,8 @@ def observe_prevalence(
QueryPortion(match=np.int64(7), total=np.int64(79))
"""
mapping = mapping or DataConfig.model_fields["mapping"].default_factory()
data["tumor", "1", "t_stage"] = data.ly.t_stage.map(mapping)
key = ("tumor", "1", "t_stage") if is_old(data) else ("tumor", "info", "t_stage")
data[key] = data.ly.t_stage.map(mapping)

has_t_stage = C("t_stage").isin(scenario_config.t_stages)
if scenario_config.midext is None:
Expand Down
2 changes: 2 additions & 0 deletions src/lyscripts/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from lyscripts.data import ( # noqa: F401
enhance,
fetch,
generate,
join,
lyproxify,
Expand All @@ -34,6 +35,7 @@ class DataCLI(BaseSettings):
lyproxify: CliSubCommand[lyproxify.LyproxifyCLI]
join: CliSubCommand[join.JoinCLI]
split: CliSubCommand[split.SplitCLI]
fetch: CliSubCommand[fetch.FetchCLI]
filter: CliSubCommand[filter_.FilterCLI]
enhance: CliSubCommand[enhance.EnhanceCLI]
generate: CliSubCommand[generate.GenerateCLI]
Expand Down
57 changes: 57 additions & 0 deletions src/lyscripts/data/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Small command to fetch the data from a remote using the lydata package."""

from pathlib import Path

import lydata # noqa: F401
from loguru import logger
from lydata.loader import LyDataset
from pydantic import Field

from lyscripts.cli import assemble_main
from lyscripts.configs import BaseCLI


class FetchCLI(LyDataset, BaseCLI):
"""Fetch a specific dataset from the lyDATA repository."""

github_token: str | None = Field(
default=None,
description=(
"GitHub token to access private datasets. Can also be provided as "
"`GITHUB_TOKEN` environment variable."
),
)
github_user: str | None = Field(
default=None,
description=(
"GitHub user for non-token login. Can also be provided as "
"`GITHUB_USER` environment variable."
),
)
github_password: str | None = Field(
default=None,
description=(
"GitHub password for non-token login. Can also be provided as "
"`GITHUB_PASSWORD` environment variable."
),
)
output_file: Path = Field(description="The path to save the dataset to.")

def cli_cmd(self):
"""Execute the ``fetch`` command."""
logger.enable("lydata")
logger.debug(self.model_dump_json(indent=2))

dataset = self.get_dataframe(
use_github=True,
token=self.github_token,
user=self.github_user,
password=self.github_password,
)
dataset.to_csv(self.output_file, index=False)
logger.success(f"Fetched dataset and saved to {self.output_file}")


if __name__ == "__main__":
main = assemble_main(settings_cls=FetchCLI, prog_name="fetch")
main()
9 changes: 5 additions & 4 deletions src/lyscripts/data/join.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ def cli_cmd(self) -> None:

.. code-block:: bash

lydata join \
--inputs='["data.source": "file1.csv", "data.source": "file2.csv"]' \
--output="joined.csv"
lyscripts data join \
--inputs '{"source": "file1.csv"}' \
--inputs '{"source": "file2.csv"}' \
--output-file "joined.csv"

But it also allows for concatenating datasets fetched directly from the
`lydata Github repo`_. Due to the rather complex command signature, we
Expand All @@ -51,7 +52,7 @@ def cli_cmd(self) -> None:

.. code-block:: bash

lydata join --configs=datasets.ly.yaml --output=joined.csv
lyscripts data join --configs datasets.ly.yaml --output-file joined.csv

.. _pydantic: https://docs.pydantic.dev/latest/
.. _lydata Github repo: https://github.com/rmnldwg/lydata
Expand Down
12 changes: 7 additions & 5 deletions src/lyscripts/data/lyproxify.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from pathlib import Path
from typing import Annotated, Any

import lydata # noqa: F401
import pandas as pd
from loguru import logger
from lydata import C
from pydantic import AfterValidator, Field, FilePath

from lyscripts.cli import assemble_main
Expand Down Expand Up @@ -111,7 +113,7 @@ def cli_cmd(self) -> None:
reduced = exclude_patients(trimmed, mapping.EXCLUDE)
processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)

if ("tumor", "1", "side") in processed.columns:
if "side" in processed.ly:
processed = leftright_to_ipsicontra(processed)

save_table_to_csv(file_path=self.output_file, table=processed)
Expand Down Expand Up @@ -224,7 +226,7 @@ def transform_to_lyprox(
.. code-block:: python

column_map = {
("patient", "#", "age"): {
("patient", "info", "age"): {
"func": compute_age_from_raw,
"kwargs": {"randomize": False},
"columns": ["birthday", "date of diagnosis"]
Expand All @@ -235,7 +237,7 @@ def transform_to_lyprox(
values of the columns ``"birthday"`` and ``"date of diagnosis"`` as positional
arguments, and the keyword argument ``"randomize"`` is set to ``False``. The
function then returns the patient's age, which is subsequently stored in the column
``("patient", "#", "age")``.
``("patient", "info", "age")``.

Note that the ``column_map`` dictionary must have either a ``"default"`` key or
``"func"`` along with ``"columns"`` and ``"kwargs"``, depending on the function
Expand Down Expand Up @@ -289,8 +291,8 @@ def leftright_to_ipsicontra(data: pd.DataFrame):
involvement.
"""
len_before = len(data)
left_data = data.loc[data["tumor", "1", "side"] != "right"]
right_data = data.loc[data["tumor", "1", "side"] == "right"]
left_data = data.ly.query(C("side") != "right")
right_data = data.ly.query(C("side") == "right")

left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
left_data = left_data.rename(columns={"right": "contra"}, level=1)
Expand Down
Loading