lycosystem · rmnldwg · Jan 15, 2025 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,20 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.2.4] - 2025-01-15
+
+### 📚 Documentation
+
+- Add `__repr__` & explanation to `C`
+- Mention private attribute `_column_map`
+- Mention `execute` method of `Q` objects
+- Fix unfinished sentence in utils
+
+### Change
+
+- In `__repr__`, add parentheses around combination of `AndQ` and `OrQ`.
+- Switch to [`loguru`](https://loguru.readthedocs.io/en/stable/index.html) for logging
+
 ## [0.2.3] - 2024-12-05
 
 ### 🚀 Features
@@ -220,6 +234,7 @@ Initial implementation of the lyDATA library.
 <!-- generated by git-cliff -->
 <!-- markdownlint-disable-file MD024 -->
 
+[0.2.4]: https://github.com/rmnldwg/lydata/compare/0.2.3..0.2.4
 [0.2.3]: https://github.com/rmnldwg/lydata/compare/0.2.2..0.2.3
 [0.2.2]: https://github.com/rmnldwg/lydata/compare/0.2.1..0.2.2
 [0.2.1]: https://github.com/rmnldwg/lydata/compare/0.2.0..0.2.1

diff --git a/lydata/__init__.py b/lydata/__init__.py
@@ -1,6 +1,6 @@
 """Library for handling lymphatic involvement data."""
 
-import logging
+from loguru import logger
 
 import lydata._version as _version
 from lydata.accessor import C, Q
@@ -26,6 +26,4 @@
     "infer_and_combine_levels",
 ]
 
-logger = logging.getLogger(__name__)
-logger.addHandler(logging.NullHandler())
-logger.setLevel(logging.WARNING)
+logger.disable("lydata")
diff --git a/lydata/accessor.py b/lydata/accessor.py
@@ -11,20 +11,24 @@
 as easy as typing ``df.ly.age``.
 
 Beyond that, the module implements a convenient wat to query the
-:py:class:`pd.DataFrame`: The :py:class:`Q` object, that was inspired by Django's
+:py:class:`~pandas.DataFrame`: The :py:class:`Q` object, that was inspired by Django's
 ``Q`` object. It allows for more readable and modular queries, which can be combined
 with logical operators and reused across different DataFrames.
 
 The :py:class:`Q` objects can be passed to the :py:meth:`LyDataAccessor.query` and
 :py:meth:`LyDataAccessor.portion` methods to filter the DataFrame or compute the
-:py:class:`QueryPortion` of rows that satisfy the query.
+:py:class:`QueryPortion` of rows that satisfy the query. Alternatively, any of these
+:py:class:`Q` objects have a method called :py:meth:`~Q.execute` that can be called with
+a :py:class:`~pandas.DataFrame` to get a boolean mask of the rows satisfying the query.
 
-Further, we implement methods like :py:meth:`LyDataAccessor.combine`,
-:py:meth:`LyDataAccessor.infer_sublevels`, and
-:py:meth:`LyDataAccessor.infer_superlevels` to compute additional columns from the
+Further, we implement methods like :py:meth:`~LyDataAccessor.combine`,
+:py:meth:`~LyDataAccessor.infer_sublevels`, and
+:py:meth:`~LyDataAccessor.infer_superlevels` to compute additional columns from the
 lyDATA tables. This is sometimes necessary, because not all data contains all the
 possibly necessary columns. E.g., in some cohorts we do have detailed sublevel
 information (i.e., IIa and IIb), while in others only the superlevel (II) is reported.
+In such a case, one can now simply call ``df.ly.infer_sublevels()`` to get the
+additional columns.
 """
 
 from __future__ import annotations
@@ -107,9 +111,33 @@ class Q(CombineQMixin):
     .. caution::
 
         The column names are not checked upon instantiation. This is only done when the
-        query is executed. In fact, the ``Q`` object does not even know about the
-        :py:class:`~pandas.DataFrame` it will be applied to in the beginning. On the
+        query is executed. In fact, the :py:class:`Q` object does not even know about
+        the :py:class:`~pandas.DataFrame` it will be applied to in the beginning. On the
         flip side, this means a query may be reused for different DataFrames.
+
+    The ``operator`` argument may be one of the following:
+
+    - ``'=='``: Checks if ``column`` values are equal to the ``value``.
+    - ``'<'``: Checks if ``column`` values are less than the ``value``.
+    - ``'<='``: Checks if ``column`` values are less than or equal to ``value``.
+    - ``'>'``: Checks if ``column`` values are greater than the ``value``.
+    - ``'>='``: Checks if ``column`` values are greater than or equal to ``value``.
+    - ``'!='``: Checks if ``column`` values are not equal to the ``value``. This is
+      equivalent to ``~Q(column, '==', value)``.
+    - ``'in'``: Checks if ``column`` values are in the list of ``value``. For this,
+      pandas' :py:meth:`~pandas.Series.isin` method is used.
+    - ``'contains'``: Checks if ``column`` values contain the string ``value``.
+      Here, pandas' :py:meth:`~pandas.Series.str.contains` method is used.
+
+    .. note::
+
+        During initialization, a private attribute ``_column_map`` is set to the
+        default column map returned by :py:func:`~lydata.utils.get_default_column_map`.
+        This is used to convert short column names to long ones. If one feels
+        adventurous, they may set this attribute to a custom column map containing
+        additional or other column short names. This could also be achieved by
+        subclassing the :py:class:`Q`. However, the attribute may change in the future,
+        and without notice.
     """
 
     _OPERATOR_MAP: dict[str, Callable[[pd.Series, Any], pd.Series]] = {
@@ -175,7 +203,7 @@ class AndQ(CombineQMixin):
     >>> q2 = Q('col2', 'contains', 'ba')
     >>> and_q = q1 & q2
     >>> print(and_q)
-    Q('col1', '!=', 3) & Q('col2', 'contains', 'ba')
+    (Q('col1', '!=', 3) & Q('col2', 'contains', 'ba'))
     >>> isinstance(and_q, AndQ)
     True
     >>> and_q.execute(df)
@@ -194,7 +222,7 @@ def __init__(self, q1: QTypes, q2: QTypes) -> None:
 
     def __repr__(self) -> str:
         """Return a string representation of the query."""
-        return f"{self.q1!r} & {self.q2!r}"
+        return f"({self.q1!r} & {self.q2!r})"
 
     def execute(self, df: pd.DataFrame) -> pd.Series:
         """Return a boolean mask where both queries are satisfied."""
@@ -209,7 +237,7 @@ class OrQ(CombineQMixin):
     >>> q2 = Q('col1', '==', 3)
     >>> or_q = q1 | q2
     >>> print(or_q)
-    Q('col1', '==', 1) | Q('col1', '==', 3)
+    (Q('col1', '==', 1) | Q('col1', '==', 3))
     >>> isinstance(or_q, OrQ)
     True
     >>> or_q.execute(df)
@@ -228,7 +256,7 @@ def __init__(self, q1: QTypes, q2: QTypes) -> None:
 
     def __repr__(self) -> str:
         """Return a string representation of the query."""
-        return f"{self.q1!r} | {self.q2!r}"
+        return f"({self.q1!r} | {self.q2!r})"
 
     def execute(self, df: pd.DataFrame) -> pd.Series:
         """Return a boolean mask where either query is satisfied."""
@@ -250,6 +278,8 @@ class NotQ(CombineQMixin):
     1    False
     2     True
     Name: col1, dtype: bool
+    >>> print(~(Q('col1', '==', 2) & Q('col1', '!=', 3)))
+    ~(Q('col1', '==', 2) & Q('col1', '!=', 3))
     """
 
     def __init__(self, q: QTypes) -> None:
@@ -284,6 +314,10 @@ def execute(self, df: pd.DataFrame) -> pd.Series:
 class C:
     """Wraps a column name and produces a :py:class:`Q` object upon comparison.
 
+    This is basically a shorthand for creating a :py:class:`Q` object that avoids
+    writing the operator and value in quotes. Thus, it may be more readable and allows
+    IDEs to provide better autocompletion.
+
     .. caution::
 
         Just like for the :py:class:`Q` object, it is not checked upon instantiation
@@ -294,13 +328,23 @@ def __init__(self, *column: str) -> None:
         """Create a column object for comparison.
 
         For querying multi-level columns, both the syntax ``C('col1', 'col2')`` and
-        ``C(('col1', 'col2'))`` are valid.
+        ``C(('col1', 'col2'))`` is valid.
 
         >>> (C('col1', 'col2') == 1) == (C(('col1', 'col2')) == 1)
         True
         """
         self.column = column[0] if len(column) == 1 else column
 
+    def __repr__(self) -> str:
+        """Return a string representation of the column object.
+
+        >>> repr(C('foo'))
+        "C('foo')"
+        >>> repr(C('foo', 'bar'))
+        "C(('foo', 'bar'))"
+        """
+        return f"C({self.column!r})"
+
     def __eq__(self, value: Any) -> Q:
         """Create a query object for comparing equality.
 

diff --git a/lydata/loader.py b/lydata/loader.py
@@ -21,7 +21,6 @@
 """
 
 import fnmatch
-import logging
 import os
 import warnings
 from collections.abc import Generator
@@ -32,9 +31,9 @@
 import pandas as pd
 from github import Auth, Github, Repository
 from github.ContentFile import ContentFile
+from loguru import logger
 from pydantic import BaseModel, Field, PrivateAttr, constr
 
-logger = logging.getLogger(__name__)
 _default_repo_name = "rmnldwg/lydata"
 low_min1_str = constr(to_lower=True, min_length=1)
 

diff --git a/lydata/utils.py b/lydata/utils.py
@@ -13,7 +13,7 @@ def update_and_expand(
     right: pd.DataFrame,
     **update_kwargs: Any,
 ) -> pd.DataFrame:
-    """Update ``left`` with values from ``right``, also adding .
+    """Update ``left`` with values from ``right``, also adding columns from ``right``.
 
     The added feature of this function over pandas' :py:meth:`~pandas.DataFrame.update`
     is that it also adds columns that are present in ``right`` but not in ``left``.

diff --git a/lydata/validator.py b/lydata/validator.py
@@ -16,17 +16,15 @@
 .. _LyProX: https://lyprox.org
 """
 
-import logging
 from typing import Any
 
 import pandas as pd
+from loguru import logger
 from pandera import Check, Column, DataFrameSchema
 from pandera.errors import SchemaError
 
 from lydata.loader import available_datasets
 
-logger = logging.getLogger(__name__)
-
 _NULLABLE_OPTIONAL = {"required": False, "nullable": True}
 _NULLABLE_OPTIONAL_BOOLEAN_COLUMN = Column(
     dtype="boolean",

diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "mistletoe",
     "pandera",
     "pydantic",
+    "loguru",
 ]
 
 [project.urls]