Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ Please cite with:
title = {sequifier - causal transformer models for multivariate sequence modelling},
year = {2025},
publisher = {GitHub},
version = {v1.0.0.6},
version = {v1.1.0.0},
url = {https://github.com/0xideas/sequifier}
}
```
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
project = 'sequifier'
copyright = '2025, Leon Luithlen'
author = 'Leon Luithlen'
release = 'v1.0.0.6'
release = 'v1.1.0.0'
html_baseurl = 'https://www.sequifier.com/'

# -- General configuration ---------------------------------------------------
Expand Down
23 changes: 23 additions & 0 deletions documentation/configs/preprocess.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,26 @@ After running `preprocess`, the following are generated:
2. **Metadata Config:** Located in `configs/metadata_configs/[NAME].json`.
* **Crucial:** This file contains the integer mappings for categorical variables (`id_maps`) and normalization stats for real variables (`selected_columns_statistics`).
* **Next Step:** You must link this file path in your `train.yaml` and `infer.yaml` under `metadata_config_path`.


## 5\. Advanced: Custom ID Mapping

By default, Sequifier automatically generates integer IDs for categorical columns starting from index 2 (indices 0 and 1 are reserved for system use, such as "unknown" values).

If you need to enforce specific integer mappings (e.g., to maintain consistency across different training runs or datasets), you can provide **precomputed ID maps**.

1. Create a folder named `id_maps` inside your configs directory: `configs/id_maps/`.
2. Create a JSON file named exactly after the column you want to map (e.g., `my_column_name.json`).
3. The JSON file must contain a key-value dictionary where keys are the raw values and values are the integer IDs.

**Constraints:**
* Integer IDs must start at **2** or higher.
* IDs **0** and **1** are reserved.

**Example `configs/id_maps/category_col.json`:**
```json
{
"cat": 2,
"dog": 3,
"mouse": 4
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "sequifier"
version = "v1.0.0.6"
version = "v1.1.0.0"
authors = [
{ name = "Leon Luithlen", email = "leontimnaluithlen@gmail.com" },
]
Expand Down
6 changes: 6 additions & 0 deletions src/sequifier/config/train_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ class DotDict(dict):
def __deepcopy__(self, memo=None):
return DotDict(copy.deepcopy(dict(self), memo=memo))

def __getstate__(self):
return dict(self)

def __setstate__(self, state):
self.update(state)


class TrainingSpecModel(BaseModel):
"""Pydantic model for training specifications.
Expand Down
9 changes: 7 additions & 2 deletions src/sequifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ def construct_index_maps(

A special mapping for index 0 is added:
- If original IDs are strings, 0 maps to "unknown".
- If original IDs are integers, 0 maps to (minimum original ID) - 1.
- If original IDs are strings, 1 maps to "other".
- If original IDs are integers, 0 maps to (minimum original ID) - 2.
- If original IDs are integers, 1 maps to (minimum original ID) - 1.

Args:
id_maps: A nested dictionary mapping column names to their
Expand Down Expand Up @@ -105,10 +107,13 @@ def construct_index_maps(
val = next(iter(map_.values()))
if isinstance(val, str):
map_[0] = "unknown"
map_[1] = "other"
else:
if not isinstance(val, int):
raise TypeError(f"Expected integer ID in map, got {type(val)}")
map_[0] = min(map_.values()) - 1 # type: ignore
min_id = int(min(map_.values()))
map_[0] = min_id - 2 # type: ignore
map_[1] = min_id - 1
index_map[target_column] = map_
return index_map

Expand Down
94 changes: 86 additions & 8 deletions src/sequifier/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,22 @@ def __init__(
col for col in data.columns if col not in ["sequenceId", "itemPosition"]
]
id_maps, selected_columns_statistics = {}, {}

precomputed_id_maps = load_precomputed_id_maps(
self.project_root, data_columns
)

id_maps, selected_columns_statistics = _get_column_statistics(
data, data_columns, id_maps, selected_columns_statistics, 0
data,
data_columns,
id_maps,
selected_columns_statistics,
0,
precomputed_id_maps,
)

id_maps = id_maps | precomputed_id_maps

data, n_classes, col_types = _apply_column_statistics(
data, data_columns, id_maps, selected_columns_statistics
)
Expand Down Expand Up @@ -319,9 +331,14 @@ def _get_column_metadata_across_files(
- data_columns (list[str]): List of all processed data
column names.
"""

n_rows_running_count = 0
id_maps, selected_columns_statistics = {}, {}

col_types, data_columns = None, None

precomputed_id_maps = load_precomputed_id_maps(self.project_root, data_columns)

files_to_process = []
logger.info(f"Data path: {data_path}")
for root, dirs, files in os.walk(data_path):
Expand Down Expand Up @@ -354,6 +371,12 @@ def _get_column_metadata_across_files(
if col_types is None:
data_columns = current_file_cols
col_types = {col: str(data.schema[col]) for col in data_columns}

for col in precomputed_id_maps.keys():
if col not in data_columns:
raise ValueError(
f"Precomputed column {col} not found in {file}"
)
else:
if set(current_file_cols) != set(col_types.keys()):
missing = set(col_types.keys()) - set(current_file_cols)
Expand Down Expand Up @@ -382,12 +405,15 @@ def _get_column_metadata_across_files(
id_maps,
selected_columns_statistics,
n_rows_running_count,
precomputed_id_maps,
)
n_rows_running_count += data.shape[0]

id_maps = id_maps | precomputed_id_maps

if data_columns is None:
raise RuntimeError("data_columns was not initialized correctly.")
n_classes = {col: len(id_maps[col]) + 1 for col in id_maps}
n_classes = {col: max(id_maps[col].values()) + 1 for col in id_maps}

if col_types is None:
raise RuntimeError("col_types was not initialized correctly.")
Expand Down Expand Up @@ -785,14 +811,14 @@ def _apply_column_statistics(
- `col_types`: The (potentially computed) column type dictionary.
"""
if n_classes is None:
n_classes = {col: len(id_maps[col]) + 1 for col in id_maps}
n_classes = {col: max(id_maps[col].values()) + 1 for col in id_maps}

if col_types is None:
col_types = {col: str(data.schema[col]) for col in data_columns}

for col in data_columns:
if col in id_maps:
data = data.with_columns(pl.col(col).replace(id_maps[col]))
data = data.with_columns(pl.col(col).replace(id_maps[col], default=1))
col_types[col] = "Int64"
elif col in selected_columns_statistics:
data = data.with_columns(
Expand All @@ -805,13 +831,55 @@ def _apply_column_statistics(
return (data, n_classes, col_types)


@beartype
def load_precomputed_id_maps(
project_root: str, data_columns: Optional[list[str]]
) -> dict[str, dict[Union[str, int], int]]:
"""Loads custom ID maps from configs/id_maps if the folder exists.

Args:
project_root: The path to the project root directory.
data_columns: Optional list of columns present in the data to validate
against the found map files.

Returns:
A dictionary mapping column names to their ID maps.
"""
custom_maps = {}
path = os.path.join(project_root, "configs", "id_maps")

if os.path.exists(path):
for file in os.listdir(path):
if file.endswith(".json"):
col_name = os.path.splitext(file)[0]
if data_columns is not None and col_name not in data_columns:
raise ValueError(
f"{file} does not correspond to any column in the data"
)

with open(os.path.join(path, file), "r") as f:
# Load and ensure values are integers
m = {k: int(v) for k, v in json.load(f).items()}

if not len(m) > 0:
raise ValueError(f"map in {file} does not contain any values")
min_val = min(m.values())
if min_val != 2:
raise ValueError(
f"minimum value in map {file} is {min_val}, must be 2."
)
custom_maps[col_name] = m
return custom_maps


@beartype
def _get_column_statistics(
data: pl.DataFrame,
data_columns: list[str],
id_maps: dict[str, dict[Union[str, int], int]],
selected_columns_statistics: dict[str, dict[str, float]],
n_rows_running_count: int,
precomputed_id_maps: dict[str, dict[Union[str, int], int]],
) -> tuple[
dict[str, dict[Union[str, int], int]],
dict[str, dict[str, float]],
Expand All @@ -837,6 +905,8 @@ def _get_column_statistics(
statistics to be updated.
n_rows_running_count: The total number of rows processed *before*
this chunk, used for weighting statistics.
precomputed_id_maps: A dictionary of pre-loaded ID maps that should
be applied and not re-computed.

Returns:
A tuple `(id_maps, selected_columns_statistics)` containing the
Expand All @@ -863,9 +933,17 @@ def _get_column_statistics(
pl.UInt64,
),
):
new_id_map = create_id_map(data, column=data_col)
id_maps[data_col] = combine_maps(new_id_map, id_maps.get(data_col, {}))
if data_col not in precomputed_id_maps:
new_id_map = create_id_map(data, column=data_col)
id_maps[data_col] = combine_maps(new_id_map, id_maps.get(data_col, {}))
else:
logger.info(f"Applying precomputed map for {data_col}")
elif isinstance(dtype, (pl.Float32, pl.Float64)):
if data_col in precomputed_id_maps:
raise ValueError(
f"Column {data_col} is not categorical, precomputed map is invalid."
)

combined_mean, combined_std = get_combined_statistics(
data.shape[0],
data.get_column(data_col).mean(),
Expand Down Expand Up @@ -1262,7 +1340,7 @@ def create_id_map(data: pl.DataFrame, column: str) -> dict[Union[str, int], int]
ids = sorted(
[int(x) if not isinstance(x, str) else x for x in np.unique(data[column])]
) # type: ignore
id_map = {id_: i + 1 for i, id_ in enumerate(ids)}
id_map = {id_: i + 2 for i, id_ in enumerate(ids)}
return dict(id_map)


Expand Down Expand Up @@ -1330,7 +1408,7 @@ def combine_maps(
A new, combined, and re-indexed ID map.
"""
combined_keys = sorted(list(set(list(map1.keys())).union(list(set(map2.keys())))))
id_map = {id_: i + 1 for i, id_ in enumerate(combined_keys)}
id_map = {id_: i + 2 for i, id_ in enumerate(combined_keys)}
return id_map


Expand Down
12 changes: 6 additions & 6 deletions tests/unit/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,13 @@ def test_get_column_statistics_state_accumulation():

# Pass 1
id_maps, stats = _get_column_statistics(
chunk1, ["cat_col", "num_col"], id_maps, stats, running_count
chunk1, ["cat_col", "num_col"], id_maps, stats, running_count, {}
)
running_count += len(chunk1)

# Pass 2
id_maps, stats = _get_column_statistics(
chunk2, ["cat_col", "num_col"], id_maps, stats, running_count
chunk2, ["cat_col", "num_col"], id_maps, stats, running_count, {}
)

# Validations
Expand All @@ -197,7 +197,7 @@ def test_create_id_map():
df = pl.DataFrame({"A": ["z", "x", "y", "x"]})
mapping = create_id_map(df, "A")

# Sorted unique values: x, y, z -> 1, 2, 3
assert mapping["x"] == 1
assert mapping["y"] == 2
assert mapping["z"] == 3
# Sorted unique values: x, y, z -> 2, 3, 4
assert mapping["x"] == 2
assert mapping["y"] == 3
assert mapping["z"] == 4
Loading