0xideas · 0xideas · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/README.md b/README.md
@@ -194,7 +194,7 @@ Please cite with:
   title = {sequifier - causal transformer models for multivariate sequence modelling},
   year = {2025},
   publisher = {GitHub},
-  version = {v1.0.0.6},
+  version = {v1.1.0.0},
   url = {https://github.com/0xideas/sequifier}
 }
 ```
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -15,7 +15,7 @@
 project = 'sequifier'
 copyright = '2025, Leon Luithlen'
 author = 'Leon Luithlen'
-release = 'v1.0.0.6'
+release = 'v1.1.0.0'
 html_baseurl = 'https://www.sequifier.com/'
 
 # -- General configuration ---------------------------------------------------

diff --git a/documentation/configs/preprocess.md b/documentation/configs/preprocess.md
@@ -86,3 +86,26 @@ After running `preprocess`, the following are generated:
 2.  **Metadata Config:** Located in `configs/metadata_configs/[NAME].json`.
       * **Crucial:** This file contains the integer mappings for categorical variables (`id_maps`) and normalization stats for real variables (`selected_columns_statistics`).
       * **Next Step:** You must link this file path in your `train.yaml` and `infer.yaml` under `metadata_config_path`.
+
+
+## 5\. Advanced: Custom ID Mapping
+
+By default, Sequifier automatically generates integer IDs for categorical columns starting from index 2 (indices 0 and 1 are reserved for system use, such as "unknown" values).
+
+If you need to enforce specific integer mappings (e.g., to maintain consistency across different training runs or datasets), you can provide **precomputed ID maps**.
+
+1.  Create a folder named `id_maps` inside your configs directory: `configs/id_maps/`.
+2.  Create a JSON file named exactly after the column you want to map (e.g., `my_column_name.json`).
+3.  The JSON file must contain a key-value dictionary where keys are the raw values and values are the integer IDs.
+
+**Constraints:**
+* Integer IDs must start at **2** or higher.
+* IDs **0** and **1** are reserved.
+
+**Example `configs/id_maps/category_col.json`:**
+```json
+{
+  "cat": 2,
+  "dog": 3,
+  "mouse": 4
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sequifier"
-version = "v1.0.0.6"
+version = "v1.1.0.0"
 authors = [
     { name = "Leon Luithlen", email = "leontimnaluithlen@gmail.com" },
 ]

diff --git a/src/sequifier/config/train_config.py b/src/sequifier/config/train_config.py
@@ -100,6 +100,12 @@ class DotDict(dict):
     def __deepcopy__(self, memo=None):
         return DotDict(copy.deepcopy(dict(self), memo=memo))
 
+    def __getstate__(self):
+        return dict(self)
+
+    def __setstate__(self, state):
+        self.update(state)
+
 
 class TrainingSpecModel(BaseModel):
     """Pydantic model for training specifications.

diff --git a/src/sequifier/helpers.py b/src/sequifier/helpers.py
@@ -73,7 +73,9 @@ def construct_index_maps(
 
     A special mapping for index 0 is added:
     - If original IDs are strings, 0 maps to "unknown".
-    - If original IDs are integers, 0 maps to (minimum original ID) - 1.
+    - If original IDs are strings, 1 maps to "other".
+    - If original IDs are integers, 0 maps to (minimum original ID) - 2.
+    - If original IDs are integers, 1 maps to (minimum original ID) - 1.
 
     Args:
         id_maps: A nested dictionary mapping column names to their
@@ -105,10 +107,13 @@ def construct_index_maps(
             val = next(iter(map_.values()))
             if isinstance(val, str):
                 map_[0] = "unknown"
+                map_[1] = "other"
             else:
                 if not isinstance(val, int):
                     raise TypeError(f"Expected integer ID in map, got {type(val)}")
-                map_[0] = min(map_.values()) - 1  # type: ignore
+                min_id = int(min(map_.values()))
+                map_[0] = min_id - 2  # type: ignore
+                map_[1] = min_id - 1
             index_map[target_column] = map_
     return index_map
 

diff --git a/src/sequifier/preprocess.py b/src/sequifier/preprocess.py
@@ -130,10 +130,22 @@ def __init__(
                 col for col in data.columns if col not in ["sequenceId", "itemPosition"]
             ]
             id_maps, selected_columns_statistics = {}, {}
+
+            precomputed_id_maps = load_precomputed_id_maps(
+                self.project_root, data_columns
+            )
+
             id_maps, selected_columns_statistics = _get_column_statistics(
-                data, data_columns, id_maps, selected_columns_statistics, 0
+                data,
+                data_columns,
+                id_maps,
+                selected_columns_statistics,
+                0,
+                precomputed_id_maps,
             )
 
+            id_maps = id_maps | precomputed_id_maps
+
             data, n_classes, col_types = _apply_column_statistics(
                 data, data_columns, id_maps, selected_columns_statistics
             )
@@ -319,9 +331,14 @@ def _get_column_metadata_across_files(
                 - data_columns (list[str]): List of all processed data
                   column names.
         """
+
         n_rows_running_count = 0
         id_maps, selected_columns_statistics = {}, {}
+
         col_types, data_columns = None, None
+
+        precomputed_id_maps = load_precomputed_id_maps(self.project_root, data_columns)
+
         files_to_process = []
         logger.info(f"Data path: {data_path}")
         for root, dirs, files in os.walk(data_path):
@@ -354,6 +371,12 @@ def _get_column_metadata_across_files(
                     if col_types is None:
                         data_columns = current_file_cols
                         col_types = {col: str(data.schema[col]) for col in data_columns}
+
+                        for col in precomputed_id_maps.keys():
+                            if col not in data_columns:
+                                raise ValueError(
+                                    f"Precomputed column {col} not found in {file}"
+                                )
                     else:
                         if set(current_file_cols) != set(col_types.keys()):
                             missing = set(col_types.keys()) - set(current_file_cols)
@@ -382,12 +405,15 @@ def _get_column_metadata_across_files(
                         id_maps,
                         selected_columns_statistics,
                         n_rows_running_count,
+                        precomputed_id_maps,
                     )
                     n_rows_running_count += data.shape[0]
 
+        id_maps = id_maps | precomputed_id_maps
+
         if data_columns is None:
             raise RuntimeError("data_columns was not initialized correctly.")
-        n_classes = {col: len(id_maps[col]) + 1 for col in id_maps}
+        n_classes = {col: max(id_maps[col].values()) + 1 for col in id_maps}
 
         if col_types is None:
             raise RuntimeError("col_types was not initialized correctly.")
@@ -785,14 +811,14 @@ def _apply_column_statistics(
             - `col_types`: The (potentially computed) column type dictionary.
     """
     if n_classes is None:
-        n_classes = {col: len(id_maps[col]) + 1 for col in id_maps}
+        n_classes = {col: max(id_maps[col].values()) + 1 for col in id_maps}
 
     if col_types is None:
         col_types = {col: str(data.schema[col]) for col in data_columns}
 
     for col in data_columns:
         if col in id_maps:
-            data = data.with_columns(pl.col(col).replace(id_maps[col]))
+            data = data.with_columns(pl.col(col).replace(id_maps[col], default=1))
             col_types[col] = "Int64"
         elif col in selected_columns_statistics:
             data = data.with_columns(
@@ -805,13 +831,55 @@ def _apply_column_statistics(
     return (data, n_classes, col_types)
 
 
+@beartype
+def load_precomputed_id_maps(
+    project_root: str, data_columns: Optional[list[str]]
+) -> dict[str, dict[Union[str, int], int]]:
+    """Loads custom ID maps from configs/id_maps if the folder exists.
+
+    Args:
+        project_root: The path to the project root directory.
+        data_columns: Optional list of columns present in the data to validate
+            against the found map files.
+
+    Returns:
+        A dictionary mapping column names to their ID maps.
+    """
+    custom_maps = {}
+    path = os.path.join(project_root, "configs", "id_maps")
+
+    if os.path.exists(path):
+        for file in os.listdir(path):
+            if file.endswith(".json"):
+                col_name = os.path.splitext(file)[0]
+                if data_columns is not None and col_name not in data_columns:
+                    raise ValueError(
+                        f"{file} does not correspond to any column in the data"
+                    )
+
+                with open(os.path.join(path, file), "r") as f:
+                    # Load and ensure values are integers
+                    m = {k: int(v) for k, v in json.load(f).items()}
+
+                    if not len(m) > 0:
+                        raise ValueError(f"map in {file} does not contain any values")
+                    min_val = min(m.values())
+                    if min_val != 2:
+                        raise ValueError(
+                            f"minimum value in map {file} is {min_val}, must be 2."
+                        )
+                    custom_maps[col_name] = m
+    return custom_maps
+
+
 @beartype
 def _get_column_statistics(
     data: pl.DataFrame,
     data_columns: list[str],
     id_maps: dict[str, dict[Union[str, int], int]],
     selected_columns_statistics: dict[str, dict[str, float]],
     n_rows_running_count: int,
+    precomputed_id_maps: dict[str, dict[Union[str, int], int]],
 ) -> tuple[
     dict[str, dict[Union[str, int], int]],
     dict[str, dict[str, float]],
@@ -837,6 +905,8 @@ def _get_column_statistics(
             statistics to be updated.
         n_rows_running_count: The total number of rows processed *before*
             this chunk, used for weighting statistics.
+        precomputed_id_maps: A dictionary of pre-loaded ID maps that should
+            be applied and not re-computed.
 
     Returns:
         A tuple `(id_maps, selected_columns_statistics)` containing the
@@ -863,9 +933,17 @@ def _get_column_statistics(
                 pl.UInt64,
             ),
         ):
-            new_id_map = create_id_map(data, column=data_col)
-            id_maps[data_col] = combine_maps(new_id_map, id_maps.get(data_col, {}))
+            if data_col not in precomputed_id_maps:
+                new_id_map = create_id_map(data, column=data_col)
+                id_maps[data_col] = combine_maps(new_id_map, id_maps.get(data_col, {}))
+            else:
+                logger.info(f"Applying precomputed map for {data_col}")
         elif isinstance(dtype, (pl.Float32, pl.Float64)):
+            if data_col in precomputed_id_maps:
+                raise ValueError(
+                    f"Column {data_col} is not categorical, precomputed map is invalid."
+                )
+
             combined_mean, combined_std = get_combined_statistics(
                 data.shape[0],
                 data.get_column(data_col).mean(),
@@ -1262,7 +1340,7 @@ def create_id_map(data: pl.DataFrame, column: str) -> dict[Union[str, int], int]
     ids = sorted(
         [int(x) if not isinstance(x, str) else x for x in np.unique(data[column])]
     )  # type: ignore
-    id_map = {id_: i + 1 for i, id_ in enumerate(ids)}
+    id_map = {id_: i + 2 for i, id_ in enumerate(ids)}
     return dict(id_map)
 
 
@@ -1330,7 +1408,7 @@ def combine_maps(
         A new, combined, and re-indexed ID map.
     """
     combined_keys = sorted(list(set(list(map1.keys())).union(list(set(map2.keys())))))
-    id_map = {id_: i + 1 for i, id_ in enumerate(combined_keys)}
+    id_map = {id_: i + 2 for i, id_ in enumerate(combined_keys)}
     return id_map
 
 

diff --git a/tests/unit/test_preprocess.py b/tests/unit/test_preprocess.py
@@ -170,13 +170,13 @@ def test_get_column_statistics_state_accumulation():
 
     # Pass 1
     id_maps, stats = _get_column_statistics(
-        chunk1, ["cat_col", "num_col"], id_maps, stats, running_count
+        chunk1, ["cat_col", "num_col"], id_maps, stats, running_count, {}
     )
     running_count += len(chunk1)
 
     # Pass 2
     id_maps, stats = _get_column_statistics(
-        chunk2, ["cat_col", "num_col"], id_maps, stats, running_count
+        chunk2, ["cat_col", "num_col"], id_maps, stats, running_count, {}
     )
 
     # Validations
@@ -197,7 +197,7 @@ def test_create_id_map():
     df = pl.DataFrame({"A": ["z", "x", "y", "x"]})
     mapping = create_id_map(df, "A")
 
-    # Sorted unique values: x, y, z -> 1, 2, 3
-    assert mapping["x"] == 1
-    assert mapping["y"] == 2
-    assert mapping["z"] == 3
+    # Sorted unique values: x, y, z -> 2, 3, 4
+    assert mapping["x"] == 2
+    assert mapping["y"] == 3
+    assert mapping["z"] == 4