ML4GLand · d-laub · Oct 20, 2025 · Sep 23, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ archive/
 .vscode/
 .ruff_cache/
 .benchmarks/
+scripts/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,14 +10,14 @@ name = "seqpro"
 crate-type = ["cdylib", "rlib"]
 
 [dependencies]
-anyhow = "1.0.79"
-derive_builder = "0.13.0"
+anyhow = "1.0.99"
+derive_builder = "0.13.1"
 ndarray = { version = "0.15.6", features = ["rayon"] }
 numpy = "0.20.0"
 rand = { version = "0.8.5", features = ["small_rng"] }
-rayon = "1.8.0"
-thiserror = "1.0.53"
-xxhash-rust = { version = "0.8.8", features = ["xxh3"] }
+rayon = "1.11.0"
+thiserror = "1.0.69"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
 
 [dependencies.pyo3]
 version = "0.20"

diff --git a/python/seqpro/_modifiers.py b/python/seqpro/_modifiers.py
@@ -39,11 +39,11 @@ def reverse_complement(
 def k_shuffle(
     seqs: SeqType,
     k: int,
+    alphabet: NucleotideAlphabet,
     *,
     length_axis: int | None = None,
     ohe_axis: int | None = None,
     seed: int | np.random.Generator | None = None,
-    alphabet: NucleotideAlphabet | None = None,
 ) -> NDArray[Union[np.bytes_, np.uint8]]:
     """Shuffle sequences while preserving k-let frequencies.
 
@@ -52,21 +52,21 @@ def k_shuffle(
     seqs : SeqType
     k : int
         Size of k-lets to preserve frequencies of.
+    alphabet : NucleotideAlphabet
+        Alphabet, needed for OHE sequence input.
     length_axis : Optional[int], optional
         Needed for array input. Axis that corresponds to the length of sequences.
     ohe_axes : Optional[int], optional
         Needed for OHE input. Axis that corresponds to the one hot encoding, should be
         the same size as the length of the alphabet.
     seed : int, np.random.Generator, optional
         Seed or generator for shuffling.
-    alphabet : Optional[NucleotideAlphabet], optional
-        Alphabet, needed for OHE sequence input.
     """
 
     check_axes(seqs, length_axis, ohe_axis)
 
     if isinstance(seed, np.random.Generator):
-        seed = seed.integers(0, np.iinfo(np.int32).max)
+        seed = seed.integers(0, np.iinfo(np.int32).max)  # type: ignore
 
     seqs = cast_seqs(seqs)
 
@@ -78,16 +78,13 @@ def k_shuffle(
         assert ohe_axis is not None
         seqs = cast(NDArray[np.uint8], seqs)
         ohe = True
-        if alphabet is None:
-            raise ValueError("Need an alphabet to process OHE sequences.")
         seqs = alphabet.decode_ohe(seqs, ohe_axis=ohe_axis)
     else:
         ohe = False
 
     seqs = np.moveaxis(seqs, length_axis, -1)  # length must be final
-    seqs = np.ascontiguousarray(seqs)  # must be contiguous
 
-    shuffled = _k_shuffle(seqs.view("u1"), k, seed).view("S1")
+    shuffled = _k_shuffle(seqs.view("u1"), k, len(alphabet), seed).view("S1")
 
     shuffled = np.moveaxis(shuffled, -1, length_axis)  # put length back where it was
 

diff --git a/python/seqpro/_utils.py b/python/seqpro/_utils.py
@@ -1,9 +1,11 @@
-from typing import List, Optional, TypeVar, Union, cast, overload
+from __future__ import annotations
+
+from typing import Optional, TypeVar, Union, cast, overload
 
 import numpy as np
 from numpy.typing import NDArray
 
-NestedStr = Union[bytes, str, List["NestedStr"]]
+NestedStr = Union[bytes, str, list["NestedStr"]]
 """String or nested list of strings"""
 
 StrSeqType = Union[NestedStr, NDArray[Union[np.str_, np.object_, np.bytes_]]]

diff --git a/python/seqpro/bed.py b/python/seqpro/bed.py
@@ -33,15 +33,13 @@
 def sort(bed: pl.DataFrame):
     """Sort a BED-like DataFrame by chromosome, start, and end position, using the natural
     order of chromosome names e.g. 1, 2, ..., 10, ..."""
-    contigs = bed["chrom"].unique()
-    with pl.StringCache():
-        pl.Series(natsorted(contigs), dtype=pl.Categorical)
-        bed = bed.sort(
-            pl.col("chrom").cast(pl.Categorical),
-            "chromStart",
-            "chromEnd",
-            maintain_order=True,
-        )
+    order = natsorted(bed["chrom"].unique())
+    bed = bed.sort(
+        pl.col("chrom").cast(pl.Enum(order)),
+        "chromStart",
+        "chromEnd",
+        maintain_order=True,
+    )
     return bed
 
 
@@ -100,7 +98,7 @@ def to_pyr(bedlike: pl.DataFrame) -> pr.PyRanges:
                 "strand": "Strand",
             },
             strict=False,
-        ).to_pandas(use_pyarrow_extension_array=True)
+        ).to_pandas()
     )