From feb82f0da6c1daadd06e2cefa3ba113a2f486ad3 Mon Sep 17 00:00:00 2001 From: Carter Green Date: Thu, 11 Dec 2025 11:30:51 -0600 Subject: [PATCH 1/3] DOC: Fix code of conduct email --- CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 8ddae1db..483134e4 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -60,7 +60,7 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -info@nautechsystems.io. +support@databento.com. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the From 191f5dc20c34bae73d35a59125d6d2f8d7a4f231 Mon Sep 17 00:00:00 2001 From: Nick Macholl Date: Tue, 9 Dec 2025 15:35:28 -0800 Subject: [PATCH 2/3] OPT: Vectorize symbol mapping --- CHANGELOG.md | 5 + databento/common/dbnstore.py | 21 +-- databento/common/symbology.py | 234 ++++++++++++++++++++++++++------- pyproject.toml | 1 - tests/test_common_symbology.py | 150 ++++++++++++++------- 5 files changed, 304 insertions(+), 107 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27416e74..8c0ee46d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## 0.69.0 + +#### Enhancements +- Improved the performance of `DBNStore.to_df()` symbol mapping + ## 0.68.0 - 2025-12-09 This release adds support for Python 3.14. diff --git a/databento/common/dbnstore.py b/databento/common/dbnstore.py index 722f2aa7..dc1a09f9 100644 --- a/databento/common/dbnstore.py +++ b/databento/common/dbnstore.py @@ -1287,7 +1287,7 @@ def _transcode( ) -> None: if map_symbols: self._instrument_map.insert_metadata(self.metadata) - symbol_map = self._instrument_map._data + symbol_map = self._instrument_map.build_symbol_map() else: symbol_map = None @@ -1299,7 +1299,7 @@ def _transcode( pretty_ts=pretty_ts, has_metadata=True, map_symbols=map_symbols, - symbol_interval_map=symbol_map, # type: ignore [arg-type] + symbol_interval_map=symbol_map, schema=schema, ) @@ -1508,19 +1508,12 @@ def _format_hidden_fields(self, df: pd.DataFrame) -> None: def _format_map_symbols(self, df: pd.DataFrame) -> None: # the first ordered field will be ts_recv or ts_event when appropriate ts_name = self._struct_type._ordered_fields[0] + dates = df[ts_name] if self._pretty_ts else pd.to_datetime(df[ts_name], utc=True).dt.date - if df.empty: - df["symbol"] = [] - else: - df["symbol"] = df.apply( - lambda r: self._instrument_map.resolve( - r["instrument_id"], - ( - r[ts_name] if self._pretty_ts else pd.to_datetime(r[ts_name], utc=True) - ).date(), - ), - axis=1, - ) + df["symbol"] = self._instrument_map.resolve_many( + df["instrument_id"].to_numpy(), + np.asarray(dates, dtype="datetime64[D]"), + ) def _format_timezone(self, df: pd.DataFrame) -> None: for field in self._struct_type._timestamp_fields: diff --git a/databento/common/symbology.py b/databento/common/symbology.py index 731ead9b..4b357fa4 100644 --- a/databento/common/symbology.py +++ b/databento/common/symbology.py @@ -1,11 +1,9 @@ from __future__ import annotations -import bisect import csv import datetime as dt import functools import json -from collections import defaultdict from collections.abc import Mapping from io import TextIOWrapper from os import PathLike @@ -15,12 +13,14 @@ from typing import NamedTuple from typing import TextIO +import numpy as np import pandas as pd from databento_dbn import UNDEF_TIMESTAMP from databento_dbn import Metadata from databento_dbn import SType from databento_dbn import SymbolMappingMsg from databento_dbn import SymbolMappingMsgV1 +from numpy.typing import NDArray from databento.common.parsing import datetime_to_unix_nanoseconds from databento.common.validation import validate_path @@ -187,20 +187,24 @@ class InstrumentMap: ) def __init__(self) -> None: - self._data: dict[int, list[MappingInterval]] = defaultdict(list) + self._instrument_ids: NDArray[np.uint64] + self._starts: NDArray[np.datetime64[dt.date]] + self._ends: NDArray[np.datetime64[dt.date]] + self._symbols: NDArray[np.datetime64[dt.date]] + + self.clear() # initialize def clear(self) -> None: """ Clear all mapping data. """ - self._data.clear() + self._instrument_ids = np.empty(shape=[0], dtype=np.uint64) + self._starts = np.empty(shape=[0], dtype="datetime64[D]") + self._ends = np.empty(shape=[0], dtype="datetime64[D]") + self._symbols = np.empty(shape=[0], dtype="object") @functools.lru_cache - def resolve( - self, - instrument_id: int, - date: dt.date, - ) -> str | None: + def resolve(self, instrument_id: int, date: dt.date) -> str | None: """ Resolve an instrument ID on a particular date to the mapped symbol, or `None` if there is not mapping on that date. @@ -220,13 +224,106 @@ def resolve( If the InstrumentMap is empty. If the InstrumentMap does not contain a mapping for the `instrument_id`. + See Also + -------- + InstrumentMap.resolve_many() + """ - mappings = self._data[int(instrument_id)] - for entry in mappings: - if entry.start_date <= date < entry.end_date: - return entry.symbol + if self._instrument_ids.size == 0: + return None + + np_date = np.datetime64(date) + + key_ids = self._instrument_ids + key_starts = self._starts + + left = np.searchsorted(key_ids, instrument_id, side="left") + right = np.searchsorted(key_ids, instrument_id, side="right") + + if left == right: + return None + + pos = np.searchsorted(key_starts[left:right], np_date, side="right") - 1 + + if pos < 0: + return None + + pos += left + + if np_date < self._ends[pos]: + return self._symbols[pos] + return None + def resolve_many( + self, + instrument_ids: NDArray[np.uint64], + dates: NDArray[np.datetime64[dt.date]], + ) -> NDArray[np.object_]: + """ + Resolve several instrument ID and date pairs to their mapped symbols. + This method is optimal for vectorizing the symbology resolution + operation. + + Parameters + ---------- + instrument_ids : NDArray[np.uint64] + The collection of instrument IDs. + dates : NDArray[np.datetime64[dt.date]] + The collection of dates for each instrument ID. + + Returns + ------- + NDArray[np.object_] + The collection of resolved symbols. + + See Also + -------- + InstrumentMap.resolve() + + """ + if instrument_ids.shape != dates.shape: + raise ValueError("instrument_ids and dates must have the same shape") + + result = np.full(instrument_ids.shape, None, dtype=object) + if instrument_ids.size == 0 or self._instrument_ids.size == 0: + return result + + # Get unique instrument, date combinations and just resolve those + query_array = np.stack([instrument_ids, dates.view("uint64")], axis=1) + unique_queries, inverse = np.unique(query_array, axis=0, return_inverse=True) + unique_inst = unique_queries[:, 0] + unique_dates = unique_queries[:, 1].view("datetime64[D]") + + resolved_unique = np.full(unique_inst.shape, None, dtype=object) + for inst_id in np.unique(unique_inst): + # In this loop we resolve all dates for each instrument + mask = unique_inst == inst_id + idx = np.nonzero(mask)[0] + + # Slice the mapping arrays + left = np.searchsorted(self._instrument_ids, inst_id, side="left") + right = np.searchsorted(self._instrument_ids, inst_id, side="right") + + if left == right: + continue # no intervals for this instrument + + starts = self._starts[left:right] + ends = self._ends[left:right] + symbols = self._symbols[left:right] + + # Get all the dates to resolve + dates_to_check = unique_dates[idx] + + resolved = np.searchsorted(starts, dates_to_check, side="right") - 1 + valid = (resolved >= 0) & (dates_to_check < ends[resolved]) + resolved_unique[idx[valid]] = symbols[resolved[valid]] + + # Map the resolved symbols back to the result + result[:] = resolved_unique[inverse] + + return result + def insert_metadata(self, metadata: Metadata) -> None: """ Insert mappings from DBN Metadata. @@ -246,6 +343,8 @@ def insert_metadata(self, metadata: Metadata) -> None: # Nothing to do return + instrument_ids: list[int] = [] + intervals: list[MappingInterval] = [] for symbol_in, entries in metadata.mappings.items(): for entry in entries: if not entry["symbol"]: @@ -266,8 +365,8 @@ def insert_metadata(self, metadata: Metadata) -> None: stype_out=metadata.stype_out, ) - self._insert_interval( - instrument_id, + instrument_ids.append(instrument_id) + intervals.append( MappingInterval( start_date=start_date, end_date=end_date, @@ -275,6 +374,8 @@ def insert_metadata(self, metadata: Metadata) -> None: ), ) + self._insert_intervals(instrument_ids=instrument_ids, intervals=intervals) + def insert_symbol_mapping_msg( self, msg: SymbolMappingMsg | SymbolMappingMsgV1, @@ -310,13 +411,15 @@ def insert_symbol_mapping_msg( else: symbol = msg.stype_out_symbol - self._insert_interval( - msg.instrument_id, - MappingInterval( - start_date=start_ts.date(), - end_date=end_ts.date(), - symbol=symbol, - ), + self._insert_intervals( + instrument_ids=[msg.instrument_id], + intervals=[ + MappingInterval( + start_date=start_ts.date(), + end_date=end_ts.date(), + symbol=symbol, + ), + ], ) def insert_json( @@ -360,6 +463,8 @@ def insert_json( stype_in = SType(mapping["stype_in"]) stype_out = SType(mapping["stype_out"]) + instrument_ids: list[int] = [] + intervals: list[MappingInterval] = [] for symbol_in, entries in mapping["result"].items(): for entry in entries: if not all(k in entry for k in self.SYMBOLOGY_RESULT_KEYS): @@ -385,14 +490,10 @@ def insert_json( stype_out=stype_out, ) - self._insert_interval( - instrument_id, - MappingInterval( - start_date=start_date, - end_date=end_date, - symbol=symbol, - ), - ) + instrument_ids.append(instrument_id) + intervals.append(MappingInterval(start_date, end_date, symbol)) + + self._insert_intervals(instrument_ids=instrument_ids, intervals=intervals) def map_symbols_csv( self, @@ -467,7 +568,7 @@ def map_symbols_csv( if instrument_id is None: row["symbol"] = "" else: - row["symbol"] = self.resolve(instrument_id, date) + row["symbol"] = self.resolve(int(instrument_id), date) writer.writerow(row) @@ -531,7 +632,7 @@ def map_symbols_json( ts = datetime_to_unix_nanoseconds(ts_field) date = pd.Timestamp(ts, unit="ns").date() - record["symbol"] = self.resolve(instrument_id, date) + record["symbol"] = self.resolve(int(instrument_id), date) json.dump( record, @@ -542,24 +643,69 @@ def map_symbols_json( return out_file_valid - def _insert_interval(self, instrument_id: int, interval: MappingInterval) -> None: + def build_symbol_map(self) -> dict[int, list[tuple[dt.date, dt.date, str]]]: """ - Insert a SymbolInterval into the map. + Build a simple symbol map. This is to interface with methods of + databento-dbn. - This ensures elements are inserted in order and prevents - duplicate entries. + Returns + ------- + dict[int, list[tuple[dt.date, dt.date, str]]] """ - mappings = self._data[instrument_id] - insert_position = bisect.bisect_left( - self._data[instrument_id], - interval, - ) + symbol_map: dict[int, list[tuple[dt.date, dt.date, str]]] = {} + + if self._instrument_ids.size == 0: + return symbol_map + + unique_ids, start_indices = np.unique(self._instrument_ids, return_index=True) + end_indices = np.append(start_indices[1:], self._instrument_ids.size) + + for inst_id, start, end in zip(unique_ids, start_indices, end_indices): + starts = self._starts[start:end] + ends = self._ends[start:end] + symbols = self._symbols[start:end] - if insert_position < len(mappings) and mappings[insert_position] == interval: - return # this mapping is already present + symbol_map[inst_id] = [ + (_to_date(s), _to_date(e), str(sym)) for s, e, sym in zip(starts, ends, symbols) + ] - mappings.insert(insert_position, interval) + return symbol_map + + def _insert_intervals( + self, + instrument_ids: list[int], + intervals: list[MappingInterval], + ) -> None: + new_ids = np.array(instrument_ids, dtype=np.uint64) + new_starts = np.array([i.start_date for i in intervals], dtype="datetime64[D]") + new_ends = np.array([i.end_date for i in intervals], dtype="datetime64[D]") + new_symbols = np.array([i.symbol for i in intervals], dtype=object) + + n = new_ids.size + if not (n == new_starts.size == new_ends.size == new_symbols.size): + raise ValueError("All input arrays must have the same length") + + if n == 0: + return # nothing to insert + + self._instrument_ids = np.concatenate([self._instrument_ids, new_ids]) + self._starts = np.concatenate([self._starts, new_starts]) + self._ends = np.concatenate([self._ends, new_ends]) + self._symbols = np.concatenate([self._symbols, new_symbols]) + + order = np.lexsort((self._starts, self._instrument_ids)) + self._instrument_ids = self._instrument_ids[order] + self._starts = self._starts[order] + self._ends = self._ends[order] + self._symbols = self._symbols[order] + + +def _to_date(value: np.datetime64[dt.date]) -> dt.date: + py_dt = value.astype("O") # yields datetime.datetime or datetime.date + if isinstance(py_dt, dt.datetime): + return py_dt.date() + return py_dt def _resolve_mapping_tuple( diff --git a/pyproject.toml b/pyproject.toml index 160b313a..2245710d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,6 @@ no_strict_optional = true warn_no_return = true warn_unused_configs = true warn_unused_ignores = true -plugins = ["numpy.typing.mypy_plugin"] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tests/test_common_symbology.py b/tests/test_common_symbology.py index 49467c54..a14c1743 100644 --- a/tests/test_common_symbology.py +++ b/tests/test_common_symbology.py @@ -8,6 +8,7 @@ from typing import NamedTuple import databento_dbn +import numpy as np import pandas as pd import pytest from databento_dbn import UNDEF_TIMESTAMP @@ -57,7 +58,7 @@ def fixture_instrument_map() -> InstrumentMap: def fixture_start_date() -> pd.Timestamp: """ Fixture for a start date. This is one day behind the date provided by the - `start_date` fixture. + `end_date` fixture. Returns ------- @@ -199,7 +200,10 @@ def test_instrument_map( Test the creation of an InstrumentMap. """ # Arrange, Act, Assert - assert instrument_map._data == {} + assert len(instrument_map._starts) == 0 + assert len(instrument_map._ends) == 0 + assert len(instrument_map._instrument_ids) == 0 + assert len(instrument_map._symbols) == 0 @pytest.mark.parametrize( @@ -323,7 +327,10 @@ def test_instrument_map_insert_metadata_empty_mappings( instrument_map.insert_metadata(metadata) # Assert - assert instrument_map._data == {} + assert len(instrument_map._starts) == 0 + assert len(instrument_map._ends) == 0 + assert len(instrument_map._instrument_ids) == 0 + assert len(instrument_map._symbols) == 0 @pytest.mark.parametrize( @@ -570,7 +577,10 @@ def test_instrument_map_insert_symbology_response_empty_mapping( instrument_map.insert_json(sym_resp) # Assert - assert instrument_map._data == {} + assert len(instrument_map._starts) == 0 + assert len(instrument_map._ends) == 0 + assert len(instrument_map._instrument_ids) == 0 + assert len(instrument_map._symbols) == 0 @pytest.mark.parametrize( @@ -709,7 +719,10 @@ def test_instrument_map_insert_json_str_empty_mapping( instrument_map.insert_json(json.dumps(sym_resp)) # Assert - assert instrument_map._data == {} + assert len(instrument_map._starts) == 0 + assert len(instrument_map._ends) == 0 + assert len(instrument_map._instrument_ids) == 0 + assert len(instrument_map._symbols) == 0 @pytest.mark.parametrize( @@ -793,13 +806,16 @@ def test_instrument_map_resolve_with_date( symbol = "test_1" instrument_id = 1234 - instrument_map._data[instrument_id] = [ - MappingInterval( - start_date=start_date.date(), - end_date=end_date.date(), - symbol=symbol, - ), - ] + instrument_map._insert_intervals( + [instrument_id], + [ + MappingInterval( + start_date=start_date.date(), + end_date=end_date.date(), + symbol=symbol, + ), + ], + ) # Assert assert ( @@ -813,42 +829,6 @@ def test_instrument_map_resolve_with_date( assert instrument_map.resolve(instrument_id, end_date.date()) is None -def test_instrument_map_ignore_duplicate( - instrument_map: InstrumentMap, - start_date: pd.Timestamp, - end_date: pd.Timestamp, -) -> None: - """ - Test that a duplicate entry is not inserted into an InstrumentMap. - """ - # Arrange, Act - symbol = "test_1" - instrument_id = 1234 - - instrument_map._data[instrument_id] = [ - MappingInterval( - start_date=start_date.date(), - end_date=end_date.date(), - symbol=symbol, - ), - ] - - # Act, Assert - assert len(instrument_map._data[instrument_id]) == 1 - - msg = create_symbol_mapping_message( - instrument_id=instrument_id, - stype_in_symbol=symbol, - stype_out_symbol=instrument_id, - start_ts=start_date, - end_ts=end_date, - ) - - instrument_map.insert_symbol_mapping_msg(msg) - - assert len(instrument_map._data[instrument_id]) == 1 - - @pytest.mark.parametrize( "dataset", [ @@ -1000,3 +980,77 @@ def test_insert_symbology_json_mismatched_stypes( # Assert assert store.to_df().iloc[0]["symbol"] == "NVDA" assert store.to_df().iloc[0]["instrument_id"] == 6155 + + +def test_instrument_map_resolve() -> None: + """ + Test a synthetic symbology of symbols, resolving an instrument ID across + many dates where the symbol it points to rotates every day. + """ + # Arrange + instrument_map = InstrumentMap() + start_date = pd.Timestamp("2020-01-01") + end_date = pd.Timestamp("2021-01-01") + + instrument_ids = np.arange(100).astype("uint64") + symbols = instrument_ids.astype(str) + dates = pd.date_range(start_date, end_date, freq="D", inclusive="left").to_numpy( + "datetime64[D]", + ) + + # Act + for offset, date in enumerate(dates): + instrument_map._insert_intervals( + np.roll(instrument_ids, offset).tolist(), + [ + MappingInterval( + start_date=date, + end_date=date + pd.Timedelta(days=1), + symbol=symbol, + ) + for symbol in symbols + ], + ) + + # Resolve instrument ID 1 on every date + resolve_many_result = instrument_map.resolve_many( + np.ones(dtype="uint64", shape=dates.shape), + dates, + ) + + # Assert + resolve_results = [] + for date in dates: + resolve_results.append(instrument_map.resolve(1, date)) + + assert resolve_results == resolve_many_result.tolist() + + +@pytest.mark.parametrize( + "dataset", + Dataset, +) +def test_instrument_map_resolve_definition( + test_data_path: Callable[[Dataset, Schema], pathlib.Path], + dataset: Dataset, +) -> None: + """ + Test that symbology resolved with `InstrumentMap.resolve()` and + `InstrumentMap.resolve_many()` agree and are both correct using stub + Definition data. + """ + # Arrange + store = DBNStore.from_file(test_data_path(dataset, Schema.DEFINITION)) + df = store.to_df(map_symbols=False) + + # Act + instrument_ids = df["instrument_id"] + dates = df.index.date + expected = df["raw_symbol"] + resolve_many_result = store._instrument_map.resolve_many(instrument_ids, dates) + + # Assert + assert (resolve_many_result == expected).all + for i, (instrument_id, date) in enumerate(zip(instrument_ids, dates)): + resolve_result = store._instrument_map.resolve(instrument_id, date) + assert resolve_many_result[i] == resolve_result From 71aaec214a503e284999e55af97c3e14b043373d Mon Sep 17 00:00:00 2001 From: Nick Macholl Date: Tue, 16 Dec 2025 14:27:10 -0800 Subject: [PATCH 3/3] VER: Release 0.68.1 --- CHANGELOG.md | 2 +- databento/version.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c0ee46d..fde69e4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 0.69.0 +## 0.68.1 - 2025-12-16 #### Enhancements - Improved the performance of `DBNStore.to_df()` symbol mapping diff --git a/databento/version.py b/databento/version.py index 251b74b8..993101b4 100644 --- a/databento/version.py +++ b/databento/version.py @@ -1 +1 @@ -__version__ = "0.68.0" +__version__ = "0.68.1" diff --git a/pyproject.toml b/pyproject.toml index 2245710d..e06ebe16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "databento" -version = "0.68.0" +version = "0.68.1" description = "Official Python client library for Databento" readme = "README.md" requires-python = ">=3.10"