From e5a52c0c0ebfe98cdd261d95bf2a9b2a94954e59 Mon Sep 17 00:00:00 2001 From: Saibo-creator Date: Wed, 9 Apr 2025 22:21:34 +0200 Subject: [PATCH] Fix gemma3 token vocab mismatch by using tokenizer.vocab_size when available --- transformers_cfg/tokenization/mapping/token2byte.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/transformers_cfg/tokenization/mapping/token2byte.py b/transformers_cfg/tokenization/mapping/token2byte.py index b7ce48d..ee82e5f 100644 --- a/transformers_cfg/tokenization/mapping/token2byte.py +++ b/transformers_cfg/tokenization/mapping/token2byte.py @@ -27,10 +27,14 @@ def __init__(self, tokenizer): self.bos_token_id = tokenizer.bos_token_id self.tokenizer = tokenizer self.special = tokenizer.all_special_ids - self._length = len(self.tokenizer.get_vocab()) + # vocab_size and len(get_vocab()) should be identical in most cases + # the only known exception is for gemma3, vocab_size is 262144 but len(get_vocab()) is 262145 because the last token is special token called + self._vocab_size = getattr( + tokenizer, "vocab_size", len(self.tokenizer.get_vocab()) + ) def __len__(self): - return self._length + return self._vocab_size @abstractmethod def map(self, token_id: int, verbose=False) -> bytes: