From 704aec5b05c2cd59dc917413fdf2982b072657c9 Mon Sep 17 00:00:00 2001
From: Hokuto Munakata <hokuto.munakata@lycorp.co.jp>
Date: Mon, 11 Aug 2025 12:43:21 +0900
Subject: [PATCH 1/2] Remove torch text

---
 .github/workflows/mypy_ruff.yml               |   2 +-
 .github/workflows/pytest.yml                  |   2 +-
 README.md                                     |   4 +-
 lighthouse/common/vocab.py                    | 586 ++++++++++++++++++
 lighthouse/common/vocab/__init__.py           |  13 +
 lighthouse/common/vocab/vectors.py            | 301 +++++++++
 lighthouse/common/vocab/vocab.py              | 171 +++++
 .../feature_extractor/text_encoders/glove.py  |   4 +-
 mypy.ini                                      |   3 -
 training/cg_detr_dataset.py                   |   2 +-
 training/dataset.py                           |   2 +-
 11 files changed, 1079 insertions(+), 11 deletions(-)
 create mode 100644 lighthouse/common/vocab.py
 create mode 100644 lighthouse/common/vocab/__init__.py
 create mode 100644 lighthouse/common/vocab/vectors.py
 create mode 100644 lighthouse/common/vocab/vocab.py
diff --git a/.github/workflows/mypy_ruff.yml b/.github/workflows/mypy_ruff.yml
index 629908b..f7e423f 100644
--- a/.github/workflows/mypy_ruff.yml
+++ b/.github/workflows/mypy_ruff.yml
@@ -22,7 +22,7 @@ jobs:
       - name: Run dependency libraries
         run: |
           pip install -e .
-          pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0
+          pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0
           pip install mypy ruff
 
       - name: Run mypy
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
index 61c107f..476084b 100644
--- a/.github/workflows/pytest.yml
+++ b/.github/workflows/pytest.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Run dependency libraries
         run: |
           pip install -e .
-          pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0
+          pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0
           pip install pytest
       - name: Run pytest
         run: pytest tests/test_models.py
diff --git a/README.md b/README.md
index 949ff7a..19d568a 100755
--- a/README.md
+++ b/README.md
@@ -24,10 +24,10 @@ Install ffmpeg first. If you are an Ubuntu user, run:
 ```
 apt install ffmpeg
 ```
-Then, install pytorch, torchvision, torchaudio, and torchtext based on your GPU environments.
+Then, install pytorch, torchvision, and torchaudio based on your GPU environments.
 Note that the inference API is available for CPU environments. We tested the codes on Python 3.9 and CUDA 11.8:
 ```
-pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 --index-url https://download.pytorch.org/whl/cu118
+pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
 ```
 Finally, run to install dependency libraries:
 ```
diff --git a/lighthouse/common/vocab.py b/lighthouse/common/vocab.py
new file mode 100644
index 0000000..bc3157b
--- /dev/null
+++ b/lighthouse/common/vocab.py
@@ -0,0 +1,586 @@
+import gzip
+import logging
+import os
+import tarfile
+import zipfile
+from collections import Counter, defaultdict
+from functools import partial
+from urllib.request import urlretrieve
+
+import torch
+from tqdm import tqdm
+
+from .utils import reporthook
+
+logger = logging.getLogger(__name__)
+
+
+class Vocab(object):
+    """Defines a vocabulary object that will be used to numericalize a field.
+
+    Attributes:
+        freqs: A collections.Counter object holding the frequencies of tokens
+            in the data used to build the Vocab.
+        stoi: A collections.defaultdict instance mapping token strings to
+            numerical identifiers.
+        itos: A list of token strings indexed by their numerical identifiers.
+    """
+
+    # TODO (@mttk): Populate classs with default values of special symbols
+    UNK = "<unk>"
+
+    def __init__(
+        self,
+        counter,
+        max_size=None,
+        min_freq=1,
+        specials=("<unk>", "<pad>"),
+        vectors=None,
+        unk_init=None,
+        vectors_cache=None,
+        specials_first=True,
+    ):
+        """Create a Vocab object from a collections.Counter.
+
+        Arguments:
+            counter: collections.Counter object holding the frequencies of
+                each value found in the data.
+            max_size: The maximum size of the vocabulary, or None for no
+                maximum. Default: None.
+            min_freq: The minimum frequency needed to include a token in the
+                vocabulary. Values less than 1 will be set to 1. Default: 1.
+            specials: The list of special tokens (e.g., padding or eos) that
+                will be prepended to the vocabulary. Default: ['<unk'>, '<pad>']
+            vectors: One of either the available pretrained vectors
+                or custom pretrained vectors (see Vocab.load_vectors);
+                or a list of aforementioned vectors
+            unk_init (callback): by default, initialize out-of-vocabulary word vectors
+                to zero vectors; can be any function that takes in a Tensor and
+                returns a Tensor of the same size. Default: 'torch.zeros'
+            vectors_cache: directory for cached vectors. Default: '.vector_cache'
+            specials_first: Whether to add special tokens into the vocabulary at first.
+                If it is False, they are added into the vocabulary at last.
+                Default: True.
+        """
+        self.freqs = counter
+        counter = counter.copy()
+        min_freq = max(min_freq, 1)
+
+        self.itos = list()
+        self.unk_index = None
+        if specials_first:
+            self.itos = list(specials)
+            # only extend max size if specials are prepended
+            max_size = None if max_size is None else max_size + len(specials)
+
+        # frequencies of special tokens are not counted when building vocabulary
+        # in frequency order
+        for tok in specials:
+            del counter[tok]
+
+        # sort by frequency, then alphabetically
+        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
+        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
+
+        for word, freq in words_and_frequencies:
+            if freq < min_freq or len(self.itos) == max_size:
+                break
+            self.itos.append(word)
+
+        if Vocab.UNK in specials:  # hard-coded for now
+            unk_index = specials.index(Vocab.UNK)  # position in list
+            # account for ordering of specials, set variable
+            self.unk_index = unk_index if specials_first else len(self.itos) + unk_index
+            self.stoi = defaultdict(self._default_unk_index)
+        else:
+            self.stoi = defaultdict()
+
+        if not specials_first:
+            self.itos.extend(list(specials))
+
+        # stoi is simply a reverse dict for itos
+        self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
+
+        self.vectors = None
+        if vectors is not None:
+            self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
+        else:
+            assert unk_init is None and vectors_cache is None
+
+    def _default_unk_index(self):
+        return self.unk_index
+
+    def __getitem__(self, token):
+        return self.stoi.get(token, self.stoi.get(Vocab.UNK))
+
+    def __getstate__(self):
+        # avoid picking defaultdict
+        attrs = dict(self.__dict__)
+        # cast to regular dict
+        attrs["stoi"] = dict(self.stoi)
+        return attrs
+
+    def __setstate__(self, state):
+        if state.get("unk_index", None) is None:
+            stoi = defaultdict()
+        else:
+            stoi = defaultdict(self._default_unk_index)
+        stoi.update(state["stoi"])
+        state["stoi"] = stoi
+        self.__dict__.update(state)
+
+    def __eq__(self, other):
+        if self.freqs != other.freqs:
+            return False
+        if self.stoi != other.stoi:
+            return False
+        if self.itos != other.itos:
+            return False
+        if self.vectors != other.vectors:
+            return False
+        return True
+
+    def __len__(self):
+        return len(self.itos)
+
+    def lookup_indices(self, tokens):
+        indices = [self.__getitem__(token) for token in tokens]
+        return indices
+
+    def extend(self, v, sort=False):
+        words = sorted(v.itos) if sort else v.itos
+        for w in words:
+            if w not in self.stoi:
+                self.itos.append(w)
+                self.stoi[w] = len(self.itos) - 1
+
+    def load_vectors(self, vectors, **kwargs):
+        """
+        Arguments:
+            vectors: one of or a list containing instantiations of the
+                GloVe, CharNGram, or Vectors classes. Alternatively, one
+                of or a list of available pretrained vectors:
+
+                charngram.100d
+                fasttext.en.300d
+                fasttext.simple.300d
+                glove.42B.300d
+                glove.840B.300d
+                glove.twitter.27B.25d
+                glove.twitter.27B.50d
+                glove.twitter.27B.100d
+                glove.twitter.27B.200d
+                glove.6B.50d
+                glove.6B.100d
+                glove.6B.200d
+                glove.6B.300d
+
+            Remaining keyword arguments: Passed to the constructor of Vectors classes.
+        """
+        if not isinstance(vectors, list):
+            vectors = [vectors]
+        for idx, vector in enumerate(vectors):
+            if isinstance(vector, str):
+                # Convert the string pretrained vector identifier
+                # to a Vectors object
+                if vector not in pretrained_aliases:
+                    raise ValueError(
+                        "Got string input vector {}, but allowed pretrained "
+                        "vectors are {}".format(vector, list(pretrained_aliases.keys()))
+                    )
+                vectors[idx] = pretrained_aliases[vector](**kwargs)
+            elif not isinstance(vector, Vectors):
+                raise ValueError(
+                    "Got input vectors of type {}, expected str or "
+                    "Vectors object".format(type(vector))
+                )
+
+        tot_dim = sum(v.dim for v in vectors)
+        self.vectors = torch.Tensor(len(self), tot_dim)
+        for i, token in enumerate(self.itos):
+            start_dim = 0
+            for v in vectors:
+                end_dim = start_dim + v.dim
+                self.vectors[i][start_dim:end_dim] = v[token.strip()]
+                start_dim = end_dim
+            assert start_dim == tot_dim
+
+    def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_):
+        """
+        Set the vectors for the Vocab instance from a collection of Tensors.
+
+        Arguments:
+            stoi: A dictionary of string to the index of the associated vector
+                in the `vectors` input argument.
+            vectors: An indexed iterable (or other structure supporting __getitem__) that
+                given an input index, returns a FloatTensor representing the vector
+                for the token associated with the index. For example,
+                vector[stoi["string"]] should return the vector for "string".
+            dim: The dimensionality of the vectors.
+            unk_init (callback): by default, initialize out-of-vocabulary word vectors
+                to zero vectors; can be any function that takes in a Tensor and
+                returns a Tensor of the same size. Default: 'torch.zeros'
+        """
+        self.vectors = torch.Tensor(len(self), dim)
+        for i, token in enumerate(self.itos):
+            wv_index = stoi.get(token, None)
+            if wv_index is not None:
+                self.vectors[i] = vectors[wv_index]
+            else:
+                self.vectors[i] = unk_init(self.vectors[i])
+
+
+class SubwordVocab(Vocab):
+    def __init__(
+        self,
+        counter,
+        max_size=None,
+        specials=("<pad>"),
+        vectors=None,
+        unk_init=torch.Tensor.zero_,
+    ):
+        """Create a revtok subword vocabulary from a collections.Counter.
+
+        Arguments:
+            counter: collections.Counter object holding the frequencies of
+                each word found in the data.
+            max_size: The maximum size of the subword vocabulary, or None for no
+                maximum. Default: None.
+            specials: The list of special tokens (e.g., padding or eos) that
+                will be prepended to the vocabulary in addition to an <unk>
+                token.
+            vectors: One of either the available pretrained vectors
+                or custom pretrained vectors (see Vocab.load_vectors);
+                or a list of aforementioned vectors
+            unk_init (callback): by default, initialize out-of-vocabulary word vectors
+                to zero vectors; can be any function that takes in a Tensor and
+                returns a Tensor of the same size. Default: 'torch.zeros
+        """
+        try:
+            import revtok
+        except ImportError:
+            print("Please install revtok.")
+            raise
+
+        # Hardcode unk_index as subword_vocab has no specials_first argument
+        self.unk_index = (
+            specials.index(SubwordVocab.UNK) if SubwordVocab.UNK in specials else None
+        )
+
+        if self.unk_index is None:
+            self.stoi = defaultdict()
+        else:
+            self.stoi = defaultdict(self._default_unk_index)
+
+        self.stoi.update({tok: i for i, tok in enumerate(specials)})
+        self.itos = specials.copy()
+
+        self.segment = revtok.SubwordSegmenter(counter, max_size)
+
+        max_size = None if max_size is None else max_size + len(self.itos)
+
+        # sort by frequency/entropy, then alphabetically
+        toks = sorted(
+            self.segment.vocab.items(),
+            key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]),
+        )
+
+        for tok, _ in toks:
+            if len(self.itos) == max_size:
+                break
+            self.itos.append(tok)
+            self.stoi[tok] = len(self.itos) - 1
+
+        if vectors is not None:
+            self.load_vectors(vectors, unk_init=unk_init)
+
+
+def _infer_shape(f):
+    num_lines, vector_dim = 0, None
+    for line in f:
+        if vector_dim is None:
+            row = line.rstrip().split(b" ")
+            vector = row[1:]
+            # Assuming word, [vector] format
+            if len(vector) > 2:
+                # The header present in some (w2v) formats contains two elements.
+                vector_dim = len(vector)
+                num_lines += 1  # First element read
+        else:
+            num_lines += 1
+    f.seek(0)
+    return num_lines, vector_dim
+
+
+class Vectors(object):
+    def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None):
+        """
+        Arguments:
+
+            name: name of the file that contains the vectors
+            cache: directory for cached vectors
+            url: url for download if vectors not found in cache
+            unk_init (callback): by default, initialize out-of-vocabulary word vectors
+                to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size
+            max_vectors (int): this can be used to limit the number of
+                pre-trained vectors loaded.
+                Most pre-trained vector sets are sorted
+                in the descending order of word frequency.
+                Thus, in situations where the entire set doesn't fit in memory,
+                or is not needed for another reason, passing `max_vectors`
+                can limit the size of the loaded set.
+        """
+
+        cache = ".vector_cache" if cache is None else cache
+        self.itos = None
+        self.stoi = None
+        self.vectors = None
+        self.dim = None
+        self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init
+        self.cache(name, cache, url=url, max_vectors=max_vectors)
+
+    def __getitem__(self, token):
+        if token in self.stoi:
+            return self.vectors[self.stoi[token]]
+        else:
+            return self.unk_init(torch.Tensor(self.dim))
+
+    def cache(self, name, cache, url=None, max_vectors=None):
+        import ssl
+
+        ssl._create_default_https_context = ssl._create_unverified_context
+        if os.path.isfile(name):
+            path = name
+            if max_vectors:
+                file_suffix = "_{}.pt".format(max_vectors)
+            else:
+                file_suffix = ".pt"
+            path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix
+        else:
+            path = os.path.join(cache, name)
+            if max_vectors:
+                file_suffix = "_{}.pt".format(max_vectors)
+            else:
+                file_suffix = ".pt"
+            path_pt = path + file_suffix
+
+        if not os.path.isfile(path_pt):
+            if not os.path.isfile(path) and url:
+                logger.info("Downloading vectors from {}".format(url))
+                if not os.path.exists(cache):
+                    os.makedirs(cache)
+                dest = os.path.join(cache, os.path.basename(url))
+                if not os.path.isfile(dest):
+                    with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t:
+                        try:
+                            urlretrieve(url, dest, reporthook=reporthook(t))
+                        except KeyboardInterrupt as e:  # remove the partial zip file
+                            os.remove(dest)
+                            raise e
+                logger.info("Extracting vectors into {}".format(cache))
+                ext = os.path.splitext(dest)[1][1:]
+                if ext == "zip":
+                    with zipfile.ZipFile(dest, "r") as zf:
+                        zf.extractall(cache)
+                elif ext == "gz":
+                    if dest.endswith(".tar.gz"):
+                        with tarfile.open(dest, "r:gz") as tar:
+                            tar.extractall(path=cache)
+            if not os.path.isfile(path):
+                raise RuntimeError("no vectors found at {}".format(path))
+
+            logger.info("Loading vectors from {}".format(path))
+            ext = os.path.splitext(path)[1][1:]
+            if ext == "gz":
+                open_file = gzip.open
+            else:
+                open_file = open
+
+            vectors_loaded = 0
+            with open_file(path, "rb") as f:
+                num_lines, dim = _infer_shape(f)
+                if not max_vectors or max_vectors > num_lines:
+                    max_vectors = num_lines
+
+                itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None
+
+                for line in tqdm(f, total=max_vectors):
+                    # Explicitly splitting on " " is important, so we don't
+                    # get rid of Unicode non-breaking spaces in the vectors.
+                    entries = line.rstrip().split(b" ")
+
+                    word, entries = entries[0], entries[1:]
+                    if dim is None and len(entries) > 1:
+                        dim = len(entries)
+                    elif len(entries) == 1:
+                        logger.warning(
+                            "Skipping token {} with 1-dimensional "
+                            "vector {}; likely a header".format(word, entries)
+                        )
+                        continue
+                    elif dim != len(entries):
+                        raise RuntimeError(
+                            "Vector for token {} has {} dimensions, but previously "
+                            "read vectors have {} dimensions. All vectors must have "
+                            "the same number of dimensions.".format(
+                                word, len(entries), dim
+                            )
+                        )
+
+                    try:
+                        if isinstance(word, bytes):
+                            word = word.decode("utf-8")
+                    except UnicodeDecodeError:
+                        logger.info("Skipping non-UTF8 token {}".format(repr(word)))
+                        continue
+
+                    vectors[vectors_loaded] = torch.tensor([float(x) for x in entries])
+                    vectors_loaded += 1
+                    itos.append(word)
+
+                    if vectors_loaded == max_vectors:
+                        break
+
+            self.itos = itos
+            self.stoi = {word: i for i, word in enumerate(itos)}
+            self.vectors = torch.Tensor(vectors).view(-1, dim)
+            self.dim = dim
+            logger.info("Saving vectors to {}".format(path_pt))
+            if not os.path.exists(cache):
+                os.makedirs(cache)
+            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
+        else:
+            logger.info("Loading vectors from {}".format(path_pt))
+            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
+
+    def __len__(self):
+        return len(self.vectors)
+
+    def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
+        """Look up embedding vectors of tokens.
+
+        Arguments:
+            tokens: a token or a list of tokens. if `tokens` is a string,
+                returns a 1-D tensor of shape `self.dim`; if `tokens` is a
+                list of strings, returns a 2-D tensor of shape=(len(tokens),
+                self.dim).
+            lower_case_backup : Whether to look up the token in the lower case.
+                If False, each token in the original case will be looked up;
+                if True, each token in the original case will be looked up first,
+                if not found in the keys of the property `stoi`, the token in the
+                lower case will be looked up. Default: False.
+
+        Examples:
+            >>> examples = ['chip', 'baby', 'Beautiful']
+            >>> vec = text.vocab.GloVe(name='6B', dim=50)
+            >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)
+        """
+        to_reduce = False
+
+        if not isinstance(tokens, list):
+            tokens = [tokens]
+            to_reduce = True
+
+        if not lower_case_backup:
+            indices = [self[token] for token in tokens]
+        else:
+            indices = [
+                self[token] if token in self.stoi else self[token.lower()]
+                for token in tokens
+            ]
+
+        vecs = torch.stack(indices)
+        return vecs[0] if to_reduce else vecs
+
+
+class GloVe(Vectors):
+    url = {
+        "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
+        "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
+        "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
+        "6B": "http://nlp.stanford.edu/data/glove.6B.zip",
+    }
+
+    def __init__(self, name="840B", dim=300, **kwargs):
+        url = self.url[name]
+        name = "glove.{}.{}d.txt".format(name, str(dim))
+        super(GloVe, self).__init__(name, url=url, **kwargs)
+
+
+class FastText(Vectors):
+    url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec"
+
+    def __init__(self, language="en", **kwargs):
+        url = self.url_base.format(language)
+        name = os.path.basename(url)
+        super(FastText, self).__init__(name, url=url, **kwargs)
+
+
+class CharNGram(Vectors):
+    name = "charNgram.txt"
+    url = (
+        "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/"
+        "jmt_pre-trained_embeddings.tar.gz"
+    )
+
+    def __init__(self, **kwargs):
+        super(CharNGram, self).__init__(self.name, url=self.url, **kwargs)
+
+    def __getitem__(self, token):
+        vector = torch.Tensor(1, self.dim).zero_()
+        if token == "<unk>":
+            return self.unk_init(vector)
+        chars = ["#BEGIN#"] + list(token) + ["#END#"]
+        num_vectors = 0
+        for n in [2, 3, 4]:
+            end = len(chars) - n + 1
+            grams = [chars[i : (i + n)] for i in range(end)]
+            for gram in grams:
+                gram_key = "{}gram-{}".format(n, "".join(gram))
+                if gram_key in self.stoi:
+                    vector += self.vectors[self.stoi[gram_key]]
+                    num_vectors += 1
+        if num_vectors > 0:
+            vector /= num_vectors
+        else:
+            vector = self.unk_init(vector)
+        return vector
+
+
+pretrained_aliases = {
+    "charngram.100d": partial(CharNGram),
+    "fasttext.en.300d": partial(FastText, language="en"),
+    "fasttext.simple.300d": partial(FastText, language="simple"),
+    "glove.42B.300d": partial(GloVe, name="42B", dim="300"),
+    "glove.840B.300d": partial(GloVe, name="840B", dim="300"),
+    "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"),
+    "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"),
+    "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"),
+    "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"),
+    "glove.6B.50d": partial(GloVe, name="6B", dim="50"),
+    "glove.6B.100d": partial(GloVe, name="6B", dim="100"),
+    "glove.6B.200d": partial(GloVe, name="6B", dim="200"),
+    "glove.6B.300d": partial(GloVe, name="6B", dim="300"),
+}
+"""Mapping from string name to factory function"""
+
+
+def build_vocab_from_iterator(iterator, num_lines=None):
+    """
+    Build a Vocab from an iterator.
+
+    Arguments:
+        iterator: Iterator used to build Vocab. Must yield list or iterator of tokens.
+        num_lines: The expected number of elements returned by the iterator.
+            (Default: None)
+            Optionally, if known, the expected number of elements can be passed to
+            this factory function for improved progress reporting.
+    """
+
+    counter = Counter()
+    with tqdm(unit_scale=0, unit="lines", total=num_lines) as t:
+        for tokens in iterator:
+            counter.update(tokens)
+            t.update(1)
+    word_vocab = Vocab(counter)
+    return word_vocab
diff --git a/lighthouse/common/vocab/__init__.py b/lighthouse/common/vocab/__init__.py
new file mode 100644
index 0000000..c20fd45
--- /dev/null
+++ b/lighthouse/common/vocab/__init__.py
@@ -0,0 +1,13 @@
+import warnings
+
+from .vectors import CharNGram, FastText, GloVe, pretrained_aliases, Vectors
+from .vocab import Vocab
+
+__all__ = [
+    "Vocab",
+    "GloVe",
+    "FastText",
+    "CharNGram",
+    "pretrained_aliases",
+    "Vectors",
+]
diff --git a/lighthouse/common/vocab/vectors.py b/lighthouse/common/vocab/vectors.py
new file mode 100644
index 0000000..f1eacca
--- /dev/null
+++ b/lighthouse/common/vocab/vectors.py
@@ -0,0 +1,301 @@
+import gzip
+import logging
+import os
+import tarfile
+import zipfile
+from functools import partial
+from urllib.request import urlretrieve
+
+import torch
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def _infer_shape(f):
+    num_lines, vector_dim = 0, None
+    for line in f:
+        if vector_dim is None:
+            row = line.rstrip().split(b" ")
+            vector = row[1:]
+            # Assuming word, [vector] format
+            if len(vector) > 2:
+                # The header present in some (w2v) formats contains two elements.
+                vector_dim = len(vector)
+                num_lines += 1  # First element read
+        else:
+            num_lines += 1
+    f.seek(0)
+    return num_lines, vector_dim
+
+
+class Vectors:
+    def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None) -> None:
+        """
+        Args:
+
+            name: name of the file that contains the vectors
+            cache: directory for cached vectors
+            url: url for download if vectors not found in cache
+            unk_init (callback): by default, initialize out-of-vocabulary word vectors
+                to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size
+            max_vectors (int): this can be used to limit the number of
+                pre-trained vectors loaded.
+                Most pre-trained vector sets are sorted
+                in the descending order of word frequency.
+                Thus, in situations where the entire set doesn't fit in memory,
+                or is not needed for another reason, passing `max_vectors`
+                can limit the size of the loaded set.
+        """
+
+        cache = ".vector_cache" if cache is None else cache
+        self.itos = None
+        self.stoi = None
+        self.vectors = None
+        self.dim = None
+        self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init
+        self.cache(name, cache, url=url, max_vectors=max_vectors)
+
+    def __getitem__(self, token):
+        if token in self.stoi:
+            return self.vectors[self.stoi[token]]
+        else:
+            return self.unk_init(torch.Tensor(self.dim))
+
+    def __contains__(self, token):
+        return token in self.stoi
+
+    def cache(self, name, cache, url=None, max_vectors=None):
+        import ssl
+
+        ssl._create_default_https_context = ssl._create_unverified_context
+        if os.path.isfile(name):
+            path = name
+            if max_vectors:
+                file_suffix = "_{}.pt".format(max_vectors)
+            else:
+                file_suffix = ".pt"
+            path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix
+        else:
+            path = os.path.join(cache, name)
+            if max_vectors:
+                file_suffix = "_{}.pt".format(max_vectors)
+            else:
+                file_suffix = ".pt"
+            path_pt = path + file_suffix
+
+        if not os.path.isfile(path_pt):
+            if not os.path.isfile(path) and url:
+                logger.info("Downloading vectors from {}".format(url))
+                if not os.path.exists(cache):
+                    os.makedirs(cache)
+                dest = os.path.join(cache, os.path.basename(url))
+                if not os.path.isfile(dest):
+                    with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t:
+                        try:
+                            urlretrieve(url, dest, reporthook=reporthook(t))
+                        except KeyboardInterrupt as e:  # remove the partial zip file
+                            os.remove(dest)
+                            raise e
+                logger.info("Extracting vectors into {}".format(cache))
+                ext = os.path.splitext(dest)[1][1:]
+                if ext == "zip":
+                    with zipfile.ZipFile(dest, "r") as zf:
+                        zf.extractall(cache)
+                elif ext == "gz":
+                    if dest.endswith(".tar.gz"):
+                        with tarfile.open(dest, "r:gz") as tar:
+                            tar.extractall(path=cache)
+            if not os.path.isfile(path):
+                raise RuntimeError("no vectors found at {}".format(path))
+
+            logger.info("Loading vectors from {}".format(path))
+            ext = os.path.splitext(path)[1][1:]
+            if ext == "gz":
+                open_file = gzip.open
+            else:
+                open_file = open
+
+            vectors_loaded = 0
+            with open_file(path, "rb") as f:
+                num_lines, dim = _infer_shape(f)
+                if not max_vectors or max_vectors > num_lines:
+                    max_vectors = num_lines
+
+                itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None
+
+                for line in tqdm(f, total=max_vectors):
+                    # Explicitly splitting on " " is important, so we don't
+                    # get rid of Unicode non-breaking spaces in the vectors.
+                    entries = line.rstrip().split(b" ")
+
+                    word, entries = entries[0], entries[1:]
+                    if dim is None and len(entries) > 1:
+                        dim = len(entries)
+                    elif len(entries) == 1:
+                        logger.warning(
+                            "Skipping token {} with 1-dimensional " "vector {}; likely a header".format(word, entries)
+                        )
+                        continue
+                    elif dim != len(entries):
+                        raise RuntimeError(
+                            "Vector for token {} has {} dimensions, but previously "
+                            "read vectors have {} dimensions. All vectors must have "
+                            "the same number of dimensions.".format(word, len(entries), dim)
+                        )
+
+                    try:
+                        if isinstance(word, bytes):
+                            word = word.decode("utf-8")
+                    except UnicodeDecodeError:
+                        logger.info("Skipping non-UTF8 token {}".format(repr(word)))
+                        continue
+
+                    vectors[vectors_loaded] = torch.tensor([float(x) for x in entries])
+                    vectors_loaded += 1
+                    itos.append(word)
+
+                    if vectors_loaded == max_vectors:
+                        break
+
+            self.itos = itos
+            self.stoi = {word: i for i, word in enumerate(itos)}
+            self.vectors = torch.Tensor(vectors).view(-1, dim)
+            self.dim = dim
+            logger.info("Saving vectors to {}".format(path_pt))
+            if not os.path.exists(cache):
+                os.makedirs(cache)
+            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
+        else:
+            logger.info("Loading vectors from {}".format(path_pt))
+            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
+
+    def __len__(self):
+        return len(self.vectors)
+
+    def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
+        """Look up embedding vectors of tokens.
+
+        Args:
+            tokens: a token or a list of tokens. if `tokens` is a string,
+                returns a 1-D tensor of shape `self.dim`; if `tokens` is a
+                list of strings, returns a 2-D tensor of shape=(len(tokens),
+                self.dim).
+            lower_case_backup : Whether to look up the token in the lower case.
+                If False, each token in the original case will be looked up;
+                if True, each token in the original case will be looked up first,
+                if not found in the keys of the property `stoi`, the token in the
+                lower case will be looked up. Default: False.
+
+        Examples:
+            >>> examples = ['chip', 'baby', 'Beautiful']
+            >>> vec = text.vocab.GloVe(name='6B', dim=50)
+            >>> ret = vec.get_vecs_by_tokens(examples, lower_case_backup=True)
+        """
+        to_reduce = False
+
+        if not isinstance(tokens, list):
+            tokens = [tokens]
+            to_reduce = True
+
+        if not lower_case_backup:
+            indices = [self[token] for token in tokens]
+        else:
+            indices = [self[token] if token in self.stoi else self[token.lower()] for token in tokens]
+
+        vecs = torch.stack(indices)
+        return vecs[0] if to_reduce else vecs
+
+
+class GloVe(Vectors):
+    url = {
+        "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
+        "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
+        "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
+        "6B": "http://nlp.stanford.edu/data/glove.6B.zip",
+    }
+
+    def __init__(self, name="840B", dim=300, **kwargs) -> None:
+        url = self.url[name]
+        name = "glove.{}.{}d.txt".format(name, str(dim))
+        super(GloVe, self).__init__(name, url=url, **kwargs)
+
+
+class FastText(Vectors):
+
+    url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec"
+
+    def __init__(self, language="en", **kwargs) -> None:
+        url = self.url_base.format(language)
+        name = os.path.basename(url)
+        super(FastText, self).__init__(name, url=url, **kwargs)
+
+
+class CharNGram(Vectors):
+
+    name = "charNgram.txt"
+    url = "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/" "jmt_pre-trained_embeddings.tar.gz"
+
+    def __init__(self, **kwargs) -> None:
+        super(CharNGram, self).__init__(self.name, url=self.url, **kwargs)
+
+    def __getitem__(self, token):
+        vector = torch.Tensor(1, self.dim).zero_()
+        if token == "<unk>":
+            return self.unk_init(vector)
+        chars = ["#BEGIN#"] + list(token) + ["#END#"]
+        num_vectors = 0
+        for n in [2, 3, 4]:
+            end = len(chars) - n + 1
+            grams = [chars[i : (i + n)] for i in range(end)]
+            for gram in grams:
+                gram_key = "{}gram-{}".format(n, "".join(gram))
+                if gram_key in self.stoi:
+                    vector += self.vectors[self.stoi[gram_key]]
+                    num_vectors += 1
+        if num_vectors > 0:
+            vector /= num_vectors
+        else:
+            vector = self.unk_init(vector)
+        return vector
+
+
+def reporthook(t):
+    """
+    https://github.com/tqdm/tqdm.
+    """
+    last_b = [0]
+
+    def inner(b=1, bsize=1, tsize=None):
+        """
+        b: int, optional
+        Number of blocks just transferred [default: 1].
+        bsize: int, optional
+        Size of each block (in tqdm units) [default: 1].
+        tsize: int, optional
+        Total size (in tqdm units). If [default: None] remains unchanged.
+        """
+        if tsize is not None:
+            t.total = tsize
+        t.update((b - last_b[0]) * bsize)
+        last_b[0] = b
+
+    return inner
+
+
+pretrained_aliases = {
+    "charngram.100d": partial(CharNGram),
+    "fasttext.en.300d": partial(FastText, language="en"),
+    "fasttext.simple.300d": partial(FastText, language="simple"),
+    "glove.42B.300d": partial(GloVe, name="42B", dim="300"),
+    "glove.840B.300d": partial(GloVe, name="840B", dim="300"),
+    "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"),
+    "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"),
+    "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"),
+    "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"),
+    "glove.6B.50d": partial(GloVe, name="6B", dim="50"),
+    "glove.6B.100d": partial(GloVe, name="6B", dim="100"),
+    "glove.6B.200d": partial(GloVe, name="6B", dim="200"),
+    "glove.6B.300d": partial(GloVe, name="6B", dim="300"),
+}
+"""Mapping from string name to factory function"""
diff --git a/lighthouse/common/vocab/vocab.py b/lighthouse/common/vocab/vocab.py
new file mode 100644
index 0000000..68e1635
--- /dev/null
+++ b/lighthouse/common/vocab/vocab.py
@@ -0,0 +1,171 @@
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+
+
+def _log_class_usage(klass):
+    identifier = "torchtext"
+    if klass and hasattr(klass, "__name__"):
+        identifier += f".{klass.__name__}"
+    torch._C._log_api_usage_once(identifier)
+
+
+class Vocab(nn.Module):
+    __jit_unused_properties__ = ["is_jitable"]
+    r"""Creates a vocab object which maps tokens to indices.
+
+    Args:
+        vocab (torch.classes.torchtext.Vocab or torchtext._torchtext.Vocab): a cpp vocab object.
+    """
+
+    def __init__(self, vocab) -> None:
+        super(Vocab, self).__init__()
+        self.vocab = vocab
+        _log_class_usage(__class__)
+
+    @property
+    def is_jitable(self):
+        return isinstance(self.vocab, torch._C.ScriptObject)
+
+    @torch.jit.export
+    def forward(self, tokens: List[str]) -> List[int]:
+        r"""Calls the `lookup_indices` method
+
+        Args:
+            tokens: a list of tokens used to lookup their corresponding `indices`.
+
+        Returns:
+            The indices associated with a list of `tokens`.
+        """
+        return self.vocab.lookup_indices(tokens)
+
+    @torch.jit.export
+    def __len__(self) -> int:
+        r"""
+        Returns:
+            The length of the vocab.
+        """
+        return len(self.vocab)
+
+    @torch.jit.export
+    def __contains__(self, token: str) -> bool:
+        r"""
+        Args:
+            token: The token for which to check the membership.
+
+        Returns:
+            Whether the token is member of vocab or not.
+        """
+        return self.vocab.__contains__(token)
+
+    @torch.jit.export
+    def __getitem__(self, token: str) -> int:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+
+        Returns:
+            The index corresponding to the associated token.
+        """
+        return self.vocab[token]
+
+    @torch.jit.export
+    def set_default_index(self, index: Optional[int]) -> None:
+        r"""
+        Args:
+            index: Value of default index. This index will be returned when OOV token is queried.
+        """
+        self.vocab.set_default_index(index)
+
+    @torch.jit.export
+    def get_default_index(self) -> Optional[int]:
+        r"""
+        Returns:
+            Value of default index if it is set.
+        """
+        return self.vocab.get_default_index()
+
+    @torch.jit.export
+    def insert_token(self, token: str, index: int) -> None:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+            index: The index corresponding to the associated token.
+        Raises:
+            RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab.
+        """
+        self.vocab.insert_token(token, index)
+
+    @torch.jit.export
+    def append_token(self, token: str) -> None:
+        r"""
+        Args:
+            token: The token used to lookup the corresponding index.
+
+        Raises:
+            RuntimeError: If `token` already exists in the vocab
+        """
+        self.vocab.append_token(token)
+
+    @torch.jit.export
+    def lookup_token(self, index: int) -> str:
+        r"""
+        Args:
+            index: The index corresponding to the associated token.
+
+        Returns:
+            token: The token used to lookup the corresponding index.
+
+        Raises:
+            RuntimeError: If `index` not in range [0, itos.size()).
+        """
+        return self.vocab.lookup_token(index)
+
+    @torch.jit.export
+    def lookup_tokens(self, indices: List[int]) -> List[str]:
+        r"""
+        Args:
+            indices: The `indices` used to lookup their corresponding`tokens`.
+
+        Returns:
+            The `tokens` associated with `indices`.
+
+        Raises:
+            RuntimeError: If an index within `indices` is not int range [0, itos.size()).
+        """
+        return self.vocab.lookup_tokens(indices)
+
+    @torch.jit.export
+    def lookup_indices(self, tokens: List[str]) -> List[int]:
+        r"""
+        Args:
+            tokens: the tokens used to lookup their corresponding `indices`.
+
+        Returns:
+            The 'indices` associated with `tokens`.
+        """
+        return self.vocab.lookup_indices(tokens)
+
+    @torch.jit.export
+    def get_stoi(self) -> Dict[str, int]:
+        r"""
+        Returns:
+            Dictionary mapping tokens to indices.
+        """
+        return self.vocab.get_stoi()
+
+    @torch.jit.export
+    def get_itos(self) -> List[str]:
+        r"""
+        Returns:
+            List mapping indices to tokens.
+        """
+        return self.vocab.get_itos()
+
+    def __prepare_scriptable__(self):
+        r"""Return a JITable Vocab."""
+        if not self.is_jitable:
+            cpp_vocab = torch.classes.torchtext.Vocab(self.vocab.itos_, self.vocab.default_index_)
+            return Vocab(cpp_vocab)
+        return self
diff --git a/lighthouse/feature_extractor/text_encoders/glove.py b/lighthouse/feature_extractor/text_encoders/glove.py
index ef65df8..05bdf06 100644
--- a/lighthouse/feature_extractor/text_encoders/glove.py
+++ b/lighthouse/feature_extractor/text_encoders/glove.py
@@ -1,6 +1,6 @@
 import torch
 from typing import Tuple
-from torchtext import vocab
+from lighthouse.common import vocab
 
 class GloVe:
     def __init__(
@@ -35,4 +35,4 @@ def __call__(
         word_inds = torch.LongTensor(
             [self._vocab.stoi.get(w.lower(), 400000) for w in query.split()])
         mask = torch.ones((1, word_inds.shape[0])).to(self._device)
-        return self._embedding(word_inds).unsqueeze(0).to(self._device), mask
\ No newline at end of file
+        return self._embedding(word_inds).unsqueeze(0).to(self._device), mask
diff --git a/mypy.ini b/mypy.ini
index 54eecc0..41e0b02 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -13,9 +13,6 @@ ignore_missing_imports = True
 [mypy-torchlibrosa.*]
 ignore_missing_imports = True
 
-[mypy-torchtext.*]
-ignore_missing_imports = True
-
 [mypy-clip.*]
 ignore_missing_imports = True
 
diff --git a/training/cg_detr_dataset.py b/training/cg_detr_dataset.py
index 64c932e..25725c1 100644
--- a/training/cg_detr_dataset.py
+++ b/training/cg_detr_dataset.py
@@ -41,10 +41,10 @@
 import random
 import logging
 from os.path import join, exists
+from lighthouse.common import vocab
 from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array
 from lighthouse.common.utils.tensor_utils import pad_sequences_1d
 from lighthouse.common.utils.span_utils import span_xx_to_cxw
-from torchtext import vocab
 import torch.nn as nn
 
 logger = logging.getLogger(__name__)
diff --git a/training/dataset.py b/training/dataset.py
index d8053d7..0adacdb 100755
--- a/training/dataset.py
+++ b/training/dataset.py
@@ -62,10 +62,10 @@
 import random
 import logging
 from os.path import join, exists
+from lighthouse.common import vocab
 from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array
 from lighthouse.common.utils.tensor_utils import pad_sequences_1d
 from lighthouse.common.utils.span_utils import span_xx_to_cxw
-from torchtext import vocab
 import torch.nn as nn
 
 logger = logging.getLogger(__name__)

From 47df93e673a51797a621b78f6bb0200d2a06cb8d Mon Sep 17 00:00:00 2001
From: Hokuto Munakata <hokuto.munakata@lycorp.co.jp>
Date: Mon, 11 Aug 2025 13:17:47 +0900
Subject: [PATCH 2/2] Remove unnecessary file

---
 lighthouse/common/vocab.py | 586 -------------------------------------
 1 file changed, 586 deletions(-)
 delete mode 100644 lighthouse/common/vocab.py

diff --git a/lighthouse/common/vocab.py b/lighthouse/common/vocab.py
deleted file mode 100644
index bc3157b..0000000
--- a/lighthouse/common/vocab.py
+++ /dev/null
@@ -1,586 +0,0 @@
-import gzip
-import logging
-import os
-import tarfile
-import zipfile
-from collections import Counter, defaultdict
-from functools import partial
-from urllib.request import urlretrieve
-
-import torch
-from tqdm import tqdm
-
-from .utils import reporthook
-
-logger = logging.getLogger(__name__)
-
-
-class Vocab(object):
-    """Defines a vocabulary object that will be used to numericalize a field.
-
-    Attributes:
-        freqs: A collections.Counter object holding the frequencies of tokens
-            in the data used to build the Vocab.
-        stoi: A collections.defaultdict instance mapping token strings to
-            numerical identifiers.
-        itos: A list of token strings indexed by their numerical identifiers.
-    """
-
-    # TODO (@mttk): Populate classs with default values of special symbols
-    UNK = "<unk>"
-
-    def __init__(
-        self,
-        counter,
-        max_size=None,
-        min_freq=1,
-        specials=("<unk>", "<pad>"),
-        vectors=None,
-        unk_init=None,
-        vectors_cache=None,
-        specials_first=True,
-    ):
-        """Create a Vocab object from a collections.Counter.
-
-        Arguments:
-            counter: collections.Counter object holding the frequencies of
-                each value found in the data.
-            max_size: The maximum size of the vocabulary, or None for no
-                maximum. Default: None.
-            min_freq: The minimum frequency needed to include a token in the
-                vocabulary. Values less than 1 will be set to 1. Default: 1.
-            specials: The list of special tokens (e.g., padding or eos) that
-                will be prepended to the vocabulary. Default: ['<unk'>, '<pad>']
-            vectors: One of either the available pretrained vectors
-                or custom pretrained vectors (see Vocab.load_vectors);
-                or a list of aforementioned vectors
-            unk_init (callback): by default, initialize out-of-vocabulary word vectors
-                to zero vectors; can be any function that takes in a Tensor and
-                returns a Tensor of the same size. Default: 'torch.zeros'
-            vectors_cache: directory for cached vectors. Default: '.vector_cache'
-            specials_first: Whether to add special tokens into the vocabulary at first.
-                If it is False, they are added into the vocabulary at last.
-                Default: True.
-        """
-        self.freqs = counter
-        counter = counter.copy()
-        min_freq = max(min_freq, 1)
-
-        self.itos = list()
-        self.unk_index = None
-        if specials_first:
-            self.itos = list(specials)
-            # only extend max size if specials are prepended
-            max_size = None if max_size is None else max_size + len(specials)
-
-        # frequencies of special tokens are not counted when building vocabulary
-        # in frequency order
-        for tok in specials:
-            del counter[tok]
-
-        # sort by frequency, then alphabetically
-        words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0])
-        words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True)
-
-        for word, freq in words_and_frequencies:
-            if freq < min_freq or len(self.itos) == max_size:
-                break
-            self.itos.append(word)
-
-        if Vocab.UNK in specials:  # hard-coded for now
-            unk_index = specials.index(Vocab.UNK)  # position in list
-            # account for ordering of specials, set variable
-            self.unk_index = unk_index if specials_first else len(self.itos) + unk_index
-            self.stoi = defaultdict(self._default_unk_index)
-        else:
-            self.stoi = defaultdict()
-
-        if not specials_first:
-            self.itos.extend(list(specials))
-
-        # stoi is simply a reverse dict for itos
-        self.stoi.update({tok: i for i, tok in enumerate(self.itos)})
-
-        self.vectors = None
-        if vectors is not None:
-            self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache)
-        else:
-            assert unk_init is None and vectors_cache is None
-
-    def _default_unk_index(self):
-        return self.unk_index
-
-    def __getitem__(self, token):
-        return self.stoi.get(token, self.stoi.get(Vocab.UNK))
-
-    def __getstate__(self):
-        # avoid picking defaultdict
-        attrs = dict(self.__dict__)
-        # cast to regular dict
-        attrs["stoi"] = dict(self.stoi)
-        return attrs
-
-    def __setstate__(self, state):
-        if state.get("unk_index", None) is None:
-            stoi = defaultdict()
-        else:
-            stoi = defaultdict(self._default_unk_index)
-        stoi.update(state["stoi"])
-        state["stoi"] = stoi
-        self.__dict__.update(state)
-
-    def __eq__(self, other):
-        if self.freqs != other.freqs:
-            return False
-        if self.stoi != other.stoi:
-            return False
-        if self.itos != other.itos:
-            return False
-        if self.vectors != other.vectors:
-            return False
-        return True
-
-    def __len__(self):
-        return len(self.itos)
-
-    def lookup_indices(self, tokens):
-        indices = [self.__getitem__(token) for token in tokens]
-        return indices
-
-    def extend(self, v, sort=False):
-        words = sorted(v.itos) if sort else v.itos
-        for w in words:
-            if w not in self.stoi:
-                self.itos.append(w)
-                self.stoi[w] = len(self.itos) - 1
-
-    def load_vectors(self, vectors, **kwargs):
-        """
-        Arguments:
-            vectors: one of or a list containing instantiations of the
-                GloVe, CharNGram, or Vectors classes. Alternatively, one
-                of or a list of available pretrained vectors:
-
-                charngram.100d
-                fasttext.en.300d
-                fasttext.simple.300d
-                glove.42B.300d
-                glove.840B.300d
-                glove.twitter.27B.25d
-                glove.twitter.27B.50d
-                glove.twitter.27B.100d
-                glove.twitter.27B.200d
-                glove.6B.50d
-                glove.6B.100d
-                glove.6B.200d
-                glove.6B.300d
-
-            Remaining keyword arguments: Passed to the constructor of Vectors classes.
-        """
-        if not isinstance(vectors, list):
-            vectors = [vectors]
-        for idx, vector in enumerate(vectors):
-            if isinstance(vector, str):
-                # Convert the string pretrained vector identifier
-                # to a Vectors object
-                if vector not in pretrained_aliases:
-                    raise ValueError(
-                        "Got string input vector {}, but allowed pretrained "
-                        "vectors are {}".format(vector, list(pretrained_aliases.keys()))
-                    )
-                vectors[idx] = pretrained_aliases[vector](**kwargs)
-            elif not isinstance(vector, Vectors):
-                raise ValueError(
-                    "Got input vectors of type {}, expected str or "
-                    "Vectors object".format(type(vector))
-                )
-
-        tot_dim = sum(v.dim for v in vectors)
-        self.vectors = torch.Tensor(len(self), tot_dim)
-        for i, token in enumerate(self.itos):
-            start_dim = 0
-            for v in vectors:
-                end_dim = start_dim + v.dim
-                self.vectors[i][start_dim:end_dim] = v[token.strip()]
-                start_dim = end_dim
-            assert start_dim == tot_dim
-
-    def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_):
-        """
-        Set the vectors for the Vocab instance from a collection of Tensors.
-
-        Arguments:
-            stoi: A dictionary of string to the index of the associated vector
-                in the `vectors` input argument.
-            vectors: An indexed iterable (or other structure supporting __getitem__) that
-                given an input index, returns a FloatTensor representing the vector
-                for the token associated with the index. For example,
-                vector[stoi["string"]] should return the vector for "string".
-            dim: The dimensionality of the vectors.
-            unk_init (callback): by default, initialize out-of-vocabulary word vectors
-                to zero vectors; can be any function that takes in a Tensor and
-                returns a Tensor of the same size. Default: 'torch.zeros'
-        """
-        self.vectors = torch.Tensor(len(self), dim)
-        for i, token in enumerate(self.itos):
-            wv_index = stoi.get(token, None)
-            if wv_index is not None:
-                self.vectors[i] = vectors[wv_index]
-            else:
-                self.vectors[i] = unk_init(self.vectors[i])
-
-
-class SubwordVocab(Vocab):
-    def __init__(
-        self,
-        counter,
-        max_size=None,
-        specials=("<pad>"),
-        vectors=None,
-        unk_init=torch.Tensor.zero_,
-    ):
-        """Create a revtok subword vocabulary from a collections.Counter.
-
-        Arguments:
-            counter: collections.Counter object holding the frequencies of
-                each word found in the data.
-            max_size: The maximum size of the subword vocabulary, or None for no
-                maximum. Default: None.
-            specials: The list of special tokens (e.g., padding or eos) that
-                will be prepended to the vocabulary in addition to an <unk>
-                token.
-            vectors: One of either the available pretrained vectors
-                or custom pretrained vectors (see Vocab.load_vectors);
-                or a list of aforementioned vectors
-            unk_init (callback): by default, initialize out-of-vocabulary word vectors
-                to zero vectors; can be any function that takes in a Tensor and
-                returns a Tensor of the same size. Default: 'torch.zeros
-        """
-        try:
-            import revtok
-        except ImportError:
-            print("Please install revtok.")
-            raise
-
-        # Hardcode unk_index as subword_vocab has no specials_first argument
-        self.unk_index = (
-            specials.index(SubwordVocab.UNK) if SubwordVocab.UNK in specials else None
-        )
-
-        if self.unk_index is None:
-            self.stoi = defaultdict()
-        else:
-            self.stoi = defaultdict(self._default_unk_index)
-
-        self.stoi.update({tok: i for i, tok in enumerate(specials)})
-        self.itos = specials.copy()
-
-        self.segment = revtok.SubwordSegmenter(counter, max_size)
-
-        max_size = None if max_size is None else max_size + len(self.itos)
-
-        # sort by frequency/entropy, then alphabetically
-        toks = sorted(
-            self.segment.vocab.items(),
-            key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]),
-        )
-
-        for tok, _ in toks:
-            if len(self.itos) == max_size:
-                break
-            self.itos.append(tok)
-            self.stoi[tok] = len(self.itos) - 1
-
-        if vectors is not None:
-            self.load_vectors(vectors, unk_init=unk_init)
-
-
-def _infer_shape(f):
-    num_lines, vector_dim = 0, None
-    for line in f:
-        if vector_dim is None:
-            row = line.rstrip().split(b" ")
-            vector = row[1:]
-            # Assuming word, [vector] format
-            if len(vector) > 2:
-                # The header present in some (w2v) formats contains two elements.
-                vector_dim = len(vector)
-                num_lines += 1  # First element read
-        else:
-            num_lines += 1
-    f.seek(0)
-    return num_lines, vector_dim
-
-
-class Vectors(object):
-    def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None):
-        """
-        Arguments:
-
-            name: name of the file that contains the vectors
-            cache: directory for cached vectors
-            url: url for download if vectors not found in cache
-            unk_init (callback): by default, initialize out-of-vocabulary word vectors
-                to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size
-            max_vectors (int): this can be used to limit the number of
-                pre-trained vectors loaded.
-                Most pre-trained vector sets are sorted
-                in the descending order of word frequency.
-                Thus, in situations where the entire set doesn't fit in memory,
-                or is not needed for another reason, passing `max_vectors`
-                can limit the size of the loaded set.
-        """
-
-        cache = ".vector_cache" if cache is None else cache
-        self.itos = None
-        self.stoi = None
-        self.vectors = None
-        self.dim = None
-        self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init
-        self.cache(name, cache, url=url, max_vectors=max_vectors)
-
-    def __getitem__(self, token):
-        if token in self.stoi:
-            return self.vectors[self.stoi[token]]
-        else:
-            return self.unk_init(torch.Tensor(self.dim))
-
-    def cache(self, name, cache, url=None, max_vectors=None):
-        import ssl
-
-        ssl._create_default_https_context = ssl._create_unverified_context
-        if os.path.isfile(name):
-            path = name
-            if max_vectors:
-                file_suffix = "_{}.pt".format(max_vectors)
-            else:
-                file_suffix = ".pt"
-            path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix
-        else:
-            path = os.path.join(cache, name)
-            if max_vectors:
-                file_suffix = "_{}.pt".format(max_vectors)
-            else:
-                file_suffix = ".pt"
-            path_pt = path + file_suffix
-
-        if not os.path.isfile(path_pt):
-            if not os.path.isfile(path) and url:
-                logger.info("Downloading vectors from {}".format(url))
-                if not os.path.exists(cache):
-                    os.makedirs(cache)
-                dest = os.path.join(cache, os.path.basename(url))
-                if not os.path.isfile(dest):
-                    with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t:
-                        try:
-                            urlretrieve(url, dest, reporthook=reporthook(t))
-                        except KeyboardInterrupt as e:  # remove the partial zip file
-                            os.remove(dest)
-                            raise e
-                logger.info("Extracting vectors into {}".format(cache))
-                ext = os.path.splitext(dest)[1][1:]
-                if ext == "zip":
-                    with zipfile.ZipFile(dest, "r") as zf:
-                        zf.extractall(cache)
-                elif ext == "gz":
-                    if dest.endswith(".tar.gz"):
-                        with tarfile.open(dest, "r:gz") as tar:
-                            tar.extractall(path=cache)
-            if not os.path.isfile(path):
-                raise RuntimeError("no vectors found at {}".format(path))
-
-            logger.info("Loading vectors from {}".format(path))
-            ext = os.path.splitext(path)[1][1:]
-            if ext == "gz":
-                open_file = gzip.open
-            else:
-                open_file = open
-
-            vectors_loaded = 0
-            with open_file(path, "rb") as f:
-                num_lines, dim = _infer_shape(f)
-                if not max_vectors or max_vectors > num_lines:
-                    max_vectors = num_lines
-
-                itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None
-
-                for line in tqdm(f, total=max_vectors):
-                    # Explicitly splitting on " " is important, so we don't
-                    # get rid of Unicode non-breaking spaces in the vectors.
-                    entries = line.rstrip().split(b" ")
-
-                    word, entries = entries[0], entries[1:]
-                    if dim is None and len(entries) > 1:
-                        dim = len(entries)
-                    elif len(entries) == 1:
-                        logger.warning(
-                            "Skipping token {} with 1-dimensional "
-                            "vector {}; likely a header".format(word, entries)
-                        )
-                        continue
-                    elif dim != len(entries):
-                        raise RuntimeError(
-                            "Vector for token {} has {} dimensions, but previously "
-                            "read vectors have {} dimensions. All vectors must have "
-                            "the same number of dimensions.".format(
-                                word, len(entries), dim
-                            )
-                        )
-
-                    try:
-                        if isinstance(word, bytes):
-                            word = word.decode("utf-8")
-                    except UnicodeDecodeError:
-                        logger.info("Skipping non-UTF8 token {}".format(repr(word)))
-                        continue
-
-                    vectors[vectors_loaded] = torch.tensor([float(x) for x in entries])
-                    vectors_loaded += 1
-                    itos.append(word)
-
-                    if vectors_loaded == max_vectors:
-                        break
-
-            self.itos = itos
-            self.stoi = {word: i for i, word in enumerate(itos)}
-            self.vectors = torch.Tensor(vectors).view(-1, dim)
-            self.dim = dim
-            logger.info("Saving vectors to {}".format(path_pt))
-            if not os.path.exists(cache):
-                os.makedirs(cache)
-            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
-        else:
-            logger.info("Loading vectors from {}".format(path_pt))
-            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
-
-    def __len__(self):
-        return len(self.vectors)
-
-    def get_vecs_by_tokens(self, tokens, lower_case_backup=False):
-        """Look up embedding vectors of tokens.
-
-        Arguments:
-            tokens: a token or a list of tokens. if `tokens` is a string,
-                returns a 1-D tensor of shape `self.dim`; if `tokens` is a
-                list of strings, returns a 2-D tensor of shape=(len(tokens),
-                self.dim).
-            lower_case_backup : Whether to look up the token in the lower case.
-                If False, each token in the original case will be looked up;
-                if True, each token in the original case will be looked up first,
-                if not found in the keys of the property `stoi`, the token in the
-                lower case will be looked up. Default: False.
-
-        Examples:
-            >>> examples = ['chip', 'baby', 'Beautiful']
-            >>> vec = text.vocab.GloVe(name='6B', dim=50)
-            >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True)
-        """
-        to_reduce = False
-
-        if not isinstance(tokens, list):
-            tokens = [tokens]
-            to_reduce = True
-
-        if not lower_case_backup:
-            indices = [self[token] for token in tokens]
-        else:
-            indices = [
-                self[token] if token in self.stoi else self[token.lower()]
-                for token in tokens
-            ]
-
-        vecs = torch.stack(indices)
-        return vecs[0] if to_reduce else vecs
-
-
-class GloVe(Vectors):
-    url = {
-        "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip",
-        "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip",
-        "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
-        "6B": "http://nlp.stanford.edu/data/glove.6B.zip",
-    }
-
-    def __init__(self, name="840B", dim=300, **kwargs):
-        url = self.url[name]
-        name = "glove.{}.{}d.txt".format(name, str(dim))
-        super(GloVe, self).__init__(name, url=url, **kwargs)
-
-
-class FastText(Vectors):
-    url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec"
-
-    def __init__(self, language="en", **kwargs):
-        url = self.url_base.format(language)
-        name = os.path.basename(url)
-        super(FastText, self).__init__(name, url=url, **kwargs)
-
-
-class CharNGram(Vectors):
-    name = "charNgram.txt"
-    url = (
-        "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/"
-        "jmt_pre-trained_embeddings.tar.gz"
-    )
-
-    def __init__(self, **kwargs):
-        super(CharNGram, self).__init__(self.name, url=self.url, **kwargs)
-
-    def __getitem__(self, token):
-        vector = torch.Tensor(1, self.dim).zero_()
-        if token == "<unk>":
-            return self.unk_init(vector)
-        chars = ["#BEGIN#"] + list(token) + ["#END#"]
-        num_vectors = 0
-        for n in [2, 3, 4]:
-            end = len(chars) - n + 1
-            grams = [chars[i : (i + n)] for i in range(end)]
-            for gram in grams:
-                gram_key = "{}gram-{}".format(n, "".join(gram))
-                if gram_key in self.stoi:
-                    vector += self.vectors[self.stoi[gram_key]]
-                    num_vectors += 1
-        if num_vectors > 0:
-            vector /= num_vectors
-        else:
-            vector = self.unk_init(vector)
-        return vector
-
-
-pretrained_aliases = {
-    "charngram.100d": partial(CharNGram),
-    "fasttext.en.300d": partial(FastText, language="en"),
-    "fasttext.simple.300d": partial(FastText, language="simple"),
-    "glove.42B.300d": partial(GloVe, name="42B", dim="300"),
-    "glove.840B.300d": partial(GloVe, name="840B", dim="300"),
-    "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"),
-    "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"),
-    "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"),
-    "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"),
-    "glove.6B.50d": partial(GloVe, name="6B", dim="50"),
-    "glove.6B.100d": partial(GloVe, name="6B", dim="100"),
-    "glove.6B.200d": partial(GloVe, name="6B", dim="200"),
-    "glove.6B.300d": partial(GloVe, name="6B", dim="300"),
-}
-"""Mapping from string name to factory function"""
-
-
-def build_vocab_from_iterator(iterator, num_lines=None):
-    """
-    Build a Vocab from an iterator.
-
-    Arguments:
-        iterator: Iterator used to build Vocab. Must yield list or iterator of tokens.
-        num_lines: The expected number of elements returned by the iterator.
-            (Default: None)
-            Optionally, if known, the expected number of elements can be passed to
-            this factory function for improved progress reporting.
-    """
-
-    counter = Counter()
-    with tqdm(unit_scale=0, unit="lines", total=num_lines) as t:
-        for tokens in iterator:
-            counter.update(tokens)
-            t.update(1)
-    word_vocab = Vocab(counter)
-    return word_vocab