From 704aec5b05c2cd59dc917413fdf2982b072657c9 Mon Sep 17 00:00:00 2001 From: Hokuto Munakata Date: Mon, 11 Aug 2025 12:43:21 +0900 Subject: [PATCH 1/2] Remove torch text --- .github/workflows/mypy_ruff.yml | 2 +- .github/workflows/pytest.yml | 2 +- README.md | 4 +- lighthouse/common/vocab.py | 586 ++++++++++++++++++ lighthouse/common/vocab/__init__.py | 13 + lighthouse/common/vocab/vectors.py | 301 +++++++++ lighthouse/common/vocab/vocab.py | 171 +++++ .../feature_extractor/text_encoders/glove.py | 4 +- mypy.ini | 3 - training/cg_detr_dataset.py | 2 +- training/dataset.py | 2 +- 11 files changed, 1079 insertions(+), 11 deletions(-) create mode 100644 lighthouse/common/vocab.py create mode 100644 lighthouse/common/vocab/__init__.py create mode 100644 lighthouse/common/vocab/vectors.py create mode 100644 lighthouse/common/vocab/vocab.py diff --git a/.github/workflows/mypy_ruff.yml b/.github/workflows/mypy_ruff.yml index 629908b..f7e423f 100644 --- a/.github/workflows/mypy_ruff.yml +++ b/.github/workflows/mypy_ruff.yml @@ -22,7 +22,7 @@ jobs: - name: Run dependency libraries run: | pip install -e . - pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 + pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pip install mypy ruff - name: Run mypy diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 61c107f..476084b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,7 +28,7 @@ jobs: - name: Run dependency libraries run: | pip install -e . - pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 + pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pip install pytest - name: Run pytest run: pytest tests/test_models.py diff --git a/README.md b/README.md index 949ff7a..19d568a 100755 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ Install ffmpeg first. If you are an Ubuntu user, run: ``` apt install ffmpeg ``` -Then, install pytorch, torchvision, torchaudio, and torchtext based on your GPU environments. +Then, install pytorch, torchvision, and torchaudio based on your GPU environments. Note that the inference API is available for CPU environments. We tested the codes on Python 3.9 and CUDA 11.8: ``` -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 --index-url https://download.pytorch.org/whl/cu118 +pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118 ``` Finally, run to install dependency libraries: ``` diff --git a/lighthouse/common/vocab.py b/lighthouse/common/vocab.py new file mode 100644 index 0000000..bc3157b --- /dev/null +++ b/lighthouse/common/vocab.py @@ -0,0 +1,586 @@ +import gzip +import logging +import os +import tarfile +import zipfile +from collections import Counter, defaultdict +from functools import partial +from urllib.request import urlretrieve + +import torch +from tqdm import tqdm + +from .utils import reporthook + +logger = logging.getLogger(__name__) + + +class Vocab(object): + """Defines a vocabulary object that will be used to numericalize a field. + + Attributes: + freqs: A collections.Counter object holding the frequencies of tokens + in the data used to build the Vocab. + stoi: A collections.defaultdict instance mapping token strings to + numerical identifiers. + itos: A list of token strings indexed by their numerical identifiers. + """ + + # TODO (@mttk): Populate classs with default values of special symbols + UNK = "" + + def __init__( + self, + counter, + max_size=None, + min_freq=1, + specials=("", ""), + vectors=None, + unk_init=None, + vectors_cache=None, + specials_first=True, + ): + """Create a Vocab object from a collections.Counter. + + Arguments: + counter: collections.Counter object holding the frequencies of + each value found in the data. + max_size: The maximum size of the vocabulary, or None for no + maximum. Default: None. + min_freq: The minimum frequency needed to include a token in the + vocabulary. Values less than 1 will be set to 1. Default: 1. + specials: The list of special tokens (e.g., padding or eos) that + will be prepended to the vocabulary. Default: [', ''] + vectors: One of either the available pretrained vectors + or custom pretrained vectors (see Vocab.load_vectors); + or a list of aforementioned vectors + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and + returns a Tensor of the same size. Default: 'torch.zeros' + vectors_cache: directory for cached vectors. Default: '.vector_cache' + specials_first: Whether to add special tokens into the vocabulary at first. + If it is False, they are added into the vocabulary at last. + Default: True. + """ + self.freqs = counter + counter = counter.copy() + min_freq = max(min_freq, 1) + + self.itos = list() + self.unk_index = None + if specials_first: + self.itos = list(specials) + # only extend max size if specials are prepended + max_size = None if max_size is None else max_size + len(specials) + + # frequencies of special tokens are not counted when building vocabulary + # in frequency order + for tok in specials: + del counter[tok] + + # sort by frequency, then alphabetically + words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0]) + words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True) + + for word, freq in words_and_frequencies: + if freq < min_freq or len(self.itos) == max_size: + break + self.itos.append(word) + + if Vocab.UNK in specials: # hard-coded for now + unk_index = specials.index(Vocab.UNK) # position in list + # account for ordering of specials, set variable + self.unk_index = unk_index if specials_first else len(self.itos) + unk_index + self.stoi = defaultdict(self._default_unk_index) + else: + self.stoi = defaultdict() + + if not specials_first: + self.itos.extend(list(specials)) + + # stoi is simply a reverse dict for itos + self.stoi.update({tok: i for i, tok in enumerate(self.itos)}) + + self.vectors = None + if vectors is not None: + self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache) + else: + assert unk_init is None and vectors_cache is None + + def _default_unk_index(self): + return self.unk_index + + def __getitem__(self, token): + return self.stoi.get(token, self.stoi.get(Vocab.UNK)) + + def __getstate__(self): + # avoid picking defaultdict + attrs = dict(self.__dict__) + # cast to regular dict + attrs["stoi"] = dict(self.stoi) + return attrs + + def __setstate__(self, state): + if state.get("unk_index", None) is None: + stoi = defaultdict() + else: + stoi = defaultdict(self._default_unk_index) + stoi.update(state["stoi"]) + state["stoi"] = stoi + self.__dict__.update(state) + + def __eq__(self, other): + if self.freqs != other.freqs: + return False + if self.stoi != other.stoi: + return False + if self.itos != other.itos: + return False + if self.vectors != other.vectors: + return False + return True + + def __len__(self): + return len(self.itos) + + def lookup_indices(self, tokens): + indices = [self.__getitem__(token) for token in tokens] + return indices + + def extend(self, v, sort=False): + words = sorted(v.itos) if sort else v.itos + for w in words: + if w not in self.stoi: + self.itos.append(w) + self.stoi[w] = len(self.itos) - 1 + + def load_vectors(self, vectors, **kwargs): + """ + Arguments: + vectors: one of or a list containing instantiations of the + GloVe, CharNGram, or Vectors classes. Alternatively, one + of or a list of available pretrained vectors: + + charngram.100d + fasttext.en.300d + fasttext.simple.300d + glove.42B.300d + glove.840B.300d + glove.twitter.27B.25d + glove.twitter.27B.50d + glove.twitter.27B.100d + glove.twitter.27B.200d + glove.6B.50d + glove.6B.100d + glove.6B.200d + glove.6B.300d + + Remaining keyword arguments: Passed to the constructor of Vectors classes. + """ + if not isinstance(vectors, list): + vectors = [vectors] + for idx, vector in enumerate(vectors): + if isinstance(vector, str): + # Convert the string pretrained vector identifier + # to a Vectors object + if vector not in pretrained_aliases: + raise ValueError( + "Got string input vector {}, but allowed pretrained " + "vectors are {}".format(vector, list(pretrained_aliases.keys())) + ) + vectors[idx] = pretrained_aliases[vector](**kwargs) + elif not isinstance(vector, Vectors): + raise ValueError( + "Got input vectors of type {}, expected str or " + "Vectors object".format(type(vector)) + ) + + tot_dim = sum(v.dim for v in vectors) + self.vectors = torch.Tensor(len(self), tot_dim) + for i, token in enumerate(self.itos): + start_dim = 0 + for v in vectors: + end_dim = start_dim + v.dim + self.vectors[i][start_dim:end_dim] = v[token.strip()] + start_dim = end_dim + assert start_dim == tot_dim + + def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_): + """ + Set the vectors for the Vocab instance from a collection of Tensors. + + Arguments: + stoi: A dictionary of string to the index of the associated vector + in the `vectors` input argument. + vectors: An indexed iterable (or other structure supporting __getitem__) that + given an input index, returns a FloatTensor representing the vector + for the token associated with the index. For example, + vector[stoi["string"]] should return the vector for "string". + dim: The dimensionality of the vectors. + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and + returns a Tensor of the same size. Default: 'torch.zeros' + """ + self.vectors = torch.Tensor(len(self), dim) + for i, token in enumerate(self.itos): + wv_index = stoi.get(token, None) + if wv_index is not None: + self.vectors[i] = vectors[wv_index] + else: + self.vectors[i] = unk_init(self.vectors[i]) + + +class SubwordVocab(Vocab): + def __init__( + self, + counter, + max_size=None, + specials=(""), + vectors=None, + unk_init=torch.Tensor.zero_, + ): + """Create a revtok subword vocabulary from a collections.Counter. + + Arguments: + counter: collections.Counter object holding the frequencies of + each word found in the data. + max_size: The maximum size of the subword vocabulary, or None for no + maximum. Default: None. + specials: The list of special tokens (e.g., padding or eos) that + will be prepended to the vocabulary in addition to an + token. + vectors: One of either the available pretrained vectors + or custom pretrained vectors (see Vocab.load_vectors); + or a list of aforementioned vectors + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and + returns a Tensor of the same size. Default: 'torch.zeros + """ + try: + import revtok + except ImportError: + print("Please install revtok.") + raise + + # Hardcode unk_index as subword_vocab has no specials_first argument + self.unk_index = ( + specials.index(SubwordVocab.UNK) if SubwordVocab.UNK in specials else None + ) + + if self.unk_index is None: + self.stoi = defaultdict() + else: + self.stoi = defaultdict(self._default_unk_index) + + self.stoi.update({tok: i for i, tok in enumerate(specials)}) + self.itos = specials.copy() + + self.segment = revtok.SubwordSegmenter(counter, max_size) + + max_size = None if max_size is None else max_size + len(self.itos) + + # sort by frequency/entropy, then alphabetically + toks = sorted( + self.segment.vocab.items(), + key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]), + ) + + for tok, _ in toks: + if len(self.itos) == max_size: + break + self.itos.append(tok) + self.stoi[tok] = len(self.itos) - 1 + + if vectors is not None: + self.load_vectors(vectors, unk_init=unk_init) + + +def _infer_shape(f): + num_lines, vector_dim = 0, None + for line in f: + if vector_dim is None: + row = line.rstrip().split(b" ") + vector = row[1:] + # Assuming word, [vector] format + if len(vector) > 2: + # The header present in some (w2v) formats contains two elements. + vector_dim = len(vector) + num_lines += 1 # First element read + else: + num_lines += 1 + f.seek(0) + return num_lines, vector_dim + + +class Vectors(object): + def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None): + """ + Arguments: + + name: name of the file that contains the vectors + cache: directory for cached vectors + url: url for download if vectors not found in cache + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size + max_vectors (int): this can be used to limit the number of + pre-trained vectors loaded. + Most pre-trained vector sets are sorted + in the descending order of word frequency. + Thus, in situations where the entire set doesn't fit in memory, + or is not needed for another reason, passing `max_vectors` + can limit the size of the loaded set. + """ + + cache = ".vector_cache" if cache is None else cache + self.itos = None + self.stoi = None + self.vectors = None + self.dim = None + self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init + self.cache(name, cache, url=url, max_vectors=max_vectors) + + def __getitem__(self, token): + if token in self.stoi: + return self.vectors[self.stoi[token]] + else: + return self.unk_init(torch.Tensor(self.dim)) + + def cache(self, name, cache, url=None, max_vectors=None): + import ssl + + ssl._create_default_https_context = ssl._create_unverified_context + if os.path.isfile(name): + path = name + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix + else: + path = os.path.join(cache, name) + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = path + file_suffix + + if not os.path.isfile(path_pt): + if not os.path.isfile(path) and url: + logger.info("Downloading vectors from {}".format(url)) + if not os.path.exists(cache): + os.makedirs(cache) + dest = os.path.join(cache, os.path.basename(url)) + if not os.path.isfile(dest): + with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t: + try: + urlretrieve(url, dest, reporthook=reporthook(t)) + except KeyboardInterrupt as e: # remove the partial zip file + os.remove(dest) + raise e + logger.info("Extracting vectors into {}".format(cache)) + ext = os.path.splitext(dest)[1][1:] + if ext == "zip": + with zipfile.ZipFile(dest, "r") as zf: + zf.extractall(cache) + elif ext == "gz": + if dest.endswith(".tar.gz"): + with tarfile.open(dest, "r:gz") as tar: + tar.extractall(path=cache) + if not os.path.isfile(path): + raise RuntimeError("no vectors found at {}".format(path)) + + logger.info("Loading vectors from {}".format(path)) + ext = os.path.splitext(path)[1][1:] + if ext == "gz": + open_file = gzip.open + else: + open_file = open + + vectors_loaded = 0 + with open_file(path, "rb") as f: + num_lines, dim = _infer_shape(f) + if not max_vectors or max_vectors > num_lines: + max_vectors = num_lines + + itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None + + for line in tqdm(f, total=max_vectors): + # Explicitly splitting on " " is important, so we don't + # get rid of Unicode non-breaking spaces in the vectors. + entries = line.rstrip().split(b" ") + + word, entries = entries[0], entries[1:] + if dim is None and len(entries) > 1: + dim = len(entries) + elif len(entries) == 1: + logger.warning( + "Skipping token {} with 1-dimensional " + "vector {}; likely a header".format(word, entries) + ) + continue + elif dim != len(entries): + raise RuntimeError( + "Vector for token {} has {} dimensions, but previously " + "read vectors have {} dimensions. All vectors must have " + "the same number of dimensions.".format( + word, len(entries), dim + ) + ) + + try: + if isinstance(word, bytes): + word = word.decode("utf-8") + except UnicodeDecodeError: + logger.info("Skipping non-UTF8 token {}".format(repr(word))) + continue + + vectors[vectors_loaded] = torch.tensor([float(x) for x in entries]) + vectors_loaded += 1 + itos.append(word) + + if vectors_loaded == max_vectors: + break + + self.itos = itos + self.stoi = {word: i for i, word in enumerate(itos)} + self.vectors = torch.Tensor(vectors).view(-1, dim) + self.dim = dim + logger.info("Saving vectors to {}".format(path_pt)) + if not os.path.exists(cache): + os.makedirs(cache) + torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) + else: + logger.info("Loading vectors from {}".format(path_pt)) + self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt) + + def __len__(self): + return len(self.vectors) + + def get_vecs_by_tokens(self, tokens, lower_case_backup=False): + """Look up embedding vectors of tokens. + + Arguments: + tokens: a token or a list of tokens. if `tokens` is a string, + returns a 1-D tensor of shape `self.dim`; if `tokens` is a + list of strings, returns a 2-D tensor of shape=(len(tokens), + self.dim). + lower_case_backup : Whether to look up the token in the lower case. + If False, each token in the original case will be looked up; + if True, each token in the original case will be looked up first, + if not found in the keys of the property `stoi`, the token in the + lower case will be looked up. Default: False. + + Examples: + >>> examples = ['chip', 'baby', 'Beautiful'] + >>> vec = text.vocab.GloVe(name='6B', dim=50) + >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True) + """ + to_reduce = False + + if not isinstance(tokens, list): + tokens = [tokens] + to_reduce = True + + if not lower_case_backup: + indices = [self[token] for token in tokens] + else: + indices = [ + self[token] if token in self.stoi else self[token.lower()] + for token in tokens + ] + + vecs = torch.stack(indices) + return vecs[0] if to_reduce else vecs + + +class GloVe(Vectors): + url = { + "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", + "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", + "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip", + "6B": "http://nlp.stanford.edu/data/glove.6B.zip", + } + + def __init__(self, name="840B", dim=300, **kwargs): + url = self.url[name] + name = "glove.{}.{}d.txt".format(name, str(dim)) + super(GloVe, self).__init__(name, url=url, **kwargs) + + +class FastText(Vectors): + url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec" + + def __init__(self, language="en", **kwargs): + url = self.url_base.format(language) + name = os.path.basename(url) + super(FastText, self).__init__(name, url=url, **kwargs) + + +class CharNGram(Vectors): + name = "charNgram.txt" + url = ( + "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/" + "jmt_pre-trained_embeddings.tar.gz" + ) + + def __init__(self, **kwargs): + super(CharNGram, self).__init__(self.name, url=self.url, **kwargs) + + def __getitem__(self, token): + vector = torch.Tensor(1, self.dim).zero_() + if token == "": + return self.unk_init(vector) + chars = ["#BEGIN#"] + list(token) + ["#END#"] + num_vectors = 0 + for n in [2, 3, 4]: + end = len(chars) - n + 1 + grams = [chars[i : (i + n)] for i in range(end)] + for gram in grams: + gram_key = "{}gram-{}".format(n, "".join(gram)) + if gram_key in self.stoi: + vector += self.vectors[self.stoi[gram_key]] + num_vectors += 1 + if num_vectors > 0: + vector /= num_vectors + else: + vector = self.unk_init(vector) + return vector + + +pretrained_aliases = { + "charngram.100d": partial(CharNGram), + "fasttext.en.300d": partial(FastText, language="en"), + "fasttext.simple.300d": partial(FastText, language="simple"), + "glove.42B.300d": partial(GloVe, name="42B", dim="300"), + "glove.840B.300d": partial(GloVe, name="840B", dim="300"), + "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), + "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), + "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), + "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), + "glove.6B.50d": partial(GloVe, name="6B", dim="50"), + "glove.6B.100d": partial(GloVe, name="6B", dim="100"), + "glove.6B.200d": partial(GloVe, name="6B", dim="200"), + "glove.6B.300d": partial(GloVe, name="6B", dim="300"), +} +"""Mapping from string name to factory function""" + + +def build_vocab_from_iterator(iterator, num_lines=None): + """ + Build a Vocab from an iterator. + + Arguments: + iterator: Iterator used to build Vocab. Must yield list or iterator of tokens. + num_lines: The expected number of elements returned by the iterator. + (Default: None) + Optionally, if known, the expected number of elements can be passed to + this factory function for improved progress reporting. + """ + + counter = Counter() + with tqdm(unit_scale=0, unit="lines", total=num_lines) as t: + for tokens in iterator: + counter.update(tokens) + t.update(1) + word_vocab = Vocab(counter) + return word_vocab diff --git a/lighthouse/common/vocab/__init__.py b/lighthouse/common/vocab/__init__.py new file mode 100644 index 0000000..c20fd45 --- /dev/null +++ b/lighthouse/common/vocab/__init__.py @@ -0,0 +1,13 @@ +import warnings + +from .vectors import CharNGram, FastText, GloVe, pretrained_aliases, Vectors +from .vocab import Vocab + +__all__ = [ + "Vocab", + "GloVe", + "FastText", + "CharNGram", + "pretrained_aliases", + "Vectors", +] diff --git a/lighthouse/common/vocab/vectors.py b/lighthouse/common/vocab/vectors.py new file mode 100644 index 0000000..f1eacca --- /dev/null +++ b/lighthouse/common/vocab/vectors.py @@ -0,0 +1,301 @@ +import gzip +import logging +import os +import tarfile +import zipfile +from functools import partial +from urllib.request import urlretrieve + +import torch +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def _infer_shape(f): + num_lines, vector_dim = 0, None + for line in f: + if vector_dim is None: + row = line.rstrip().split(b" ") + vector = row[1:] + # Assuming word, [vector] format + if len(vector) > 2: + # The header present in some (w2v) formats contains two elements. + vector_dim = len(vector) + num_lines += 1 # First element read + else: + num_lines += 1 + f.seek(0) + return num_lines, vector_dim + + +class Vectors: + def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None) -> None: + """ + Args: + + name: name of the file that contains the vectors + cache: directory for cached vectors + url: url for download if vectors not found in cache + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size + max_vectors (int): this can be used to limit the number of + pre-trained vectors loaded. + Most pre-trained vector sets are sorted + in the descending order of word frequency. + Thus, in situations where the entire set doesn't fit in memory, + or is not needed for another reason, passing `max_vectors` + can limit the size of the loaded set. + """ + + cache = ".vector_cache" if cache is None else cache + self.itos = None + self.stoi = None + self.vectors = None + self.dim = None + self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init + self.cache(name, cache, url=url, max_vectors=max_vectors) + + def __getitem__(self, token): + if token in self.stoi: + return self.vectors[self.stoi[token]] + else: + return self.unk_init(torch.Tensor(self.dim)) + + def __contains__(self, token): + return token in self.stoi + + def cache(self, name, cache, url=None, max_vectors=None): + import ssl + + ssl._create_default_https_context = ssl._create_unverified_context + if os.path.isfile(name): + path = name + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix + else: + path = os.path.join(cache, name) + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = path + file_suffix + + if not os.path.isfile(path_pt): + if not os.path.isfile(path) and url: + logger.info("Downloading vectors from {}".format(url)) + if not os.path.exists(cache): + os.makedirs(cache) + dest = os.path.join(cache, os.path.basename(url)) + if not os.path.isfile(dest): + with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t: + try: + urlretrieve(url, dest, reporthook=reporthook(t)) + except KeyboardInterrupt as e: # remove the partial zip file + os.remove(dest) + raise e + logger.info("Extracting vectors into {}".format(cache)) + ext = os.path.splitext(dest)[1][1:] + if ext == "zip": + with zipfile.ZipFile(dest, "r") as zf: + zf.extractall(cache) + elif ext == "gz": + if dest.endswith(".tar.gz"): + with tarfile.open(dest, "r:gz") as tar: + tar.extractall(path=cache) + if not os.path.isfile(path): + raise RuntimeError("no vectors found at {}".format(path)) + + logger.info("Loading vectors from {}".format(path)) + ext = os.path.splitext(path)[1][1:] + if ext == "gz": + open_file = gzip.open + else: + open_file = open + + vectors_loaded = 0 + with open_file(path, "rb") as f: + num_lines, dim = _infer_shape(f) + if not max_vectors or max_vectors > num_lines: + max_vectors = num_lines + + itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None + + for line in tqdm(f, total=max_vectors): + # Explicitly splitting on " " is important, so we don't + # get rid of Unicode non-breaking spaces in the vectors. + entries = line.rstrip().split(b" ") + + word, entries = entries[0], entries[1:] + if dim is None and len(entries) > 1: + dim = len(entries) + elif len(entries) == 1: + logger.warning( + "Skipping token {} with 1-dimensional " "vector {}; likely a header".format(word, entries) + ) + continue + elif dim != len(entries): + raise RuntimeError( + "Vector for token {} has {} dimensions, but previously " + "read vectors have {} dimensions. All vectors must have " + "the same number of dimensions.".format(word, len(entries), dim) + ) + + try: + if isinstance(word, bytes): + word = word.decode("utf-8") + except UnicodeDecodeError: + logger.info("Skipping non-UTF8 token {}".format(repr(word))) + continue + + vectors[vectors_loaded] = torch.tensor([float(x) for x in entries]) + vectors_loaded += 1 + itos.append(word) + + if vectors_loaded == max_vectors: + break + + self.itos = itos + self.stoi = {word: i for i, word in enumerate(itos)} + self.vectors = torch.Tensor(vectors).view(-1, dim) + self.dim = dim + logger.info("Saving vectors to {}".format(path_pt)) + if not os.path.exists(cache): + os.makedirs(cache) + torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) + else: + logger.info("Loading vectors from {}".format(path_pt)) + self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt) + + def __len__(self): + return len(self.vectors) + + def get_vecs_by_tokens(self, tokens, lower_case_backup=False): + """Look up embedding vectors of tokens. + + Args: + tokens: a token or a list of tokens. if `tokens` is a string, + returns a 1-D tensor of shape `self.dim`; if `tokens` is a + list of strings, returns a 2-D tensor of shape=(len(tokens), + self.dim). + lower_case_backup : Whether to look up the token in the lower case. + If False, each token in the original case will be looked up; + if True, each token in the original case will be looked up first, + if not found in the keys of the property `stoi`, the token in the + lower case will be looked up. Default: False. + + Examples: + >>> examples = ['chip', 'baby', 'Beautiful'] + >>> vec = text.vocab.GloVe(name='6B', dim=50) + >>> ret = vec.get_vecs_by_tokens(examples, lower_case_backup=True) + """ + to_reduce = False + + if not isinstance(tokens, list): + tokens = [tokens] + to_reduce = True + + if not lower_case_backup: + indices = [self[token] for token in tokens] + else: + indices = [self[token] if token in self.stoi else self[token.lower()] for token in tokens] + + vecs = torch.stack(indices) + return vecs[0] if to_reduce else vecs + + +class GloVe(Vectors): + url = { + "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", + "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", + "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip", + "6B": "http://nlp.stanford.edu/data/glove.6B.zip", + } + + def __init__(self, name="840B", dim=300, **kwargs) -> None: + url = self.url[name] + name = "glove.{}.{}d.txt".format(name, str(dim)) + super(GloVe, self).__init__(name, url=url, **kwargs) + + +class FastText(Vectors): + + url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec" + + def __init__(self, language="en", **kwargs) -> None: + url = self.url_base.format(language) + name = os.path.basename(url) + super(FastText, self).__init__(name, url=url, **kwargs) + + +class CharNGram(Vectors): + + name = "charNgram.txt" + url = "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/" "jmt_pre-trained_embeddings.tar.gz" + + def __init__(self, **kwargs) -> None: + super(CharNGram, self).__init__(self.name, url=self.url, **kwargs) + + def __getitem__(self, token): + vector = torch.Tensor(1, self.dim).zero_() + if token == "": + return self.unk_init(vector) + chars = ["#BEGIN#"] + list(token) + ["#END#"] + num_vectors = 0 + for n in [2, 3, 4]: + end = len(chars) - n + 1 + grams = [chars[i : (i + n)] for i in range(end)] + for gram in grams: + gram_key = "{}gram-{}".format(n, "".join(gram)) + if gram_key in self.stoi: + vector += self.vectors[self.stoi[gram_key]] + num_vectors += 1 + if num_vectors > 0: + vector /= num_vectors + else: + vector = self.unk_init(vector) + return vector + + +def reporthook(t): + """ + https://github.com/tqdm/tqdm. + """ + last_b = [0] + + def inner(b=1, bsize=1, tsize=None): + """ + b: int, optional + Number of blocks just transferred [default: 1]. + bsize: int, optional + Size of each block (in tqdm units) [default: 1]. + tsize: int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + t.total = tsize + t.update((b - last_b[0]) * bsize) + last_b[0] = b + + return inner + + +pretrained_aliases = { + "charngram.100d": partial(CharNGram), + "fasttext.en.300d": partial(FastText, language="en"), + "fasttext.simple.300d": partial(FastText, language="simple"), + "glove.42B.300d": partial(GloVe, name="42B", dim="300"), + "glove.840B.300d": partial(GloVe, name="840B", dim="300"), + "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), + "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), + "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), + "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), + "glove.6B.50d": partial(GloVe, name="6B", dim="50"), + "glove.6B.100d": partial(GloVe, name="6B", dim="100"), + "glove.6B.200d": partial(GloVe, name="6B", dim="200"), + "glove.6B.300d": partial(GloVe, name="6B", dim="300"), +} +"""Mapping from string name to factory function""" diff --git a/lighthouse/common/vocab/vocab.py b/lighthouse/common/vocab/vocab.py new file mode 100644 index 0000000..68e1635 --- /dev/null +++ b/lighthouse/common/vocab/vocab.py @@ -0,0 +1,171 @@ +from typing import Dict, List, Optional + +import torch +import torch.nn as nn + + +def _log_class_usage(klass): + identifier = "torchtext" + if klass and hasattr(klass, "__name__"): + identifier += f".{klass.__name__}" + torch._C._log_api_usage_once(identifier) + + +class Vocab(nn.Module): + __jit_unused_properties__ = ["is_jitable"] + r"""Creates a vocab object which maps tokens to indices. + + Args: + vocab (torch.classes.torchtext.Vocab or torchtext._torchtext.Vocab): a cpp vocab object. + """ + + def __init__(self, vocab) -> None: + super(Vocab, self).__init__() + self.vocab = vocab + _log_class_usage(__class__) + + @property + def is_jitable(self): + return isinstance(self.vocab, torch._C.ScriptObject) + + @torch.jit.export + def forward(self, tokens: List[str]) -> List[int]: + r"""Calls the `lookup_indices` method + + Args: + tokens: a list of tokens used to lookup their corresponding `indices`. + + Returns: + The indices associated with a list of `tokens`. + """ + return self.vocab.lookup_indices(tokens) + + @torch.jit.export + def __len__(self) -> int: + r""" + Returns: + The length of the vocab. + """ + return len(self.vocab) + + @torch.jit.export + def __contains__(self, token: str) -> bool: + r""" + Args: + token: The token for which to check the membership. + + Returns: + Whether the token is member of vocab or not. + """ + return self.vocab.__contains__(token) + + @torch.jit.export + def __getitem__(self, token: str) -> int: + r""" + Args: + token: The token used to lookup the corresponding index. + + Returns: + The index corresponding to the associated token. + """ + return self.vocab[token] + + @torch.jit.export + def set_default_index(self, index: Optional[int]) -> None: + r""" + Args: + index: Value of default index. This index will be returned when OOV token is queried. + """ + self.vocab.set_default_index(index) + + @torch.jit.export + def get_default_index(self) -> Optional[int]: + r""" + Returns: + Value of default index if it is set. + """ + return self.vocab.get_default_index() + + @torch.jit.export + def insert_token(self, token: str, index: int) -> None: + r""" + Args: + token: The token used to lookup the corresponding index. + index: The index corresponding to the associated token. + Raises: + RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab. + """ + self.vocab.insert_token(token, index) + + @torch.jit.export + def append_token(self, token: str) -> None: + r""" + Args: + token: The token used to lookup the corresponding index. + + Raises: + RuntimeError: If `token` already exists in the vocab + """ + self.vocab.append_token(token) + + @torch.jit.export + def lookup_token(self, index: int) -> str: + r""" + Args: + index: The index corresponding to the associated token. + + Returns: + token: The token used to lookup the corresponding index. + + Raises: + RuntimeError: If `index` not in range [0, itos.size()). + """ + return self.vocab.lookup_token(index) + + @torch.jit.export + def lookup_tokens(self, indices: List[int]) -> List[str]: + r""" + Args: + indices: The `indices` used to lookup their corresponding`tokens`. + + Returns: + The `tokens` associated with `indices`. + + Raises: + RuntimeError: If an index within `indices` is not int range [0, itos.size()). + """ + return self.vocab.lookup_tokens(indices) + + @torch.jit.export + def lookup_indices(self, tokens: List[str]) -> List[int]: + r""" + Args: + tokens: the tokens used to lookup their corresponding `indices`. + + Returns: + The 'indices` associated with `tokens`. + """ + return self.vocab.lookup_indices(tokens) + + @torch.jit.export + def get_stoi(self) -> Dict[str, int]: + r""" + Returns: + Dictionary mapping tokens to indices. + """ + return self.vocab.get_stoi() + + @torch.jit.export + def get_itos(self) -> List[str]: + r""" + Returns: + List mapping indices to tokens. + """ + return self.vocab.get_itos() + + def __prepare_scriptable__(self): + r"""Return a JITable Vocab.""" + if not self.is_jitable: + cpp_vocab = torch.classes.torchtext.Vocab(self.vocab.itos_, self.vocab.default_index_) + return Vocab(cpp_vocab) + return self diff --git a/lighthouse/feature_extractor/text_encoders/glove.py b/lighthouse/feature_extractor/text_encoders/glove.py index ef65df8..05bdf06 100644 --- a/lighthouse/feature_extractor/text_encoders/glove.py +++ b/lighthouse/feature_extractor/text_encoders/glove.py @@ -1,6 +1,6 @@ import torch from typing import Tuple -from torchtext import vocab +from lighthouse.common import vocab class GloVe: def __init__( @@ -35,4 +35,4 @@ def __call__( word_inds = torch.LongTensor( [self._vocab.stoi.get(w.lower(), 400000) for w in query.split()]) mask = torch.ones((1, word_inds.shape[0])).to(self._device) - return self._embedding(word_inds).unsqueeze(0).to(self._device), mask \ No newline at end of file + return self._embedding(word_inds).unsqueeze(0).to(self._device), mask diff --git a/mypy.ini b/mypy.ini index 54eecc0..41e0b02 100644 --- a/mypy.ini +++ b/mypy.ini @@ -13,9 +13,6 @@ ignore_missing_imports = True [mypy-torchlibrosa.*] ignore_missing_imports = True -[mypy-torchtext.*] -ignore_missing_imports = True - [mypy-clip.*] ignore_missing_imports = True diff --git a/training/cg_detr_dataset.py b/training/cg_detr_dataset.py index 64c932e..25725c1 100644 --- a/training/cg_detr_dataset.py +++ b/training/cg_detr_dataset.py @@ -41,10 +41,10 @@ import random import logging from os.path import join, exists +from lighthouse.common import vocab from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array from lighthouse.common.utils.tensor_utils import pad_sequences_1d from lighthouse.common.utils.span_utils import span_xx_to_cxw -from torchtext import vocab import torch.nn as nn logger = logging.getLogger(__name__) diff --git a/training/dataset.py b/training/dataset.py index d8053d7..0adacdb 100755 --- a/training/dataset.py +++ b/training/dataset.py @@ -62,10 +62,10 @@ import random import logging from os.path import join, exists +from lighthouse.common import vocab from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array from lighthouse.common.utils.tensor_utils import pad_sequences_1d from lighthouse.common.utils.span_utils import span_xx_to_cxw -from torchtext import vocab import torch.nn as nn logger = logging.getLogger(__name__) From 47df93e673a51797a621b78f6bb0200d2a06cb8d Mon Sep 17 00:00:00 2001 From: Hokuto Munakata Date: Mon, 11 Aug 2025 13:17:47 +0900 Subject: [PATCH 2/2] Remove unnecessary file --- lighthouse/common/vocab.py | 586 ------------------------------------- 1 file changed, 586 deletions(-) delete mode 100644 lighthouse/common/vocab.py diff --git a/lighthouse/common/vocab.py b/lighthouse/common/vocab.py deleted file mode 100644 index bc3157b..0000000 --- a/lighthouse/common/vocab.py +++ /dev/null @@ -1,586 +0,0 @@ -import gzip -import logging -import os -import tarfile -import zipfile -from collections import Counter, defaultdict -from functools import partial -from urllib.request import urlretrieve - -import torch -from tqdm import tqdm - -from .utils import reporthook - -logger = logging.getLogger(__name__) - - -class Vocab(object): - """Defines a vocabulary object that will be used to numericalize a field. - - Attributes: - freqs: A collections.Counter object holding the frequencies of tokens - in the data used to build the Vocab. - stoi: A collections.defaultdict instance mapping token strings to - numerical identifiers. - itos: A list of token strings indexed by their numerical identifiers. - """ - - # TODO (@mttk): Populate classs with default values of special symbols - UNK = "" - - def __init__( - self, - counter, - max_size=None, - min_freq=1, - specials=("", ""), - vectors=None, - unk_init=None, - vectors_cache=None, - specials_first=True, - ): - """Create a Vocab object from a collections.Counter. - - Arguments: - counter: collections.Counter object holding the frequencies of - each value found in the data. - max_size: The maximum size of the vocabulary, or None for no - maximum. Default: None. - min_freq: The minimum frequency needed to include a token in the - vocabulary. Values less than 1 will be set to 1. Default: 1. - specials: The list of special tokens (e.g., padding or eos) that - will be prepended to the vocabulary. Default: [', ''] - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros' - vectors_cache: directory for cached vectors. Default: '.vector_cache' - specials_first: Whether to add special tokens into the vocabulary at first. - If it is False, they are added into the vocabulary at last. - Default: True. - """ - self.freqs = counter - counter = counter.copy() - min_freq = max(min_freq, 1) - - self.itos = list() - self.unk_index = None - if specials_first: - self.itos = list(specials) - # only extend max size if specials are prepended - max_size = None if max_size is None else max_size + len(specials) - - # frequencies of special tokens are not counted when building vocabulary - # in frequency order - for tok in specials: - del counter[tok] - - # sort by frequency, then alphabetically - words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0]) - words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True) - - for word, freq in words_and_frequencies: - if freq < min_freq or len(self.itos) == max_size: - break - self.itos.append(word) - - if Vocab.UNK in specials: # hard-coded for now - unk_index = specials.index(Vocab.UNK) # position in list - # account for ordering of specials, set variable - self.unk_index = unk_index if specials_first else len(self.itos) + unk_index - self.stoi = defaultdict(self._default_unk_index) - else: - self.stoi = defaultdict() - - if not specials_first: - self.itos.extend(list(specials)) - - # stoi is simply a reverse dict for itos - self.stoi.update({tok: i for i, tok in enumerate(self.itos)}) - - self.vectors = None - if vectors is not None: - self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache) - else: - assert unk_init is None and vectors_cache is None - - def _default_unk_index(self): - return self.unk_index - - def __getitem__(self, token): - return self.stoi.get(token, self.stoi.get(Vocab.UNK)) - - def __getstate__(self): - # avoid picking defaultdict - attrs = dict(self.__dict__) - # cast to regular dict - attrs["stoi"] = dict(self.stoi) - return attrs - - def __setstate__(self, state): - if state.get("unk_index", None) is None: - stoi = defaultdict() - else: - stoi = defaultdict(self._default_unk_index) - stoi.update(state["stoi"]) - state["stoi"] = stoi - self.__dict__.update(state) - - def __eq__(self, other): - if self.freqs != other.freqs: - return False - if self.stoi != other.stoi: - return False - if self.itos != other.itos: - return False - if self.vectors != other.vectors: - return False - return True - - def __len__(self): - return len(self.itos) - - def lookup_indices(self, tokens): - indices = [self.__getitem__(token) for token in tokens] - return indices - - def extend(self, v, sort=False): - words = sorted(v.itos) if sort else v.itos - for w in words: - if w not in self.stoi: - self.itos.append(w) - self.stoi[w] = len(self.itos) - 1 - - def load_vectors(self, vectors, **kwargs): - """ - Arguments: - vectors: one of or a list containing instantiations of the - GloVe, CharNGram, or Vectors classes. Alternatively, one - of or a list of available pretrained vectors: - - charngram.100d - fasttext.en.300d - fasttext.simple.300d - glove.42B.300d - glove.840B.300d - glove.twitter.27B.25d - glove.twitter.27B.50d - glove.twitter.27B.100d - glove.twitter.27B.200d - glove.6B.50d - glove.6B.100d - glove.6B.200d - glove.6B.300d - - Remaining keyword arguments: Passed to the constructor of Vectors classes. - """ - if not isinstance(vectors, list): - vectors = [vectors] - for idx, vector in enumerate(vectors): - if isinstance(vector, str): - # Convert the string pretrained vector identifier - # to a Vectors object - if vector not in pretrained_aliases: - raise ValueError( - "Got string input vector {}, but allowed pretrained " - "vectors are {}".format(vector, list(pretrained_aliases.keys())) - ) - vectors[idx] = pretrained_aliases[vector](**kwargs) - elif not isinstance(vector, Vectors): - raise ValueError( - "Got input vectors of type {}, expected str or " - "Vectors object".format(type(vector)) - ) - - tot_dim = sum(v.dim for v in vectors) - self.vectors = torch.Tensor(len(self), tot_dim) - for i, token in enumerate(self.itos): - start_dim = 0 - for v in vectors: - end_dim = start_dim + v.dim - self.vectors[i][start_dim:end_dim] = v[token.strip()] - start_dim = end_dim - assert start_dim == tot_dim - - def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_): - """ - Set the vectors for the Vocab instance from a collection of Tensors. - - Arguments: - stoi: A dictionary of string to the index of the associated vector - in the `vectors` input argument. - vectors: An indexed iterable (or other structure supporting __getitem__) that - given an input index, returns a FloatTensor representing the vector - for the token associated with the index. For example, - vector[stoi["string"]] should return the vector for "string". - dim: The dimensionality of the vectors. - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros' - """ - self.vectors = torch.Tensor(len(self), dim) - for i, token in enumerate(self.itos): - wv_index = stoi.get(token, None) - if wv_index is not None: - self.vectors[i] = vectors[wv_index] - else: - self.vectors[i] = unk_init(self.vectors[i]) - - -class SubwordVocab(Vocab): - def __init__( - self, - counter, - max_size=None, - specials=(""), - vectors=None, - unk_init=torch.Tensor.zero_, - ): - """Create a revtok subword vocabulary from a collections.Counter. - - Arguments: - counter: collections.Counter object holding the frequencies of - each word found in the data. - max_size: The maximum size of the subword vocabulary, or None for no - maximum. Default: None. - specials: The list of special tokens (e.g., padding or eos) that - will be prepended to the vocabulary in addition to an - token. - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: 'torch.zeros - """ - try: - import revtok - except ImportError: - print("Please install revtok.") - raise - - # Hardcode unk_index as subword_vocab has no specials_first argument - self.unk_index = ( - specials.index(SubwordVocab.UNK) if SubwordVocab.UNK in specials else None - ) - - if self.unk_index is None: - self.stoi = defaultdict() - else: - self.stoi = defaultdict(self._default_unk_index) - - self.stoi.update({tok: i for i, tok in enumerate(specials)}) - self.itos = specials.copy() - - self.segment = revtok.SubwordSegmenter(counter, max_size) - - max_size = None if max_size is None else max_size + len(self.itos) - - # sort by frequency/entropy, then alphabetically - toks = sorted( - self.segment.vocab.items(), - key=lambda tup: (len(tup[0]) != 1, -tup[1], tup[0]), - ) - - for tok, _ in toks: - if len(self.itos) == max_size: - break - self.itos.append(tok) - self.stoi[tok] = len(self.itos) - 1 - - if vectors is not None: - self.load_vectors(vectors, unk_init=unk_init) - - -def _infer_shape(f): - num_lines, vector_dim = 0, None - for line in f: - if vector_dim is None: - row = line.rstrip().split(b" ") - vector = row[1:] - # Assuming word, [vector] format - if len(vector) > 2: - # The header present in some (w2v) formats contains two elements. - vector_dim = len(vector) - num_lines += 1 # First element read - else: - num_lines += 1 - f.seek(0) - return num_lines, vector_dim - - -class Vectors(object): - def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None): - """ - Arguments: - - name: name of the file that contains the vectors - cache: directory for cached vectors - url: url for download if vectors not found in cache - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size - max_vectors (int): this can be used to limit the number of - pre-trained vectors loaded. - Most pre-trained vector sets are sorted - in the descending order of word frequency. - Thus, in situations where the entire set doesn't fit in memory, - or is not needed for another reason, passing `max_vectors` - can limit the size of the loaded set. - """ - - cache = ".vector_cache" if cache is None else cache - self.itos = None - self.stoi = None - self.vectors = None - self.dim = None - self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init - self.cache(name, cache, url=url, max_vectors=max_vectors) - - def __getitem__(self, token): - if token in self.stoi: - return self.vectors[self.stoi[token]] - else: - return self.unk_init(torch.Tensor(self.dim)) - - def cache(self, name, cache, url=None, max_vectors=None): - import ssl - - ssl._create_default_https_context = ssl._create_unverified_context - if os.path.isfile(name): - path = name - if max_vectors: - file_suffix = "_{}.pt".format(max_vectors) - else: - file_suffix = ".pt" - path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix - else: - path = os.path.join(cache, name) - if max_vectors: - file_suffix = "_{}.pt".format(max_vectors) - else: - file_suffix = ".pt" - path_pt = path + file_suffix - - if not os.path.isfile(path_pt): - if not os.path.isfile(path) and url: - logger.info("Downloading vectors from {}".format(url)) - if not os.path.exists(cache): - os.makedirs(cache) - dest = os.path.join(cache, os.path.basename(url)) - if not os.path.isfile(dest): - with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t: - try: - urlretrieve(url, dest, reporthook=reporthook(t)) - except KeyboardInterrupt as e: # remove the partial zip file - os.remove(dest) - raise e - logger.info("Extracting vectors into {}".format(cache)) - ext = os.path.splitext(dest)[1][1:] - if ext == "zip": - with zipfile.ZipFile(dest, "r") as zf: - zf.extractall(cache) - elif ext == "gz": - if dest.endswith(".tar.gz"): - with tarfile.open(dest, "r:gz") as tar: - tar.extractall(path=cache) - if not os.path.isfile(path): - raise RuntimeError("no vectors found at {}".format(path)) - - logger.info("Loading vectors from {}".format(path)) - ext = os.path.splitext(path)[1][1:] - if ext == "gz": - open_file = gzip.open - else: - open_file = open - - vectors_loaded = 0 - with open_file(path, "rb") as f: - num_lines, dim = _infer_shape(f) - if not max_vectors or max_vectors > num_lines: - max_vectors = num_lines - - itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None - - for line in tqdm(f, total=max_vectors): - # Explicitly splitting on " " is important, so we don't - # get rid of Unicode non-breaking spaces in the vectors. - entries = line.rstrip().split(b" ") - - word, entries = entries[0], entries[1:] - if dim is None and len(entries) > 1: - dim = len(entries) - elif len(entries) == 1: - logger.warning( - "Skipping token {} with 1-dimensional " - "vector {}; likely a header".format(word, entries) - ) - continue - elif dim != len(entries): - raise RuntimeError( - "Vector for token {} has {} dimensions, but previously " - "read vectors have {} dimensions. All vectors must have " - "the same number of dimensions.".format( - word, len(entries), dim - ) - ) - - try: - if isinstance(word, bytes): - word = word.decode("utf-8") - except UnicodeDecodeError: - logger.info("Skipping non-UTF8 token {}".format(repr(word))) - continue - - vectors[vectors_loaded] = torch.tensor([float(x) for x in entries]) - vectors_loaded += 1 - itos.append(word) - - if vectors_loaded == max_vectors: - break - - self.itos = itos - self.stoi = {word: i for i, word in enumerate(itos)} - self.vectors = torch.Tensor(vectors).view(-1, dim) - self.dim = dim - logger.info("Saving vectors to {}".format(path_pt)) - if not os.path.exists(cache): - os.makedirs(cache) - torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) - else: - logger.info("Loading vectors from {}".format(path_pt)) - self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt) - - def __len__(self): - return len(self.vectors) - - def get_vecs_by_tokens(self, tokens, lower_case_backup=False): - """Look up embedding vectors of tokens. - - Arguments: - tokens: a token or a list of tokens. if `tokens` is a string, - returns a 1-D tensor of shape `self.dim`; if `tokens` is a - list of strings, returns a 2-D tensor of shape=(len(tokens), - self.dim). - lower_case_backup : Whether to look up the token in the lower case. - If False, each token in the original case will be looked up; - if True, each token in the original case will be looked up first, - if not found in the keys of the property `stoi`, the token in the - lower case will be looked up. Default: False. - - Examples: - >>> examples = ['chip', 'baby', 'Beautiful'] - >>> vec = text.vocab.GloVe(name='6B', dim=50) - >>> ret = vec.get_vecs_by_tokens(tokens, lower_case_backup=True) - """ - to_reduce = False - - if not isinstance(tokens, list): - tokens = [tokens] - to_reduce = True - - if not lower_case_backup: - indices = [self[token] for token in tokens] - else: - indices = [ - self[token] if token in self.stoi else self[token.lower()] - for token in tokens - ] - - vecs = torch.stack(indices) - return vecs[0] if to_reduce else vecs - - -class GloVe(Vectors): - url = { - "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", - "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", - "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip", - "6B": "http://nlp.stanford.edu/data/glove.6B.zip", - } - - def __init__(self, name="840B", dim=300, **kwargs): - url = self.url[name] - name = "glove.{}.{}d.txt".format(name, str(dim)) - super(GloVe, self).__init__(name, url=url, **kwargs) - - -class FastText(Vectors): - url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec" - - def __init__(self, language="en", **kwargs): - url = self.url_base.format(language) - name = os.path.basename(url) - super(FastText, self).__init__(name, url=url, **kwargs) - - -class CharNGram(Vectors): - name = "charNgram.txt" - url = ( - "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/" - "jmt_pre-trained_embeddings.tar.gz" - ) - - def __init__(self, **kwargs): - super(CharNGram, self).__init__(self.name, url=self.url, **kwargs) - - def __getitem__(self, token): - vector = torch.Tensor(1, self.dim).zero_() - if token == "": - return self.unk_init(vector) - chars = ["#BEGIN#"] + list(token) + ["#END#"] - num_vectors = 0 - for n in [2, 3, 4]: - end = len(chars) - n + 1 - grams = [chars[i : (i + n)] for i in range(end)] - for gram in grams: - gram_key = "{}gram-{}".format(n, "".join(gram)) - if gram_key in self.stoi: - vector += self.vectors[self.stoi[gram_key]] - num_vectors += 1 - if num_vectors > 0: - vector /= num_vectors - else: - vector = self.unk_init(vector) - return vector - - -pretrained_aliases = { - "charngram.100d": partial(CharNGram), - "fasttext.en.300d": partial(FastText, language="en"), - "fasttext.simple.300d": partial(FastText, language="simple"), - "glove.42B.300d": partial(GloVe, name="42B", dim="300"), - "glove.840B.300d": partial(GloVe, name="840B", dim="300"), - "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), - "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), - "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), - "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), - "glove.6B.50d": partial(GloVe, name="6B", dim="50"), - "glove.6B.100d": partial(GloVe, name="6B", dim="100"), - "glove.6B.200d": partial(GloVe, name="6B", dim="200"), - "glove.6B.300d": partial(GloVe, name="6B", dim="300"), -} -"""Mapping from string name to factory function""" - - -def build_vocab_from_iterator(iterator, num_lines=None): - """ - Build a Vocab from an iterator. - - Arguments: - iterator: Iterator used to build Vocab. Must yield list or iterator of tokens. - num_lines: The expected number of elements returned by the iterator. - (Default: None) - Optionally, if known, the expected number of elements can be passed to - this factory function for improved progress reporting. - """ - - counter = Counter() - with tqdm(unit_scale=0, unit="lines", total=num_lines) as t: - for tokens in iterator: - counter.update(tokens) - t.update(1) - word_vocab = Vocab(counter) - return word_vocab