diff --git a/.github/workflows/mypy_ruff.yml b/.github/workflows/mypy_ruff.yml index 629908b..f7e423f 100644 --- a/.github/workflows/mypy_ruff.yml +++ b/.github/workflows/mypy_ruff.yml @@ -22,7 +22,7 @@ jobs: - name: Run dependency libraries run: | pip install -e . - pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 + pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pip install mypy ruff - name: Run mypy diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 61c107f..476084b 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -28,7 +28,7 @@ jobs: - name: Run dependency libraries run: | pip install -e . - pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 + pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 pip install pytest - name: Run pytest run: pytest tests/test_models.py diff --git a/README.md b/README.md index 949ff7a..19d568a 100755 --- a/README.md +++ b/README.md @@ -24,10 +24,10 @@ Install ffmpeg first. If you are an Ubuntu user, run: ``` apt install ffmpeg ``` -Then, install pytorch, torchvision, torchaudio, and torchtext based on your GPU environments. +Then, install pytorch, torchvision, and torchaudio based on your GPU environments. Note that the inference API is available for CPU environments. We tested the codes on Python 3.9 and CUDA 11.8: ``` -pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 torchtext==0.16.0 --index-url https://download.pytorch.org/whl/cu118 +pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118 ``` Finally, run to install dependency libraries: ``` diff --git a/lighthouse/common/vocab/__init__.py b/lighthouse/common/vocab/__init__.py new file mode 100644 index 0000000..c20fd45 --- /dev/null +++ b/lighthouse/common/vocab/__init__.py @@ -0,0 +1,13 @@ +import warnings + +from .vectors import CharNGram, FastText, GloVe, pretrained_aliases, Vectors +from .vocab import Vocab + +__all__ = [ + "Vocab", + "GloVe", + "FastText", + "CharNGram", + "pretrained_aliases", + "Vectors", +] diff --git a/lighthouse/common/vocab/vectors.py b/lighthouse/common/vocab/vectors.py new file mode 100644 index 0000000..f1eacca --- /dev/null +++ b/lighthouse/common/vocab/vectors.py @@ -0,0 +1,301 @@ +import gzip +import logging +import os +import tarfile +import zipfile +from functools import partial +from urllib.request import urlretrieve + +import torch +from tqdm import tqdm + +logger = logging.getLogger(__name__) + + +def _infer_shape(f): + num_lines, vector_dim = 0, None + for line in f: + if vector_dim is None: + row = line.rstrip().split(b" ") + vector = row[1:] + # Assuming word, [vector] format + if len(vector) > 2: + # The header present in some (w2v) formats contains two elements. + vector_dim = len(vector) + num_lines += 1 # First element read + else: + num_lines += 1 + f.seek(0) + return num_lines, vector_dim + + +class Vectors: + def __init__(self, name, cache=None, url=None, unk_init=None, max_vectors=None) -> None: + """ + Args: + + name: name of the file that contains the vectors + cache: directory for cached vectors + url: url for download if vectors not found in cache + unk_init (callback): by default, initialize out-of-vocabulary word vectors + to zero vectors; can be any function that takes in a Tensor and returns a Tensor of the same size + max_vectors (int): this can be used to limit the number of + pre-trained vectors loaded. + Most pre-trained vector sets are sorted + in the descending order of word frequency. + Thus, in situations where the entire set doesn't fit in memory, + or is not needed for another reason, passing `max_vectors` + can limit the size of the loaded set. + """ + + cache = ".vector_cache" if cache is None else cache + self.itos = None + self.stoi = None + self.vectors = None + self.dim = None + self.unk_init = torch.Tensor.zero_ if unk_init is None else unk_init + self.cache(name, cache, url=url, max_vectors=max_vectors) + + def __getitem__(self, token): + if token in self.stoi: + return self.vectors[self.stoi[token]] + else: + return self.unk_init(torch.Tensor(self.dim)) + + def __contains__(self, token): + return token in self.stoi + + def cache(self, name, cache, url=None, max_vectors=None): + import ssl + + ssl._create_default_https_context = ssl._create_unverified_context + if os.path.isfile(name): + path = name + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = os.path.join(cache, os.path.basename(name)) + file_suffix + else: + path = os.path.join(cache, name) + if max_vectors: + file_suffix = "_{}.pt".format(max_vectors) + else: + file_suffix = ".pt" + path_pt = path + file_suffix + + if not os.path.isfile(path_pt): + if not os.path.isfile(path) and url: + logger.info("Downloading vectors from {}".format(url)) + if not os.path.exists(cache): + os.makedirs(cache) + dest = os.path.join(cache, os.path.basename(url)) + if not os.path.isfile(dest): + with tqdm(unit="B", unit_scale=True, miniters=1, desc=dest) as t: + try: + urlretrieve(url, dest, reporthook=reporthook(t)) + except KeyboardInterrupt as e: # remove the partial zip file + os.remove(dest) + raise e + logger.info("Extracting vectors into {}".format(cache)) + ext = os.path.splitext(dest)[1][1:] + if ext == "zip": + with zipfile.ZipFile(dest, "r") as zf: + zf.extractall(cache) + elif ext == "gz": + if dest.endswith(".tar.gz"): + with tarfile.open(dest, "r:gz") as tar: + tar.extractall(path=cache) + if not os.path.isfile(path): + raise RuntimeError("no vectors found at {}".format(path)) + + logger.info("Loading vectors from {}".format(path)) + ext = os.path.splitext(path)[1][1:] + if ext == "gz": + open_file = gzip.open + else: + open_file = open + + vectors_loaded = 0 + with open_file(path, "rb") as f: + num_lines, dim = _infer_shape(f) + if not max_vectors or max_vectors > num_lines: + max_vectors = num_lines + + itos, vectors, dim = [], torch.zeros((max_vectors, dim)), None + + for line in tqdm(f, total=max_vectors): + # Explicitly splitting on " " is important, so we don't + # get rid of Unicode non-breaking spaces in the vectors. + entries = line.rstrip().split(b" ") + + word, entries = entries[0], entries[1:] + if dim is None and len(entries) > 1: + dim = len(entries) + elif len(entries) == 1: + logger.warning( + "Skipping token {} with 1-dimensional " "vector {}; likely a header".format(word, entries) + ) + continue + elif dim != len(entries): + raise RuntimeError( + "Vector for token {} has {} dimensions, but previously " + "read vectors have {} dimensions. All vectors must have " + "the same number of dimensions.".format(word, len(entries), dim) + ) + + try: + if isinstance(word, bytes): + word = word.decode("utf-8") + except UnicodeDecodeError: + logger.info("Skipping non-UTF8 token {}".format(repr(word))) + continue + + vectors[vectors_loaded] = torch.tensor([float(x) for x in entries]) + vectors_loaded += 1 + itos.append(word) + + if vectors_loaded == max_vectors: + break + + self.itos = itos + self.stoi = {word: i for i, word in enumerate(itos)} + self.vectors = torch.Tensor(vectors).view(-1, dim) + self.dim = dim + logger.info("Saving vectors to {}".format(path_pt)) + if not os.path.exists(cache): + os.makedirs(cache) + torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt) + else: + logger.info("Loading vectors from {}".format(path_pt)) + self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt) + + def __len__(self): + return len(self.vectors) + + def get_vecs_by_tokens(self, tokens, lower_case_backup=False): + """Look up embedding vectors of tokens. + + Args: + tokens: a token or a list of tokens. if `tokens` is a string, + returns a 1-D tensor of shape `self.dim`; if `tokens` is a + list of strings, returns a 2-D tensor of shape=(len(tokens), + self.dim). + lower_case_backup : Whether to look up the token in the lower case. + If False, each token in the original case will be looked up; + if True, each token in the original case will be looked up first, + if not found in the keys of the property `stoi`, the token in the + lower case will be looked up. Default: False. + + Examples: + >>> examples = ['chip', 'baby', 'Beautiful'] + >>> vec = text.vocab.GloVe(name='6B', dim=50) + >>> ret = vec.get_vecs_by_tokens(examples, lower_case_backup=True) + """ + to_reduce = False + + if not isinstance(tokens, list): + tokens = [tokens] + to_reduce = True + + if not lower_case_backup: + indices = [self[token] for token in tokens] + else: + indices = [self[token] if token in self.stoi else self[token.lower()] for token in tokens] + + vecs = torch.stack(indices) + return vecs[0] if to_reduce else vecs + + +class GloVe(Vectors): + url = { + "42B": "http://nlp.stanford.edu/data/glove.42B.300d.zip", + "840B": "http://nlp.stanford.edu/data/glove.840B.300d.zip", + "twitter.27B": "http://nlp.stanford.edu/data/glove.twitter.27B.zip", + "6B": "http://nlp.stanford.edu/data/glove.6B.zip", + } + + def __init__(self, name="840B", dim=300, **kwargs) -> None: + url = self.url[name] + name = "glove.{}.{}d.txt".format(name, str(dim)) + super(GloVe, self).__init__(name, url=url, **kwargs) + + +class FastText(Vectors): + + url_base = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.vec" + + def __init__(self, language="en", **kwargs) -> None: + url = self.url_base.format(language) + name = os.path.basename(url) + super(FastText, self).__init__(name, url=url, **kwargs) + + +class CharNGram(Vectors): + + name = "charNgram.txt" + url = "http://www.logos.t.u-tokyo.ac.jp/~hassy/publications/arxiv2016jmt/" "jmt_pre-trained_embeddings.tar.gz" + + def __init__(self, **kwargs) -> None: + super(CharNGram, self).__init__(self.name, url=self.url, **kwargs) + + def __getitem__(self, token): + vector = torch.Tensor(1, self.dim).zero_() + if token == "": + return self.unk_init(vector) + chars = ["#BEGIN#"] + list(token) + ["#END#"] + num_vectors = 0 + for n in [2, 3, 4]: + end = len(chars) - n + 1 + grams = [chars[i : (i + n)] for i in range(end)] + for gram in grams: + gram_key = "{}gram-{}".format(n, "".join(gram)) + if gram_key in self.stoi: + vector += self.vectors[self.stoi[gram_key]] + num_vectors += 1 + if num_vectors > 0: + vector /= num_vectors + else: + vector = self.unk_init(vector) + return vector + + +def reporthook(t): + """ + https://github.com/tqdm/tqdm. + """ + last_b = [0] + + def inner(b=1, bsize=1, tsize=None): + """ + b: int, optional + Number of blocks just transferred [default: 1]. + bsize: int, optional + Size of each block (in tqdm units) [default: 1]. + tsize: int, optional + Total size (in tqdm units). If [default: None] remains unchanged. + """ + if tsize is not None: + t.total = tsize + t.update((b - last_b[0]) * bsize) + last_b[0] = b + + return inner + + +pretrained_aliases = { + "charngram.100d": partial(CharNGram), + "fasttext.en.300d": partial(FastText, language="en"), + "fasttext.simple.300d": partial(FastText, language="simple"), + "glove.42B.300d": partial(GloVe, name="42B", dim="300"), + "glove.840B.300d": partial(GloVe, name="840B", dim="300"), + "glove.twitter.27B.25d": partial(GloVe, name="twitter.27B", dim="25"), + "glove.twitter.27B.50d": partial(GloVe, name="twitter.27B", dim="50"), + "glove.twitter.27B.100d": partial(GloVe, name="twitter.27B", dim="100"), + "glove.twitter.27B.200d": partial(GloVe, name="twitter.27B", dim="200"), + "glove.6B.50d": partial(GloVe, name="6B", dim="50"), + "glove.6B.100d": partial(GloVe, name="6B", dim="100"), + "glove.6B.200d": partial(GloVe, name="6B", dim="200"), + "glove.6B.300d": partial(GloVe, name="6B", dim="300"), +} +"""Mapping from string name to factory function""" diff --git a/lighthouse/common/vocab/vocab.py b/lighthouse/common/vocab/vocab.py new file mode 100644 index 0000000..68e1635 --- /dev/null +++ b/lighthouse/common/vocab/vocab.py @@ -0,0 +1,171 @@ +from typing import Dict, List, Optional + +import torch +import torch.nn as nn + + +def _log_class_usage(klass): + identifier = "torchtext" + if klass and hasattr(klass, "__name__"): + identifier += f".{klass.__name__}" + torch._C._log_api_usage_once(identifier) + + +class Vocab(nn.Module): + __jit_unused_properties__ = ["is_jitable"] + r"""Creates a vocab object which maps tokens to indices. + + Args: + vocab (torch.classes.torchtext.Vocab or torchtext._torchtext.Vocab): a cpp vocab object. + """ + + def __init__(self, vocab) -> None: + super(Vocab, self).__init__() + self.vocab = vocab + _log_class_usage(__class__) + + @property + def is_jitable(self): + return isinstance(self.vocab, torch._C.ScriptObject) + + @torch.jit.export + def forward(self, tokens: List[str]) -> List[int]: + r"""Calls the `lookup_indices` method + + Args: + tokens: a list of tokens used to lookup their corresponding `indices`. + + Returns: + The indices associated with a list of `tokens`. + """ + return self.vocab.lookup_indices(tokens) + + @torch.jit.export + def __len__(self) -> int: + r""" + Returns: + The length of the vocab. + """ + return len(self.vocab) + + @torch.jit.export + def __contains__(self, token: str) -> bool: + r""" + Args: + token: The token for which to check the membership. + + Returns: + Whether the token is member of vocab or not. + """ + return self.vocab.__contains__(token) + + @torch.jit.export + def __getitem__(self, token: str) -> int: + r""" + Args: + token: The token used to lookup the corresponding index. + + Returns: + The index corresponding to the associated token. + """ + return self.vocab[token] + + @torch.jit.export + def set_default_index(self, index: Optional[int]) -> None: + r""" + Args: + index: Value of default index. This index will be returned when OOV token is queried. + """ + self.vocab.set_default_index(index) + + @torch.jit.export + def get_default_index(self) -> Optional[int]: + r""" + Returns: + Value of default index if it is set. + """ + return self.vocab.get_default_index() + + @torch.jit.export + def insert_token(self, token: str, index: int) -> None: + r""" + Args: + token: The token used to lookup the corresponding index. + index: The index corresponding to the associated token. + Raises: + RuntimeError: If `index` is not in range [0, Vocab.size()] or if `token` already exists in the vocab. + """ + self.vocab.insert_token(token, index) + + @torch.jit.export + def append_token(self, token: str) -> None: + r""" + Args: + token: The token used to lookup the corresponding index. + + Raises: + RuntimeError: If `token` already exists in the vocab + """ + self.vocab.append_token(token) + + @torch.jit.export + def lookup_token(self, index: int) -> str: + r""" + Args: + index: The index corresponding to the associated token. + + Returns: + token: The token used to lookup the corresponding index. + + Raises: + RuntimeError: If `index` not in range [0, itos.size()). + """ + return self.vocab.lookup_token(index) + + @torch.jit.export + def lookup_tokens(self, indices: List[int]) -> List[str]: + r""" + Args: + indices: The `indices` used to lookup their corresponding`tokens`. + + Returns: + The `tokens` associated with `indices`. + + Raises: + RuntimeError: If an index within `indices` is not int range [0, itos.size()). + """ + return self.vocab.lookup_tokens(indices) + + @torch.jit.export + def lookup_indices(self, tokens: List[str]) -> List[int]: + r""" + Args: + tokens: the tokens used to lookup their corresponding `indices`. + + Returns: + The 'indices` associated with `tokens`. + """ + return self.vocab.lookup_indices(tokens) + + @torch.jit.export + def get_stoi(self) -> Dict[str, int]: + r""" + Returns: + Dictionary mapping tokens to indices. + """ + return self.vocab.get_stoi() + + @torch.jit.export + def get_itos(self) -> List[str]: + r""" + Returns: + List mapping indices to tokens. + """ + return self.vocab.get_itos() + + def __prepare_scriptable__(self): + r"""Return a JITable Vocab.""" + if not self.is_jitable: + cpp_vocab = torch.classes.torchtext.Vocab(self.vocab.itos_, self.vocab.default_index_) + return Vocab(cpp_vocab) + return self diff --git a/lighthouse/feature_extractor/text_encoders/glove.py b/lighthouse/feature_extractor/text_encoders/glove.py index ef65df8..05bdf06 100644 --- a/lighthouse/feature_extractor/text_encoders/glove.py +++ b/lighthouse/feature_extractor/text_encoders/glove.py @@ -1,6 +1,6 @@ import torch from typing import Tuple -from torchtext import vocab +from lighthouse.common import vocab class GloVe: def __init__( @@ -35,4 +35,4 @@ def __call__( word_inds = torch.LongTensor( [self._vocab.stoi.get(w.lower(), 400000) for w in query.split()]) mask = torch.ones((1, word_inds.shape[0])).to(self._device) - return self._embedding(word_inds).unsqueeze(0).to(self._device), mask \ No newline at end of file + return self._embedding(word_inds).unsqueeze(0).to(self._device), mask diff --git a/mypy.ini b/mypy.ini index 54eecc0..41e0b02 100644 --- a/mypy.ini +++ b/mypy.ini @@ -13,9 +13,6 @@ ignore_missing_imports = True [mypy-torchlibrosa.*] ignore_missing_imports = True -[mypy-torchtext.*] -ignore_missing_imports = True - [mypy-clip.*] ignore_missing_imports = True diff --git a/training/cg_detr_dataset.py b/training/cg_detr_dataset.py index 64c932e..25725c1 100644 --- a/training/cg_detr_dataset.py +++ b/training/cg_detr_dataset.py @@ -41,10 +41,10 @@ import random import logging from os.path import join, exists +from lighthouse.common import vocab from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array from lighthouse.common.utils.tensor_utils import pad_sequences_1d from lighthouse.common.utils.span_utils import span_xx_to_cxw -from torchtext import vocab import torch.nn as nn logger = logging.getLogger(__name__) diff --git a/training/dataset.py b/training/dataset.py index d8053d7..0adacdb 100755 --- a/training/dataset.py +++ b/training/dataset.py @@ -62,10 +62,10 @@ import random import logging from os.path import join, exists +from lighthouse.common import vocab from lighthouse.common.utils.basic_utils import load_jsonl, l2_normalize_np_array from lighthouse.common.utils.tensor_utils import pad_sequences_1d from lighthouse.common.utils.span_utils import span_xx_to_cxw -from torchtext import vocab import torch.nn as nn logger = logging.getLogger(__name__)