diff --git a/README.md b/README.md index 57f8038..9598c5b 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,6 @@ Below is a simple use case of MiniCheck. MiniCheck models will be automatically ```python from minicheck.minicheck import MiniCheck import os -os.environ["CUDA_VISIBLE_DEVICES"] = "0" doc = "A group of students gather in the school library to study for their upcoming final exams." claim_1 = "The students are preparing for an examination." @@ -101,7 +100,7 @@ claim_2 = "The students are on vacation." # MiniCheck-Flan-T5-Large (770M) is the best fack-checking model # with size < 1B and reaches GPT-4 performance. -scorer = MiniCheck(model_name='flan-t5-large', cache_dir='./ckpts') +scorer = MiniCheck(model_name='flan-t5-large', device='cuda', cache_dir='./ckpts') pred_label, raw_prob, _, _ = scorer.score(docs=[doc, doc], claims=[claim_1, claim_2]) print(pred_label) # [1, 0] diff --git a/minicheck/inference.py b/minicheck/inference.py index 6bfea13..03f4dbd 100644 --- a/minicheck/inference.py +++ b/minicheck/inference.py @@ -24,7 +24,7 @@ def sent_tokenize_with_newlines(text): class Inferencer(): - def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None: + def __init__(self, model_name, max_model_len, batch_size, cache_dir, device) -> None: self.model_name = model_name @@ -32,9 +32,12 @@ def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None: if not os.path.exists(cache_dir): os.makedirs(cache_dir) + if device == 'cuda' and not torch.cuda.is_available(): + device = 'cpu' + if model_name == 'flan-t5-large': ckpt = 'lytang/MiniCheck-Flan-T5-Large' - self.model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, cache_dir=cache_dir, device_map="auto") + self.model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, cache_dir=cache_dir, device_map=device) self.tokenizer = AutoTokenizer.from_pretrained(ckpt, cache_dir=cache_dir) self.max_model_len=2048 if max_model_len is None else max_model_len @@ -57,7 +60,7 @@ def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None: self.tokenizer = AutoTokenizer.from_pretrained(ckpt, use_fast=True, revision='main', token=None, cache_dir=cache_dir) self.model = AutoModelForSequenceClassification.from_pretrained( - ckpt, config=config, revision='main', token=None, ignore_mismatched_sizes=False, cache_dir=cache_dir, device_map="auto") + ckpt, config=config, revision='main', token=None, ignore_mismatched_sizes=False, cache_dir=cache_dir, device_map=device) self.model.eval() self.batch_size = batch_size @@ -268,7 +271,7 @@ def fact_check(self, doc, claim): class LLMCheck: - def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): + def __init__(self, model_id, device, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None): from vllm import LLM, SamplingParams import logging @@ -298,7 +301,7 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non self.enable_prefix_caching = enable_prefix_caching # Check if CUDA is available and get compute capability - if torch.cuda.is_available(): + if device == 'cuda' and torch.cuda.is_available(): compute_capability = torch.cuda.get_device_capability() if compute_capability[0] >= 8: self.dtype = torch.bfloat16 diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py index 4a27f5e..5525092 100644 --- a/minicheck/minicheck.py +++ b/minicheck/minicheck.py @@ -6,7 +6,7 @@ class MiniCheck: - def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None: + def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, device='cpu', cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None: ''' Parameters: @@ -53,6 +53,11 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ Whether to enable prefix caching for 'Bespoke-MiniCheck-7B'. This can improve performance when using the same document chunk to fact-check different claims. + device : str, optional (default='cpu', available options: 'cpu', 'cuda') + The device to use for inference. Options are: + - 'cpu': Use the CPU for inference. + - 'cuda': Use the GPU for inference. Make sure CUDA is properly installed. + Note: (1) MiniCheck-Flan-T5-Large (770M) is the best fack-checking model with size < 1B and reaches GPT-4 performance. (2) Bespoke-MiniCheck-7B is the most performant fact-checking model in the MiniCheck series AND @@ -71,13 +76,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B'], \ "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B']" + assert device in ['cpu', 'cuda'], "device must be one of ['cpu', 'cuda']" if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']: self.model = Inferencer( model_name=model_name, batch_size=batch_size, max_model_len=max_model_len, - cache_dir=cache_dir + cache_dir=cache_dir, + device=device ) elif model_name == 'Bespoke-MiniCheck-7B': self.model = LLMCheck( @@ -86,7 +93,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_ max_tokens=max_tokens, cache_dir=cache_dir, enable_prefix_caching=enable_prefix_caching, - max_model_len=max_model_len + max_model_len=max_model_len, + device=device )