diff --git a/README.md b/README.md
index 57f8038..9598c5b 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,6 @@ Below is a simple use case of MiniCheck. MiniCheck models will be automatically
 ```python
 from minicheck.minicheck import MiniCheck
 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
 doc = "A group of students gather in the school library to study for their upcoming final exams."
 claim_1 = "The students are preparing for an examination."
@@ -101,7 +100,7 @@ claim_2 = "The students are on vacation."
 
 #  MiniCheck-Flan-T5-Large (770M) is the best fack-checking model 
 # with size < 1B and reaches GPT-4 performance.
-scorer = MiniCheck(model_name='flan-t5-large', cache_dir='./ckpts')
+scorer = MiniCheck(model_name='flan-t5-large', device='cuda', cache_dir='./ckpts')
 pred_label, raw_prob, _, _ = scorer.score(docs=[doc, doc], claims=[claim_1, claim_2])
 
 print(pred_label) # [1, 0]
diff --git a/minicheck/inference.py b/minicheck/inference.py
index 6bfea13..03f4dbd 100644
--- a/minicheck/inference.py
+++ b/minicheck/inference.py
@@ -24,7 +24,7 @@ def sent_tokenize_with_newlines(text):
 
 
 class Inferencer():
-    def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None:
+    def __init__(self, model_name, max_model_len, batch_size, cache_dir, device) -> None:
         
         self.model_name = model_name
 
@@ -32,9 +32,12 @@ def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None:
             if not os.path.exists(cache_dir):
                 os.makedirs(cache_dir)
 
+        if device == 'cuda' and not torch.cuda.is_available():
+            device = 'cpu'
+
         if model_name == 'flan-t5-large':
             ckpt = 'lytang/MiniCheck-Flan-T5-Large'
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, cache_dir=cache_dir, device_map="auto")
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, cache_dir=cache_dir, device_map=device)
             self.tokenizer = AutoTokenizer.from_pretrained(ckpt, cache_dir=cache_dir)
 
             self.max_model_len=2048 if max_model_len is None else max_model_len
@@ -57,7 +60,7 @@ def __init__(self, model_name, max_model_len, batch_size, cache_dir) -> None:
 
             self.tokenizer = AutoTokenizer.from_pretrained(ckpt, use_fast=True, revision='main', token=None, cache_dir=cache_dir)
             self.model = AutoModelForSequenceClassification.from_pretrained(
-                ckpt, config=config, revision='main', token=None, ignore_mismatched_sizes=False, cache_dir=cache_dir, device_map="auto")
+                ckpt, config=config, revision='main', token=None, ignore_mismatched_sizes=False, cache_dir=cache_dir, device_map=device)
         
         self.model.eval()
         self.batch_size = batch_size
@@ -268,7 +271,7 @@ def fact_check(self, doc, claim):
 
 class LLMCheck:
 
-    def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
+    def __init__(self, model_id, device, tensor_parallel_size=1, max_tokens=1, cache_dir=None, enable_prefix_caching=False, max_model_len=None):
         from vllm import LLM, SamplingParams
 
         import logging
@@ -298,7 +301,7 @@ def __init__(self, model_id, tensor_parallel_size=1, max_tokens=1, cache_dir=Non
         self.enable_prefix_caching = enable_prefix_caching
 
         # Check if CUDA is available and get compute capability
-        if torch.cuda.is_available():
+        if device == 'cuda' and torch.cuda.is_available():
             compute_capability = torch.cuda.get_device_capability()
             if compute_capability[0] >= 8:
                 self.dtype = torch.bfloat16
diff --git a/minicheck/minicheck.py b/minicheck/minicheck.py
index 4a27f5e..5525092 100644
--- a/minicheck/minicheck.py
+++ b/minicheck/minicheck.py
@@ -6,7 +6,7 @@
 
 
 class MiniCheck:
-    def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None:
+    def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_size=16, device='cpu', cache_dir=None, tensor_parallel_size=1, max_tokens=1, enable_prefix_caching=False) -> None:
 
         '''
         Parameters:
@@ -53,6 +53,11 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
             Whether to enable prefix caching for 'Bespoke-MiniCheck-7B'. This can improve performance
             when using the same document chunk to fact-check different claims.
 
+        device : str, optional (default='cpu', available options: 'cpu', 'cuda')
+            The device to use for inference. Options are:
+            - 'cpu': Use the CPU for inference.
+            - 'cuda': Use the GPU for inference. Make sure CUDA is properly installed.
+
         Note:
         (1) MiniCheck-Flan-T5-Large (770M) is the best fack-checking model with size < 1B and reaches GPT-4 performance.
         (2) Bespoke-MiniCheck-7B is the most performant fact-checking model in the MiniCheck series AND
@@ -71,13 +76,15 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
         assert model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B'], \
             "model_name must be one of ['roberta-large', 'deberta-v3-large', 'flan-t5-large', 'Bespoke-MiniCheck-7B']"
 
+        assert device in ['cpu', 'cuda'], "device must be one of ['cpu', 'cuda']"
         
         if model_name in ['roberta-large', 'deberta-v3-large', 'flan-t5-large']:
             self.model = Inferencer(
                 model_name=model_name, 
                 batch_size=batch_size, 
                 max_model_len=max_model_len,
-                cache_dir=cache_dir
+                cache_dir=cache_dir,
+                device=device
             )
         elif model_name == 'Bespoke-MiniCheck-7B':
             self.model = LLMCheck(
@@ -86,7 +93,8 @@ def __init__(self, model_name='Bespoke-MiniCheck-7B', max_model_len=None, batch_
                 max_tokens=max_tokens,
                 cache_dir=cache_dir,
                 enable_prefix_caching=enable_prefix_caching,
-                max_model_len=max_model_len
+                max_model_len=max_model_len,
+                device=device
             )