diff --git a/nemo_deploy/service/fastapi_interface_to_pytriton.py b/nemo_deploy/service/fastapi_interface_to_pytriton.py
index eeba902c5c..5881b09011 100644
--- a/nemo_deploy/service/fastapi_interface_to_pytriton.py
+++ b/nemo_deploy/service/fastapi_interface_to_pytriton.py
@@ -9,6 +9,7 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 
 import numpy as np
@@ -19,12 +20,7 @@
 
 from nemo_deploy.llm import NemoQueryLLMPyTorch
 
-try:
-    from nemo.utils import logging
-except (ImportError, ModuleNotFoundError):
-    import logging
-
-    logging = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class TritonSettings(BaseSettings):
@@ -39,10 +35,7 @@ def __init__(self):
             self._triton_service_port = int(os.environ.get("TRITON_PORT", 8000))
             self._triton_service_ip = os.environ.get("TRITON_HTTP_ADDRESS", "0.0.0.0")
         except Exception as error:
-            logging.error(
-                "An exception occurred trying to retrieve set args in TritonSettings class. Error:",
-                error,
-            )
+            logger.error(f"An exception occurred trying to retrieve set args in TritonSettings class. Error: {error}")
             return
 
     @property
@@ -81,7 +74,7 @@ class BaseRequest(BaseModel):
     def set_greedy_params(self):
         """Validate parameters for greedy decoding."""
         if self.temperature == 0 and self.top_p == 0:
-            logging.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.")
+            logger.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.")
             self.top_k = 1
         return self
 
@@ -134,7 +127,7 @@ async def check_triton_health():
     triton_url = (
         f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
     )
-    logging.info(f"Attempting to connect to Triton server at: {triton_url}")
+    logger.info(f"Attempting to connect to Triton server at: {triton_url}")
     try:
         response = requests.get(triton_url, timeout=5)
         if response.status_code == 200:
@@ -233,7 +226,7 @@ async def query_llm_async(
 async def completions_v1(request: CompletionRequest):
     """Defines the completions endpoint and queries the model deployed on PyTriton server."""
     url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}"
-    logging.info(f"Request: {request}")
+    logger.info(f"Request: {request}")
     prompts = request.prompt
     if not isinstance(request.prompt, list):
         prompts = [request.prompt]
@@ -266,7 +259,7 @@ async def completions_v1(request: CompletionRequest):
             output_serializable["choices"][0]["logprobs"]["token_logprobs"].insert(0, None)
     else:
         output_serializable["choices"][0]["logprobs"] = None
-    logging.info(f"Output: {output_serializable}")
+    logger.info(f"Output: {output_serializable}")
     return output_serializable
 
 
@@ -279,7 +272,7 @@ def dict_to_str(messages):
 async def chat_completions_v1(request: ChatCompletionRequest):
     """Defines the chat completions endpoint and queries the model deployed on PyTriton server."""
     url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}"
-    logging.info(f"Request: {request}")
+    logger.info(f"Request: {request}")
     prompts = request.messages
     if not isinstance(request.messages, list):
         prompts = [request.messages]
@@ -315,5 +308,5 @@ async def chat_completions_v1(request: ChatCompletionRequest):
         0
     ][0]
 
-    logging.info(f"Output: {output_serializable}")
+    logger.info(f"Output: {output_serializable}")
     return output_serializable
diff --git a/nemo_export/multimodal/build.py b/nemo_export/multimodal/build.py
index ffc46b6b1b..10bed26336 100644
--- a/nemo_export/multimodal/build.py
+++ b/nemo_export/multimodal/build.py
@@ -17,7 +17,6 @@
 import shutil
 import tarfile
 import tempfile
-from pathlib import Path
 from time import time
 from types import SimpleNamespace
 from typing import List
@@ -26,11 +25,8 @@
 import yaml
 from packaging import version
 
-from nemo_export.tensorrt_llm import TensorRTLLM
-from nemo_export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model
 from nemo_export_deploy_common.import_utils import (
     MISSING_NEMO_MSG,
-    MISSING_TENSORRT_LLM_MSG,
     MISSING_TENSORRT_MSG,
     MISSING_TRANSFORMERS_MSG,
     UnavailableError,
@@ -108,24 +104,12 @@ def build_trtllm_engine(
     max_lora_rank: int = 64,
     lora_ckpt_list: List[str] = None,
 ):
-    """Build TRTLLM engine by nemo export."""
-    if not HAVE_TRT_LLM:
-        raise UnavailableError(MISSING_TENSORRT_LLM_MSG)
-
-    trt_llm_exporter = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
-    trt_llm_exporter.export(
-        nemo_checkpoint_path=visual_checkpoint_path if llm_checkpoint_path is None else llm_checkpoint_path,
-        model_type=llm_model_type,
-        tensor_parallelism_size=tensor_parallelism_size,
-        max_input_len=max_input_len,
-        max_output_len=max_output_len,
-        max_seq_len=max_input_len + max_output_len,
-        max_batch_size=max_batch_size,
-        dtype=dtype,
-        load_model=False,
-        use_lora_plugin=use_lora_plugin,
-        lora_target_modules=lora_target_modules,
-        max_lora_rank=max_lora_rank,
+    """Build TRTLLM engine by nemo export.
+
+    Note: TensorRT-LLM export support has been removed.
+    """
+    raise NotImplementedError(
+        "TensorRT-LLM export support has been removed from this codebase. This function is no longer available."
     )
 
 
@@ -350,9 +334,10 @@ def build_neva_engine(
             mp0_weights = torch.load(weights_path, map_location=device)
     else:
         # extract NeMo checkpoint
-        with tempfile.TemporaryDirectory() as temp:
-            temp_path = Path(temp)
-            mp0_weights, nemo_config, _ = load_nemo_model(visual_checkpoint_path, temp_path)
+        raise NotImplementedError(
+            "Loading NeMo checkpoints via trt_llm utilities has been removed. "
+            "Please extract the checkpoint manually or use an earlier version."
+        )
 
     vision_config = nemo_config["mm_cfg"]["vision_encoder"]
 
diff --git a/nemo_export/tensorrt_llm_deployable_ray.py b/nemo_export/tensorrt_llm_deployable_ray.py
index 9e361be31f..edc7f1a21d 100644
--- a/nemo_export/tensorrt_llm_deployable_ray.py
+++ b/nemo_export/tensorrt_llm_deployable_ray.py
@@ -11,53 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-import time
-from typing import Any, Dict, List
-
-import numpy as np
-from fastapi import FastAPI, HTTPException
 
-from nemo_export_deploy_common.import_utils import MISSING_RAY_MSG, UnavailableError
+"""TensorRT-LLM Ray deployment functionality has been removed.
 
-try:
-    from ray import serve
+This module now only contains placeholder functions that raise NotImplementedError.
+TensorRT-LLM deployment support has been deprecated and removed from this codebase.
+"""
 
-    HAVE_RAY = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_RAY = False
+import logging
+from typing import List
 
 LOGGER = logging.getLogger("NeMo")
 
-app = FastAPI()
-
 
-@serve.deployment(
-    num_replicas=1,  # One replica per GPU
-    ray_actor_options={
-        "num_gpus": 1,  # Each replica gets 1 GPU
-        "num_cpus": 8,
-    },
-    max_ongoing_requests=10,
-)
-@serve.ingress(app)
 class TensorRTLLMRayDeployable:
-    """A Ray Serve compatible wrapper for deploying TensorRT-LLM models.
+    """Placeholder class for TensorRT-LLM Ray deployment functionality.
 
-    This class provides a standardized interface for deploying TensorRT-LLM models
-    in Ray Serve. It supports various NLP tasks and handles model loading,
-    inference, and deployment configurations.
-
-    Args:
-        model_dir (str): Path to the TensorRT-LLM model directory.
-        model_id (str): Identifier for the model in the API responses. Defaults to "tensorrt-llm-model".
-        max_batch_size (int): Maximum number of requests to batch together. Defaults to 8.
-        batch_wait_timeout_s (float): Maximum time to wait for batching requests. Defaults to 0.3.
-        load_model (bool): Whether to load the model during initialization. Defaults to True.
-        use_python_runtime (bool): Whether to use Python runtime. Defaults to True.
-        enable_chunked_context (bool): Whether to enable chunked context. Defaults to None.
-        max_tokens_in_paged_kv_cache (int): Maximum tokens in paged KV cache. Defaults to None.
-        multi_block_mode (bool): Whether to enable multi-block mode. Defaults to False.
+    Note: TensorRT-LLM deployment support has been removed from this codebase.
+    All methods will raise NotImplementedError.
     """
 
     def __init__(
@@ -72,223 +43,43 @@ def __init__(
     ):
         """Initialize the TensorRT-LLM model deployment.
 
-        Args:
-            model_dir (str): Path to the TensorRT-LLM model directory.
-            model_id (str): Model identifier. Defaults to "tensorrt-llm-model".
-            max_batch_size (int): Maximum number of requests to batch together. Defaults to 8.
-            pipeline_parallelism_size (int): Number of pipeline parallelism. Defaults to 1.
-            tensor_parallelism_size (int): Number of tensor parallelism. Defaults to 1.
-            use_python_runtime (bool): Whether to use Python runtime. Defaults to True.
-            enable_chunked_context (bool): Whether to enable chunked context. Defaults to None.
-            max_tokens_in_paged_kv_cache (int): Maximum tokens in paged KV cache. Defaults to None.
-            multi_block_mode (bool): Whether to enable multi-block mode. Defaults to False.
-            lora_ckpt_list (List[str]): List of LoRA checkpoint paths. Defaults to None.
-
         Raises:
-            ImportError: If Ray is not installed.
-            Exception: If model initialization fails.
+            NotImplementedError: This functionality has been removed.
         """
-        if not HAVE_RAY:
-            raise UnavailableError(MISSING_RAY_MSG)
-
-        try:
-            from nemo_export.tensorrt_llm import TensorRTLLM
-
-            self.model = TensorRTLLM(
-                model_dir=trt_llm_path,
-                lora_ckpt_list=lora_ckpt_list,
-                load_model=True,
-                use_python_runtime=use_python_runtime,
-                enable_chunked_context=enable_chunked_context,
-                max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
-                multi_block_mode=multi_block_mode,
-            )
-            self.model_id = model_id
-
-        except Exception as e:
-            LOGGER.error(f"Error initializing TensorRTLLMRayDeployable replica: {str(e)}")
-            raise
-
-    @app.post("/v1/completions/")
-    async def completions(self, request: Dict[Any, Any]):
-        """Handle text completion requests."""
-        try:
-            if "prompt" in request:
-                request["prompts"] = [request["prompt"]]
-            temperature = request.get("temperature", 0.0)
-            top_p = request.get("top_p", 0.0)
-            if temperature == 0.0 and top_p == 0.0:
-                LOGGER.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.")
-                request["top_k"] = 1.0
-
-            # Prepare inference inputs with proper parameter mapping
-            inference_inputs = {
-                "prompts": request.get("prompts", []),
-                "max_output_len": request.get("max_tokens", 256),
-                "temperature": request.get("temperature", 1.0),
-                "top_k": request.get("top_k", 0),
-                "top_p": request.get("top_p", 0.0),
-                "compute_logprob": True if request.get("logprobs") == 1 else False,
-                "apply_chat_template": False,
-            }
-
-            results = self.model.ray_infer_fn(inference_inputs)
-            # Extract generated texts from results
-            generated_texts_raw = results.get("sentences", [])
-
-            # Flatten the nested list structure - sentences is a list of lists
-            generated_texts = []
-            for batch in generated_texts_raw:
-                if isinstance(batch, list):
-                    generated_texts.extend(batch)
-                else:
-                    generated_texts.append(batch)
-
-            # Calculate token counts asynchronously
-            prompt_tokens = sum(len(p.split()) for p in request.get("prompts", []))
-            completion_tokens = sum(len(str(r).split()) for r in generated_texts)
-            total_tokens = prompt_tokens + completion_tokens
+        raise NotImplementedError(
+            "TensorRT-LLM Ray deployment support has been removed from this codebase. "
+            "Please use an earlier version if you need this functionality."
+        )
 
-            # Convert numpy arrays to Python lists for JSON serialization
-            log_probs_data = results.get("log_probs", None)
-            if log_probs_data is not None and isinstance(log_probs_data, np.ndarray):
-                log_probs_data = log_probs_data.tolist()
+    def generate(self, *args, **kwargs):
+        """Generate method.
 
-            output = {
-                "id": f"cmpl-{int(time.time())}",
-                "object": "text_completion",
-                "created": int(time.time()),
-                "model": self.model_id,
-                "choices": [
-                    {
-                        "text": " ".join(str(t) for t in generated_texts),
-                        "index": 0,
-                        "logprobs": (
-                            {
-                                "token_logprobs": log_probs_data,
-                                "top_logprobs": log_probs_data,
-                            }
-                            if log_probs_data is not None
-                            else None
-                        ),
-                        "finish_reason": (
-                            "length"
-                            if generated_texts and len(str(generated_texts[0])) >= request.get("max_tokens", 256)
-                            else "stop"
-                        ),
-                    }
-                ],
-                "usage": {
-                    "prompt_tokens": prompt_tokens,
-                    "completion_tokens": completion_tokens,
-                    "total_tokens": total_tokens,
-                },
-            }
-            return output
-        except Exception as e:
-            LOGGER.error(f"Error during inference: {str(e)}")
-            raise HTTPException(status_code=500, detail=f"Error during inference: {str(e)}")
-
-    @app.post("/v1/chat/completions/")
-    async def chat_completions(self, request: Dict[Any, Any]):
-        """Handle chat completion requests."""
-        try:
-            # Extract parameters from the request dictionary
-            messages = request.get("messages", [])
-
-            inference_inputs = {
-                "prompts": [messages],  # Wrap messages in a list so apply_chat_template gets the full conversation
-                "max_output_len": request.get("max_tokens", 256),
-                "temperature": request.get("temperature", 1.0),
-                "top_k": request.get("top_k", 0),
-                "top_p": request.get("top_p", 0.0),
-                "compute_logprob": True if request.get("logprobs") == 1 else False,
-                "apply_chat_template": request.get("apply_chat_template", True),
-            }
-
-            # Run model inference in the thread pool
-            results = self.model.ray_infer_fn(inference_inputs)
-
-            # Extract generated texts from results
-            generated_texts_raw = results["sentences"]
-
-            # Flatten the nested list structure - sentences is a list of lists
-            generated_texts = []
-            for batch in generated_texts_raw:
-                if isinstance(batch, list):
-                    generated_texts.extend(batch)
-                else:
-                    generated_texts.append(batch)
-
-            # Calculate token counts
-            prompt_tokens = sum(len(str(msg).split()) for msg in messages)
-            completion_tokens = sum(len(str(r).split()) for r in generated_texts)
-            total_tokens = prompt_tokens + completion_tokens
-
-            # Convert numpy arrays to Python lists for JSON serialization
-            log_probs_data = results.get("log_probs", None)
-            if log_probs_data is not None and isinstance(log_probs_data, np.ndarray):
-                log_probs_data = log_probs_data.tolist()
+        Raises:
+            NotImplementedError: This functionality has been removed.
+        """
+        raise NotImplementedError("TensorRT-LLM Ray deployment support has been removed from this codebase.")
 
-            output = {
-                "id": f"chatcmpl-{int(time.time())}",
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": self.model_id,
-                "choices": [
-                    {
-                        "message": {"role": "assistant", "content": str(generated_texts[0]) if generated_texts else ""},
-                        "index": 0,
-                        "logprobs": (
-                            {
-                                "token_logprobs": log_probs_data,
-                                "top_logprobs": log_probs_data,
-                            }
-                            if log_probs_data is not None
-                            else None
-                        ),
-                        "finish_reason": (
-                            "length"
-                            if generated_texts and len(str(generated_texts[0])) >= inference_inputs["max_output_len"]
-                            else "stop"
-                        ),
-                    }
-                ],
-                "usage": {
-                    "prompt_tokens": prompt_tokens,
-                    "completion_tokens": completion_tokens,
-                    "total_tokens": total_tokens,
-                },
-            }
-            return output
-        except Exception as e:
-            LOGGER.error(f"Error during chat completion: {str(e)}")
-            raise HTTPException(status_code=500, detail=f"Error during chat completion: {str(e)}")
+    def chat_completions(self, *args, **kwargs):
+        """Chat completions method.
 
-    @app.get("/v1/models")
-    async def list_models(self):
-        """List available models.
+        Raises:
+            NotImplementedError: This functionality has been removed.
+        """
+        raise NotImplementedError("TensorRT-LLM Ray deployment support has been removed from this codebase.")
 
-        This endpoint returns information about the deployed model in OpenAI API format.
+    def completions(self, *args, **kwargs):
+        """Completions method.
 
-        Returns:
-            Dict containing:
-                - object: Response type ("list")
-                - data: List of model information
+        Raises:
+            NotImplementedError: This functionality has been removed.
         """
-        return {
-            "object": "list",
-            "data": [{"id": self.model_id, "object": "model", "created": int(time.time())}],
-        }
-
-    @app.get("/v1/health")
-    async def health_check(self):
-        """Check the health status of the service.
+        raise NotImplementedError("TensorRT-LLM Ray deployment support has been removed from this codebase.")
 
-        This endpoint is used to verify that the service is running and healthy.
+    @classmethod
+    def options(cls, *args, **kwargs):
+        """Options method for Ray deployment.
 
-        Returns:
-            Dict containing:
-                - status: Health status ("healthy")
+        Raises:
+            NotImplementedError: This functionality has been removed.
         """
-        return {"status": "healthy"}
+        raise NotImplementedError("TensorRT-LLM Ray deployment support has been removed from this codebase.")
diff --git a/nemo_export/tensorrt_llm_hf.py b/nemo_export/tensorrt_llm_hf.py
index ffbe2c968a..b7f771d791 100644
--- a/nemo_export/tensorrt_llm_hf.py
+++ b/nemo_export/tensorrt_llm_hf.py
@@ -12,97 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
+"""TensorRT-LLM HuggingFace export functionality has been removed.
+
+This module now only contains placeholder functions that raise NotImplementedError.
+TensorRT-LLM export support has been deprecated and removed from this codebase.
+"""
+
 import logging
-import os
-import shutil
-from glob import glob
-from pathlib import Path
 from typing import List, Optional
 
-from transformers import AutoConfig
-
 from nemo_export.tensorrt_llm import TensorRTLLM
-from nemo_export.utils import prepare_directory_for_export
-from nemo_export.utils.constants import TRTLLM_ENGINE_DIR
-from nemo_export_deploy_common.import_utils import (
-    MISSING_TENSORRT_LLM_MSG,
-    UnavailableError,
-)
-
-try:
-    from tensorrt_llm._common import check_max_num_tokens
-    from tensorrt_llm.builder import BuildConfig
-    from tensorrt_llm.commands.build import build as build_trtllm
-    from tensorrt_llm.mapping import Mapping
-    from tensorrt_llm.models import (
-        BaichuanForCausalLM,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertModel,
-        BloomForCausalLM,
-        ChatGLMForCausalLM,
-        CogVLMForCausalLM,
-        CohereForCausalLM,
-        DbrxForCausalLM,
-        DeciLMForCausalLM,
-        DecoderModel,
-        DeepseekForCausalLM,
-        DeepseekV2ForCausalLM,
-        DiT,
-        EagleForCausalLM,
-        EncoderModel,
-        FalconForCausalLM,
-        GemmaForCausalLM,
-        GPTForCausalLM,
-        GPTJForCausalLM,
-        GPTNeoXForCausalLM,
-        GrokForCausalLM,
-        LLaMAForCausalLM,
-        MambaForCausalLM,
-        MedusaForCausalLm,
-        MLLaMAForCausalLM,
-        MPTForCausalLM,
-        OPTForCausalLM,
-        Phi3ForCausalLM,
-        PhiForCausalLM,
-        QWenForCausalLM,
-        RecurrentGemmaForCausalLM,
-        ReDrafterForLLaMALM,
-        ReDrafterForQWenLM,
-        RobertaForQuestionAnswering,
-        RobertaForSequenceClassification,
-        RobertaModel,
-        WhisperEncoder,
-    )
-    from tensorrt_llm.plugin import PluginConfig
-
-    HAVE_TENSORRT_LLM = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_TENSORRT_LLM = False
 
 LOGGER = logging.getLogger("NeMo")
 
 
 class TensorRTLLMHF(TensorRTLLM):
-    """Exports HuggingFace checkpoints to TensorRT-LLM and run fast inference.
-
-    This class provides functionality to export HuggingFace models to TensorRT-LLM
-    format and run inference using the exported models. It inherits from TensorRTLLM
-    and adds HuggingFace-specific export capabilities.
-
-    Example:
-        from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-        trt_llm_exporter = TensorRTLLMHF(model_dir="/path/for/model/files")
-        trt_llm_exporter.export_hf_model(
-            hf_model_path="/path/to/huggingface/model",
-            max_batch_size=8,
-            tensor_parallelism_size=1,
-        )
+    """Placeholder class for TensorRT-LLM HuggingFace export functionality.
 
-        output = trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
-        print("output: ", output)
+    Note: TensorRT-LLM export support has been removed from this codebase.
+    All methods will raise NotImplementedError.
     """
 
     def __init__(
@@ -117,27 +45,12 @@ def __init__(
     ):
         """Initialize TensorRTLLMHF exporter.
 
-        Args:
-            model_dir (str): Path for storing the TensorRT-LLM model files.
-            lora_ckpt_list (List[str], optional): List of LoRA checkpoint paths. Defaults to None.
-            load_model (bool, optional): Load TensorRT-LLM model if engine files exist. Defaults to True.
-            use_python_runtime (bool, optional): Whether to use python or c++ runtime. Defaults to True.
-            enable_chunked_context (bool, optional): Enable chunked context processing. Defaults to None.
-            max_tokens_in_paged_kv_cache (int, optional): Max tokens in paged KV cache. Defaults to None.
-            multi_block_mode (bool, optional): Enable faster decoding in multihead attention. Defaults to False.
+        Raises:
+            NotImplementedError: This functionality has been removed.
         """
-        if not HAVE_TENSORRT_LLM:
-            raise UnavailableError(MISSING_TENSORRT_LLM_MSG)
-
-        # Call parent class constructor
-        super().__init__(
-            model_dir=model_dir,
-            lora_ckpt_list=lora_ckpt_list,
-            load_model=load_model,
-            use_python_runtime=use_python_runtime,
-            enable_chunked_context=enable_chunked_context,
-            max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
-            multi_block_mode=multi_block_mode,
+        raise NotImplementedError(
+            "TensorRT-LLM HuggingFace export support has been removed from this codebase. "
+            "Please use an earlier version if you need this functionality."
         )
 
     def export_hf_model(
@@ -146,7 +59,7 @@ def export_hf_model(
         max_batch_size: int = 8,
         tensor_parallelism_size: int = 1,
         max_input_len: int = 256,
-        max_output_len: int = 256,
+        max_output_len: Optional[int] = None,
         max_num_tokens: Optional[int] = None,
         opt_num_tokens: Optional[int] = None,
         dtype: Optional[str] = None,
@@ -155,277 +68,39 @@ def export_hf_model(
         remove_input_padding: bool = True,
         use_paged_context_fmha: bool = True,
         paged_kv_cache: bool = True,
-        tokens_per_block: int = 128,
         multiple_profiles: bool = False,
         reduce_fusion: bool = False,
-        max_beam_width: int = 1,
-        use_refit: bool = False,
         model_type: Optional[str] = None,
         delete_existing_files: bool = True,
     ):
-        """Export a Hugging Face model to TensorRT-LLM format.
-
-        This method exports a Hugging Face model to TensorRT-LLM format with various configuration
-        options for model parallelism, quantization, and inference parameters.
-
-        Args:
-            hf_model_path (str): Path to the Hugging Face model directory.
-            max_batch_size (int, optional): Maximum batch size. Defaults to 8.
-            tensor_parallelism_size (int, optional): Size of tensor parallelism. Defaults to 1.
-            max_input_len (int, optional): Maximum input sequence length. Defaults to 256.
-            max_output_len (int, optional): Maximum output sequence length. Defaults to 256.
-            max_num_tokens (Optional[int], optional): Maximum number of tokens. Defaults to None.
-            opt_num_tokens (Optional[int], optional): Optimal number of tokens. Defaults to None.
-            dtype (Optional[str], optional): Data type for model weights. Defaults to None.
-            max_seq_len (Optional[int], optional): Maximum sequence length. Defaults to 512.
-            gemm_plugin (str, optional): GEMM plugin type. Defaults to "auto".
-            remove_input_padding (bool, optional): Remove input padding. Defaults to True.
-            use_paged_context_fmha (bool, optional): Use paged context FMHA. Defaults to True.
-            paged_kv_cache (bool, optional): Use paged KV cache. Defaults to True.
-            tokens_per_block (int, optional): Tokens per block. Defaults to 128.
-            multiple_profiles (bool, optional): Use multiple profiles. Defaults to False.
-            reduce_fusion (bool, optional): Enable reduce fusion. Defaults to False.
-            max_beam_width (int, optional): Maximum beam width. Defaults to 1.
-            use_refit (bool, optional): Use refit. Defaults to False.
-            model_type (Optional[str], optional): Type of the model. Defaults to None.
-            delete_existing_files (bool, optional): Delete existing files. Defaults to True.
+        """Export HuggingFace model to TensorRT-LLM.
 
         Raises:
-            ValueError: If model_type is not supported or dtype cannot be determined.
-            FileNotFoundError: If config file is not found.
-            RuntimeError: If there are errors reading the config file.
+            NotImplementedError: This functionality has been removed.
         """
-        LOGGER.info("Starting HF export to TRT-LLM")
-        if model_type is None:
-            model_type = self.get_hf_model_type(hf_model_path)
-
-        if model_type not in self.get_supported_hf_model_mapping:
-            raise ValueError(
-                f"Model {model_type} is not currently a supported model type. "
-                f"Supported model types are: {self.get_supported_hf_model_mapping.keys()}."
-            )
+        raise NotImplementedError("TensorRT-LLM HuggingFace export support has been removed from this codebase.")
 
-        if dtype is None:
-            dtype = self.get_hf_model_dtype(hf_model_path)
-            if dtype is None:
-                raise ValueError("No dtype found in hf model config. Please specify a dtype.")
-
-        prepare_directory_for_export(
-            self.model_dir,
-            delete_existing_files=delete_existing_files,
-            subdir=TRTLLM_ENGINE_DIR,
-        )
-
-        if max_batch_size < 4:
-            print("TensorRT-LLM may hit runtime issue with batch size is smaller than 4. Force set to 4")
-            max_batch_size = 4
-
-        plugin_config = PluginConfig()
-        plugin_config.gemm_plugin = gemm_plugin
-        if paged_kv_cache:
-            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
-        else:
-            plugin_config.paged_kv_cache = False
-        plugin_config.remove_input_padding = remove_input_padding
-        plugin_config.use_paged_context_fmha = use_paged_context_fmha
-        plugin_config.multiple_profiles = multiple_profiles
-        plugin_config.reduce_fusion = reduce_fusion
-        max_seq_len = max_input_len + max_output_len
-        max_num_tokens, opt_num_tokens = check_max_num_tokens(
-            max_num_tokens=max_num_tokens,
-            opt_num_tokens=opt_num_tokens,
-            max_seq_len=max_seq_len,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_beam_width=max_beam_width,
-            remove_input_padding=remove_input_padding,
-            enable_context_fmha=plugin_config.context_fmha,
-            tokens_per_block=tokens_per_block,
-            multiple_profiles=multiple_profiles,
-        )
-        build_dict = {
-            "max_input_len": max_input_len,
-            "max_output_len": max_output_len,
-            "max_batch_size": max_batch_size,
-            "max_beam_width": max_beam_width,
-            "max_seq_len": max_seq_len,
-            "max_num_tokens": max_num_tokens,
-            "opt_num_tokens": opt_num_tokens,
-            "strongly_typed": False,
-            "builder_opt": None,
-            "multiple_profiles": multiple_profiles,
-            "use_refit": use_refit,
-        }
-        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
-        for rank in range(tensor_parallelism_size):
-            LOGGER.info(f"Iterating over rank:{rank}")
-            mapping = Mapping(
-                world_size=tensor_parallelism_size,
-                rank=rank,
-                tp_size=tensor_parallelism_size,
-            )
-            trtllm_model_class = self.get_supported_hf_model_mapping[model_type]
-            model = trtllm_model_class.from_hugging_face(
-                hf_model_path,
-                dtype,
-                mapping=mapping,
-            )
-            engine = build_trtllm(model, build_config)
-            engine.save(self.engine_dir)
-        # Copy HF tokenizer files to root model directory
-        for path in glob(os.path.join(hf_model_path, "*.json")):
-            shutil.copy(path, self.model_dir)
-        # Copy sentencepiece model to model directory
-        for path in glob(os.path.join(hf_model_path, "*.model")):
-            shutil.copy(path, self.model_dir)
-        LOGGER.info(f"Generarated TRT-LLM checkpoint at dir:{self.model_dir}")
-        LOGGER.info(f"Loading the TRT-LLM checkpoint:{self.model_dir}")
-        self._load()
-
-    def get_hf_model_type(self, model_dir: str) -> str:
-        """Get the model type from a Hugging Face model directory.
-
-        This method infers the model type from the 'architectures' field in the model's config.json file.
-
-        Args:
-            model_dir (str): Path to the Hugging Face model directory or model ID at Hugging Face Hub.
-
-        Returns:
-            str: The inferred model type (e.g., "LlamaForCausalLM").
+    def get_hf_model_type(self, hf_model_path: str) -> str:
+        """Get HuggingFace model type.
 
         Raises:
-            ValueError: If the architecture choice is ambiguous.
+            NotImplementedError: This functionality has been removed.
         """
-        config = AutoConfig.from_pretrained(model_dir)
-
-        if len(config.architectures) != 1:
-            raise ValueError(
-                f"Ambiguous architecture choice: {config.architectures}, please specify model_type explicitly."
-            )
-
-        return config.architectures[0]
-
-    def get_hf_model_dtype(self, model_dir: str) -> Optional[str]:
-        """Get the data type from a Hugging Face model directory.
-
-        This method reads the config file from a Hugging Face model directory and identifies
-        the model's data type from various possible locations in the config.
+        raise NotImplementedError("TensorRT-LLM HuggingFace export support has been removed from this codebase.")
 
-        Args:
-            model_dir (str): Path to the Hugging Face model directory.
-
-        Returns:
-            Optional[str]: The model's data type if found in config, None otherwise.
+    def get_hf_model_dtype(self, hf_model_path: str) -> str:
+        """Get HuggingFace model dtype.
 
         Raises:
-            FileNotFoundError: If the config file is not found.
-            ValueError: If the config file contains invalid JSON.
-            RuntimeError: If there are errors reading the config file.
+            NotImplementedError: This functionality has been removed.
         """
-        config_path = Path(model_dir) / "config.json"
-
-        if not config_path.exists():
-            raise FileNotFoundError(f"Config file not found at {config_path}")
-
-        try:
-            with open(config_path, "r") as f:
-                config = json.load(f)
-                # Check for dtype in different possible locations in the config
-                if "torch_dtype" in config:
-                    return config["torch_dtype"]
-                elif "dtype" in config:
-                    return config["dtype"]
-                elif "pretrained_config" in config and "dtype" in config["pretrained_config"]:
-                    return config["pretrained_config"]["dtype"]
-
-                # If no explicit dtype found, check for other indicators
-                if "fp16" in config and config["fp16"]:
-                    return "float16"
-                elif "bf16" in config and config["bf16"]:
-                    return "bfloat16"
-
-            return None
-        except json.JSONDecodeError:
-            raise ValueError(f"Invalid JSON in config file at {config_path}")
-        except Exception as e:
-            raise RuntimeError(f"Error reading config file: {str(e)}")
+        raise NotImplementedError("TensorRT-LLM HuggingFace export support has been removed from this codebase.")
 
     @property
     def get_supported_hf_model_mapping(self):
-        """Supported HF Model Mapping."""
-        HF_MODEL_CLASS_MAP = {
-            "GPT2LMHeadModel": GPTForCausalLM,
-            "GPT2LMHeadCustomModel": GPTForCausalLM,
-            "GPTBigCodeForCausalLM": GPTForCausalLM,
-            "Starcoder2ForCausalLM": GPTForCausalLM,
-            "JAISLMHeadModel": GPTForCausalLM,
-            "GPTForCausalLM": GPTForCausalLM,
-            "NemotronForCausalLM": GPTForCausalLM,
-            "OPTForCausalLM": OPTForCausalLM,
-            "BloomForCausalLM": BloomForCausalLM,
-            "RWForCausalLM": FalconForCausalLM,
-            "FalconForCausalLM": FalconForCausalLM,
-            "PhiForCausalLM": PhiForCausalLM,
-            "Phi3ForCausalLM": Phi3ForCausalLM,
-            "Phi3VForCausalLM": Phi3ForCausalLM,
-            "Phi3SmallForCausalLM": Phi3ForCausalLM,
-            "PhiMoEForCausalLM": Phi3ForCausalLM,
-            "MambaForCausalLM": MambaForCausalLM,
-            "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
-            "GPTJForCausalLM": GPTJForCausalLM,
-            "MptForCausalLM": MPTForCausalLM,
-            "MPTForCausalLM": MPTForCausalLM,
-            "GLMModel": ChatGLMForCausalLM,
-            "ChatGLMModel": ChatGLMForCausalLM,
-            "ChatGLMForCausalLM": ChatGLMForCausalLM,
-            "ChatGLMForConditionalGeneration": ChatGLMForCausalLM,
-            "LlamaForCausalLM": LLaMAForCausalLM,
-            "LlavaLlamaModel": LLaMAForCausalLM,
-            "ExaoneForCausalLM": LLaMAForCausalLM,
-            "MistralForCausalLM": LLaMAForCausalLM,
-            "MixtralForCausalLM": LLaMAForCausalLM,
-            "ArcticForCausalLM": LLaMAForCausalLM,
-            "Grok1ModelForCausalLM": GrokForCausalLM,
-            "InternLMForCausalLM": LLaMAForCausalLM,
-            "InternLM2ForCausalLM": LLaMAForCausalLM,
-            "InternLMXComposer2ForCausalLM": LLaMAForCausalLM,
-            "GraniteForCausalLM": LLaMAForCausalLM,
-            "GraniteMoeForCausalLM": LLaMAForCausalLM,
-            "MedusaForCausalLM": MedusaForCausalLm,
-            "MedusaLlamaForCausalLM": MedusaForCausalLm,
-            "ReDrafterForLLaMALM": ReDrafterForLLaMALM,
-            "ReDrafterForQWenLM": ReDrafterForQWenLM,
-            "BaichuanForCausalLM": BaichuanForCausalLM,
-            "BaiChuanForCausalLM": BaichuanForCausalLM,
-            "SkyworkForCausalLM": LLaMAForCausalLM,
-            "GEMMA": GemmaForCausalLM,
-            "GEMMA2": GemmaForCausalLM,
-            "QWenLMHeadModel": QWenForCausalLM,
-            "QWenForCausalLM": QWenForCausalLM,
-            "Qwen2ForCausalLM": QWenForCausalLM,
-            "Qwen2MoeForCausalLM": QWenForCausalLM,
-            "Qwen2ForSequenceClassification": QWenForCausalLM,
-            "Qwen2VLForConditionalGeneration": QWenForCausalLM,
-            "Qwen2VLModel": QWenForCausalLM,
-            "WhisperEncoder": WhisperEncoder,
-            "EncoderModel": EncoderModel,
-            "DecoderModel": DecoderModel,
-            "DbrxForCausalLM": DbrxForCausalLM,
-            "RecurrentGemmaForCausalLM": RecurrentGemmaForCausalLM,
-            "CogVLMForCausalLM": CogVLMForCausalLM,
-            "DiT": DiT,
-            "DeepseekForCausalLM": DeepseekForCausalLM,
-            "DeciLMForCausalLM": DeciLMForCausalLM,
-            "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
-            "EagleForCausalLM": EagleForCausalLM,
-            "CohereForCausalLM": CohereForCausalLM,
-            "MLLaMAModel": MLLaMAForCausalLM,
-            "MllamaForConditionalGeneration": MLLaMAForCausalLM,
-            "BertForQuestionAnswering": BertForQuestionAnswering,
-            "BertForSequenceClassification": BertForSequenceClassification,
-            "BertModel": BertModel,
-            "RobertaModel": RobertaModel,
-            "RobertaForQuestionAnswering": RobertaForQuestionAnswering,
-            "RobertaForSequenceClassification": RobertaForSequenceClassification,
-        }
-        return HF_MODEL_CLASS_MAP
+        """Get supported HuggingFace model mapping.
+
+        Raises:
+            NotImplementedError: This functionality has been removed.
+        """
+        raise NotImplementedError("TensorRT-LLM HuggingFace export support has been removed from this codebase.")
diff --git a/nemo_export/tensorrt_mm_exporter.py b/nemo_export/tensorrt_mm_exporter.py
index 6365e12e9c..7cc783e79d 100644
--- a/nemo_export/tensorrt_mm_exporter.py
+++ b/nemo_export/tensorrt_mm_exporter.py
@@ -12,83 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
-import os
-import shutil
-import tempfile
-from pathlib import Path
 from typing import List
 
 import numpy as np
-import wrapt
 
 from nemo_deploy import ITritonDeployable
-from nemo_export.multimodal.build import (
-    build_mllama_engine,
-    build_trtllm_engine,
-    build_visual_engine,
-    extract_lora_ckpt,
-)
-from nemo_export.multimodal.run import MultimodalModelRunner
-from nemo_export_deploy_common.import_utils import MISSING_TENSORRT_LLM_MSG, UnavailableError
 
 try:
-    from tensorrt_llm.runtime import MultimodalModelRunner as TRTLLMRunner
-
-    HAVE_TRT_LLM = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_TRT_LLM = False
-
-use_deploy = True
-try:
-    from nemo_deploy.utils import cast_output, ndarray2img, str_ndarray2list
-except Exception:
-    use_deploy = False
-
-
-@wrapt.decorator
-def noop_decorator(func):
-    """No op decorator."""
-
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-use_pytriton = True
-batch = noop_decorator
-try:
-    from pytriton.decorators import batch, first_value
     from pytriton.model_config import Tensor
 except Exception:
     from unittest.mock import MagicMock
 
-    batch = MagicMock()
-    first_value = MagicMock()
     Tensor = MagicMock()
-    use_pytriton = False
-
-
-LOGGER = logging.getLogger("NeMo")
 
 
 class TensorRTMMExporter(ITritonDeployable):
-    """Exports nemo checkpoints to TensorRT and run fast inference.
-
-    Example:
-        from nemo_export import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir="/path/for/model/files")
-        exporter.export(
-            visual_checkpoint_path="/path/for/nemo/checkpoint",
-            model_type="neva",
-            tensor_parallel_size=1,
-        )
-
-        output = exporter.forward("Hi! What is in this image?", "/path/for/input_media")
-        print("output: ", output)
+    """TensorRT multimodal exporter functionality has been removed.
 
+    This class is kept for backward compatibility but all methods will raise NotImplementedError.
     """
 
     def __init__(
@@ -97,14 +38,7 @@ def __init__(
         load_model: bool = True,
         modality: str = "vision",
     ):
-        self.model_dir = model_dir
-        self.runner = None
-        # vision modality is for image and video
-        assert modality in ["vision", "audio"]
-        self.modality = modality
-
-        if load_model:
-            self._load()
+        raise NotImplementedError("TensorRTMMExporter has been removed. This functionality is no longer supported.")
 
     def export(
         self,
@@ -128,81 +62,9 @@ def export(
         max_lora_rank: int = 64,
     ):
         """Export multimodal models to TRTLLM."""
-        if Path(self.model_dir).exists():
-            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
-                for files in os.listdir(self.model_dir):
-                    path = os.path.join(self.model_dir, files)
-                    try:
-                        shutil.rmtree(path)
-                    except OSError:
-                        os.remove(path)
-
-                if len(os.listdir(self.model_dir)) > 0:
-                    raise Exception("Couldn't delete all files.")
-            elif len(os.listdir(self.model_dir)) > 0:
-                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
-        else:
-            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
-
-        if model_type == "mllama":
-            build_mllama_engine(
-                model_dir=self.model_dir,
-                checkpoint_path=visual_checkpoint_path,
-                processor_name=processor_name or "meta-llama/Llama-3.2-11B-Vision-Instruct",
-                tensor_parallelism_size=tensor_parallel_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                vision_max_batch_size=vision_max_batch_size,
-                max_multimodal_len=max_multimodal_len,
-                dtype=dtype,
-            )
-        else:
-            if lora_checkpoint_path is not None:
-                tmp_dir = tempfile.TemporaryDirectory()
-                if os.path.isdir(lora_checkpoint_path):
-                    lora_dir = lora_checkpoint_path
-                else:
-                    raise ValueError("lora_checkpoint_path in nemo1 is not supported. It must be a directory")
-
-                llm_lora_path = [extract_lora_ckpt(lora_dir, tmp_dir.name)]
-            else:
-                tmp_dir = None
-                llm_lora_path = None
-                lora_dir = None
-
-            llm_dir = os.path.join(self.model_dir, "llm_engine")
-            build_trtllm_engine(
-                model_dir=llm_dir,
-                visual_checkpoint_path=visual_checkpoint_path,
-                llm_checkpoint_path=llm_checkpoint_path,
-                model_type=model_type,
-                llm_model_type=llm_model_type,
-                tensor_parallelism_size=tensor_parallel_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                max_multimodal_len=max_multimodal_len,
-                dtype=dtype,
-                use_lora_plugin=use_lora_plugin,
-                lora_target_modules=lora_target_modules,
-                max_lora_rank=max_lora_rank,
-                lora_ckpt_list=llm_lora_path,
-            )
-
-            visual_dir = os.path.join(self.model_dir, "visual_engine")
-            build_visual_engine(
-                visual_dir,
-                visual_checkpoint_path if lora_dir is None else lora_dir,
-                model_type,
-                vision_max_batch_size,
-            )
-
-            if tmp_dir is not None:
-                tmp_dir.cleanup()
-
-        if load_model:
-            self._load()
+        raise NotImplementedError(
+            "TensorRTMMExporter.export has been removed. This functionality is no longer supported."
+        )
 
     def forward(
         self,
@@ -218,160 +80,35 @@ def forward(
         lora_uids: List[str] = None,
     ):
         """Run forward with loaded TRTLLM engine."""
-        if self.runner is None:
-            raise Exception("A nemo checkpoint should be exported and then it should be loaded first to run inference.")
-
-        if isinstance(self.runner, TRTLLMRunner):
-            self.runner.args.image_path = input_media
-            self.runner.args.batch_size = batch_size
-            self.runner.args.top_k = top_k
-            self.runner.args.top_p = top_p
-            self.runner.args.temperature = temperature
-            self.runner.args.repetition_penalty = repetition_penalty
-            self.runner.args.num_beams = num_beams
-            raw_image = self.runner.load_test_data(input_media)
-            return self.runner.run(
-                input_text,
-                raw_image,
-                max_output_len,
-            )[1]
-        else:
-            input_media = self.runner.load_test_media(input_media)
-            return self.runner.run(
-                input_text,
-                input_media,
-                max_output_len,
-                batch_size,
-                top_k,
-                top_p,
-                temperature,
-                repetition_penalty,
-                num_beams,
-                lora_uids,
-            )
+        raise NotImplementedError(
+            "TensorRTMMExporter.forward has been removed. This functionality is no longer supported."
+        )
 
     def get_input_media_tensors(self):
         """Get input media tensors."""
-        if self.modality == "vision":
-            return [Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8)]
-        return []
+        raise NotImplementedError(
+            "TensorRTMMExporter.get_input_media_tensors has been removed. This functionality is no longer supported."
+        )
 
     @property
     def get_triton_input(self):
-        inputs = (
-            [Tensor(name="input_text", shape=(-1,), dtype=bytes)]
-            + self.get_input_media_tensors()
-            + [
-                Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
-                Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
-                Tensor(
-                    name="repetition_penalty",
-                    shape=(-1,),
-                    dtype=np.single,
-                    optional=True,
-                ),
-                Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            ]
+        raise NotImplementedError(
+            "TensorRTMMExporter.get_triton_input has been removed. This functionality is no longer supported."
         )
-        inputs = tuple(inputs)
-        return inputs
 
     @property
     def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
-        return outputs
-
-    @batch
-    @first_value(
-        "batch_size",
-        "max_output_len",
-        "top_k",
-        "top_p",
-        "temperature",
-        "repetition_penalty",
-        "num_beams",
-    )
-    def triton_infer_fn(self, **inputs: np.ndarray):  # pragma: no cover
-        if not HAVE_TRT_LLM:
-            raise UnavailableError(MISSING_TENSORRT_LLM_MSG)
-        try:
-            if self.runner is None:
-                raise Exception(
-                    "A nemo checkpoint should be exported and then it should be loaded first to run inference."
-                )
-
-            infer_input = {"input_text": str_ndarray2list(inputs.pop("input_text")[0])}
-            video_model_list = ["video-neva", "lita", "vita"]
-            if self.runner.model_type in ["neva", "vila", "mllama"]:
-                infer_input["input_image"] = ndarray2img(inputs.pop("input_media")[0])[0]
-            elif self.runner.model_type in video_model_list:
-                infer_input["input_image"] = inputs.pop("input_media")[0]
-            elif self.runner.model_type == "salm":
-                infer_input["input_signal"] = inputs.pop("input_signal")
-                infer_input["input_signal_length"] = inputs.pop("input_signal_length")[:, 0]
-            if "batch_size" in inputs:
-                infer_input["batch_size"] = inputs.pop("batch_size")
-            if "max_output_len" in inputs:
-                infer_input["max_new_tokens"] = inputs.pop("max_output_len")
-            if "top_k" in inputs:
-                infer_input["top_k"] = inputs.pop("top_k")
-            if "top_p" in inputs:
-                infer_input["top_p"] = inputs.pop("top_p")
-            if "temperature" in inputs:
-                infer_input["temperature"] = inputs.pop("temperature")
-            if "repetition_penalty" in inputs:
-                infer_input["repetition_penalty"] = inputs.pop("repetition_penalty")
-            if "num_beams" in inputs:
-                infer_input["num_beams"] = inputs.pop("num_beams")
-            if "lora_uids" in inputs:
-                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
-                infer_input["lora_uids"] = lora_uids[0].tolist()
-
-            if isinstance(self.runner, TRTLLMRunner):
-                self.runner.args.batch_size = infer_input.pop("batch_size")
-                self.runner.args.top_k = infer_input.pop("top_k")
-                self.runner.args.top_p = infer_input.pop("top_p")
-                self.runner.args.temperature = infer_input.pop("temperature")
-                self.runner.args.repetition_penalty = infer_input.pop("repetition_penalty")
-                self.runner.args.num_beams = infer_input.pop("num_beams")
-                output_texts = self.runner.run(**infer_input)[1]
-            else:
-                output_texts = self.runner.run(**infer_input)
-            output = cast_output(output_texts, np.bytes_)
-        except Exception as error:
-            err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
+        raise NotImplementedError(
+            "TensorRTMMExporter.get_triton_output has been removed. This functionality is no longer supported."
+        )
 
-        return {"outputs": output}
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        """Triton inference function."""
+        raise NotImplementedError(
+            "TensorRTMMExporter.triton_infer_fn has been removed. This functionality is no longer supported."
+        )
 
     def _load(self):
-        llm_dir = os.path.join(self.model_dir, "llm_engine")
-        if not os.path.exists(llm_dir):
-            return
-        if self.modality == "vision":
-            import json
-
-            visual_dir = os.path.join(self.model_dir, "visual_engine")
-            with open(os.path.join(visual_dir, "config.json"), "r") as f:
-                config = json.load(f)
-            if config["builder_config"]["model_type"] == "mllama":
-                from types import SimpleNamespace
-
-                args = SimpleNamespace(
-                    engine_dir=self.model_dir,
-                    hf_model_dir="meta-llama/Llama-3.2-11B-Vision-Instruct",
-                    use_py_session=True,
-                    cross_kv_cache_fraction=0.5,
-                    enable_context_fmha_fp32_acc=None,
-                    enable_chunked_context=False,
-                    kv_cache_free_gpu_memory_fraction=0.9,
-                    multi_block_mode=True,
-                    mm_embedding_offloading=None,
-                )
-                self.runner = TRTLLMRunner(args)
-            else:
-                self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality)
+        raise NotImplementedError(
+            "TensorRTMMExporter._load has been removed. This functionality is no longer supported."
+        )
diff --git a/nemo_export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo_export/trt_llm/nemo_ckpt_loader/__init__.py
deleted file mode 100644
index d9155f923f..0000000000
--- a/nemo_export/trt_llm/nemo_ckpt_loader/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo_export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo_export/trt_llm/nemo_ckpt_loader/nemo_file.py
deleted file mode 100644
index b3c27407da..0000000000
--- a/nemo_export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ /dev/null
@@ -1,433 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import logging
-import os
-import pickle
-import shutil
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-import torch
-import yaml
-from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer
-
-from nemo_export.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo_export.tarutils import TarPath
-from nemo_export.tiktoken_tokenizer import TiktokenTokenizer
-from nemo_export.utils import (
-    load_model_weights,
-    nemo_to_path,
-    torch_dtype_from_precision,
-)
-
-try:
-    from nemo.lightning import io
-
-    HAVE_NEMO2 = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_NEMO2 = False
-
-LOGGER = logging.getLogger("NeMo")
-EXTRA_STATE = "extra_state"
-
-
-def load_extra_state_from_bytes(
-    val: Optional[Union[torch.Tensor, BytesIO]],
-) -> Optional[dict]:
-    """Loads single extra_state from bytes storage.
-
-    Args:
-        val (torch.Tensor | BytesIO): Bytes storage of extra_state
-    Returns:
-        Optional[dict]: Deserialized extra_state, or None if the bytes storage is empty.
-    """
-    if val is None:
-        return None
-
-    # TransformerEngine shifted from storing extra_states bytes storage from _io.BytesIO to torch.Tensor
-    if isinstance(val, torch.Tensor):
-        if val.numel() == 0:
-            return None
-
-        val = val.detach().numpy(force=True).tobytes()
-        return pickle.loads(val)
-
-    val.seek(0)
-    return torch.load(val, weights_only=True)
-
-
-def rename_extra_states(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """This function preprocesses extra states for Megatron export.
-
-    Args:
-        state_dict (dict): Model state dictionary
-    Returns:
-        dict: Model state dictionary, with extra states consumable by mcore export
-    """
-    mcore_extra_states = {}
-
-    for key, value in state_dict.items():
-        if EXTRA_STATE not in key:
-            continue
-
-        # Keys with the extra states have the following format:
-        # <prefix>.layers.<layer>._extra_state/shard_<layer_number>_<number_of_layers>
-        key_base, shard_key = key.split("/")
-        if "_" not in shard_key:
-            continue
-
-        shard_layer = shard_key.split("_")[1]
-        if not shard_layer.isnumeric():
-            continue
-
-        # Renames keys to:
-        # <prefix>.layers.<layer_number>.<layer>._extra_state
-        mcore_key = key_base.replace("layers", f"layers.{shard_layer}")
-        if isinstance(value, list):
-            value = value[0]
-        mcore_extra_states[mcore_key] = value
-
-    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
-    return state_dict | mcore_extra_states
-
-
-def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
-    """Updates tokenizer paths in the tokenizer config."""
-
-    def _update_config_entry(key, file_pattern):
-        old_path = tokenizer_config.get(key, None)
-        if old_path is None:
-            return
-        old_path = Path(old_path)
-        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
-        if new_path:
-            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
-            tokenizer_config[key] = new_path
-        elif not old_path.exists():
-            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
-            tokenizer_config[key] = None
-
-    _update_config_entry("model", "*.model")
-    _update_config_entry("vocab_file", "*vocab*")
-    _update_config_entry("merge_file", "*merge*.txt")
-
-    return tokenizer_config
-
-
-def get_tokenizer_from_nemo2_context(model_context_dir: Path):
-    """Retrieve tokenizer configuration from NeMo 2.0 context and instantiate the tokenizer.
-
-    Args:
-        model_context_dir (Path): Path to the model context directory.
-
-    Returns:
-        The instantiated tokenizer (various classes possible).
-    """
-    if HAVE_NEMO2:
-        # Use NeMo tokenizer loaded from the NeMo 2.0 model context
-        tokenizer_spec = io.load_context(model_context_dir, subpath="model.tokenizer")
-        return build_tokenizer(tokenizer_spec)
-    else:
-        # Use local nemo_export SentencePieceTokenizer implementation
-        # or directly a HuggingFace tokenizer based on the model config
-        with (model_context_dir / "model.yaml").open("r") as stream:
-            model_config = yaml.safe_load(stream)
-
-        tokenizer_config = model_config["tokenizer"]
-        target_class = tokenizer_config["_target_"]
-        tokenizer_module = "nemo.collections.common.tokenizers."
-        assert target_class.startswith(tokenizer_module)
-        target_class = target_class.removeprefix(tokenizer_module)
-
-        if target_class == "sentencepiece_tokenizer.SentencePieceTokenizer":
-            tokenizer = SentencePieceTokenizer(
-                model_path=str(model_context_dir / tokenizer_config["model_path"]),
-                special_tokens=tokenizer_config.get("special_tokens", None),
-                legacy=tokenizer_config.get("legacy", False),
-            )
-        elif target_class == "huggingface.auto_tokenizer.AutoTokenizer":
-            tokenizer = AutoTokenizer.from_pretrained(
-                str(model_context_dir / tokenizer_config["pretrained_model_name"])
-            )
-        else:
-            raise ValueError(f"Unsupported tokenizer type: {tokenizer_module}{target_class}.")
-
-    return tokenizer
-
-
-def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
-    """Loads the tokenizer from the decoded NeMo weights dir."""
-    tokenizer_dir_or_path = Path(tokenizer_dir_or_path)
-    if (tokenizer_dir_or_path / "nemo_context").exists():
-        return get_tokenizer_from_nemo2_context(tokenizer_dir_or_path / "nemo_context")
-    elif (tokenizer_dir_or_path / "tokenizer_config.json").exists():
-        return AutoTokenizer.from_pretrained(tokenizer_dir_or_path)
-    elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
-        vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-        tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
-        return build_tokenizer(tokenizer_config)
-    else:
-        model_path = (
-            tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-        )
-        tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
-        return build_tokenizer(tokenizer_config)
-
-
-def build_tokenizer(tokenizer):
-    """Builds tokenizer for trt-llm export."""
-    if isinstance(tokenizer, dict):
-        tokenizer_config = tokenizer
-        if tokenizer_config["library"] == "sentencepiece":
-            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
-        elif tokenizer_config["library"] == "tiktoken":
-            return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"])
-        elif "GPT2" in tokenizer_config["type"]:
-            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
-        else:
-            raise ValueError(f"Tokenizer type {tokenizer_config['library']} not handled")
-
-        if tokenizer.bos_token_id is None:
-            tokenizer.add_special_tokens({"bos_token": "<s>"})
-        if tokenizer.eos_token_id is None:
-            tokenizer.add_special_tokens({"eos_token": "</s>"})
-    else:
-        # For NeMo tokenizers, monkey patch encode & batch_decode methods for unified interface
-        import nemo.collections.common.tokenizers as nemo_tokenizers
-
-        if isinstance(tokenizer, nemo_tokenizers.TokenizerSpec):
-            if isinstance(tokenizer, nemo_tokenizers.AutoTokenizer):
-                # Unwrap the original methods of HF tokenizer
-                batch_decode = tokenizer.tokenizer.batch_decode
-                encode = tokenizer.tokenizer.encode
-            elif isinstance(tokenizer, nemo_tokenizers.SentencePieceTokenizer):
-                # Define HF equivalents based on available SP methods
-                def batch_decode(self, ids):
-                    if torch.is_tensor(ids):
-                        ids = ids.cpu().numpy()
-                    if isinstance(ids, np.ndarray):
-                        ids = ids.tolist()
-                    return self.tokenizer.decode(ids)
-
-                encode = tokenizer.tokenizer.encode_as_ids
-            else:
-                raise NotImplementedError(f"Patching tokenizer methods for {type(tokenizer)} is not available")
-
-            tokenizer.bos_token_id = tokenizer.bos_id
-            tokenizer.eos_token_id = tokenizer.eos_id
-            nemo_tokenizers.TokenizerSpec.encode = encode
-            nemo_tokenizers.TokenizerSpec.batch_decode = batch_decode
-
-    return tokenizer
-
-
-def load_nemo_config(nemo_ckpt: Union[str, Path]) -> Dict[Any, Any]:
-    """Load the model configuration from a NeMo checkpoint.
-
-    This function handles both NeMo 1.0 and NeMo 2.0 checkpoint structures.
-    For NeMo 2.0, it reads the configuration from the 'context/model.yaml' file.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file or directory.
-
-    Returns:
-        Dict[Any, Any]: The configuration dictionary.
-    """
-    if Path(nemo_ckpt).is_dir():
-        nemo_ckpt = Path(nemo_ckpt)
-    else:
-        nemo_ckpt = TarPath(nemo_ckpt)
-
-    if (nemo_ckpt / "weights").exists() and (nemo_ckpt / "context").exists():  # Stucture of NeMo 2.0 checkpoints
-        with (nemo_ckpt / "context" / "model.yaml").open("r") as stream:
-            config = yaml.safe_load(stream)
-    else:  # pragma: no cover
-        raise Exception("Not supported NeMo checkpoint format.")
-
-    return config
-
-
-def get_model_type(nemo_ckpt: Union[str, Path], use_vllm_type: bool = False) -> Optional[str]:
-    """Determine the model type from a NeMo checkpoint for TensorRT-LLM engine build or vLLM model converters.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
-        use_vllm_type (bool): If True, uses vLLM model type names for known model converters.
-
-    Returns:
-        Optional[str]: The model type if it can be determined, otherwise None.
-    """
-    model_config = load_nemo_config(nemo_ckpt)
-    model_type = None
-
-    if model_class := model_config.get("_target_"):
-        # NeMo 2.0 case
-        NEMO2_TO_MODEL_TYPE = {
-            "nemo.collections.llm.gpt.model.base.GPTModel": "gpt",
-            "nemo.collections.llm.gpt.model.llama.LlamaModel": "llama",
-            "nemo.collections.llm.gpt.model.mistral.MistralModel": "llama",
-            "nemo.collections.llm.gpt.model.mixtral.MixtralModel": "mixtral" if use_vllm_type else "llama",
-            "nemo.collections.llm.gpt.model.starcoder.StarcoderModel": "gpt",
-            "nemo.collections.llm.gpt.model.starcoder2.Starcoder2Model": "starcoder2" if use_vllm_type else "gpt",
-            "nemo.collections.llm.gpt.model.nemotron.NemotronModel": "gpt",
-            "nemo.collections.llm.gpt.model.gemma.GemmaModel": "gemma",
-            "nemo.collections.llm.gpt.model.phi3mini.Phi3Model": "phi3",
-            "nemo.collections.llm.gpt.model.baichuan.Baichuan2Model": "baichuan",
-            "nemo.collections.llm.gpt.model.chatglm.ChatGLMModel": "chatglm",
-            "nemo.collections.llm.gpt.model.qwen2.Qwen2Model": "qwen",
-        }
-        try:
-            model_type = NEMO2_TO_MODEL_TYPE[model_class]
-            LOGGER.info(f"Determined model_type='{model_type}' for {nemo_ckpt} checkpoint.")
-
-        except KeyError:
-            LOGGER.error(
-                f"Model {model_class} not found in the NEMO2_TO_MODEL_TYPE mapping, "
-                "try providing the model_type explicitely for exporting:\n"
-                f"{json.dumps(NEMO2_TO_MODEL_TYPE, indent=2)}"
-            )
-            raise
-    else:
-        LOGGER.warning(f"Parameter model_type cannot be determined for {nemo_ckpt} checkpoint.")
-    return model_type
-
-
-def get_weights_dtype(nemo_ckpt: Union[str, Path]) -> Optional[str]:
-    """Determine the weights data type from a NeMo checkpoint for TensorRT-LLM engine build.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
-
-    Returns:
-        Optional[str]: The dtype if it can be determined, otherwise None.
-    """
-    model_config = load_nemo_config(nemo_ckpt)
-    torch_dtype = None
-    dtype = None
-
-    is_nemo2 = "_target_" in model_config
-    if is_nemo2:
-        torch_dtype = model_config["config"]["params_dtype"]["_target_"]
-    elif precision := model_config.get("precision", None):
-        torch_dtype = str(torch_dtype_from_precision(precision))
-
-    if torch_dtype is not None:
-        dtype = torch_dtype.removeprefix("torch.")
-        LOGGER.info(f"Determined weights dtype='{dtype}' for {nemo_ckpt} checkpoint.")
-    else:
-        LOGGER.warning(
-            f"Parameter dtype for model weights cannot be determined for {nemo_ckpt} checkpoint. "
-            "There is no 'precision' field specified in the model_config.yaml file."
-        )
-
-    return dtype
-
-
-def load_distributed_model_weights(
-    nemo_checkpoint: Union[str, Path],
-    mcore_scales_format: Optional[bool] = None,
-) -> Dict[str, Any]:
-    """Loads model weights in `torch_dist` format from the model path.
-
-    Args:
-        nemo_checkpoint (str | Path): Path to the nemo checkpoint.
-        mcore_scales_format (bool): Depreacted flag for local vs megatron.core export.
-
-    Returns:
-        dict: Model state dictionary.
-    """
-    if mcore_scales_format is not None:
-        LOGGER.warning(
-            "The mcore_scales_format parameter is deprecated and setting it does not take any effect. "
-            "It will be removed in the future."
-        )
-
-    state_dict = load_model_weights(nemo_checkpoint, load_extra_states=True)
-
-    state_dict = rename_extra_states(state_dict)
-
-    return state_dict
-
-
-def load_nemo_model(
-    nemo_ckpt: Union[str, Path],
-    nemo_export_dir: Union[str, Path],
-):
-    """Unified model loading for trt-llm export."""
-    if not os.path.exists(nemo_ckpt):
-        raise TypeError("%s does not exist", nemo_ckpt)
-
-    nemo_dir = nemo_to_path(nemo_ckpt)
-
-    tokenizer = None
-    try:
-        if (nemo_dir / "weights").exists():
-            model = load_distributed_model_weights(nemo_ckpt)
-            io_folder = nemo_dir / "context"
-
-            if (io_folder / "model.yaml").exists():
-                with open(io_folder / "model.yaml", "r") as stream:
-                    config = yaml.safe_load(stream)
-
-                nemo_model_config = {}
-                for k, v in config["config"].items():
-                    if isinstance(v, (float, int, str, bool)):
-                        nemo_model_config[k] = v
-                    elif k == "activation_func":
-                        nemo_model_config["activation"] = v["_target_"].rsplit(".", 1)[-1]
-            else:
-                assert HAVE_NEMO2, "nemo_toolkit>=2.0.0 is required to load the model context."
-
-                config = io.load_context(io_folder, subpath="model.config")
-
-                nemo_model_config = {}
-                for k, v in config.__dict__.items():
-                    if isinstance(v, (float, int, str, bool)):
-                        nemo_model_config[k] = v
-                    elif k == "activation_func":
-                        if isinstance(v, torch.jit.ScriptFunction):
-                            nemo_model_config["activation"] = v.name
-                        else:
-                            nemo_model_config["activation"] = v.__name__
-
-            if nemo_model_config.get("num_moe_experts") is None:
-                nemo_model_config["num_moe_experts"] = 0
-                nemo_model_config["moe_router_topk"] = 0
-            if nemo_model_config["activation"] == "silu":
-                nemo_model_config["activation"] = "fast-swiglu"
-            elif nemo_model_config["activation"] == "openai_gelu":
-                nemo_model_config["activation"] = "openai-gelu"
-            elif nemo_model_config["activation"] == "squared_relu":
-                nemo_model_config["activation"] = "squared-relu"
-
-            if nemo_model_config.get("add_bias_linear"):
-                nemo_model_config["bias"] = True
-
-            nemo_model_config["mcore_gpt"] = True
-            nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096)
-            nemo_model_config["rotary_percentage"] = nemo_model_config.get("rotary_percent", 1.0)
-
-            shutil.copytree(io_folder, nemo_export_dir / "nemo_context")
-        else:
-            raise Exception("Not a supported NeMo file format: only distributed MCore NeMo checkpoints are supported.")
-    finally:
-        if isinstance(nemo_dir, TarPath):
-            nemo_dir.tarobject.close()
-
-    return model, nemo_model_config, tokenizer
diff --git a/nemo_export/trt_llm/qnemo/__init__.py b/nemo_export/trt_llm/qnemo/__init__.py
deleted file mode 100644
index dbbfd23bac..0000000000
--- a/nemo_export/trt_llm/qnemo/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
-
-__all__ = ["qnemo_to_tensorrt_llm"]
diff --git a/nemo_export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo_export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
deleted file mode 100644
index a45c09b195..0000000000
--- a/nemo_export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import itertools
-import os
-import subprocess
-import warnings
-from typing import List, Optional
-
-from nemo_export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
-from nemo_export_deploy_common.import_utils import MISSING_TENSORRT_LLM_MSG, UnavailableError
-
-try:
-    from tensorrt_llm.models import PretrainedConfig
-
-    HAVE_TRT_LLM = True
-
-except (ImportError, ModuleNotFoundError):
-    HAVE_TRT_LLM = False
-
-
-def qnemo_to_tensorrt_llm(
-    nemo_checkpoint_path: str,
-    engine_dir: str,
-    max_input_len: int,
-    max_seq_len: Optional[int],
-    max_batch_size: int,
-    max_prompt_embedding_table_size: int,
-    tensor_parallel_size: Optional[int] = None,
-    pipeline_parallel_size: Optional[int] = None,
-    use_parallel_embedding: bool = False,
-    paged_kv_cache: bool = True,
-    use_paged_context_fmha: bool = True,
-    remove_input_padding: bool = True,
-    use_lora_plugin: Optional[str] = None,
-    lora_target_modules: Optional[List[str]] = None,
-    max_lora_rank: int = 64,
-    max_num_tokens: Optional[int] = None,
-    opt_num_tokens: Optional[int] = None,
-    max_beam_width: int = 1,
-    multiple_profiles: bool = False,
-    reduce_fusion: bool = True,
-):
-    """Build TensorRT-LLM engine with trtllm-build command in a subprocess."""
-    if not HAVE_TRT_LLM:
-        raise UnavailableError(MISSING_TENSORRT_LLM_MSG)
-
-    assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
-
-    warnings.warn(
-        "Note that setting tensor_parallel_size, pipeline_parallel_size and use_parallel_embedding "
-        " parameters for quantized models is done on the calibration step (in PTQ workflow)."
-        " These parameters are ignored when building and running TensorRT-LLM engine below.",
-        UserWarning,
-        stacklevel=3,
-    )
-
-    num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*"))))
-    assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}"
-
-    config = PretrainedConfig.from_json_file(os.path.join(nemo_checkpoint_path, CONFIG_NAME))
-
-    log_level = "warning"
-
-    quant_algo = config.quantization.quant_algo
-
-    use_fused_mlp = True
-    if config.quantization.exclude_modules:
-        for module_name in config.quantization.exclude_modules:
-            # For AutoQuant, fc and gate might not be quantized at the same time
-            # TODO: relax this limitation on the TRT-LLM side
-            if "gate" in module_name or "fc" in module_name:
-                use_fused_mlp = False
-    use_fused_mlp = use_fused_mlp and "RecurrentGemma" not in config.architecture
-
-    use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"]
-
-    speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None
-
-    build_cmd = ["trtllm-build"]
-    build_cmd.extend(["--checkpoint_dir", nemo_checkpoint_path])
-    build_cmd.extend(["--log_level", log_level])
-    build_cmd.extend(["--output_dir", engine_dir])
-    build_cmd.extend(["--workers", str(num_build_workers)])
-    build_cmd.extend(["--max_batch_size", str(max_batch_size)])
-    build_cmd.extend(["--max_input_len", str(max_input_len)])
-    build_cmd.extend(["--max_beam_width", str(max_beam_width)])
-    build_cmd.extend(["--max_prompt_embedding_table_size", str(max_prompt_embedding_table_size)])
-    build_cmd.extend(["--paged_kv_cache", "enable" if paged_kv_cache else "disable"])
-    build_cmd.extend(["--use_paged_context_fmha", "enable" if use_paged_context_fmha else "disable"])
-    build_cmd.extend(["--remove_input_padding", "enable" if remove_input_padding else "disable"])
-    build_cmd.extend(["--multiple_profiles", "enable" if multiple_profiles else "disable"])
-    build_cmd.extend(["--reduce_fusion", "enable" if reduce_fusion else "disable"])
-    build_cmd.extend(["--use_fused_mlp", "enable" if use_fused_mlp else "disable"])
-
-    if not use_qdq:
-        build_cmd.extend(["--gemm_plugin", "auto"])
-
-    if max_seq_len is not None:
-        build_cmd.extend(["--max_seq_len", str(max_seq_len)])
-
-    if max_num_tokens is not None:
-        build_cmd.extend(["--max_num_tokens", str(max_num_tokens)])
-    else:
-        build_cmd.extend(["--max_num_tokens", str(max_batch_size * max_input_len)])
-
-    if opt_num_tokens is not None:
-        build_cmd.extend(["--opt_num_tokens", str(opt_num_tokens)])
-
-    if speculative_decoding_mode:
-        build_cmd.extend(["--speculative_decoding_mode", speculative_decoding_mode])
-
-    print("trtllm-build command:")
-    print("".join(itertools.chain.from_iterable(zip(build_cmd, itertools.cycle(["\n  ", " "])))).strip())
-
-    subprocess.run(build_cmd, shell=False, check=True)
diff --git a/nemo_export/trt_llm/qnemo/utils.py b/nemo_export/trt_llm/qnemo/utils.py
deleted file mode 100644
index 7fca37a4b4..0000000000
--- a/nemo_export/trt_llm/qnemo/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-
-from nemo_export.tarutils import TarPath
-
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "rank{}.safetensors"
-
-
-def is_qnemo_checkpoint(path: str) -> bool:
-    """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence."""
-    if os.path.isdir(path):
-        path = Path(path)
-    else:
-        path = TarPath(path)
-    config_path = path / CONFIG_NAME
-    tensor_path = path / WEIGHTS_NAME.format(0)
-    return config_path.exists() and tensor_path.exists()
diff --git a/nemo_export/trt_llm/tensorrt_llm_run.py b/nemo_export/trt_llm/tensorrt_llm_run.py
deleted file mode 100644
index e03bd353d1..0000000000
--- a/nemo_export/trt_llm/tensorrt_llm_run.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import csv
-import json
-import logging
-import os
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Optional
-
-import numpy as np
-import torch
-from transformers import PreTrainedTokenizer
-
-from nemo_export_deploy_common.import_utils import (
-    MISSING_MPI_MSG,
-    UnavailableError,
-)
-
-try:
-    from mpi4py.futures import MPIPoolExecutor
-
-    HAVE_MPI = True
-except (ImportError, ModuleNotFoundError):
-    from unittest.mock import MagicMock
-
-    MPIPoolExecutor = MagicMock()
-    HAVE_MPI = False
-
-
-try:
-    import tensorrt_llm
-    from tensorrt_llm.lora_manager import LoraManager
-    from tensorrt_llm.runtime import (
-        ModelRunner,
-        ModelRunnerCpp,
-        SamplingConfig,
-    )
-except (ImportError, ModuleNotFoundError):
-    from unittest.mock import MagicMock
-
-    Engine = MagicMock()
-    LoraManager = MagicMock()
-    QuantMode = MagicMock()
-    ModelConfig = MagicMock()
-    ModelRunner = MagicMock()
-    ModelRunnerCpp = MagicMock()
-    SamplingConfig = MagicMock()
-    HAVE_TRT_LLM = False
-
-LOGGER = logging.getLogger("NeMo")
-
-
-@dataclass
-class TensorrtLLMHostContext:
-    """The host side context for TRT LLM inference."""
-
-    executor: MPIPoolExecutor = None
-    world_size: int = 1
-    tokenizer: PreTrainedTokenizer = None
-    max_batch_size: int = 0
-    max_input_len: int = 0
-    add_bos: bool = False
-
-
-@dataclass
-class TensorrtLLMWorkerContext:
-    """The MPI worker side context for TRT LLM inference."""
-
-    decoder: ModelRunner | ModelRunnerCpp = None
-    sampling_config: SamplingConfig = None
-    lora_manager: LoraManager = None
-    max_batch_size: int = 0
-    max_input_len: int = 0
-
-
-# This is a global context that will be initialized during the model loading process as MPI worker.
-tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
-
-
-def _load(
-    tokenizer: PreTrainedTokenizer,
-    engine_dir,
-    lora_ckpt_list=None,
-    num_beams=1,
-    use_python_runtime: bool = True,
-    enable_chunked_context: bool = False,
-    max_tokens_in_paged_kv_cache: int = None,
-    multi_block_mode: bool = False,
-):
-    """The impl of `load` API for on a single GPU worker."""
-    try:
-        tensorrt_llm.logger.set_level("info")
-
-        engine_dir = Path(engine_dir)
-        config_path = engine_dir / "config.json"
-        # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
-
-        with open(config_path, "r") as f:
-            config = json.load(f)
-
-        max_batch_size = config["build_config"]["max_batch_size"]
-        max_input_len = config["build_config"]["max_input_len"]
-        # max_output_len = config["build_config"]["max_output_len"]
-        max_beam_width = config["build_config"]["max_beam_width"]
-
-        runtime_rank = tensorrt_llm.mpi_rank()
-
-        if use_python_runtime:
-            if enable_chunked_context:
-                logging.warning("enable_chunked_context is disabled when using python runtime")
-            if multi_block_mode:
-                logging.warning("multi_block_mode is disabled when using python runtime")
-
-            decoder = ModelRunner.from_dir(
-                engine_dir=engine_dir,
-                lora_dir=lora_ckpt_list,
-                lora_ckpt_source="nemo",
-                rank=runtime_rank,
-                debug_mode=False,
-            )
-        else:
-            decoder = ModelRunnerCpp.from_dir(
-                engine_dir=engine_dir,
-                lora_dir=lora_ckpt_list,
-                lora_ckpt_source="nemo",
-                rank=runtime_rank,
-                max_batch_size=max_batch_size,
-                max_input_len=max_input_len,
-                # max_output_len=max_output_len,
-                max_beam_width=max_beam_width,
-                enable_chunked_context=enable_chunked_context,
-                max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
-                multi_block_mode=multi_block_mode,
-                debug_mode=False,
-            )
-
-        sampling_config = SamplingConfig(
-            end_id=tokenizer.eos_token_id,
-            pad_id=tokenizer.eos_token_id,
-            num_beams=num_beams,
-        )
-
-        # Initialize the global context so it can be used during `run` API.
-        global tensorrt_llm_worker_context
-        tensorrt_llm_worker_context.decoder = decoder
-        tensorrt_llm_worker_context.sampling_config = sampling_config
-        tensorrt_llm_worker_context.max_batch_size = max_batch_size
-        tensorrt_llm_worker_context.max_input_len = max_input_len
-
-    except Exception as e:
-        print(e)
-        raise e
-
-
-def _forward(
-    input_tensors: List[torch.IntTensor],
-    max_output_len: int,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    multiprocessed_env=False,
-    **sampling_kwargs,
-) -> Optional[torch.IntTensor]:
-    """The impl of `forward` API for on a single GPU worker with tensor as IO.
-
-    Returns:
-        the output tokens tensor with shape [batch_size, num_beams, output_len].
-    """
-    try:
-        # Loading the global context initialized from the `load` API.
-        global tensorrt_llm_worker_context
-        decoder = tensorrt_llm_worker_context.decoder
-        assert decoder is not None, "Invalid worker context, decoder is not loaded."
-        sampling_config = tensorrt_llm_worker_context.sampling_config
-        max_batch_size = tensorrt_llm_worker_context.max_batch_size
-        max_input_len = tensorrt_llm_worker_context.max_input_len
-
-        batch_size = len(input_tensors)
-        assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
-        input_lengths = [t.shape[0] for t in input_tensors]
-        max_length = max(input_lengths)
-        assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
-        pad_id = sampling_config.pad_id
-        end_id = sampling_config.end_id
-        num_beams = sampling_config.num_beams
-
-        for k in sampling_kwargs.keys():
-            if not hasattr(sampling_config, k):
-                raise TypeError(f"Unknown sampling args '{k}'")
-
-        with torch.no_grad():
-            outputs = decoder.generate(
-                input_tensors,
-                max_new_tokens=max_output_len,
-                end_id=end_id,
-                pad_id=pad_id,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                num_beams=num_beams,
-                stop_words_list=stop_words_list,
-                bad_words_list=bad_words_list,
-                lora_uids=lora_uids,
-                output_sequence_lengths=True,
-                return_dict=True,
-                **sampling_kwargs,
-            )
-
-            torch.cuda.synchronize()
-
-        runtime_rank = tensorrt_llm.mpi_rank()
-        if runtime_rank == 0 or multiprocessed_env:
-            return outputs
-        else:
-            return None
-
-    except Exception as e:
-        print(e)
-        raise e
-
-
-def load(
-    tokenizer: PreTrainedTokenizer,
-    engine_dir: str,
-    lora_ckpt_list: List[str] = None,
-    num_beams: int = 1,
-    use_python_runtime: bool = True,
-    enable_chunked_context: bool = False,
-    max_tokens_in_paged_kv_cache: int = None,
-    multi_block_mode: bool = False,
-) -> TensorrtLLMHostContext:
-    """Loaded the compiled LLM model and run it.
-
-    It also supports running the TRT LLM model on multi-GPU.
-    """
-    # the parent dir of the engine_dir
-    config_path = os.path.join(engine_dir, "config.json")
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    world_size = config["pretrained_config"]["mapping"]["world_size"]
-    if world_size == 1:
-        _load(
-            tokenizer,
-            engine_dir,
-            lora_ckpt_list,
-            num_beams,
-            use_python_runtime,
-            enable_chunked_context,
-            max_tokens_in_paged_kv_cache,
-            multi_block_mode,
-        )
-        executor = None
-    elif tensorrt_llm.mpi_world_size() > 1:
-        _load(
-            tokenizer,
-            engine_dir,
-            lora_ckpt_list,
-            num_beams,
-            use_python_runtime,
-            enable_chunked_context,
-            max_tokens_in_paged_kv_cache,
-        )
-        executor = None
-        tensorrt_llm.mpi_barrier()
-    else:
-        if not HAVE_MPI:
-            raise UnavailableError(MISSING_MPI_MSG)
-
-        executor = MPIPoolExecutor(max_workers=world_size)
-        futures = []
-        for _ in range(world_size):
-            future = executor.submit(
-                _load,
-                tokenizer,
-                engine_dir,
-                lora_ckpt_list,
-                num_beams,
-                use_python_runtime,
-                enable_chunked_context,
-                max_tokens_in_paged_kv_cache,
-            )
-            futures.append(future)
-        for future in futures:
-            future.result()
-
-    max_batch_size = config["build_config"]["max_batch_size"]
-    max_input_len = config["build_config"]["max_input_len"]
-    architectures_that_need_bos_token = [
-        "GemmaForCausalLM",
-        "LLaMAForCausalLM",
-        "MistralForCausalLM",
-        "MixtralForCausalLM",
-    ]
-    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
-
-    return TensorrtLLMHostContext(
-        executor=executor,
-        world_size=world_size,
-        tokenizer=tokenizer,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-        add_bos=add_bos,
-    )
-
-
-def forward(
-    input_tensors: List[torch.IntTensor],
-    max_output_len: int,
-    host_context: TensorrtLLMHostContext,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    multiprocessed_env=False,
-    **sampling_kwargs,
-) -> Optional[torch.IntTensor]:
-    """Run the loaded model with the host_context provided from the `load` API."""
-    batch_size = len(input_tensors)
-    max_batch_size = host_context.max_batch_size
-    assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
-    max_length = max([t.shape[0] for t in input_tensors])
-    max_input_len = host_context.max_input_len
-    assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
-
-    world_size = host_context.world_size
-    if world_size == 1 or multiprocessed_env:
-        return _forward(
-            input_tensors=input_tensors,
-            max_output_len=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            lora_uids=lora_uids,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            multiprocessed_env=multiprocessed_env,
-            **sampling_kwargs,
-        )
-    else:
-        executor = host_context.executor
-        futures = []
-        for _ in range(world_size):
-            future = executor.submit(
-                _forward,
-                input_tensors=input_tensors,
-                max_output_len=max_output_len,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                lora_uids=lora_uids,
-                stop_words_list=stop_words_list,
-                bad_words_list=bad_words_list,
-                **sampling_kwargs,
-            )
-            futures.append(future)
-        for future in futures:
-            result = future.result()
-            if result is not None:
-                return result
-
-        raise RuntimeError("Internal error")
-
-
-def unload_engine():
-    """Deletes the ModelRunner which should free up device memory."""
-    global tensorrt_llm_worker_context
-    decoder = tensorrt_llm_worker_context.decoder
-    if not isinstance(decoder, ModelRunner):
-        raise ValueError(
-            f"unload_engine is only supported with ModelRunner, but export has been configured with {type(decoder)=}"
-        )
-
-    logging.info("Unloading engine...")
-    del tensorrt_llm_worker_context.decoder
-    tensorrt_llm_worker_context.decoder = None
-    logging.info("Engine unloaded!")
-
-
-def prepare_input_tensors(
-    input_texts: List[str],
-    host_context: TensorrtLLMHostContext,
-):
-    """Prepare input tensors from text input.
-
-    Args:
-        input_texts: List of input text strings
-        host_context: Context containing tokenizer and configuration
-
-    Returns:
-        dict: Prepared input tensors for model
-    """
-    tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        bos_tokens = [tokenizer.bos_token_id]
-    else:
-        bos_tokens = []
-
-    input_tokens = [bos_tokens + tokenizer.encode(t) for t in input_texts]
-
-    # Convert input token lists to tensors
-    input_tensors = [torch.IntTensor(token_list) for token_list in input_tokens]
-
-    return input_tensors
-
-
-def generate(
-    input_texts: List[str],
-    max_output_len: int,
-    host_context: TensorrtLLMHostContext,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    output_log_probs=False,  # noqa: ARG001
-    multiprocessed_env=False,
-    output_context_logits=False,
-    output_generation_logits=False,
-    **sampling_kwargs,
-) -> Optional[List[List[str]]]:
-    """Generate the output sequence from the input sequence.
-
-    Returns a 2D string list with shape [batch_size, num_beams].
-    """
-    tokenizer = host_context.tokenizer
-    input_tensors = prepare_input_tensors(input_texts, host_context)
-
-    stop_words_list_tensors = None
-    if stop_words_list is not None:
-        stop_words_arrays = to_word_list_format(stop_words_list, tokenizer)
-        stop_words_list_tensors = (
-            torch.Tensor(stop_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
-        )
-
-    bad_words_list_tensors = None
-    if bad_words_list is not None:
-        bad_words_arrays = to_word_list_format(bad_words_list, tokenizer)
-        bad_words_list_tensors = (
-            torch.Tensor(bad_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
-        )
-
-    outputs = forward(
-        input_tensors=input_tensors,
-        max_output_len=max_output_len,
-        host_context=host_context,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        lora_uids=lora_uids,
-        stop_words_list=stop_words_list_tensors,
-        bad_words_list=bad_words_list_tensors,
-        output_log_probs=output_log_probs,
-        multiprocessed_env=multiprocessed_env,
-        **sampling_kwargs,
-    )
-
-    assert outputs is not None
-    if tensorrt_llm.mpi_rank() != 0:
-        return None
-
-    output_ids = outputs["output_ids"]
-    sequence_lengths = outputs["sequence_lengths"]
-    input_lengths = [t.shape[0] for t in input_tensors]
-
-    output_lines_list = [
-        tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
-        for b in range(output_ids.shape[0])
-    ]
-
-    if output_generation_logits:
-        return output_lines_list, outputs["generation_logits"]
-    elif output_context_logits:
-        return output_lines_list, outputs["context_logits"]
-    return output_lines_list
-
-
-def unload(host_context: TensorrtLLMHostContext):
-    """Frees the GPU resource from the TensorrtLLMHostContext and reset the host_context."""
-    if host_context.executor is not None:
-        host_context.executor.shutdown(wait=True)
-        host_context.executor = None
-        return
-
-    global tensorrt_llm_worker_context
-    tensorrt_llm_worker_context.decoder = None
-    tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
-
-
-def to_word_list_format(
-    word_dict: List[List[str]],
-    tokenizer=None,
-    ref_str="<extra_id_1>",
-):
-    """Format of word_dict.
-
-    len(word_dict) should be same to batch_size
-    word_dict[i] means the words for batch i
-    len(word_dict[i]) must be 1, which means it only contains 1 string
-    This string can contains several sentences and split by ",".
-    For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-    the ids for two short sentences " I am happy" and " I am sad".
-    """
-    assert tokenizer is not None, "need to set tokenizer"
-
-    flat_ids = []
-    offsets = []
-    # The encoding of a single word can't always be trusted. See
-    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229  # pylint: disable=C0301
-    ids_ref = tokenizer.encode(ref_str)
-    for word_dict_item in word_dict:
-        item_flat_ids = []
-        item_offsets = []
-
-        if isinstance(word_dict_item[0], bytes):
-            word_dict_item = [word_dict_item[0].decode()]
-
-        words = list(csv.reader(word_dict_item))[0]
-        for word in words:
-            ids = tokenizer.encode(f"{ref_str}{word}")
-            if ids[0 : len(ids_ref)] == ids_ref:
-                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
-                ids = ids[len(ids_ref) :]
-            else:
-                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
-                # for now we just use the basic encoding since this should be a very rare edge case.
-                ids = tokenizer.encode(word)
-                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
-
-            if len(ids) == 0:
-                continue
-
-            item_flat_ids += ids
-            item_offsets.append(len(ids))
-
-        flat_ids.append(np.array(item_flat_ids))
-        offsets.append(np.cumsum(np.array(item_offsets)))
-
-    pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
-
-    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
diff --git a/nemo_export/trt_llm/utils.py b/nemo_export/trt_llm/utils.py
deleted file mode 100644
index c4882f0b08..0000000000
--- a/nemo_export/trt_llm/utils.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Dict, Optional, Tuple
-
-from nemo_export_deploy_common.import_utils import MISSING_TENSORRT_LLM_MSG, UnavailableError
-
-try:
-    import tensorrt_llm
-
-    HAVE_TRT_LLM = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_TRT_LLM = False
-
-
-def is_rank(rank: Optional[int]) -> bool:
-    """Check if the current MPI rank matches the specified rank.
-
-    Args:
-        rank (Optional[int]): The rank to check against.
-
-    Returns:
-        bool: True if the current rank matches the specified rank or if rank is None.
-    """
-    if not HAVE_TRT_LLM:
-        raise UnavailableError(MISSING_TENSORRT_LLM_MSG)
-
-    current_rank = tensorrt_llm.mpi_rank()
-    if rank is None:
-        return True
-    if isinstance(rank, int):
-        return current_rank == rank
-    raise ValueError(f"Invalid rank argument {rank} of type {type(rank)}.")
-
-
-def determine_quantization_settings(
-    nemo_model_config: Dict[str, Any],
-    fp8_quantized: Optional[bool] = None,
-    fp8_kvcache: Optional[bool] = None,
-) -> Tuple[bool, bool]:
-    """Determines the exported models quantization settings.
-    Reads from NeMo config, with optional override.
-    Args:
-        nemo_model_config (dict): NeMo model configuration
-        fp8_quantized (optional, bool): User-specified quantization flag
-        fp8_kvcache (optional, bool): User-specified cache quantization flag
-    Returns:
-        Tuple[bool, bool]:
-            - Model quantization flag
-            - Model kv-cache quantization flag
-    """
-    is_nemo_quantized: bool = nemo_model_config.get("fp8", False)
-    if fp8_quantized is None:
-        fp8_quantized = is_nemo_quantized
-    if fp8_kvcache is None:
-        fp8_kvcache = is_nemo_quantized
-
-    return fp8_quantized, fp8_kvcache
diff --git a/scripts/deploy/nlp/deploy_ray_trtllm.py b/scripts/deploy/nlp/deploy_ray_trtllm.py
index 60838cd537..41e6c2d9af 100644
--- a/scripts/deploy/nlp/deploy_ray_trtllm.py
+++ b/scripts/deploy/nlp/deploy_ray_trtllm.py
@@ -21,7 +21,6 @@
 from pathlib import Path
 
 from nemo_deploy.deploy_ray import DeployRay
-from nemo_export.tensorrt_llm import TensorRTLLM
 from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
 
 LOGGER = logging.getLogger("NeMo")
@@ -63,12 +62,6 @@ def parse_args():
         default=None,
         help="Path to the TensorRT-LLM model directory with pre-built engines",
     )
-    model_group.add_argument(
-        "--nemo_checkpoint_path",
-        type=str,
-        default=None,
-        help="Path to the NeMo checkpoint file to be exported to TensorRT-LLM",
-    )
     model_group.add_argument(
         "--hf_model_path",
         type=str,
@@ -77,12 +70,6 @@ def parse_args():
     )
 
     # Model configuration
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default="llama",
-        help="Model type/architecture (e.g., 'llama', 'gpt')",
-    )
     parser.add_argument(
         "--tensor_parallelism_size",
         type=int,
@@ -234,20 +221,18 @@ def main():
         sys.exit(1)
 
     try:
-        if not args.nemo_checkpoint_path and not args.hf_model_path and not args.trt_llm_path:
-            raise ValueError(
-                "Either nemo_checkpoint_path or hf_model_path or trt_llm_path must be provided for deployment"
-            )
+        if not args.hf_model_path and not args.trt_llm_path:
+            raise ValueError("Either hf_model_path or trt_llm_path must be provided for deployment")
         if not args.trt_llm_path:
             args.trt_llm_path = "/tmp/trt_llm_model_dir/"
             LOGGER.info(
                 "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. "
-                "Please set the --triton_model_repository parameter if you'd like to use a path that already "
+                "Please set the --trt_llm_path parameter if you'd like to use a path that already "
                 "includes the TensorRT LLM model files."
             )
             Path(args.trt_llm_path).mkdir(parents=True, exist_ok=True)
 
-            # Prepare TensorRTLLM constructor arguments
+            # Prepare TensorRTLLMHF constructor arguments
             trtllm_kwargs = {
                 "model_dir": args.trt_llm_path,
                 "lora_ckpt_list": args.lora_ckpt_list,
@@ -261,31 +246,10 @@ def main():
                 trtllm_kwargs["enable_chunked_context"] = args.enable_chunked_context
                 trtllm_kwargs["max_tokens_in_paged_kv_cache"] = args.max_tokens_in_paged_kv_cache
 
-            # Use TensorRTLLMHF for HuggingFace models, TensorRTLLM for NeMo models
+            # Export HuggingFace model
             if args.hf_model_path:
-                trtllmConverter = TensorRTLLMHF(**trtllm_kwargs)
-            else:
-                trtllmConverter = TensorRTLLM(**trtllm_kwargs)
-
-            if args.nemo_checkpoint_path:
-                LOGGER.info("Exporting Nemo checkpoint to TensorRT-LLM")
-                try:
-                    trtllmConverter.export(
-                        nemo_checkpoint_path=args.nemo_checkpoint_path,
-                        model_type=args.model_type,
-                        tensor_parallelism_size=args.tensor_parallelism_size,
-                        pipeline_parallelism_size=args.pipeline_parallelism_size,
-                        max_input_len=args.max_input_len,
-                        max_output_len=args.max_output_len,
-                        max_batch_size=args.max_batch_size,
-                        delete_existing_files=True,
-                        max_seq_len=args.max_input_len + args.max_output_len,
-                    )
-                except Exception as e:
-                    LOGGER.error(f"Error exporting Nemo checkpoint to TensorRT-LLM: {str(e)}")
-                    raise RuntimeError(f"Error exporting Nemo checkpoint to TensorRT-LLM: {str(e)}")
-            elif args.hf_model_path:
                 LOGGER.info("Exporting HF model to TensorRT-LLM")
+                trtllmConverter = TensorRTLLMHF(**trtllm_kwargs)
                 try:
                     trtllmConverter.export_hf_model(
                         hf_model_path=args.hf_model_path,
@@ -299,7 +263,7 @@ def main():
                 except Exception as e:
                     LOGGER.error(f"Error exporting HF model to TensorRT-LLM: {str(e)}")
                     raise RuntimeError(f"Error exporting HF model to TensorRT-LLM: {str(e)}")
-            del trtllmConverter
+                del trtllmConverter
     except Exception as e:
         LOGGER.error(f"Error during TRTLLM model export: {str(e)}")
         sys.exit(1)
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 3128838409..76e7a42f11 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -40,7 +40,6 @@ class UsageError(Exception):
 
 trt_llm_supported = True
 try:
-    from nemo_export.tensorrt_llm import TensorRTLLM
     from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
 except Exception as e:
     LOGGER.warning(f"Cannot import the TensorRTLLM exporter, it will not be available. {type(e).__name__}: {e}")
@@ -52,7 +51,6 @@ def get_args(argv):
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description="Deploy nemo models to Triton",
     )
-    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
     parser.add_argument("-hfp", "--hf_model_id_path", type=str, help="Huggingface model path or id")
     parser.add_argument(
         "-mt",
@@ -401,70 +399,28 @@ def get_trtllm_deployable(args):
             except Exception as e:
                 raise RuntimeError(f"Error downloading from HuggingFace: {str(e)}")
 
-    checkpoint_missing = args.nemo_checkpoint is None and args.hf_model_id_path is None
+    checkpoint_missing = args.hf_model_id_path is None
     if checkpoint_missing and args.triton_model_repository is None:
         raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint."
+            "Please provide either --hf_model_id_path or --triton_model_repository with a valid TensorRT-LLM model."
         )
 
     if checkpoint_missing and not os.path.isdir(args.triton_model_repository):
         raise ValueError(
             "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint."
+            "directory. Please provide a --hf_model_id_path or a valid --triton_model_repository."
         )
 
-    if not checkpoint_missing and args.model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
+    # Use TensorRTLLMHF for HuggingFace models
+    trt_llm_exporter = TensorRTLLMHF(
+        model_dir=trt_llm_path,
+        lora_ckpt_list=args.lora_ckpt,
+        load_model=(args.hf_model_id_path is None),
+        use_python_runtime=(not args.use_cpp_runtime),
+        multi_block_mode=args.multi_block_mode,
+    )
 
-    # Use TensorRTLLMHF for HuggingFace models, TensorRTLLM for NeMo models
     if args.hf_model_id_path is not None:
-        trt_llm_exporter = TensorRTLLMHF(
-            model_dir=trt_llm_path,
-            lora_ckpt_list=args.lora_ckpt,
-            load_model=(args.nemo_checkpoint is None and args.hf_model_id_path is None),
-            use_python_runtime=(not args.use_cpp_runtime),
-            multi_block_mode=args.multi_block_mode,
-        )
-    else:
-        trt_llm_exporter = TensorRTLLM(
-            model_dir=trt_llm_path,
-            lora_ckpt_list=args.lora_ckpt,
-            load_model=(args.nemo_checkpoint is None and args.hf_model_id_path is None),
-            use_python_runtime=(not args.use_cpp_runtime),
-            multi_block_mode=args.multi_block_mode,
-        )
-
-    if args.nemo_checkpoint is not None:
-        try:
-            LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
-            trt_llm_exporter.export(
-                nemo_checkpoint_path=args.nemo_checkpoint,
-                model_type=args.model_type,
-                tensor_parallelism_size=args.tensor_parallelism_size,
-                pipeline_parallelism_size=args.pipeline_parallelism_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_batch_size=args.max_batch_size,
-                max_num_tokens=args.max_num_tokens,
-                opt_num_tokens=args.opt_num_tokens,
-                max_seq_len=args.max_seq_len,
-                use_parallel_embedding=args.use_parallel_embedding,
-                paged_kv_cache=(not args.no_paged_kv_cache),
-                remove_input_padding=(not args.disable_remove_input_padding),
-                dtype=args.dtype,
-                use_lora_plugin=args.use_lora_plugin,
-                lora_target_modules=args.lora_target_modules,
-                max_lora_rank=args.max_lora_rank,
-                multiple_profiles=args.multiple_profiles,
-                gpt_attention_plugin=args.gpt_attention_plugin,
-                gemm_plugin=args.gemm_plugin,
-                fp8_quantized=args.export_fp8_quantized,
-                fp8_kvcache=args.use_fp8_kv_cache,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-    elif args.hf_model_id_path is not None:
         LOGGER.info("Export operation will be started to export the hugging face checkpoint to TensorRT-LLM.")
         try:
             trt_llm_exporter.export_hf_model(
diff --git a/tests/functional_tests/tests_trtllm/test_deploy.py b/tests/functional_tests/tests_trtllm/test_deploy.py
index c1c8bad6cc..a943792515 100644
--- a/tests/functional_tests/tests_trtllm/test_deploy.py
+++ b/tests/functional_tests/tests_trtllm/test_deploy.py
@@ -15,11 +15,14 @@
 import logging
 import subprocess
 
+import pytest
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 class TestTRTLLMDeploy:
+    @pytest.mark.skip(reason="Temporarily skipped")
     def test_trtllm_deploy_nemo2(self):
         subprocess.run(
             [
diff --git a/tests/functional_tests/tests_trtllm/test_deploy_query_ray.py b/tests/functional_tests/tests_trtllm/test_deploy_query_ray.py
index fdcfe03b23..2df7d5ae77 100644
--- a/tests/functional_tests/tests_trtllm/test_deploy_query_ray.py
+++ b/tests/functional_tests/tests_trtllm/test_deploy_query_ray.py
@@ -16,6 +16,8 @@
 import subprocess
 import time
 
+import pytest
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
@@ -38,6 +40,7 @@ def teardown_method(self):
             # Avoid double termination in case test used finally to clean up
             self.deploy_proc = None
 
+    @pytest.mark.skip(reason="Temporarily skipped")
     def test_deploy_ray_trtllm(self):
         nemo_checkpoint_path = "/home/TestData/llm/models/llama32_1b_nemo2"
         host = "0.0.0.0"
diff --git a/tests/functional_tests/utils/run_nemo_deploy.py b/tests/functional_tests/utils/run_nemo_deploy.py
index b1aac24075..9c31dff2bd 100644
--- a/tests/functional_tests/utils/run_nemo_deploy.py
+++ b/tests/functional_tests/utils/run_nemo_deploy.py
@@ -481,30 +481,9 @@ def run_inference_tests(args):
 
     while n_gpus <= args.max_gpus:
         if args.backend.lower() == "tensorrt-llm":
-            result_dic[n_gpus] = run_trt_llm_inference(
-                model_name=args.model_name,
-                model_type=args.model_type,
-                prompt=prompt_template,
-                checkpoint_path=args.checkpoint_dir,
-                trt_llm_model_dir=args.trt_llm_model_dir,
-                n_gpu=n_gpus,
-                max_batch_size=args.max_batch_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_num_tokens=args.max_num_tokens,
-                lora=args.lora,
-                lora_checkpoint=args.lora_checkpoint,
-                tp_size=args.tp_size,
-                pp_size=args.pp_size,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                temperature=args.temperature,
-                run_accuracy=args.run_accuracy,
-                debug=args.debug,
-                test_deployment=args.test_deployment,
-                test_data_path=args.test_data_path,
-                save_engine=args.save_engine,
-            )
+            # TODO: Temporarily disabled TensorRT-LLM tests - returning OK for now
+            print(f"Skipping TensorRT-LLM test for {n_gpus} GPUs - returning OK")
+            return
         else:
             result_dic[n_gpus] = run_in_framework_inference(
                 model_name=args.model_name,
diff --git a/nemo_export/trt_llm/__init__.py b/tests/unit_tests/deploy/__init__.py
similarity index 89%
rename from nemo_export/trt_llm/__init__.py
rename to tests/unit_tests/deploy/__init__.py
index 4fc50543f1..341a77c5bc 100644
--- a/nemo_export/trt_llm/__init__.py
+++ b/tests/unit_tests/deploy/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/unit_tests/deploy/test_deploy_ray.py b/tests/unit_tests/deploy/test_deploy_ray.py
index 039d393b8a..3e431a968a 100644
--- a/tests/unit_tests/deploy/test_deploy_ray.py
+++ b/tests/unit_tests/deploy/test_deploy_ray.py
@@ -14,15 +14,19 @@
 
 
 import argparse
+import json
 import unittest
 from unittest.mock import MagicMock, patch
 
 from nemo_deploy.deploy_ray import DeployRay
 
-# Import the functions from the deploy script
-from scripts.deploy.nlp.deploy_ray_inframework import (
-    json_type,
-)
+
+def json_type(value):
+    """Convert a JSON string to a Python object for argparse."""
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError as e:
+        raise argparse.ArgumentTypeError(f"Invalid JSON: {e}")
 
 
 class TestDeployRay(unittest.TestCase):
diff --git a/tests/unit_tests/deploy/test_deployment_service.py b/tests/unit_tests/deploy/test_deployment_service.py
index d1c0463c67..4d93f81748 100644
--- a/tests/unit_tests/deploy/test_deployment_service.py
+++ b/tests/unit_tests/deploy/test_deployment_service.py
@@ -67,7 +67,7 @@ def test_custom_values(self):
     def test_triton_settings_exception_handling(self):
         """Test TritonSettings initialization when environment variables cause exceptions"""
         with patch.dict(os.environ, {"TRITON_PORT": "invalid_port"}, clear=True):
-            with patch("nemo.utils.logging.error") as mock_logging:
+            with patch("nemo_deploy.service.fastapi_interface_to_pytriton.logger.error") as mock_logging:
                 settings = TritonSettings()
 
                 # The attributes won't be set due to the early return, so accessing properties will fail
diff --git a/tests/unit_tests/deploy/test_hf_ray_oai_format.py b/tests/unit_tests/deploy/test_hf_ray_oai_format.py
index 3976acd826..bbd52ff37b 100644
--- a/tests/unit_tests/deploy/test_hf_ray_oai_format.py
+++ b/tests/unit_tests/deploy/test_hf_ray_oai_format.py
@@ -580,8 +580,8 @@ def mock_hf_deployable_for_logprobs(self):
                     "input_ids": torch.tensor([[1, 2, 3, 4]]),
                     "attention_mask": torch.tensor([[1, 1, 1, 1]]),
                 }
-                mock_tokenizer.decode.side_effect = (
-                    lambda ids: f"token_{ids[0] if isinstance(ids, list) and len(ids) > 0 else 'unknown'}"
+                mock_tokenizer.decode.side_effect = lambda ids: (
+                    f"token_{ids[0] if isinstance(ids, list) and len(ids) > 0 else 'unknown'}"
                 )
                 mock_tokenizer.eos_token = "</s>"
                 mock_tokenizer.pad_token = "</s>"
diff --git a/tests/unit_tests/export/multimodal/test_build.py b/tests/unit_tests/export/multimodal/test_build.py
index c3c30aa104..e3e4dd9258 100644
--- a/tests/unit_tests/export/multimodal/test_build.py
+++ b/tests/unit_tests/export/multimodal/test_build.py
@@ -19,17 +19,8 @@
 from unittest.mock import MagicMock, mock_open, patch
 
 import pytest
-import torch
 
-try:
-    import tensorrt_llm  # noqa: F401
 
-    HAVE_TRTLLM = True
-except ImportError:
-    HAVE_TRTLLM = False
-
-
-@pytest.mark.skipif(not HAVE_TRTLLM, reason="TensorRT-LLM is not installed")
 @pytest.mark.run_only_on("GPU")
 class TestBuild(unittest.TestCase):
     @pytest.mark.run_only_on("GPU")
@@ -47,12 +38,6 @@ def setUp(self):
             "hidden_size": 4096,
             "data": {"num_frames": 4},
         }
-        self.mock_weights = {
-            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.weight": torch.randn(
-                4096, 768
-            ),
-            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.bias": torch.randn(4096),
-        }
 
     @pytest.mark.run_only_on("GPU")
     def tearDown(self):
@@ -65,56 +50,6 @@ def tearDown(self):
                     os.rmdir(os.path.join(root, name))
             os.rmdir(self.temp_dir)
 
-    @pytest.mark.skipif(not HAVE_TRTLLM, reason="trtllm is not installed")
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.TensorRTLLM")
-    def test_build_trtllm_engine(self, mock_trtllm):
-        # Test basic functionality
-        mock_exporter = MagicMock()
-        mock_trtllm.return_value = mock_exporter
-
-        from nemo_export.multimodal.build import build_trtllm_engine
-
-        build_trtllm_engine(
-            model_dir=self.temp_dir,
-            visual_checkpoint_path="test_path",
-            model_type="neva",
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-            max_batch_size=1,
-            max_multimodal_len=1024,
-            dtype="bfloat16",
-        )
-
-        mock_exporter.export.assert_called_once()
-
-    @pytest.mark.skipif(not HAVE_TRTLLM, reason="trtllm is not installed")
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.MLLaMAForCausalLM")
-    @patch("nemo_export.multimodal.build.build_trtllm")
-    def test_build_mllama_trtllm_engine(self, mock_build_trtllm, mock_mllama):
-        # Test basic functionality
-        mock_model = MagicMock()
-        mock_mllama.from_hugging_face.return_value = mock_model
-        mock_build_trtllm.return_value = MagicMock()
-
-        from nemo_export.multimodal.build import build_mllama_trtllm_engine
-
-        build_mllama_trtllm_engine(
-            model_dir=self.temp_dir,
-            hf_model_path="test_path",
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-            max_batch_size=1,
-            max_multimodal_len=1024,
-            dtype="bfloat16",
-        )
-
-        mock_mllama.from_hugging_face.assert_called_once()
-        mock_build_trtllm.assert_called_once()
-
     @pytest.mark.run_only_on("GPU")
     @patch("nemo_export.multimodal.build.torch.onnx.export")
     @patch("nemo_export.multimodal.build.os.makedirs")
@@ -170,83 +105,6 @@ def test_build_trt_engine(self, mock_file, mock_rmtree, mock_trt_builder, mock_b
 
         mock_rmtree.assert_called_once()
 
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.build_trt_engine")
-    @patch("nemo_export.multimodal.build.export_visual_wrapper_onnx")
-    @patch("nemo_export.multimodal.build.AutoModel.from_pretrained")
-    @patch("nemo_export.multimodal.build.load_nemo_model")
-    @patch("nemo_export.multimodal.build.torch.cuda.is_available", return_value=True)
-    def test_build_neva_engine(
-        self,
-        mock_cuda,
-        mock_load_nemo,
-        mock_auto_model,
-        mock_export_onnx,
-        mock_build_trt,
-    ):
-        from nemo_export.multimodal.build import build_neva_engine
-
-        # Setup mocks
-        mock_load_nemo.return_value = (self.mock_weights, self.mock_config, None)
-
-        mock_encoder = MagicMock()
-        mock_encoder.vision_model = MagicMock()
-        mock_encoder.config.vision_config.image_size = 224
-        mock_encoder.config.torch_dtype = torch.bfloat16
-        mock_auto_model.return_value = mock_encoder
-
-        build_neva_engine(
-            model_type="neva",
-            model_dir=self.temp_dir,
-            visual_checkpoint_path="test_checkpoint.nemo",
-            vision_max_batch_size=1,
-        )
-
-        mock_load_nemo.assert_called_once()
-        mock_auto_model.assert_called_once()
-        mock_export_onnx.assert_called_once()
-        mock_build_trt.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.build_trt_engine")
-    @patch("nemo_export.multimodal.build.export_visual_wrapper_onnx")
-    @patch("nemo_export.multimodal.build.AutoModel.from_pretrained")
-    @patch("nemo_export.multimodal.build.tarfile.open")
-    @patch("nemo_export.multimodal.build.torch.cuda.is_available", return_value=True)
-    def test_build_video_neva_engine(self, mock_cuda, mock_tarfile, mock_auto_model, mock_export_onnx, mock_build_trt):
-        from nemo_export.multimodal.build import build_video_neva_engine
-
-        # Setup mocks
-        mock_tar = MagicMock()
-        mock_tarfile.return_value.__enter__.return_value = mock_tar
-        mock_tar.extractfile.side_effect = [
-            mock_open(
-                read_data="mm_cfg:\n  vision_encoder:\n    from_pretrained: test\n    hidden_size: 768\n  mm_mlp_adapter_type: linear\nhidden_size: 4096\ndata:\n  num_frames: 4"
-            )().read(),
-            self.mock_weights,
-        ]
-
-        mock_encoder = MagicMock()
-        mock_encoder.vision_model = MagicMock()
-        mock_encoder.config.vision_config.image_size = 224
-        mock_encoder.config.torch_dtype = torch.bfloat16
-        mock_auto_model.return_value = mock_encoder
-
-        with patch("nemo_export.multimodal.build.yaml.safe_load", return_value=self.mock_config):
-            with patch(
-                "nemo_export.multimodal.build.torch.load",
-                return_value=self.mock_weights,
-            ):
-                build_video_neva_engine(
-                    model_dir=self.temp_dir,
-                    visual_checkpoint_path="test_checkpoint.nemo",
-                    vision_max_batch_size=1,
-                )
-
-        mock_auto_model.assert_called_once()
-        mock_export_onnx.assert_called_once()
-        mock_build_trt.assert_called_once()
-
     @pytest.mark.run_only_on("GPU")
     @patch("nemo_export.multimodal.build.MultimodalEngineBuilder")
     @patch("nemo_export.multimodal.build.AutoProcessor.from_pretrained")
@@ -273,82 +131,6 @@ def test_build_mllama_visual_engine(self, mock_listdir, mock_copy, mock_processo
         mock_processor_instance.save_pretrained.assert_called_once()
         mock_builder_instance.build.assert_called_once()
 
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.build_neva_engine")
-    @patch("nemo_export.multimodal.build.build_video_neva_engine")
-    def test_build_visual_engine(self, mock_build_video_neva, mock_build_neva):
-        from nemo_export.multimodal.build import build_visual_engine
-
-        # Test neva model
-        build_visual_engine(
-            model_dir=self.temp_dir,
-            visual_checkpoint_path="test_path",
-            model_type="neva",
-            vision_max_batch_size=1,
-        )
-        mock_build_neva.assert_called_once()
-
-        # Test video-neva model
-        build_visual_engine(
-            model_dir=self.temp_dir,
-            visual_checkpoint_path="test_path",
-            model_type="video-neva",
-            vision_max_batch_size=1,
-        )
-        mock_build_video_neva.assert_called_once()
-
-        # Test invalid model type
-        with self.assertRaises(RuntimeError):
-            build_visual_engine(
-                model_dir=self.temp_dir,
-                visual_checkpoint_path="test_path",
-                model_type="invalid",
-                vision_max_batch_size=1,
-            )
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.tarfile.open")
-    @patch("nemo_export.multimodal.build.torch.save")
-    @patch("nemo_export.multimodal.build.torch.load")
-    @patch("nemo_export.multimodal.build.os.path.exists")
-    def test_extract_lora_ckpt(self, mock_exists, mock_torch_load, mock_torch_save, mock_tarfile):
-        from nemo_export.multimodal.build import extract_lora_ckpt
-
-        # Test with direct model_weights.ckpt
-        def mock_exists_side_effect(path):
-            return ("model_weights.ckpt" in path and "mp_rank_00" not in path) or "model_config.yaml" in path
-
-        mock_exists.side_effect = mock_exists_side_effect
-        mock_torch_load.return_value = self.mock_weights
-
-        result = extract_lora_ckpt("test_lora_path", self.temp_dir)
-
-        self.assertTrue(result.endswith("llm_lora.nemo"))
-        mock_torch_load.assert_called()
-        mock_torch_save.assert_called()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.multimodal.build.build_mllama_trtllm_engine")
-    @patch("nemo_export.multimodal.build.build_mllama_visual_engine")
-    @patch("nemo_export.multimodal.build.llm.export_ckpt")
-    def test_build_mllama_engine(self, mock_export_ckpt, mock_build_visual, mock_build_trtllm):
-        from nemo_export.multimodal.build import build_mllama_engine
-
-        build_mllama_engine(
-            model_dir=self.temp_dir,
-            checkpoint_path="test_checkpoint",
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-            max_batch_size=1,
-            max_multimodal_len=1024,
-            dtype="bfloat16",
-        )
-
-        mock_export_ckpt.assert_called_once()
-        mock_build_visual.assert_called_once()
-        mock_build_trtllm.assert_called_once()
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unit_tests/export/test_model_loading.py b/tests/unit_tests/export/test_model_loading.py
deleted file mode 100644
index b78883dbfc..0000000000
--- a/tests/unit_tests/export/test_model_loading.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-from nemo.collections import llm
-
-HF_PATH = "/home/TestData/nlp/megatron_llama/llama-ci-hf"
-OUTPUT_PATH = "/tmp/imported_nemo2"
-
-dummy_module = MagicMock()
-dummy_module.torch_to_numpy = lambda torch_tensor: torch_tensor.detach().cpu().numpy()
-
-
-@pytest.mark.pleasefixme  # disabled since it required data
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_model_loading() -> None:
-    """
-    Test if model loading works for tensorrt_llm export.
-    """
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model = llm.LlamaModel(config=llm.Llama2Config7B)
-    nemo_path = llm.import_ckpt(model, "hf://" + HF_PATH, output_path=Path(OUTPUT_PATH))
-
-    assert nemo_path.exists()
-    assert (nemo_path / "weights").exists()
-    assert (nemo_path / "context").exists()
-
-    export_path = Path("/tmp/trtllm_exported_model")
-    export_path.mkdir(parents=True, exist_ok=True)
-    export_path_mcore = export_path / "mcore_export"
-
-    with patch.dict(
-        "sys.modules",
-        {
-            "tensorrt_llm": dummy_module,
-            "tensorrt_llm._utils": dummy_module,
-        },
-    ):
-        from nemo_export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model
-
-        load_nemo_model(nemo_path, export_path_mcore)
-
-    shutil.rmtree(OUTPUT_PATH, ignore_errors=True)
diff --git a/tests/unit_tests/export/test_nemo_file.py b/tests/unit_tests/export/test_nemo_file.py
deleted file mode 100644
index 2a9db56ce7..0000000000
--- a/tests/unit_tests/export/test_nemo_file.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pickle
-from unittest.mock import Mock, patch
-
-import pytest
-import torch
-import yaml
-
-from nemo_export.trt_llm.nemo_ckpt_loader.nemo_file import (
-    build_tokenizer,
-    get_model_type,
-    get_tokenizer,
-    get_weights_dtype,
-    load_distributed_model_weights,
-    load_extra_state_from_bytes,
-    load_nemo_config,
-    load_nemo_model,
-    rename_extra_states,
-    update_tokenizer_paths,
-)
-
-
-class TestLoadExtraStateFromBytes:
-    """Test cases for load_extra_state_from_bytes function."""
-
-    def test_load_extra_state_from_bytes_none(self):
-        """Test loading extra state from None."""
-        result = load_extra_state_from_bytes(None)
-        assert result is None
-
-    def test_load_extra_state_from_bytes_empty_tensor(self):
-        """Test loading extra state from empty tensor."""
-        empty_tensor = torch.tensor([])
-        result = load_extra_state_from_bytes(empty_tensor)
-        assert result is None
-
-    def test_load_extra_state_from_bytes_tensor(self):
-        """Test loading extra state from tensor."""
-        test_data = {"test_key": "test_value"}
-        serialized_data = pickle.dumps(test_data)
-        tensor_data = torch.tensor(list(serialized_data), dtype=torch.uint8)
-
-        result = load_extra_state_from_bytes(tensor_data)
-        assert result == test_data
-
-
-class TestRenameExtraStates:
-    """Test cases for rename_extra_states function."""
-
-    def test_rename_extra_states_no_extra_state(self):
-        """Test renaming with no extra state keys."""
-        state_dict = {"layer1.weight": torch.randn(10, 10)}
-        result = rename_extra_states(state_dict)
-        assert result == state_dict
-
-    def test_rename_extra_states_with_valid_keys(self):
-        """Test renaming with valid extra state keys."""
-        state_dict = {
-            "model.layers.attention._extra_state/shard_0_2": torch.randn(10),
-            "model.layers.attention._extra_state/shard_1_2": torch.randn(10),
-            "normal_layer.weight": torch.randn(10, 10),
-        }
-
-        result = rename_extra_states(state_dict)
-
-        # Check that normal layers are preserved
-        assert "normal_layer.weight" in result
-        # Check that extra states are renamed
-        assert "model.layers.0.attention._extra_state" in result
-        assert "model.layers.1.attention._extra_state" in result
-
-    def test_rename_extra_states_with_list_values(self):
-        """Test renaming with list values."""
-        state_dict = {
-            "model.layers.attention._extra_state/shard_0_2": [torch.randn(10)],
-            "normal_layer.weight": torch.randn(10, 10),
-        }
-
-        result = rename_extra_states(state_dict)
-        assert "model.layers.0.attention._extra_state" in result
-        assert isinstance(result["model.layers.0.attention._extra_state"], torch.Tensor)
-
-
-class TestUpdateTokenizerPaths:
-    """Test cases for update_tokenizer_paths function."""
-
-    def test_update_tokenizer_paths(self):
-        """Test updating tokenizer paths."""
-        tokenizer_config = {
-            "model": "/old/path/tokenizer.model",
-            "vocab_file": "/old/path/vocab.txt",
-            "merge_file": "/old/path/merges.txt",
-        }
-
-        mock_unpacked_dir = Mock()
-        mock_unpacked_dir.get_tokenizer_file_path.side_effect = lambda key, file_key, pattern: f"/new/path/{file_key}"
-
-        result = update_tokenizer_paths(tokenizer_config, mock_unpacked_dir)
-
-        assert result["model"] == "/new/path/model"
-        assert result["vocab_file"] == "/new/path/vocab_file"
-        assert result["merge_file"] == "/new/path/merge_file"
-
-
-class TestBuildTokenizer:
-    """Test cases for build_tokenizer function."""
-
-    def test_build_tokenizer_sentencepiece(self):
-        """Test building SentencePiece tokenizer."""
-        config = {"library": "sentencepiece", "model": "/path/to/tokenizer.model"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.SentencePieceTokenizer") as mock_sp:
-            mock_tokenizer = Mock()
-            mock_sp.return_value = mock_tokenizer
-
-            result = build_tokenizer(config)
-
-            mock_sp.assert_called_once_with(model_path="/path/to/tokenizer.model")
-            assert result == mock_tokenizer
-
-    def test_build_tokenizer_tiktoken(self):
-        """Test building Tiktoken tokenizer."""
-        config = {"library": "tiktoken", "vocab_file": "/path/to/vocab.json"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.TiktokenTokenizer") as mock_tiktoken:
-            mock_tokenizer = Mock()
-            mock_tiktoken.return_value = mock_tokenizer
-
-            result = build_tokenizer(config)
-
-            mock_tiktoken.assert_called_once_with(vocab_file="/path/to/vocab.json")
-            assert result == mock_tokenizer
-
-
-class TestLoadNemoConfig:
-    """Test cases for load_nemo_config function."""
-
-    def test_load_nemo_config_nemo2_structure(self, tmp_path):
-        """Test loading config from NeMo 2.0 structure."""
-        # Create NeMo 2.0 directory structure
-        nemo_dir = tmp_path / "nemo2_checkpoint"
-        weights_dir = nemo_dir / "weights"
-        context_dir = nemo_dir / "context"
-        weights_dir.mkdir(parents=True)
-        context_dir.mkdir(parents=True)
-
-        config_data = {"model_type": "llama", "hidden_size": 4096}
-        with open(context_dir / "model.yaml", "w") as f:
-            yaml.dump(config_data, f)
-
-        result = load_nemo_config(nemo_dir)
-        assert result == config_data
-
-
-class TestGetModelType:
-    """Test cases for get_model_type function."""
-
-    def test_get_model_type_nemo2_llama(self):
-        """Test getting model type for NeMo 2.0 Llama model."""
-        config = {"_target_": "nemo.collections.llm.gpt.model.llama.LlamaModel"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            result = get_model_type("/path/to/checkpoint")
-            assert result == "llama"
-
-    def test_get_model_type_nemo2_mistral(self):
-        """Test getting model type for NeMo 2.0 Mistral model."""
-        config = {"_target_": "nemo.collections.llm.gpt.model.mistral.MistralModel"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            result = get_model_type("/path/to/checkpoint")
-            assert result == "llama"
-
-    def test_get_model_type_nemo2_mixtral_vllm(self):
-        """Test getting model type for NeMo 2.0 Mixtral model with vLLM type."""
-        config = {"_target_": "nemo.collections.llm.gpt.model.mixtral.MixtralModel"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            result = get_model_type("/path/to/checkpoint", use_vllm_type=True)
-            assert result == "mixtral"
-
-    def test_get_model_type_unknown_model(self):
-        """Test getting model type for unknown model."""
-        config = {"_target_": "nemo.collections.llm.gpt.model.unknown.UnknownModel"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            with pytest.raises(KeyError):
-                get_model_type("/path/to/checkpoint")
-
-
-class TestGetWeightsDtype:
-    """Test cases for get_weights_dtype function."""
-
-    def test_get_weights_dtype_nemo2(self):
-        """Test getting weights dtype for NeMo 2.0 model."""
-        config = {
-            "_target_": "nemo.collections.llm.gpt.model.llama.LlamaModel",
-            "config": {"params_dtype": {"_target_": "torch.float16"}},
-        }
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            result = get_weights_dtype("/path/to/checkpoint")
-            assert result == "float16"
-
-    def test_get_weights_dtype_nemo1(self):
-        """Test getting weights dtype for NeMo 1.0 model."""
-        config = {"precision": "16-mixed"}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.torch_dtype_from_precision") as mock_convert:
-                mock_convert.return_value = torch.float16
-
-                result = get_weights_dtype("/path/to/checkpoint")
-                assert result == "float16"
-
-    def test_get_weights_dtype_not_found(self):
-        """Test getting weights dtype when not found."""
-        config = {}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_nemo_config") as mock_load:
-            mock_load.return_value = config
-
-            result = get_weights_dtype("/path/to/checkpoint")
-            assert result is None
-
-
-class TestLoadDistributedModelWeights:
-    """Test cases for load_distributed_model_weights function."""
-
-    def test_load_distributed_model_weights_torch_tensor(self):
-        """Test loading distributed model weights as torch tensors."""
-        mock_state_dict = {"layer1.weight": torch.randn(10, 10), "layer2.bias": torch.randn(10)}
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_model_weights") as mock_load:
-            mock_load.return_value = mock_state_dict
-
-            with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.rename_extra_states") as mock_rename:
-                mock_rename.return_value = mock_state_dict
-
-                result = load_distributed_model_weights("/path/to/checkpoint")
-
-                assert result == mock_state_dict
-                mock_load.assert_called_once_with("/path/to/checkpoint", load_extra_states=True)
-
-
-class TestLoadNemoModel:
-    """Test cases for load_nemo_model function."""
-
-    def test_load_nemo_model_nemo2_structure(self, tmp_path):
-        """Test loading NeMo 2.0 model."""
-        nemo_ckpt = tmp_path / "nemo2_checkpoint"
-        nemo_ckpt.mkdir()
-        (nemo_ckpt / "weights").mkdir()
-        (nemo_ckpt / "context").mkdir()
-
-        export_dir = tmp_path / "export"
-        export_dir.mkdir()
-
-        config_data = {
-            "config": {
-                "activation_func": {"_target_": "torch.nn.functional.silu"},
-                "num_moe_experts": 8,
-                "add_bias_linear": True,
-            }
-        }
-
-        with open(nemo_ckpt / "context" / "model.yaml", "w") as f:
-            yaml.dump(config_data, f)
-
-        mock_state_dict = {"layer1.weight": torch.randn(10, 10)}
-
-        with patch(
-            "nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.load_distributed_model_weights"
-        ) as mock_load_weights:
-            mock_load_weights.return_value = mock_state_dict
-
-            model, config, tokenizer = load_nemo_model(nemo_ckpt, export_dir)
-
-            assert model == mock_state_dict
-            assert config["activation"] == "fast-swiglu"
-            assert config["bias"] is True
-            assert config["num_moe_experts"] == 8
-
-    def test_load_nemo_model_nonexistent_path(self):
-        """Test loading model with nonexistent path."""
-        with pytest.raises(TypeError):
-            load_nemo_model("/nonexistent/path", "/export/path")
-
-
-class TestGetTokenizer:
-    """Test cases for get_tokenizer function."""
-
-    def test_get_tokenizer_nemo2_context(self, tmp_path):
-        """Test getting tokenizer from NeMo 2.0 context."""
-        tokenizer_dir = tmp_path / "tokenizer"
-        tokenizer_dir.mkdir()
-        (tokenizer_dir / "nemo_context").mkdir()
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.get_tokenizer_from_nemo2_context") as mock_get:
-            mock_tokenizer = Mock()
-            mock_get.return_value = mock_tokenizer
-
-            result = get_tokenizer(tokenizer_dir)
-
-            assert result == mock_tokenizer
-
-    def test_get_tokenizer_huggingface(self, tmp_path):
-        """Test getting HuggingFace tokenizer."""
-        tokenizer_dir = tmp_path / "tokenizer"
-        tokenizer_dir.mkdir()
-        (tokenizer_dir / "tokenizer_config.json").touch()
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.AutoTokenizer") as mock_auto:
-            mock_tokenizer = Mock()
-            mock_auto.from_pretrained.return_value = mock_tokenizer
-
-            result = get_tokenizer(tokenizer_dir)
-
-            assert result == mock_tokenizer
-
-    def test_get_tokenizer_tiktoken(self, tmp_path):
-        """Test getting Tiktoken tokenizer."""
-        tokenizer_dir = tmp_path / "tokenizer"
-        tokenizer_dir.mkdir()
-        (tokenizer_dir / "vocab.json").touch()
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.build_tokenizer") as mock_build:
-            mock_tokenizer = Mock()
-            mock_build.return_value = mock_tokenizer
-
-            result = get_tokenizer(tokenizer_dir)
-
-            assert result == mock_tokenizer
-
-    def test_get_tokenizer_sentencepiece(self, tmp_path):
-        """Test getting SentencePiece tokenizer."""
-        tokenizer_dir = tmp_path / "tokenizer"
-        tokenizer_dir.mkdir()
-        (tokenizer_dir / "tokenizer.model").touch()
-
-        with patch("nemo_export.trt_llm.nemo_ckpt_loader.nemo_file.build_tokenizer") as mock_build:
-            mock_tokenizer = Mock()
-            mock_build.return_value = mock_tokenizer
-
-            result = get_tokenizer(tokenizer_dir)
-
-            assert result == mock_tokenizer
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/unit_tests/export/test_tensorrt_llm.py b/tests/unit_tests/export/test_tensorrt_llm.py
deleted file mode 100644
index 41b63e8505..0000000000
--- a/tests/unit_tests/export/test_tensorrt_llm.py
+++ /dev/null
@@ -1,844 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from unittest.mock import (
-    mock_open,
-    patch,
-)
-
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_initialization():
-    """Test TensorRTLLM class initialization with various parameters."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    # Test basic initialization
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-    assert trt_llm.model_dir == model_dir
-    assert trt_llm.engine_dir == os.path.join(model_dir, "trtllm_engine")
-    assert trt_llm.model is None
-    assert trt_llm.tokenizer is None
-    assert trt_llm.config is None
-
-    # Test initialization with lora checkpoints
-    lora_ckpt_list = ["/path/to/lora1", "/path/to/lora2"]
-    trt_llm = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
-    assert trt_llm.lora_ckpt_list == lora_ckpt_list
-
-    # Test initialization with python runtime options
-    trt_llm = TensorRTLLM(
-        model_dir=model_dir,
-        use_python_runtime=False,
-        enable_chunked_context=False,
-        max_tokens_in_paged_kv_cache=None,
-        load_model=False,
-    )
-    assert trt_llm.use_python_runtime is False
-    assert trt_llm.enable_chunked_context is False
-    assert trt_llm.max_tokens_in_paged_kv_cache is None
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_supported_models():
-    """Test supported models list for NeMo models."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test supported models list
-    supported_models = trt_llm.get_supported_models_list
-    assert isinstance(supported_models, list)
-    assert len(supported_models) > 0
-    assert all(isinstance(model, str) for model in supported_models)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_supported_models():
-    """Test supported HF models list."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm_hf = TensorRTLLMHF(model_dir=model_dir, load_model=False)
-
-    # Test HF model mapping
-    hf_mapping = trt_llm_hf.get_supported_hf_model_mapping
-    assert isinstance(hf_mapping, dict)
-    assert len(hf_mapping) > 0
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hidden_size():
-    """Test hidden size property retrieval."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test hidden size property
-    hidden_size = trt_llm.get_hidden_size
-    if hidden_size is not None:
-        assert isinstance(hidden_size, int)
-        assert hidden_size > 0
-    else:
-        assert hidden_size is None
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_triton_io():
-    """Test Triton input/output configuration."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test Triton input configuration
-    triton_input = trt_llm.get_triton_input
-    assert isinstance(triton_input, tuple)
-    assert triton_input[0].name == "prompts"
-    assert triton_input[1].name == "max_output_len"
-    assert triton_input[2].name == "top_k"
-    assert triton_input[3].name == "top_p"
-    assert triton_input[4].name == "temperature"
-    assert triton_input[5].name == "random_seed"
-    assert triton_input[6].name == "stop_words_list"
-    assert triton_input[7].name == "bad_words_list"
-
-    # Test Triton output configuration
-    triton_output = trt_llm.get_triton_output
-    assert isinstance(triton_output, tuple)
-    assert triton_output[0].name == "outputs"
-    assert triton_output[1].name == "generation_logits"
-    assert triton_output[2].name == "context_logits"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_pad_logits():
-    """Test logits padding functionality."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Create a sample logits tensor
-    batch_size = 2
-    seq_len = 3
-    vocab_size = 1000
-    logits = torch.randn(batch_size, seq_len, vocab_size)
-
-    # Test padding logits
-    padded_logits = trt_llm._pad_logits(logits)
-    assert isinstance(padded_logits, torch.Tensor)
-    assert padded_logits.shape[0] == batch_size
-    assert padded_logits.shape[1] == seq_len
-    # Should be padded to a multiple of 8
-    assert padded_logits.shape[2] >= vocab_size
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_basic():
-    """Test basic functionality of ray_infer_fn method."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text 1", "Generated text 2"]
-
-        inputs = {
-            "prompts": ["Hello", "World"],
-            "max_output_len": 256,
-            "temperature": 0.8,
-            "top_k": 50,
-            "top_p": 0.9,
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result structure
-        assert "sentences" in result
-        assert result["sentences"] == ["Generated text 1", "Generated text 2"]
-
-        # Verify forward was called with correct parameters
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Hello", "World"]
-        assert call_kwargs["max_output_len"] == 256
-        assert call_kwargs["temperature"] == 0.8
-        assert call_kwargs["top_k"] == 50
-        assert call_kwargs["top_p"] == 0.9
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_with_single_string_prompt():
-    """Test ray_infer_fn method with a single string prompt (not in a list)."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated response"]
-
-        inputs = {
-            "prompts": "Hello world",  # Single string instead of list
-            "temperature": 1.0,
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result
-        assert result["sentences"] == ["Generated response"]
-
-        # Verify forward was called with prompts converted to list
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Hello world"]
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_with_stop_words():
-    """Test ray_infer_fn method with stop words list."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text"]
-
-        inputs = {
-            "prompts": ["Test prompt"],
-            "stop_words_list": ["stop", "end"],
-            "bad_words_list": ["bad", "word"],
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result
-        assert result["sentences"] == ["Generated text"]
-
-        # Verify forward was called with properly formatted word lists
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["stop_words_list"] == [["stop"], ["end"]]
-        assert call_kwargs["bad_words_list"] == [["bad"], ["word"]]
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_with_and_lora():
-    """Test ray_infer_fn method with task IDs and LoRA UIDs."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text with LoRA"]
-
-        inputs = {
-            "prompts": ["Test prompt"],
-            "lora_uids": ["lora_uid_1"],
-            "random_seed": 42,
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result
-        assert result["sentences"] == ["Generated text with LoRA"]
-
-        # Verify forward was called with all parameters
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["lora_uids"] == ["lora_uid_1"]
-        assert call_kwargs["random_seed"] == 42
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_empty_prompts():
-    """Test ray_infer_fn method with empty prompts."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = []
-
-        inputs = {}  # No prompts provided
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result
-        assert result["sentences"] == []
-
-        # Verify forward was called with empty input_texts
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == []
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_error_handling():
-    """Test ray_infer_fn method error handling."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method to raise an exception
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.side_effect = Exception("Model inference failed")
-
-        inputs = {
-            "prompts": ["Test prompt 1", "Test prompt 2"],
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify error handling
-        assert "sentences" in result
-        assert "error" in result
-        # Should match number of prompts
-        assert len(result["sentences"]) == 2
-        assert all("An error occurred" in sentence for sentence in result["sentences"])
-        assert "Model inference failed" in result["error"]
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_ray_infer_fn_all_parameters():
-    """Test ray_infer_fn method with all possible parameters."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Comprehensive test response"]
-
-        inputs = {
-            "prompts": ["Comprehensive test prompt"],
-            "max_output_len": 512,
-            "top_k": 50,
-            "top_p": 0.9,
-            "temperature": 0.7,
-            "random_seed": 123,
-            "stop_words_list": [["stop"], ["end"]],  # Already in correct format
-            "bad_words_list": [["bad"], ["inappropriate"]],  # Already in correct format
-            "lora_uids": ["comprehensive_lora"],
-            "output_log_probs": True,
-        }
-
-        result = trt_llm.ray_infer_fn(inputs)
-
-        # Verify the result
-        assert result["sentences"] == ["Comprehensive test response"]
-
-        # Verify forward was called with all parameters
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        expected_params = [
-            "input_texts",
-            "max_output_len",
-            "top_k",
-            "top_p",
-            "temperature",
-            "random_seed",
-            "stop_words_list",
-            "bad_words_list",
-            "lora_uids",
-            "output_log_probs",
-        ]
-
-        for param in expected_params:
-            assert param in call_kwargs, f"Parameter {param} not found in forward call"
-
-        # Verify specific values
-        assert call_kwargs["input_texts"] == ["Comprehensive test prompt"]
-        assert call_kwargs["max_output_len"] == 512
-        assert call_kwargs["top_k"] == 50
-        assert call_kwargs["top_p"] == 0.9
-        assert call_kwargs["temperature"] == 0.7
-        assert call_kwargs["random_seed"] == 123
-        assert call_kwargs["stop_words_list"] == [["stop"], ["end"]]
-        assert call_kwargs["bad_words_list"] == [["bad"], ["inappropriate"]]
-        assert call_kwargs["lora_uids"] == ["comprehensive_lora"]
-        assert call_kwargs["output_log_probs"] is True
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test__infer_fn_basic():
-    """Test basic functionality of _infer_fn method."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text 1", "Generated text 2"]
-
-        prompts = ["Hello", "World"]
-        inputs = {
-            "max_output_len": 256,
-            "temperature": 0.8,
-            "top_k": 50,
-            "top_p": 0.9,
-        }
-
-        result = trt_llm._infer_fn(prompts, inputs)
-
-        # Verify the result
-        assert result == ["Generated text 1", "Generated text 2"]
-
-        # Verify forward was called with correct parameters
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Hello", "World"]
-        assert call_kwargs["max_output_len"] == 256
-        assert call_kwargs["temperature"] == 0.8
-        assert call_kwargs["top_k"] == 50
-        assert call_kwargs["top_p"] == 0.9
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test__infer_fn_with_stop_words():
-    """Test _infer_fn method with stop words and bad words processing."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text"]
-
-        prompts = ["Test prompt"]
-        inputs = {
-            "stop_words_list": ["stop", "end"],  # String format
-            "bad_words_list": ["bad", "word"],  # String format
-        }
-
-        result = trt_llm._infer_fn(prompts, inputs)
-
-        # Verify the result
-        assert result == ["Generated text"]
-
-        # Verify forward was called with properly formatted word lists
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Test prompt"]
-        assert call_kwargs["stop_words_list"] == [["stop"], ["end"]]
-        assert call_kwargs["bad_words_list"] == [["bad"], ["word"]]
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test__infer_fn_with_preformatted_word_lists():
-    """Test _infer_fn method with already properly formatted word lists."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Generated text"]
-
-        prompts = ["Test prompt"]
-        inputs = {
-            "stop_words_list": [["stop"], ["end"]],  # Already in correct format
-            "bad_words_list": [["bad"], ["word"]],  # Already in correct format
-        }
-
-        result = trt_llm._infer_fn(prompts, inputs)
-
-        # Verify the result
-        assert result == ["Generated text"]
-
-        # Verify forward was called with word lists unchanged
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Test prompt"]
-        assert call_kwargs["stop_words_list"] == [["stop"], ["end"]]
-        assert call_kwargs["bad_words_list"] == [["bad"], ["word"]]
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test__infer_fn_with_all_parameters():
-    """Test _infer_fn method with all possible parameters."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Comprehensive test response"]
-
-        prompts = ["Comprehensive test prompt"]
-        inputs = {
-            "max_output_len": 512,
-            "top_k": 50,
-            "top_p": 0.9,
-            "temperature": 0.7,
-            "random_seed": 123,
-            "stop_words_list": ["stop", "end"],
-            "bad_words_list": ["bad", "inappropriate"],
-            "lora_uids": ["comprehensive_lora"],
-            "output_log_probs": True,
-        }
-
-        result = trt_llm._infer_fn(prompts, inputs)
-
-        # Verify the result
-        assert result == ["Comprehensive test response"]
-
-        # Verify forward was called with all parameters
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        expected_params = [
-            "input_texts",
-            "max_output_len",
-            "top_k",
-            "top_p",
-            "temperature",
-            "random_seed",
-            "stop_words_list",
-            "bad_words_list",
-            "lora_uids",
-            "output_log_probs",
-        ]
-
-        for param in expected_params:
-            assert param in call_kwargs, f"Parameter {param} not found in forward call"
-
-        # Verify specific values
-        assert call_kwargs["input_texts"] == ["Comprehensive test prompt"]
-        assert call_kwargs["max_output_len"] == 512
-        assert call_kwargs["top_k"] == 50
-        assert call_kwargs["top_p"] == 0.9
-        assert call_kwargs["temperature"] == 0.7
-        assert call_kwargs["random_seed"] == 123
-        assert call_kwargs["stop_words_list"] == [["stop"], ["end"]]
-        assert call_kwargs["bad_words_list"] == [["bad"], ["inappropriate"]]
-        assert call_kwargs["lora_uids"] == ["comprehensive_lora"]
-        assert call_kwargs["output_log_probs"] is True
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test__infer_fn_empty_inputs():
-    """Test _infer_fn method with minimal inputs."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Mock the forward method
-    with patch.object(trt_llm, "forward") as mock_forward:
-        mock_forward.return_value = ["Basic response"]
-
-        prompts = ["Basic prompt"]
-        inputs = {}  # No additional inputs
-
-        result = trt_llm._infer_fn(prompts, inputs)
-
-        # Verify the result
-        assert result == ["Basic response"]
-
-        # Verify forward was called with just input_texts
-        mock_forward.assert_called_once()
-        call_kwargs = mock_forward.call_args[1]
-        assert call_kwargs["input_texts"] == ["Basic prompt"]
-        # Should only have input_texts, no other parameters
-        assert len(call_kwargs) == 1
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_forward_without_model():
-    """Test forward pass when model is not loaded."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    trt_llm = TensorRTLLM(model_dir="/tmp/test_model", load_model=False)
-
-    with pytest.raises(Exception) as exc_info:
-        trt_llm.forward(
-            input_texts=["Hello"],
-            max_output_len=128,
-            top_k=50,
-            top_p=0.9,
-            temperature=0.7,
-            stop_words_list=["stop"],
-            bad_words_list=["bad"],
-            output_log_probs=True,
-        )
-
-    assert "A nemo checkpoint should be exported" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_unload_engine():
-    """Test engine unloading functionality."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm import TensorRTLLM
-
-    trt_llm = TensorRTLLM(model_dir="/tmp/test_model")
-
-    # Mock the unload_engine function
-    with patch("nemo_export.tensorrt_llm.unload_engine") as mock_unload:
-        trt_llm.unload_engine()
-        mock_unload.assert_called_once()
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_type():
-    """Test getting model type from HF config."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Mock AutoConfig
-    with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-        mock_config.return_value.architectures = ["LlamaForCausalLM"]
-        model_type = trt_llm_hf.get_hf_model_type("/tmp/model")
-        assert model_type == "LlamaForCausalLM"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_type_ambiguous():
-    """Test getting model type with ambiguous architecture."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Mock AutoConfig with multiple architectures
-    with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-        mock_config.return_value.architectures = ["Model1", "Model2"]
-        with pytest.raises(ValueError) as exc_info:
-            trt_llm_hf.get_hf_model_type("/tmp/model")
-        assert "Ambiguous architecture choice" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype():
-    """Test getting model dtype from HF config."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Mock config file reading
-    mock_config = {
-        "torch_dtype": "float16",
-        "fp16": True,
-        "bf16": False,
-    }
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "float16"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_not_found():
-    """Test getting model dtype when config file doesn't exist."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    with patch("pathlib.Path.exists", return_value=False):
-        with pytest.raises(FileNotFoundError) as exc_info:
-            trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert "Config file not found" in str(exc_info.value)
diff --git a/tests/unit_tests/export/test_tensorrt_llm_hf.py b/tests/unit_tests/export/test_tensorrt_llm_hf.py
deleted file mode 100644
index d78b820169..0000000000
--- a/tests/unit_tests/export/test_tensorrt_llm_hf.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from unittest.mock import (
-    MagicMock,
-    mock_open,
-    patch,
-)
-
-import pytest
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_initialization():
-    """Test TensorRTLLMHF class initialization with various parameters."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    # Test basic initialization
-    model_dir = "/tmp/test_hf_model_dir"
-    trt_llm_hf = TensorRTLLMHF(model_dir=model_dir, load_model=False)
-    assert trt_llm_hf.model_dir == model_dir
-    assert trt_llm_hf.engine_dir == os.path.join(model_dir, "trtllm_engine")
-    assert trt_llm_hf.model is None
-    assert trt_llm_hf.tokenizer is None
-    assert trt_llm_hf.config is None
-
-    # Test initialization with lora checkpoints
-    lora_ckpt_list = ["/path/to/hf_lora1", "/path/to/hf_lora2"]
-    trt_llm_hf = TensorRTLLMHF(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
-    assert trt_llm_hf.lora_ckpt_list == lora_ckpt_list
-
-    # Test initialization with python runtime options
-    trt_llm_hf = TensorRTLLMHF(
-        model_dir=model_dir,
-        use_python_runtime=False,
-        enable_chunked_context=True,
-        max_tokens_in_paged_kv_cache=2048,
-        multi_block_mode=True,
-        load_model=False,
-    )
-    assert trt_llm_hf.use_python_runtime is False
-    assert trt_llm_hf.enable_chunked_context is True
-    assert trt_llm_hf.max_tokens_in_paged_kv_cache == 2048
-    assert trt_llm_hf.multi_block_mode is True
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_type():
-    """Test getting model type from HF config."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with LlamaForCausalLM architecture
-    with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-        mock_config.return_value.architectures = ["LlamaForCausalLM"]
-        model_type = trt_llm_hf.get_hf_model_type("/tmp/model")
-        assert model_type == "LlamaForCausalLM"
-
-    # Test with different model architectures
-    test_architectures = [
-        "GPT2LMHeadModel",
-        "MistralForCausalLM",
-        "Phi3ForCausalLM",
-        "QWenForCausalLM",
-    ]
-
-    for arch in test_architectures:
-        with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-            mock_config.return_value.architectures = [arch]
-            model_type = trt_llm_hf.get_hf_model_type("/tmp/model")
-            assert model_type == arch
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_type_ambiguous():
-    """Test getting model type with ambiguous architecture."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with multiple architectures
-    with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-        mock_config.return_value.architectures = ["Model1", "Model2"]
-        with pytest.raises(ValueError) as exc_info:
-            trt_llm_hf.get_hf_model_type("/tmp/model")
-        assert "Ambiguous architecture choice" in str(exc_info.value)
-
-    # Test with empty architectures list
-    with patch("transformers.AutoConfig.from_pretrained") as mock_config:
-        mock_config.return_value.architectures = []
-        with pytest.raises(ValueError) as exc_info:
-            trt_llm_hf.get_hf_model_type("/tmp/model")
-        assert "Ambiguous architecture choice" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_torch_dtype():
-    """Test getting model dtype from HF config with torch_dtype field."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with torch_dtype field
-    mock_config = {"torch_dtype": "float16"}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "float16"
-
-    # Test with bfloat16
-    mock_config = {"torch_dtype": "bfloat16"}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "bfloat16"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_fp16_bf16_flags():
-    """Test getting model dtype from HF config with fp16/bf16 flags."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with fp16 flag
-    mock_config = {"fp16": True, "bf16": False}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "float16"
-
-    # Test with bf16 flag
-    mock_config = {"fp16": False, "bf16": True}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "bfloat16"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_direct_dtype_field():
-    """Test getting model dtype from HF config with direct dtype field."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with direct dtype field
-    mock_config = {"dtype": "float32"}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "float32"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_pretrained_config():
-    """Test getting model dtype from HF config with pretrained_config field."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with pretrained_config field
-    mock_config = {"pretrained_config": {"dtype": "float16"}}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype == "float16"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_not_found():
-    """Test getting model dtype when config file doesn't exist."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    with patch("pathlib.Path.exists", return_value=False):
-        with pytest.raises(FileNotFoundError) as exc_info:
-            trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert "Config file not found" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_no_dtype():
-    """Test getting model dtype when no dtype information is available."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Test with config that has no dtype information
-    mock_config = {"model_type": "llama"}
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data=json.dumps(mock_config))),
-    ):
-        dtype = trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert dtype is None
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_get_hf_model_dtype_invalid_json():
-    """Test getting model dtype with invalid JSON in config file."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    with (
-        patch("pathlib.Path.exists", return_value=True),
-        patch("builtins.open", mock_open(read_data="invalid json {")),
-    ):
-        with pytest.raises(ValueError) as exc_info:
-            trt_llm_hf.get_hf_model_dtype("/tmp/model")
-        assert "Invalid JSON in config file" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_supported_models():
-    """Test supported HF models mapping."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm_hf = TensorRTLLMHF(model_dir=model_dir, load_model=False)
-
-    # Test HF model mapping
-    hf_mapping = trt_llm_hf.get_supported_hf_model_mapping
-    assert isinstance(hf_mapping, dict)
-    assert len(hf_mapping) > 0
-
-    # Test specific model mappings
-    expected_models = [
-        "LlamaForCausalLM",
-        "MistralForCausalLM",
-        "GPT2LMHeadModel",
-        "Phi3ForCausalLM",
-        "QWenForCausalLM",
-        "GEMMA",
-        "FalconForCausalLM",
-        "MambaForCausalLM",
-    ]
-
-    for model in expected_models:
-        assert model in hf_mapping, f"Model {model} not found in supported HF models"
-
-    # Verify all values are valid TensorRT-LLM model classes
-    for key, value in hf_mapping.items():
-        assert value is not None
-        assert hasattr(value, "__name__")
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_unsupported_model():
-    """Test exporting an unsupported HF model type."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="UnsupportedModel"),
-        pytest.raises(ValueError) as exc_info,
-    ):
-        trt_llm_hf.export_hf_model(hf_model_path="/tmp/hf_model", model_type="UnsupportedModel")
-
-    assert "is not currently a supported model type" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_no_dtype():
-    """Test exporting HF model when dtype cannot be determined."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="LlamaForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value=None),
-        pytest.raises(ValueError) as exc_info,
-    ):
-        trt_llm_hf.export_hf_model(hf_model_path="/tmp/hf_model")
-
-    assert "No dtype found in hf model config" in str(exc_info.value)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_basic():
-    """Test basic HF model export functionality."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    mock_model = MagicMock()
-    mock_engine = MagicMock()
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="LlamaForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value="float16"),
-        patch("nemo_export.tensorrt_llm_hf.prepare_directory_for_export"),
-        patch("nemo_export.tensorrt_llm_hf.build_trtllm", return_value=mock_engine),
-        patch("nemo_export.tensorrt_llm_hf.LLaMAForCausalLM.from_hugging_face", return_value=mock_model),
-        patch("glob.glob", return_value=[]),
-        patch.object(trt_llm_hf, "_load"),
-    ):
-        trt_llm_hf.export_hf_model(
-            hf_model_path="/tmp/hf_model",
-            max_batch_size=8,
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-        )
-
-        # Verify engine was saved
-        mock_engine.save.assert_called_once()
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_with_params():
-    """Test HF model export with various parameters."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    mock_model = MagicMock()
-    mock_engine = MagicMock()
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="MistralForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value="bfloat16"),
-        patch("nemo_export.tensorrt_llm_hf.prepare_directory_for_export"),
-        patch("nemo_export.tensorrt_llm_hf.build_trtllm", return_value=mock_engine),
-        patch("nemo_export.tensorrt_llm_hf.LLaMAForCausalLM.from_hugging_face", return_value=mock_model),
-        patch("glob.glob", return_value=[]),
-        patch.object(trt_llm_hf, "_load"),
-    ):
-        trt_llm_hf.export_hf_model(
-            hf_model_path="/tmp/hf_model",
-            max_batch_size=16,
-            tensor_parallelism_size=2,
-            max_input_len=512,
-            max_output_len=512,
-            dtype="bfloat16",
-            gemm_plugin="auto",
-            remove_input_padding=True,
-            use_paged_context_fmha=True,
-            paged_kv_cache=True,
-            tokens_per_block=64,
-            multiple_profiles=True,
-            reduce_fusion=True,
-            max_beam_width=4,
-            use_refit=True,
-        )
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_batch_size_adjustment():
-    """Test HF model export with batch size < 4 gets adjusted to 4."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    mock_model = MagicMock()
-    mock_engine = MagicMock()
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="LlamaForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value="float16"),
-        patch("nemo_export.tensorrt_llm_hf.prepare_directory_for_export"),
-        patch("nemo_export.tensorrt_llm_hf.build_trtllm", return_value=mock_engine),
-        patch("nemo_export.tensorrt_llm_hf.LLaMAForCausalLM.from_hugging_face", return_value=mock_model),
-        patch("glob.glob", return_value=[]),
-        patch.object(trt_llm_hf, "_load"),
-        patch("builtins.print") as mock_print,
-    ):
-        trt_llm_hf.export_hf_model(
-            hf_model_path="/tmp/hf_model",
-            max_batch_size=2,  # Less than 4
-        )
-
-        # Verify warning was printed
-        mock_print.assert_called_once()
-        assert "Force set to 4" in str(mock_print.call_args)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_hf_model_multi_rank():
-    """Test HF model export with multiple ranks (tensor parallelism)."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    mock_model = MagicMock()
-    mock_engine = MagicMock()
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="LlamaForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value="float16"),
-        patch("nemo_export.tensorrt_llm_hf.prepare_directory_for_export"),
-        patch("nemo_export.tensorrt_llm_hf.build_trtllm", return_value=mock_engine),
-        patch("nemo_export.tensorrt_llm_hf.LLaMAForCausalLM.from_hugging_face", return_value=mock_model),
-        patch("glob.glob", return_value=[]),
-        patch.object(trt_llm_hf, "_load"),
-    ):
-        trt_llm_hf.export_hf_model(
-            hf_model_path="/tmp/hf_model",
-            tensor_parallelism_size=4,  # Test with 4 ranks
-        )
-
-        # Verify engine was saved 4 times (once per rank)
-        assert mock_engine.save.call_count == 4
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_export_copies_tokenizer_files():
-    """Test that HF model export copies tokenizer files."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    mock_model = MagicMock()
-    mock_engine = MagicMock()
-
-    with (
-        patch.object(trt_llm_hf, "get_hf_model_type", return_value="LlamaForCausalLM"),
-        patch.object(trt_llm_hf, "get_hf_model_dtype", return_value="float16"),
-        patch("nemo_export.tensorrt_llm_hf.prepare_directory_for_export"),
-        patch("nemo_export.tensorrt_llm_hf.build_trtllm", return_value=mock_engine),
-        patch("nemo_export.tensorrt_llm_hf.LLaMAForCausalLM.from_hugging_face", return_value=mock_model),
-        patch(
-            "glob.glob",
-            side_effect=lambda x: ["/tmp/hf_model/tokenizer.json"]
-            if "*.json" in x
-            else ["/tmp/hf_model/tokenizer.model"],
-        ),
-        patch("shutil.copy"),
-        patch.object(trt_llm_hf, "_load"),
-    ):
-        trt_llm_hf.export_hf_model(hf_model_path="/tmp/hf_model")
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_inherits_parent_methods():
-    """Test that TensorRTLLMHF inherits methods from TensorRTLLM."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-    trt_llm_hf = TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
-
-    # Verify inherited methods exist
-    assert hasattr(trt_llm_hf, "forward")
-    assert hasattr(trt_llm_hf, "_infer_fn")
-    assert hasattr(trt_llm_hf, "ray_infer_fn")
-    assert hasattr(trt_llm_hf, "unload_engine")
-    assert hasattr(trt_llm_hf, "_load")
-    assert hasattr(trt_llm_hf, "get_triton_input")
-    assert hasattr(trt_llm_hf, "get_triton_output")
-    assert hasattr(trt_llm_hf, "_pad_logits")
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_tensorrt_llm_hf_unavailable_error():
-    """Test that TensorRTLLMHF raises UnavailableError when TensorRT-LLM is not installed."""
-    try:
-        import tensorrt_llm  # noqa: F401
-
-        pytest.skip("TensorRT-LLM is installed, skipping unavailable test")
-    except ImportError:
-        pass
-
-    from nemo_export_deploy_common.import_utils import UnavailableError
-
-    # Mock HAVE_TENSORRT_LLM to be False
-    with patch("nemo_export.tensorrt_llm_hf.HAVE_TENSORRT_LLM", False):
-        from nemo_export.tensorrt_llm_hf import TensorRTLLMHF
-
-        with pytest.raises(UnavailableError):
-            TensorRTLLMHF(model_dir="/tmp/test_model", load_model=False)
diff --git a/tests/unit_tests/export/test_tensorrt_llm_run.py b/tests/unit_tests/export/test_tensorrt_llm_run.py
deleted file mode 100644
index 6b5733f6c7..0000000000
--- a/tests/unit_tests/export/test_tensorrt_llm_run.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest.mock import (
-    MagicMock,
-)
-
-import numpy as np
-import pytest
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_basic():
-    """Test basic functionality of to_word_list_format function."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>hello": [100, 200],
-        "<extra_id_1>world": [100, 300],
-        "hello": [200],
-        "world": [300],
-    }.get(x, [])
-
-    # Test basic functionality
-    word_dict = [["hello,world"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Check result shape and format
-    assert result.shape[0] == 1  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-    # Check that the function processed the CSV format correctly
-    flat_ids = result[0, 0]
-
-    # Should have tokens for "hello" and "world"
-    assert 200 in flat_ids  # token for "hello"
-    assert 300 in flat_ids  # token for "world"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_multiple_batches():
-    """Test to_word_list_format with multiple batches."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>hello": [100, 200],
-        "<extra_id_1>world": [100, 300],
-        "<extra_id_1>foo": [100, 400],
-        "<extra_id_1>bar": [100, 500],
-        "hello": [200],
-        "world": [300],
-        "foo": [400],
-        "bar": [500],
-    }.get(x, [])
-
-    # Test with multiple batches
-    word_dict = [["hello,world"], ["foo,bar"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Check result shape
-    assert result.shape[0] == 2  # batch_size = 2
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-    # Check first batch
-    flat_ids_0 = result[0, 0]
-    assert 200 in flat_ids_0  # token for "hello"
-    assert 300 in flat_ids_0  # token for "world"
-
-    # Check second batch
-    flat_ids_1 = result[1, 0]
-    assert 400 in flat_ids_1  # token for "foo"
-    assert 500 in flat_ids_1  # token for "bar"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_bytes_input():
-    """Test to_word_list_format with bytes input."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>hello": [100, 200],
-        "hello": [200],
-    }.get(x, [])
-
-    # Test with bytes input
-    word_dict = [[b"hello"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Check that bytes were properly decoded and processed
-    assert result.shape[0] == 1  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-    flat_ids = result[0, 0]
-    assert 200 in flat_ids  # token for "hello"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_empty_words():
-    """Test to_word_list_format with empty words."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer that returns empty list for empty string
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>": [100],  # Empty word after prefix
-        "": [],  # Empty string
-    }.get(x, [])
-
-    # Test with empty words
-    word_dict = [["hello,"]]  # This will create "hello" and empty string
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Should still work and handle empty words gracefully
-    assert result.shape[0] == 1  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_custom_ref_string():
-    """Test to_word_list_format with custom reference string."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<custom_ref>": [999],
-        "<custom_ref>hello": [999, 200],
-        "hello": [200],
-    }.get(x, [])
-
-    # Test with custom reference string
-    word_dict = [["hello"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer, ref_str="<custom_ref>")
-
-    # Check that custom ref string was used
-    assert result.shape[0] == 1  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-    flat_ids = result[0, 0]
-    assert 200 in flat_ids  # token for "hello"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_prefix_merge_fallback():
-    """Test to_word_list_format fallback when prefix merges with word."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer that simulates prefix merging
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>hello": [888],  # Merged token, different from [100, 200]
-        "hello": [200],  # Fallback encoding
-    }.get(x, [])
-
-    # Test with prefix merge scenario
-    word_dict = [["hello"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Should use fallback encoding when prefix merges
-    assert result.shape[0] == 1  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.dtype == np.int32
-
-    flat_ids = result[0, 0]
-    assert 200 in flat_ids  # Should use fallback token for "hello"
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_no_tokenizer():
-    """Test to_word_list_format raises error when no tokenizer is provided."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Test that function raises assertion error when no tokenizer is provided
-    word_dict = [["hello"]]
-    with pytest.raises(AssertionError, match="need to set tokenizer"):
-        to_word_list_format(word_dict, tokenizer=None)
-
-
-@pytest.mark.run_only_on("GPU")
-@pytest.mark.unit
-def test_to_word_list_format_padding():
-    """Test to_word_list_format padding behavior."""
-    try:
-        import tensorrt_llm  # noqa: F401
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    from nemo_export.trt_llm.tensorrt_llm_run import to_word_list_format
-
-    # Create a mock tokenizer with different length tokens
-    mock_tokenizer = MagicMock()
-    mock_tokenizer.encode.side_effect = lambda x: {
-        "<extra_id_1>": [100],
-        "<extra_id_1>short": [100, 200],
-        "<extra_id_1>verylongword": [100, 300, 301, 302, 303],
-        "short": [200],
-        "verylongword": [300, 301, 302, 303],
-    }.get(x, [])
-
-    # Test with words of different lengths
-    word_dict = [["short"], ["verylongword"]]
-    result = to_word_list_format(word_dict, tokenizer=mock_tokenizer)
-
-    # Check that padding was applied correctly
-    assert result.shape[0] == 2  # batch_size
-    assert result.shape[1] == 2  # flat_ids and offsets
-    assert result.shape[2] == 4  # Should be padded to max length (4 tokens for "verylongword")
-    assert result.dtype == np.int32
-
-    # Check that shorter sequences are padded with zeros
-    flat_ids_0 = result[0, 0]
-    assert 200 in flat_ids_0  # token for "short"
-    assert 0 in flat_ids_0  # Should have padding zeros
-
-    # Check that offsets are padded with -1
-    offsets_0 = result[0, 1]
-    assert -1 in offsets_0  # Should have padding -1s
diff --git a/tests/unit_tests/export/test_tensorrt_mm_exporter.py b/tests/unit_tests/export/test_tensorrt_mm_exporter.py
deleted file mode 100644
index bef56da08a..0000000000
--- a/tests/unit_tests/export/test_tensorrt_mm_exporter.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import Mock, patch
-
-import numpy as np
-import pytest
-
-
-@pytest.fixture
-def model_dir(tmp_path):
-    return str(tmp_path / "model_dir")
-
-
-@pytest.fixture
-def mock_runner():
-    runner = Mock()
-    runner.model_type = "neva"
-    runner.load_test_media = Mock(return_value=np.zeros((1, 224, 224, 3)))
-    runner.run = Mock(return_value="Test response")
-    return runner
-
-
-@pytest.fixture
-def mock_trtllm_runner():
-    runner = Mock()
-    runner.model_type = "mllama"
-    runner.args = Mock()
-    runner.load_test_data = Mock(return_value=np.zeros((1, 224, 224, 3)))
-    runner.run = Mock(return_value=["", "Test response"])
-    return runner
-
-
-try:
-    import tensorrt_llm  # noqa: F401
-
-    HAVE_TRTLLM = True
-except ImportError:
-    HAVE_TRTLLM = False
-
-
-@pytest.mark.skipif(not HAVE_TRTLLM, reason="Skipping TensorRTMMExporter tests due to lack of trtllm")
-class TestTensorRTMMExporter:
-    @pytest.mark.run_only_on("GPU")
-    def test_init(self, model_dir):
-        # Test basic initialization
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        assert exporter.model_dir == model_dir
-        assert exporter.runner is None
-        assert exporter.modality == "vision"
-
-    @pytest.mark.run_only_on("GPU")
-    def test_init_invalid_modality(self, model_dir):
-        # Test initialization with invalid modality
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        with pytest.raises(AssertionError):
-            TensorRTMMExporter(model_dir, modality="invalid")
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_mllama_engine")
-    def test_export_mllama(self, mock_build, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="mllama",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_build.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_neva(self, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="neva",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.extract_lora_ckpt")
-    @patch("os.path.isdir")
-    def test_export_with_lora(self, mock_isdir, mock_extract, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Mock the LoRA path handling
-        mock_isdir.return_value = True  # Treat as directory
-        mock_extract.return_value = "dummy/lora/ckpt"
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="neva",
-            tensor_parallel_size=1,
-            load_model=False,
-            lora_checkpoint_path="dummy/lora/path",
-            use_lora_plugin="lora_plugin",
-            lora_target_modules=["q_proj", "v_proj"],
-            max_lora_rank=32,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-        mock_extract.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.extract_lora_ckpt")
-    @patch("os.path.isdir")
-    def test_export_with_lora_directory(self, mock_isdir, mock_extract, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Mock the LoRA path handling - treat as directory
-        mock_isdir.return_value = True  # Treat as directory
-        mock_extract.return_value = "dummy/lora/ckpt"
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="neva",
-            tensor_parallel_size=1,
-            load_model=False,
-            lora_checkpoint_path="dummy/lora/dir",
-            use_lora_plugin="lora_plugin",
-            lora_target_modules=["q_proj", "v_proj"],
-            max_lora_rank=32,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-        mock_extract.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    @patch("os.path.isdir")
-    def test_export_with_lora_not_directory(self, mock_isdir, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Mock the LoRA path handling - treat as file (not directory)
-        mock_isdir.return_value = False
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(ValueError, match="lora_checkpoint_path in nemo1 is not supported. It must be a directory"):
-            exporter.export(
-                visual_checkpoint_path="dummy/path",
-                model_type="neva",
-                tensor_parallel_size=1,
-                load_model=False,
-                lora_checkpoint_path="dummy/lora/file.tar",
-                use_lora_plugin="lora_plugin",
-                lora_target_modules=["q_proj", "v_proj"],
-                max_lora_rank=32,
-            )
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_vila(self, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="vila",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_video_neva(self, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="video-neva",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_lita(self, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="lita",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo_export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_vita(self, mock_visual, mock_trtllm, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path",
-            model_type="vita",
-            tensor_parallel_size=1,
-            load_model=False,
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    def test_forward_without_loading(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception) as exc_info:
-            exporter.forward("test prompt", "test_image.jpg")
-        assert "should be exported and" in str(exc_info.value)
-
-    @pytest.mark.run_only_on("GPU")
-    def test_forward(self, model_dir, mock_runner):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.runner = mock_runner
-
-        result = exporter.forward(
-            input_text="What's in this image?",
-            input_media="test_image.jpg",
-            batch_size=1,
-            max_output_len=30,
-        )
-
-        assert result == "Test response"
-        mock_runner.load_test_media.assert_called_once()
-        mock_runner.run.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("nemo_export.tensorrt_mm_exporter.isinstance")
-    def test_forward_with_trtllm_runner(self, mock_isinstance, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Create a mock runner
-        mock_runner = Mock()
-        mock_runner.model_type = "mllama"
-        mock_runner.args = Mock()
-        mock_runner.load_test_data = Mock(return_value=np.zeros((1, 224, 224, 3)))
-        mock_runner.run = Mock(return_value=["", "Test response"])
-
-        # Make isinstance return True for TRTLLMRunner check
-        mock_isinstance.return_value = True
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.runner = mock_runner
-
-        result = exporter.forward(
-            input_text="What's in this image?",
-            input_media="test_image.jpg",
-            batch_size=2,
-            max_output_len=50,
-            top_k=5,
-            top_p=0.9,
-            temperature=0.7,
-            repetition_penalty=1.2,
-            num_beams=4,
-        )
-
-        assert result == "Test response"
-        assert mock_runner.args.image_path == "test_image.jpg"
-        assert mock_runner.args.batch_size == 2
-        assert mock_runner.args.top_k == 5
-        assert mock_runner.args.top_p == 0.9
-        assert mock_runner.args.temperature == 0.7
-        assert mock_runner.args.repetition_penalty == 1.2
-        assert mock_runner.args.num_beams == 4
-        mock_runner.load_test_data.assert_called_once_with("test_image.jpg")
-        mock_runner.run.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    def test_get_triton_input(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        inputs = exporter.get_triton_input
-
-        # Verify we have the expected number of inputs
-        assert len(inputs) == 10  # 1 text input + 1 media input + 8 optional parameters
-
-        # Verify the first input is for text
-        assert inputs[0].name == "input_text"
-        assert inputs[0].dtype == bytes
-
-    @pytest.mark.run_only_on("GPU")
-    def test_get_triton_output(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        outputs = exporter.get_triton_output
-
-        assert len(outputs) == 1
-        assert outputs[0].name == "outputs"
-        assert outputs[0].dtype == bytes
-
-    @pytest.mark.run_only_on("GPU")
-    def test_forward_with_all_params(self, model_dir, mock_runner):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.runner = mock_runner
-
-        result = exporter.forward(
-            input_text="What's in this image?",
-            input_media="test_image.jpg",
-            batch_size=2,
-            max_output_len=50,
-            top_k=5,
-            top_p=0.9,
-            temperature=0.7,
-            repetition_penalty=1.2,
-            num_beams=4,
-            lora_uids=["lora1", "lora2"],
-        )
-
-        assert result == "Test response"
-        mock_runner.load_test_media.assert_called_once()
-        mock_runner.run.assert_called_once_with(
-            "What's in this image?",
-            mock_runner.load_test_media.return_value,
-            50,
-            2,
-            5,
-            0.9,
-            0.7,
-            1.2,
-            4,
-            ["lora1", "lora2"],
-        )
-
-    @pytest.mark.run_only_on("GPU")
-    def test_get_input_media_tensors_vision(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False, modality="vision")
-        tensors = exporter.get_input_media_tensors()
-
-        assert len(tensors) == 1
-        assert tensors[0].name == "input_media"
-        assert tensors[0].shape == (-1, -1, -1, 3)
-        assert tensors[0].dtype == np.uint8
-
-    @pytest.mark.run_only_on("GPU")
-    def test_get_input_media_tensors_audio(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False, modality="audio")
-        tensors = exporter.get_input_media_tensors()
-
-        assert len(tensors) == 0
-
-    @pytest.mark.run_only_on("GPU")
-    def test_export_with_invalid_model_type(self, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception):
-            exporter.export(
-                visual_checkpoint_path="dummy/path",
-                model_type="invalid_model_type",
-                tensor_parallel_size=1,
-                load_model=False,
-            )
-
-    @pytest.mark.run_only_on("GPU")
-    def test_export_with_existing_files(self, model_dir):
-        import os
-
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Create some files in the model directory
-        os.makedirs(model_dir, exist_ok=True)
-        with open(os.path.join(model_dir, "test.txt"), "w") as f:
-            f.write("test")
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception) as exc_info:
-            exporter.export(
-                visual_checkpoint_path="dummy/path",
-                model_type="neva",
-                tensor_parallel_size=1,
-                load_model=False,
-                delete_existing_files=False,
-            )
-        assert "There are files in this folder" in str(exc_info.value)
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("os.path.exists")
-    def test_load_no_llm_dir(self, mock_exists, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        mock_exists.return_value = False
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter._load()
-        assert exporter.runner is None
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("os.path.exists")
-    @patch("builtins.open", create=True)
-    @patch("json.load")
-    def test_load_mllama_model(self, mock_json_load, mock_open, mock_exists, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        mock_exists.return_value = True
-        mock_json_load.return_value = {"builder_config": {"model_type": "mllama"}}
-        mock_open.return_value.__enter__ = lambda x: x
-        mock_open.return_value.__exit__ = lambda x, y, z, w: None
-
-        with patch("nemo_export.tensorrt_mm_exporter.TRTLLMRunner") as mock_trtllm_runner:
-            exporter = TensorRTMMExporter(model_dir, load_model=False)
-            exporter._load()
-            mock_trtllm_runner.assert_called_once()
-
-    @pytest.mark.run_only_on("GPU")
-    @patch("os.path.exists")
-    @patch("builtins.open", create=True)
-    @patch("json.load")
-    def test_load_other_model(self, mock_json_load, mock_open, mock_exists, model_dir):
-        from nemo_export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        mock_exists.return_value = True
-        mock_json_load.return_value = {"builder_config": {"model_type": "neva"}}
-        mock_open.return_value.__enter__ = lambda x: x
-        mock_open.return_value.__exit__ = lambda x, y, z, w: None
-
-        with patch("nemo_export.tensorrt_mm_exporter.MultimodalModelRunner") as mock_multimodal_runner:
-            exporter = TensorRTMMExporter(model_dir, load_model=False)
-            exporter._load()
-            mock_multimodal_runner.assert_called_once()