NVIDIA · terrykong · Dec 13, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/docs/user-guide/aligner-algo-header.rst b/docs/user-guide/aligner-algo-header.rst
@@ -1,4 +1,6 @@
 .. important::
    Before starting this tutorial, be sure to review the :ref:`introduction <nemo-aligner-getting-started>` for tips on setting up your NeMo-Aligner environment.
 
-   If you run into any problems, refer to NeMo's `Known Issues page <https://docs.nvidia.com/nemo-framework/user-guide/latest/knownissues.html>`__. The page enumerates known issues and provides suggested workarounds where appropriate.
+   If you run into any problems, refer to NeMo's `Known Issues page <https://docs.nvidia.com/nemo-framework/user-guide/latest/knownissues.html>`__. The page enumerates known issues and provides suggested workarounds where appropriate.
+
+   After completing this tutorial, refer to the :ref:`evaluation documentation <nemo-aligner-eval>` for tips on evaluating a trained model.
diff --git a/docs/user-guide/evaluation.rst b/docs/user-guide/evaluation.rst
@@ -0,0 +1,39 @@
+.. include:: /content/nemo.rsts
+
+.. _nemo-aligner-eval:
+
+Evaluate a Trained Model
+@@@@@@@@@@@@@@@@@@@@@@@@
+
+After training a model, you may want to run evaluation to understand how the model performs on unseen tasks. You can use Eleuther AI's `Language Model Evaluation Harness <https://github.com/EleutherAI/lm-evaluation-harness>`_
+to quickly run a variety of popular benchmarks, including MMLU, SuperGLUE, HellaSwag, and WinoGrande.
+A full list of supported tasks can be found `here <https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md>`_.
+
+Install the LM Evaluation Harness
+#################################
+
+Run the following commands inside of a NeMo container to install the LM Evaluation Harness:
+
+.. code-block:: bash
+
+   git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
+   cd lm-evaluation-harness
+   pip install -e .
+
+
+Run Evaluations
+###############
+
+A detailed description of running evaluation with ``.nemo`` models can be found in Eleuther AI's `documentation <https://github.com/EleutherAI/lm-evaluation-harness?tab=readme-ov-file#nvidia-nemo-models>`_.
+Single- and multi-GPU evaluation is supported. The following is an example of running evaluation using 8 GPUs on the ``hellaswag``, ``super_glue``, and ``winogrande`` tasks using a ``.nemo`` file from NeMo-Aligner.
+Please note that while it is recommended, you are not required to unzip your .nemo file before running evaluations.
+
+.. code-block:: bash
+
+   mkdir unzipped_checkpoint
+   tar -xvf /path/to/model.nemo -c unzipped_checkpoint
+
+   torchrun --nproc-per-node=8 --no-python lm_eval --model nemo_lm \
+     --model_args path='unzipped_checkpoint',devices=8,tensor_model_parallel_size=8 \
+     --tasks lambada_openai,super-glue-lm-eval-v1,winogrande \
+     --batch_size 8
diff --git a/examples/nlp/data/sft/remove_long_dialogues.py b/examples/nlp/data/sft/remove_long_dialogues.py
@@ -25,7 +25,7 @@
 Usage:
   python3 remove_long_dialogues.py \
     --tokenizer_path <PATH TO TOKENIZER MODEL> \
-    --tokenizer_type sentencepiece
+    --tokenizer_type sentencepiece \
     --dataset_file <PATH TO DATASET TO PREPROCESS> \
     --output_file <WHERE TO SAVE PREPROCESSED DATASET> \
     --seq_len <MAX_SEQ_LEN TO USE DURING TRAINING>

diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_knowledge_distillation.py
@@ -72,7 +72,7 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     required_keys.update(("tokens", "position_ids"))
 
                 if parallel_state.is_pipeline_last_stage():
-                    required_keys.update(("labels", "loss_mask"))
+                    required_keys.update(("labels", "loss_mask", "topk_logits", "topk_token_ids"))
 
             batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()}
 
@@ -83,7 +83,9 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
 
             tokens = batch["tokens"]
             labels = batch["labels"]
-            loss_mask = batch["loss_mask"].clamp(min=0, max=1)
+            loss_mask = batch["loss_mask"]
+            if loss_mask is not None:
+                loss_mask = loss_mask.clamp(min=0, max=1)
             target_topk_logits = batch["topk_logits"]
             target_topk_token_ids = batch["topk_token_ids"]
             # Model forward pass

diff --git a/tests/functional/kd.sh b/tests/functional/kd.sh
@@ -83,7 +83,7 @@ torchrun --nproc-per-node 2 ${GPFS}/examples/nlp/gpt/train_gpt_knowledge_distill
     exp_manager.create_checkpoint_callback=False \
     model.data.num_workers=2 \
     ++model.tensor_model_parallel_size=1 \
-    ++model.pipeline_model_parallel_size=1 \
+    ++model.pipeline_model_parallel_size=2 \
     exp_manager.explicit_log_dir=${RESULTS_DIR} \
     ++model.activations_checkpoint_granularity=full \
     ++model.activations_checkpoint_method=uniform \