diff --git a/README.md b/README.md
index 10b91c2a..42569b9e 100644
--- a/README.md
+++ b/README.md
@@ -37,20 +37,20 @@ We developped a wide range of speedup techniques, including improving beam searc
 
 If you use fairseq or transformers, you only need to install one of them. If you use both, you need to install both.
 
-### Install from PIP package
-
-`fastseq` Python package can be directly installed with pip using
+### Install from the source
 
 ```bash
-$ pip install fastseq
-```
+# when fairseq and/or transformers has been installed
+$ pip install git+https://github.com/microsoft/fastseq.git
 
-### Install from the source
+# install fastseq + transformers
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers]
 
-```bash
-$ git clone https://github.com/microsoft/fastseq
-$ cd fastseq
-$ pip install --editable ./
+# install fastseq + fairseq
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[fairseq]
+
+# install fastseq + transformers + fairseq
+$ pip install git+https://github.com/microsoft/fastseq.git#egg=project[transformers,fairseq]
 ```
 
 ## Usage
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index aff30f15..328e01c8 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -21,7 +21,14 @@ jobs:
     steps:
     - script: |
         #install fastseq
-        pip install --editable .
+        pip install --editable .[transformers,fairseq]
+
+        echo "******* Running fastseq unittests *******"
+        pip install pytorch-transformers==1.0.0
+        bash tests/run_fastseq_tests.sh
+
+        #cd benchmarks/
+        #bash run_all_benchmarks.sh
 
         #show environment
         which python
@@ -32,11 +39,14 @@ jobs:
 
         echo "******* Running fairseq unittests *******"
         bash tests/run_fairseq_tests.sh
+
         echo "******* Running transformers unittests *******"
         bash tests/run_transformers_tests.sh
-        echo "******* Running fastseq unittests *******"
-        pip install pytorch-transformers==1.0.0
-        python -m unittest discover -s tests/ -p 'test_*.py' -v
-        #cd benchmarks/
-        #bash run_all_benchmarks.sh
+
       displayName: 'run fastseq unit tests'
+    - task: PublishTestResults@2
+      condition: succeededOrFailed()
+      inputs:
+        testRunTitle: 'Publish test results for Python $(python.version)'
+        testResultsFiles: 'tests/log_xml/*.xml'
+        failTaskOnFailedTests: true
diff --git a/benchmarks/models/fast_test.sh b/benchmarks/models/fast_test.sh
index e02f2d3d..3de68d67 100755
--- a/benchmarks/models/fast_test.sh
+++ b/benchmarks/models/fast_test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Run it at its parent folder, and check result at ../perf. 
-# USAGE -./benchmark.sh 
+# Run it at its parent folder, and check result at ../perf.
+# USAGE -./benchmark.sh
 #   [fairseq|fairseq+fastseq|transformers|transformers+fastseq]
 #   <model>
 #   <task>
diff --git a/fastseq/config.py b/fastseq/config.py
index 412712cc..7d5f269d 100644
--- a/fastseq/config.py
+++ b/fastseq/config.py
@@ -9,9 +9,11 @@
 FASTSEQ_DEFAULT_LOG_LEVEL = 'INFO'
 FASTSEQ_LOG_LEVEL = os.getenv('FASTSEQ_LOG_LEVEL', FASTSEQ_DEFAULT_LOG_LEVEL)
 FASTSEQ_CACHE_DIR = os.getenv('FASTSEQ_CACHE_DIR', os.path.join(os.sep, 'tmp'))
+FASTSEQ_UNITTEST_LOG_XML_DIR = os.getenv(
+    'FASTSEQ_UNITTEST_LOG_XML_DIR', os.path.join('tests', 'log_xml'))
 
 FASTSEQ_LOG_FORMAT = (
-    '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+    '%(levelname)s %(asctime)s %(pathname)s:%(lineno)d] %(message)s')
 
 FASTSEQ_VERSION = '0.0.4'
 
diff --git a/fastseq/models/prophetnet_fs/bert_dictionary.py b/fastseq/models/prophetnet_fs/bert_dictionary.py
index c63bc1a8..b4d0a372 100644
--- a/fastseq/models/prophetnet_fs/bert_dictionary.py
+++ b/fastseq/models/prophetnet_fs/bert_dictionary.py
@@ -7,14 +7,16 @@
 
 from collections import Counter
 from multiprocessing import Pool
+import logging
 import os
-
 import torch
 
 from fairseq.tokenizer import tokenize_line
 from fairseq.binarizer import safe_readline
 from fairseq.data import data_utils, Dictionary
+from fastseq.logging import get_logger
 
+logger = get_logger(__name__, logging.INFO)
 
 class BertDictionary(Dictionary):
     """A mapping from symbols to consecutive integers"""
@@ -37,11 +39,17 @@ def load_from_file(cls, filename):
         d.count = []
         d.indices = {}
 
+        line_cnt = 0
         with open(
             filename, 'r', encoding='utf-8', errors='ignore') as input_file:
             for line in input_file:
-                k, v = line.split()
-                d.add_symbol(k)
+                line_cnt += 1
+                try:
+                    k, v = line.split(" ")
+                    d.add_symbol(k)
+                except:
+                    logger.error("Bad line at line: %d (1-based), content: '%s'." % (line_cnt, line))
+                    raise
 
         d.unk_word = '[UNK]'
         d.pad_word = '[PAD]'
diff --git a/fastseq/ops/__init__.py b/fastseq/ops/__init__.py
new file mode 100644
index 00000000..59e481eb
--- /dev/null
+++ b/fastseq/ops/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/fastseq/optimizer/fairseq/__init__.py b/fastseq/optimizer/fairseq/__init__.py
index 96f56b99..05f635d1 100644
--- a/fastseq/optimizer/fairseq/__init__.py
+++ b/fastseq/optimizer/fairseq/__init__.py
@@ -40,7 +40,9 @@ def apply_fairseq_optimization():
             f"fairseq(v{fairseq.__version__}) is not supported by fastseq(v"
             f"{FASTSEQ_VERSION}) yet, please change fairseq to "
             f"v{MIN_FAIRSEQ_VERSION} ~ v{MAX_FAIRSEQ_VERSION}, or check other "
-            "versions of fastseq.")
+            "versions of fastseq. Currently, no optimization in fastseq has "
+            "been applied. Please ignore this warning if you are not using "
+            "fairseq")
         return
 
     import fastseq.optimizer.fairseq.beam_search_optimizer  # pylint: disable=import-outside-toplevel
@@ -68,15 +70,20 @@ def _update_fairseq_model_registration():
                 "Update the register model arch {} from {} to {}".format(
                     arch_name, model_class, OPTIMIZED_CLASSES[model_class]))
 
+is_fairseq_installed = True
 
 try:
     import fairseq # pylint: disable=ungrouped-imports
     from fairseq.models import ARCH_MODEL_REGISTRY, MODEL_REGISTRY # pylint: disable=ungrouped-imports
     from fairseq.sequence_generator import SequenceGenerator # pylint: disable=ungrouped-imports
-    apply_fairseq_optimization()
 except ImportError as error:
+    is_fairseq_installed = False
     logger.warning('fairseq can not be imported. Please ignore this warning if '
-                   'you are not using fairseq')
-except:
-    logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
-    raise
+                   'you are not using fairseq: {}'.format(error))
+
+if is_fairseq_installed:
+    try:
+        apply_fairseq_optimization()
+    except:
+        logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
+        raise
diff --git a/fastseq/optimizer/fairseq/beam_search_optimizer.py b/fastseq/optimizer/fairseq/beam_search_optimizer.py
index 69fc1788..f984f175 100644
--- a/fastseq/optimizer/fairseq/beam_search_optimizer.py
+++ b/fastseq/optimizer/fairseq/beam_search_optimizer.py
@@ -13,10 +13,40 @@
 from fairseq import utils
 from fairseq.models.transformer import TransformerEncoder, TransformerModel
 from fairseq.modules.multihead_attention import MultiheadAttention
+from fairseq.search import BeamSearch
 from fairseq.sequence_generator import SequenceGenerator
 from fastseq.ops.ngram_repeat_block import NGramRepeatBlock
 from fastseq.utils.api_decorator import replace
 
+@replace(BeamSearch)
+class BeamSearchV2(BeamSearch):
+
+    def step(self, step, lprobs, scores):
+        super()._init_buffers(lprobs)
+        bsz, beam_size, vocab_size = lprobs.size()
+
+        if step == 0:
+            # at the first step all hypotheses are equally likely, so use
+            # only the first beam
+            lprobs = lprobs[:, ::beam_size, :].contiguous()
+        else:
+            # make probs contain cumulative scores for each hypothesis
+            lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
+
+        torch.topk(
+            lprobs.view(bsz, -1),
+            k=min(
+                # Take the best 2 x beam_size predictions. We'll choose the first
+                # beam_size of these which don't predict eos to continue with.
+                beam_size * 2,
+                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
+            ),
+            out=(self.scores_buf, self.indices_buf),
+        )
+        self.beams_buf = torch.floor_divide(self.indices_buf, vocab_size)
+        self.indices_buf.fmod_(vocab_size)
+        return self.scores_buf, self.indices_buf, self.beams_buf
+
 @replace(TransformerEncoder)
 class TransformerEncoderV2(TransformerEncoder):
     """
@@ -494,6 +524,49 @@ def is_finished(sent, step, unfin_idx):
                 return True
             return False
 
+        def apply_no_repeat_ngram_cpu(self, tokens,lprobs, bsz,step,
+                beam_size, no_repeat_ngram_size):
+            """ Fairseq implementation of blocking
+                repeated ngrams
+            """
+            banned_list = [[] for bbsz_idx in range(bsz * beam_size)]
+            cpu_tokens = tokens.cpu()[:, :step + 1].numpy()
+            check_start_pos = step + 2 - no_repeat_ngram_size
+            for bbsz_idx in range(bsz * beam_size):
+                for i in range(check_start_pos):
+                    is_banned = True
+                    for k in range(no_repeat_ngram_size - 1):
+                        if cpu_tokens[bbsz_idx, i + k] != cpu_tokens[
+                            bbsz_idx, check_start_pos + k]:
+                            is_banned = False
+                            break
+                    if is_banned:
+                        banned_list[bbsz_idx].append(
+                            cpu_tokens[bbsz_idx,
+                                       i + no_repeat_ngram_size - 1])
+
+            def calculate_banned_tokens(bbsz_idx):
+                """before decoding the next token, prevent decoding
+                of ngrams that have already appeared
+                """
+                banned_tokens_per_sample = [
+                    (bbsz_idx, t) for t in banned_list[bbsz_idx]
+                ]
+                return banned_tokens_per_sample
+
+            banned_tokens = []
+            if step + 2 - no_repeat_ngram_size >= 0:
+                for bbsz_idx in range(bsz * beam_size):
+                    banned_tokens.extend(calculate_banned_tokens(bbsz_idx))
+
+            if banned_tokens:
+                banned_tokens = torch.LongTensor(banned_tokens)
+                lprobs.index_put_(
+                    tuple(banned_tokens.t()),
+                    lprobs.new_tensor([-math.inf] * len(banned_tokens)))
+
+            return lprobs
+
         def finalize_hypos(step, bbsz_idx, eos_scores):
             """
             Finalize the given hypotheses at this step, while keeping the total
@@ -658,8 +731,12 @@ def replicate_first_beam(tensor, mask):
 
             if self.no_repeat_ngram_size > 0:
                 #Applying Cuda Op for NGram repeat Blocking
-                lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
-                        beam_size, self.no_repeat_ngram_size)
+                if (tokens.is_cuda and lprobs.is_cuda):
+                    lprobs = self.no_repeat_ngram_op(tokens,lprobs, bsz, step,
+                            beam_size, self.no_repeat_ngram_size)
+                else:
+                    lprobs = apply_no_repeat_ngram_cpu(tokens, lprobs, bsz,
+                                step, beam_size, self.ngram_repeat_block_size)
 
             cand_scores, cand_indices, cand_beams = self.search.step(
                 step,
@@ -678,18 +755,16 @@ def replicate_first_beam(tensor, mask):
             eos_mask[:, :beam_size][blacklist] = 0
 
             # only consider eos when it's among the top beam_size indices
-            torch.masked_select(
+            eos_bbsz_idx = torch.masked_select(
                 cand_bbsz_idx[:, :beam_size],
                 mask=eos_mask[:, :beam_size],
-                out=eos_bbsz_idx,
             )
 
             finalized_sents = set()
             if eos_bbsz_idx.numel() > 0:
-                torch.masked_select(
+                eos_scores = torch.masked_select(
                     cand_scores[:, :beam_size],
                     mask=eos_mask[:, :beam_size],
-                    out=eos_scores,
                 )
                 finalized_sents = finalize_hypos(step, eos_bbsz_idx,
                                                  eos_scores)
@@ -706,7 +781,7 @@ def replicate_first_beam(tensor, mask):
                 # construct batch_idxs which holds indices of batches to keep for the next pass
                 batch_mask = cand_indices.new_ones(bsz)
                 batch_mask[cand_indices.new(finalized_sents)] = 0
-                batch_idxs = batch_mask.nonzero().squeeze(-1)
+                batch_idxs = torch.nonzero(batch_mask).squeeze(-1)
 
                 eos_mask = eos_mask[batch_idxs]
                 cand_beams = cand_beams[batch_idxs]
@@ -739,10 +814,9 @@ def replicate_first_beam(tensor, mask):
             # candidate active hypos.
             active_mask = buffer('active_mask')
             eos_mask[:, :beam_size] |= blacklist
-            torch.add(
+            active_mask = torch.add(
                 eos_mask.type_as(cand_offsets) * cand_size,
                 cand_offsets[:eos_mask.size(1)],
-                out=active_mask,
             )
 
             # get the top beam_size active hypotheses, which are just the hypos
diff --git a/fastseq/optimizer/jit/__init__.py b/fastseq/optimizer/jit/__init__.py
new file mode 100644
index 00000000..59e481eb
--- /dev/null
+++ b/fastseq/optimizer/jit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/fastseq/optimizer/jit/einsum_rewriter.py b/fastseq/optimizer/jit/einsum_rewriter.py
new file mode 100644
index 00000000..d46094cf
--- /dev/null
+++ b/fastseq/optimizer/jit/einsum_rewriter.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Optmize einsum operation in the graph"""
+
+from typing import List
+
+import torch
+from torch import Tensor
+
+from fastseq.optimizer.jit.utils import graph_pattern, rewrite_graph
+
+def einsum_pattern_0(t0: str, t1: List[Tensor]):
+    r = torch.einsum(t0, t1)
+    return r
+
+def einsum_rewrite_pattern_0(eqn: str, operands: List[Tensor]):
+    # eqn = eqn.replace(' ', '')  # TODO: fix the issue: ValueError: stoll
+    # for cases like "bmhtd,bnhsd->bmhts"
+    if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
+        eqn[2] == eqn[8] and eqn[4] == eqn[10] and eqn[9] == eqn[17]):
+        t0 = operands[0]
+        t1 = operands[1]
+        b, m, h, t, d = t0.shape
+        s = t1.size(3)
+        n = t1.size(1)
+        t1 = t1.permute(0, 2, 3, 4, 1) # (b, h, s, d, n)
+        if n > 1:
+            t1 = t1.sum(dim=4, keepdim=True) # (b, h, s, d, 1)
+
+        t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, d)
+        t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, d, 1, s)
+        t0 = t0.reshape(b*h, m*t, d)
+        t1 = t1.view(b*h, d, s)
+        r = torch.bmm(t0, t1).view(b, h, m, t, s).permute(0, 2, 1, 3, 4)
+        return r
+
+    # for cases like "bmhts,bnhsd->bmhtd"
+    if (len(eqn) == 18 and eqn[0:4] == eqn[13:17] and eqn[0] == eqn[6] and
+        eqn[2] == eqn[8] and eqn[4] == eqn[9] and eqn[10] == eqn[17]):
+        t0 = operands[0]
+        t1 = operands[1]
+        b, m, h, t, s = t0.shape
+        n = t1.size(1)
+        d = t1.size(4)
+        t1 = t1.permute(0, 2, 4, 3, 1) # (b, h, d, s, n)
+        if n > 1:
+            t1 = t1.sum(dim=4, keepdim=True) # (b, h, d, s, 1)
+        # t1 = t1.squeeze(1) # (b, h, s, d)
+        t0 = t0.permute(0, 2, 1, 3, 4) # (b, h, m, t, s)
+        t1 = t1.permute(0, 1, 3, 4, 2) # (b, h, s, 1, d)
+        t0 = t0.reshape(b*h, m*t, s)
+        t1 = t1.view(b*h, s, d)
+        r = torch.bmm(t0, t1).view(b, h, m, t, d).permute(0, 2, 1, 3, 4)
+        return r
+
+    return torch.einsum(eqn, operands)
+
+EINSUM_PATTERN_STR = graph_pattern(einsum_pattern_0)()
+EINSUM_REWRITE_PATTERN_STR = graph_pattern(einsum_rewrite_pattern_0)()
+
+def rewrite_einsum(input_graph: torch._C.Graph):
+    rewrite_graph(EINSUM_PATTERN_STR, EINSUM_REWRITE_PATTERN_STR, input_graph)
diff --git a/fastseq/optimizer/jit/graph_rewriter.py b/fastseq/optimizer/jit/graph_rewriter.py
new file mode 100644
index 00000000..c77c6ad1
--- /dev/null
+++ b/fastseq/optimizer/jit/graph_rewriter.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Load and apply the registered graph rewrite patterns"""
+
+import torch
+
+from fastseq.optimizer.jit.einsum_rewriter import rewrite_einsum
+
+def optimize_graph(input_graph: torch._C.Graph):
+    rewrite_einsum(input_graph)
diff --git a/fastseq/optimizer/jit/utils.py b/fastseq/optimizer/jit/utils.py
new file mode 100644
index 00000000..958113d7
--- /dev/null
+++ b/fastseq/optimizer/jit/utils.py
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Utilities for graph rewriting"""
+
+import torch
+
+def rewrite_graph(pattern: str,
+                  rewrite_pattern: str,
+                  input_graph: torch._C.Graph):
+    torch._C._jit_pass_custom_pattern_based_rewrite_graph(
+        pattern, rewrite_pattern, input_graph)
+
+
+def graph_pattern(obj):
+    def convert_to_graph_pattern():
+      script = torch.jit.script(obj)
+      return script.graph.str()
+
+    return convert_to_graph_pattern
diff --git a/fastseq/optimizer/transformers/__init__.py b/fastseq/optimizer/transformers/__init__.py
index 38dc845e..ed4d131f 100644
--- a/fastseq/optimizer/transformers/__init__.py
+++ b/fastseq/optimizer/transformers/__init__.py
@@ -5,8 +5,10 @@
 Automatically apply the optimizations if the supported versions of transformers
 are detected.
 """
+import sys
 
 from packaging import version
+import sys
 
 from fastseq.config import MIN_TRANSFORMERS_VERSION, MAX_TRANSFORMER_VERSION
 from fastseq.logging import get_logger
@@ -36,7 +38,9 @@ def apply_transformers_optimization():
         logger.warning(
             f"transformers == {v} is not supported yet, please change it to "
             f"v{MIN_TRANSFORMERS_VERSION} to v{MAX_TRANSFORMER_VERSION}, or try"
-            f" other versions of fastseq.")
+            f" other versions of fastseq. Currently, no optimization provided "
+            "by fastseq has been applied. Please ignore this warning if you are"
+            " not using transformers")
         return
 
     import fastseq.optimizer.transformers.modeling_bart_optimizer # pylint: disable=import-outside-toplevel
@@ -45,13 +49,17 @@ def apply_transformers_optimization():
 
     logger.debug(f"transformers == {v} has been optimized.")
 
-
+is_transformers_installed = True
 try:
     import transformers
-    apply_transformers_optimization()
 except ImportError as error:
+    is_transformers_installed = False
     logger.warning('transformers can not be imported. Please ignore this '
                    'warning if you are not using transformers')
-except:
-    logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
-    raise
+
+if is_transformers_installed:
+    try:
+        apply_transformers_optimization()
+    except:
+        logger.error("Unexpected error: {}".format(sys.exc_info()[0]))
+        raise
diff --git a/fastseq/optimizer/transformers/beam_search_optimizer.py b/fastseq/optimizer/transformers/beam_search_optimizer.py
index b9102801..df8061bc 100644
--- a/fastseq/optimizer/transformers/beam_search_optimizer.py
+++ b/fastseq/optimizer/transformers/beam_search_optimizer.py
@@ -650,8 +650,18 @@ def _update_scores(banned_tokens):
         cpu_input_ids = input_ids.cpu()
         if no_repeat_ngram_size > 0:
             #custom op for Ngram repeat blocking
-            scores = self.no_repeat_ngram_op(input_ids,scores.float(),
-                    batch_size, cur_len-1, num_beams, no_repeat_ngram_size)
+            if (input_ids.is_cuda and scores.is_cuda):
+                scores = self.no_repeat_ngram_op(input_ids,scores.float(),
+                        batch_size, cur_len-1, num_beams, no_repeat_ngram_size)
+            else:
+                num_batch_hypotheses = batch_size * num_beams
+                banned_ngram_tokens = calc_banned_ngram_tokens_v2(
+                    cpu_input_ids,
+                    num_batch_hypotheses,
+                    no_repeat_ngram_size,
+                    cur_len,
+                    self.config.pad_token_id)
+                _update_scores(banned_ngram_tokens)
 
         if bad_words_ids is not None:
             # calculate a list of banned tokens according to bad words
diff --git a/fastseq/optimizer/transformers/modeling_bart_optimizer.py b/fastseq/optimizer/transformers/modeling_bart_optimizer.py
index 851bf46b..4f3b11ec 100755
--- a/fastseq/optimizer/transformers/modeling_bart_optimizer.py
+++ b/fastseq/optimizer/transformers/modeling_bart_optimizer.py
@@ -6,21 +6,24 @@
 from typing import Dict, Optional, Tuple
 
 import torch
-from torch import Tensor
+from torch import Tensor, nn
 from torch.nn import functional as F
 
+from transformers.activations import ACT2FN
 from transformers.configuration_auto import BartConfig
 from transformers.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 from transformers.modeling_bart import (BartForConditionalGeneration,
+                                        DecoderLayer, EncoderLayer, LayerNorm,
                                         SelfAttention, _reorder_buffer)
 
 from fastseq.logging import get_logger
 from fastseq.utils.api_decorator import replace
+from fastseq.optimizer.jit.graph_rewriter import optimize_graph
 
 logger = get_logger(__name__)
 
 @replace(SelfAttention)
-class SelfAttentionV2(SelfAttention):
+class SelfAttentionV2(nn.Module):
     """"
     The BART Model with a language modeling head. Can be used for summarization.
     """
@@ -34,52 +37,68 @@ def __init__(
         encoder_decoder_attention=False,  # otherwise self_attention
         num_beams=1,
     ):
-        super().__init__(
-            embed_dim, num_heads, dropout, bias, encoder_decoder_attention)
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads")
+        self.scaling = self.head_dim ** -0.5
+
+        self.encoder_decoder_attention: bool = encoder_decoder_attention
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.cache_key = "encoder_decoder" if (
+            self.encoder_decoder_attention) else "self"
         self.num_beams = num_beams
 
+    def _shape(self, tensor: Tensor, dim_0: int, bsz: int) -> Tensor:
+        return tensor.contiguous().view(
+            dim_0, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+
     def forward(
         self,
-        query,
+        query: Tensor,
         key: Optional[Tensor],
         key_padding_mask: Optional[Tensor] = None,
-        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
+        layer_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
         attn_mask: Optional[Tensor] = None,
-        output_attentions=False,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
+        output_attentions: bool=False,
+    ) -> Tuple[Tensor,
+               Optional[Tensor],
+               Optional[Dict[str, Dict[str, Optional[Tensor]]]]]:
         """Input shape: Time(SeqLen) x Batch x Channel"""
         static_kv: bool = self.encoder_decoder_attention
         tgt_len, bsz, embed_dim = query.size()
         assert embed_dim == self.embed_dim
         assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        saved_state: Dict[str, Optional[Tensor]] = {}
         # get here for encoder decoder cause of static_kv
         if layer_state is not None:  # reuse k,v and encoder_padding_mask
-            saved_state = layer_state.get(self.cache_key, {})
-            if "prev_key" in saved_state and static_kv:
+            if self.cache_key in layer_state:
+                tmp_saved_state = layer_state.get(self.cache_key)
+                assert tmp_saved_state is not None
+                saved_state = tmp_saved_state
+            if static_kv and "prev_key" in saved_state:
                 # previous time steps are cached - no need to recompute key and
                 # value if they are static
                 key = None
-        else:
-            saved_state = None
-            layer_state = {}
-        q = self.q_proj(query) * self.scaling
-        if static_kv:
-            if key is None:
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-        else:
-            k = self.k_proj(query)
-            v = self.v_proj(query)
 
+        q = self.q_proj(query) * self.scaling
         q = self._shape(q, tgt_len, bsz)
-        if k is not None:
+
+        k: Optional[Tensor] = None
+        v: Optional[Tensor] = None
+        if key is not None:
+            k = self.k_proj(key)
             k = self._shape(k, -1, bsz)
-        if v is not None:
+            v = self.v_proj(key)
             v = self._shape(v, -1, bsz)
 
-        if saved_state is not None:
+        if len(saved_state) > 0:
             k, v, key_padding_mask = self._use_saved_state(
                 k, v, saved_state, key_padding_mask, static_kv, bsz)
 
@@ -87,35 +106,46 @@ def forward(
         cache_bsz = (bsz // self.num_beams
                      if self.encoder_decoder_attention else bsz)
 
+        assert k is not None
+        assert v is not None
         if self.encoder_decoder_attention and ("prev_key" not in saved_state):
             cache_shape = (
                 cache_bsz, self.num_beams, self.num_heads, -1, self.head_dim)
             k = k.view(cache_shape)[:, 0 : 1, :, :, :].contiguous()
             v = v.view(cache_shape)[:, 0 : 1, :, :, :].contiguous()
+            prev_k: Optional[Tensor] = k
+            prev_v: Optional[Tensor] = v
+            prev_key_padding_mask: Optional[Tensor] = None if (
+                static_kv) else key_padding_mask
+            assert layer_state is not None
             layer_state[self.cache_key] = {
-                "prev_key": k,
-                "prev_value": v,
-                "prev_key_padding_mask":
-                key_padding_mask if not static_kv else None,
+                "prev_key": prev_k,
+                "prev_value": prev_v,
+                "prev_key_padding_mask": prev_key_padding_mask,
             }
-        if not self.encoder_decoder_attention:
+
+        if not self.encoder_decoder_attention and layer_state is not None:
             cache_shape = (bsz, self.num_heads, -1, self.head_dim)
+            prev_k: Optional[Tensor] = k.view(cache_shape)
+            prev_v: Optional[Tensor] = v.view(cache_shape)
+            prev_key_padding_mask: Optional[Tensor] = None if (
+                static_kv) else key_padding_mask
+            assert layer_state is not None
             layer_state[self.cache_key] = {
-                "prev_key": k.view(cache_shape),
-                "prev_value": v.view(cache_shape),
-                "prev_key_padding_mask":
-                key_padding_mask if not static_kv else None,
+                "prev_key": prev_k,
+                "prev_value": prev_v,
+                "prev_key_padding_mask": prev_key_padding_mask,
             }
 
-        assert k is not None
+        # assert q is not None
         if self.encoder_decoder_attention:
             q = q.view(cache_bsz, self.num_beams, self.num_heads, tgt_len,
                        self.head_dim)
             src_len = k.size(3)
-            attn_weights = torch.einsum("bmhtd,bnhsd->bmhts", q,
-                                        k).reshape(-1, tgt_len, src_len)
-            assert attn_weights.size() == (bsz * self.num_heads, tgt_len,
-                                           src_len)
+            attn_weights = torch.einsum("bmhtd,bnhsd->bmhts", q, k).reshape(
+                -1, tgt_len, src_len)
+            assert attn_weights.size() == (
+                bsz * self.num_heads, tgt_len, src_len)
         else:
             src_len = k.size(1)
             attn_weights = torch.bmm(q, k.transpose(1, 2))
@@ -163,15 +193,23 @@ def forward(
         attn_output = attn_output.transpose(0, 1).contiguous().view(
             tgt_len, bsz, embed_dim)
         attn_output = self.out_proj(attn_output)
+
         if output_attentions:
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                              src_len)
+            return attn_output, attn_weights, layer_state
         else:
-            attn_weights = None
-        return attn_output, attn_weights
+            return attn_output, None, layer_state
 
-    def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv,
-                         bsz):
+    def _use_saved_state(
+        self,
+        k: Optional[Tensor],
+        v: Optional[Tensor],
+        saved_state: Dict[str, Optional[Tensor]],
+        key_padding_mask: Optional[Tensor],
+        static_kv: bool,
+        bsz: int) -> Tuple[
+            Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
         # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
         # note that for self-attn, bsz=input_bsz * beam_size; for
         # encoder-decoder-attn, bsz=input_bsz.
@@ -199,11 +237,13 @@ def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv,
 
         assert k is not None and v is not None
         prev_key_padding_mask: Optional[Tensor] = saved_state.get(
-            "prev_key_padding_mask", None)
+            "prev_key_padding_mask")
         if prev_key_padding_mask is not None:
             if static_kv:
                 new_key_padding_mask = prev_key_padding_mask
             else:
+                assert prev_key_padding_mask is not None
+                assert key_padding_mask is not None
                 new_key_padding_mask = torch.cat(
                     [prev_key_padding_mask, key_padding_mask], dim=1)
         else:
@@ -211,6 +251,159 @@ def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv,
         return k, v, new_key_padding_mask
 
 
+@replace(EncoderLayer)
+class EncoderLayerV2(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = SelfAttention(
+            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout,
+        )
+        self.self_attn = torch.jit.script(self.self_attn)
+        optimize_graph(self.self_attn.graph)
+        self.normalize_before = config.normalize_before
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def forward(self, x, encoder_padding_mask, output_attentions=False):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+            for t_tgt, t_src is excluded (or masked out), =0 means it is
+            included in attention
+
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, attn_weights, layer_state = self.self_attn(
+            query=x,
+            key=x,
+            key_padding_mask=encoder_padding_mask,
+            output_attentions=output_attentions,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return x, attn_weights
+
+
+@replace(DecoderLayer)
+class DecoderLayerV2(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = SelfAttention(
+            embed_dim=self.embed_dim, num_heads=config.decoder_attention_heads, dropout=config.attention_dropout,
+        )
+        self.self_attn = torch.jit.script(self.self_attn)
+        optimize_graph(self.self_attn.graph)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.normalize_before = config.normalize_before
+
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.encoder_attn = SelfAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            encoder_decoder_attention=True,
+        )
+        self.encoder_attn = torch.jit.script(self.encoder_attn)
+        optimize_graph(self.encoder_attn.graph)
+        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        x,
+        encoder_hidden_states,
+        encoder_attn_mask=None,
+        layer_state=None,
+        causal_mask=None,
+        decoder_padding_mask=None,
+        output_attentions=False,
+    ):
+        residual = x
+
+        if layer_state is None:
+            layer_state = {}
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        # Self Attention
+
+        x, self_attn_weights, layer_state = self.self_attn(
+            query=x,
+            key=x,
+            layer_state=layer_state,  # adds keys to layer state
+            key_padding_mask=decoder_padding_mask,
+            attn_mask=causal_mask,
+            output_attentions=output_attentions,
+        )
+
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+
+        # Cross attention
+        residual = x
+        assert self.encoder_attn.cache_key != self.self_attn.cache_key
+        if self.normalize_before:
+            x = self.encoder_attn_layer_norm(x)
+        x, attn_weights, layer_state = self.encoder_attn(
+            query=x,
+            key=encoder_hidden_states,
+            key_padding_mask=encoder_attn_mask,
+            layer_state=layer_state,  # mutates layer state
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.encoder_attn_layer_norm(x)
+
+        # Fully Connected
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return (
+            x,
+            self_attn_weights,
+            layer_state,
+        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
+
+
 @replace(BartForConditionalGeneration)
 class BartForConditionalGenerationV2(BartForConditionalGeneration):
     """
diff --git a/fastseq/utils/test_utils.py b/fastseq/utils/test_utils.py
index 2ef9d5a6..0a44a3ea 100644
--- a/fastseq/utils/test_utils.py
+++ b/fastseq/utils/test_utils.py
@@ -3,18 +3,31 @@
 
 """Utilities to make it easy to add unit tests"""
 
+from inspect import getframeinfo, stack
 import os
 from statistics import mean, stdev
 import time
 
-from absl.testing import parameterized
+from absl import flags
+from absl.testing import absltest, parameterized
 
-from fastseq.config import FASTSEQ_CACHE_DIR
+from fastseq.config import FASTSEQ_CACHE_DIR, FASTSEQ_UNITTEST_LOG_XML_DIR
 from fastseq.logging import get_logger
 from fastseq.utils.api_decorator import get_class
 
 logger = get_logger(__name__)
 
+FLAGS = flags.FLAGS
+
+def fastseq_test_main():
+    caller = getframeinfo(stack()[1][0])
+    suffix = '_' + time.strftime("%Y%m%d%H%M%S") + '.xml'
+    log_xml_file = caller.filename.replace(os.sep, '_').replace('.py', suffix)
+    log_xml_file = os.path.join(FASTSEQ_UNITTEST_LOG_XML_DIR, log_xml_file)
+    FLAGS.xml_output_file = log_xml_file
+    logger.info(f"Fastseq unit test log output filepath: {log_xml_file}")
+    absltest.main()
+
 class TestCaseBase(parameterized.TestCase):
     """Base class used for unittest."""
 
diff --git a/setup.py b/setup.py
index 6f80569b..45ebef29 100644
--- a/setup.py
+++ b/setup.py
@@ -4,23 +4,29 @@
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
-from fastseq.config import FASTSEQ_VERSION
+FASTSEQ_VERSION = '0.0.4'
+MIN_FAIRSEQ_VERSION = '0.9.0'
+MAX_FAIRSEQ_VERSION = '0.9.0'
+MIN_TRANSFORMERS_VERSION = '3.0.2'
+MAX_TRANSFORMER_VERSION = '3.0.2'
 
 def get_fastseq_version():
     return FASTSEQ_VERSION
 
 extras = {}
 
-extras["torch"] = ["torch>=1.4.0"]
-extras["fairseq"] = ["fairseq>=0.9.0"]
-extras["transformers"] = ["transformers>=3.0.2"]
+extras["transformers"] = ["transformers >= {}, <= {}".format(
+    MIN_TRANSFORMERS_VERSION, MAX_TRANSFORMER_VERSION)]
+extras["fairseq"] = ["fairseq >= {}, <= {}".format(
+    MIN_FAIRSEQ_VERSION, MAX_FAIRSEQ_VERSION)]
+extras["gitpython"] = ["gitpython>=3.1.7"]
+extras["editdistance"] = ["editdistance>=0.5.3"]
 
 extensions = [
-       CUDAExtension('ngram_repeat_block_cuda', [
-                   'fastseq/clib/cuda/ngram_repeat_block_cuda.cpp',
-                   'fastseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu',
-               ]),
-        ]
+    CUDAExtension('ngram_repeat_block_cuda', [
+        'fastseq/clib/cuda/ngram_repeat_block_cuda.cpp',
+        'fastseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu',]),
+]
 
 setup(
     name="fastseq",
@@ -41,9 +47,13 @@ def get_fastseq_version():
     ],
     install_requires=[
         "absl-py",
+        "filelock",
         "numpy",
         "requests",
+        "rouge-score>=0.0.4",
         "packaging",
+        "torch>=1.4.0",
+        "pytorch-transformers==1.0.0",
     ],
     extras_require=extras,
     python_requires=">=3.6.0",
diff --git a/tests/models/test_prophetnet_fs.py b/tests/models/test_prophetnet_fs.py
index 7e02f155..5eed8c82 100644
--- a/tests/models/test_prophetnet_fs.py
+++ b/tests/models/test_prophetnet_fs.py
@@ -19,7 +19,7 @@
 from fastseq.utils.file_utils import decompress_file, make_dirs, wget
 from fastseq.utils.test_utils import (PROPHETNET_MODEL_URLS,
                                       CACHED_PROPHETNET_MODEL_PATHS,
-                                      TestCaseBase)
+                                      fastseq_test_main, TestCaseBase)
 
 logger = get_logger(__name__)
 
@@ -136,4 +136,4 @@ def test_beam_search_optimizer(self, beam_size, batch_size, need_attn,
             self.assertEqual(output, self.expected_outputs[i])
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/optimizer/fairseq/benchmark_fairseq_optimizer.py b/tests/optimizer/fairseq/benchmark_fairseq_optimizer.py
index 98975574..28f78b00 100644
--- a/tests/optimizer/fairseq/benchmark_fairseq_optimizer.py
+++ b/tests/optimizer/fairseq/benchmark_fairseq_optimizer.py
@@ -15,7 +15,7 @@
 from fastseq.utils.file_utils import decompress_file, make_dirs, wget
 from fastseq.utils.test_utils import (BART_MODEL_URLS, CACHED_BART_MODEL_DIR,
                                       CACHED_BART_MODEL_PATHS, BenchmarkBase,
-                                      benchmark)
+                                      benchmark, fastseq_test_main)
 
 logger = get_logger(__name__)
 
@@ -128,4 +128,4 @@ def test_beam_search_optimizer(self, beam_size, batch_size, need_attn,
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/optimizer/fairseq/test_fairseq_optimizer.py b/tests/optimizer/fairseq/test_fairseq_optimizer.py
index e476a927..0fb73bb3 100644
--- a/tests/optimizer/fairseq/test_fairseq_optimizer.py
+++ b/tests/optimizer/fairseq/test_fairseq_optimizer.py
@@ -16,7 +16,8 @@
 from fastseq.logging import get_logger
 from fastseq.utils.file_utils import decompress_file, make_dirs, wget
 from fastseq.utils.test_utils import (BART_MODEL_URLS, CACHED_BART_MODEL_DIR,
-                                      CACHED_BART_MODEL_PATHS, TestCaseBase)
+                                      CACHED_BART_MODEL_PATHS,
+                                      fastseq_test_main, TestCaseBase)
 
 logger = get_logger(__name__)
 
@@ -117,4 +118,4 @@ def test_beam_search_optimizer(self, beam_size, batch_size, need_attn,
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/optimizer/jit/__init__.py b/tests/optimizer/jit/__init__.py
new file mode 100644
index 00000000..59e481eb
--- /dev/null
+++ b/tests/optimizer/jit/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/tests/optimizer/jit/test_einsum_rewriter.py b/tests/optimizer/jit/test_einsum_rewriter.py
new file mode 100644
index 00000000..79d63b1e
--- /dev/null
+++ b/tests/optimizer/jit/test_einsum_rewriter.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import logging
+import time
+
+from absl.testing import absltest, parameterized
+import torch
+from torch import Tensor
+
+from fastseq.logging import get_logger
+from fastseq.optimizer.jit.einsum_rewriter import rewrite_einsum, einsum_rewrite_pattern_0
+from fastseq.utils.test_utils import TestCaseBase
+
+logger = get_logger(__name__, logging.INFO)
+
+class EinsumRewriterTest(TestCaseBase):
+
+    @parameterized.parameters(
+        {'eqn': "bmhtd,bnhsd->bmhts",
+         'shape0': [128, 4, 16, 5, 64],
+         'shape1': [128, 2, 16, 1024, 64]},
+        {'eqn': "kmijd,knisd->kmijs",
+         'shape0': [128, 4, 16, 1, 64],
+         'shape1': [128, 2, 16, 1024, 64]},
+        {'eqn': "bmhts,bnhsd->bmhtd",
+         'shape0': [128, 4, 16, 5, 64],
+         'shape1': [128, 2, 16, 64, 1024]},
+        {'eqn': "impts,inpsw->imptw",
+         'shape0': [128, 4, 16, 3, 64],
+         'shape1': [128, 2, 16, 64, 7]},
+    )
+    def test_einsum_rewriter(self, eqn, shape0, shape1):
+
+        def run_einsum(eqn: str, t0: Tensor, t1: Tensor):
+            r = torch.einsum(eqn, t0, t1)
+            return r
+
+        t0 = torch.randn(shape0, dtype=torch.float32).cuda()
+        t1 = torch.randn(shape1, dtype=torch.float32).cuda()
+        repeat_times = 1024
+
+        r0 = run_einsum(eqn, t0, t1)
+        torch.cuda.synchronize()
+        start0 = time.time()
+        for _ in range(repeat_times):
+            run_einsum(eqn, t0, t1)
+        torch.cuda.synchronize()
+        end0 = time.time()
+
+        script_run_einsum = torch.jit.script(run_einsum)
+        logger.debug(f"Original graph: \n{script_run_einsum.graph.str()}")
+        rewrite_einsum(script_run_einsum.graph)
+        logger.debug(f"Optimized graph: \n{script_run_einsum.graph.str()}")
+        self.assertTrue('bmm' in script_run_einsum.graph.str())
+
+        r1 = script_run_einsum(eqn, t0, t1)
+        torch.cuda.synchronize()
+        start1 = time.time()
+        for _ in range(repeat_times):
+            script_run_einsum(eqn, t0, t1)
+        torch.cuda.synchronize()
+        end1 = time.time()
+
+        r2 = einsum_rewrite_pattern_0(eqn, [t0, t1])
+        torch.cuda.synchronize()
+        start2 = time.time()
+        for _ in range(repeat_times):
+            einsum_rewrite_pattern_0(eqn, [t0, t1])
+        torch.cuda.synchronize()
+        end2 = time.time()
+
+        self.assertTrue(torch.equal(r0, r1))
+        self.assertTrue(torch.equal(r0, r2))
+        self.assertEqual(
+            r0.is_contiguous(), r1.is_contiguous(), r2.is_contiguous())
+        logger.info(f"einsum took: {end0 - start0};"
+                    f"optimized einsum torchscript took: {end1 - start1};"
+                    f"optimized einsum python took: {end2 - start2};")
+
+
+if __name__ == "__main__":
+    absltest.main()
diff --git a/tests/optimizer/transformers/__init__.py b/tests/optimizer/transformers/__init__.py
new file mode 100644
index 00000000..59e481eb
--- /dev/null
+++ b/tests/optimizer/transformers/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
diff --git a/tests/optimizer/transformers/test_bart_optimizer.py b/tests/optimizer/transformers/test_bart_optimizer.py
index bb4b809a..e5d4ed0c 100644
--- a/tests/optimizer/transformers/test_bart_optimizer.py
+++ b/tests/optimizer/transformers/test_bart_optimizer.py
@@ -14,7 +14,7 @@
 from absl.testing import absltest, parameterized
 from transformers import BartForConditionalGeneration, BartTokenizer
 
-from fastseq.utils.test_utils import TestCaseBase
+from fastseq.utils.test_utils import fastseq_test_main, TestCaseBase
 
 
 class BARTOptimizerTest(TestCaseBase):
@@ -183,4 +183,4 @@ def test_beam_search_optimizer(self,
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/optimizer/transformers/test_t5_optimizer.py b/tests/optimizer/transformers/test_t5_optimizer.py
index 9d38a913..43b155ab 100644
--- a/tests/optimizer/transformers/test_t5_optimizer.py
+++ b/tests/optimizer/transformers/test_t5_optimizer.py
@@ -12,7 +12,7 @@
 
 import fastseq
 from fastseq.logging import get_logger
-from fastseq.utils.test_utils import TestCaseBase
+from fastseq.utils.test_utils import fastseq_test_main, TestCaseBase
 from transformers import (T5ForConditionalGeneration, T5Tokenizer)
 
 
@@ -184,4 +184,4 @@ def test_beam_search_optimizer(self,
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/run_fairseq_tests.py b/tests/run_fairseq_tests.py
index 980d87b6..2747c9f0 100644
--- a/tests/run_fairseq_tests.py
+++ b/tests/run_fairseq_tests.py
@@ -3,15 +3,19 @@
 """ script for importing fairseq tests """
 
 import glob
-import sys
-import os
-import argparse
+import io
 import logging
+import os
 import shutil
+import sys
+import time
 import unittest
+
+import xmlrunner
+from absl.testing import parameterized
 from git import Repo
-from absl.testing import absltest, parameterized
 from pip._internal import main as pipmain
+from xmlrunner.extra.xunit_plugin import transform
 
 FASTSEQ_PATH = os.sep.join(os.path.realpath(__file__).split('/')[0:-2])
 FAIRSEQ_PATH = '/tmp/fairseq/'
@@ -32,7 +36,7 @@ def clone_and_build_fairseq(self, repo, version):
         if os.path.isdir(FAIRSEQ_PATH):
             shutil.rmtree(FAIRSEQ_PATH)
         Repo.clone_from(FAIRSEQ_GIT_URL, FAIRSEQ_PATH, branch=version)
-        pipmain(['install', 'git+https://github.com/pytorch/fairseq.git@' + 
+        pipmain(['install', 'git+https://github.com/pytorch/fairseq.git@' +
                   version])
         original_pythonpath = os.environ[
             'PYTHONPATH'] if 'PYTHONPATH' in os.environ else ''
@@ -54,12 +58,9 @@ def get_test_suites(self, test_files_path, blocked_tests):
         return suites
 
     @parameterized.named_parameters({
-        'testcase_name':
-        'Normal',
-        'without_fastseq_opt':
-        False,
-        'fairseq_version':
-        'v0.9.0',
+        'testcase_name': 'Normal',
+        'without_fastseq_opt': False,
+        'fairseq_version': 'v0.9.0',
         'blocked_tests': [
            'test_binaries.py', 'test_bmuf.py', 'test_reproducibility.py']
     })
@@ -67,14 +68,28 @@ def test_suites(self, without_fastseq_opt, fairseq_version, blocked_tests):
         """"run test suites"""
         self.clone_and_build_fairseq(FAIRSEQ_GIT_URL, fairseq_version)
         if not without_fastseq_opt:
-            import fastseq  #pylint: disable=import-outside-toplevel
+            import fastseq  # pylint: disable=import-outside-toplevel
         self.prepare_env()
         test_files_path = FAIRSEQ_PATH + '/tests/test_*.py'
         suites = self.get_test_suites(test_files_path, blocked_tests)
         test_suite = unittest.TestSuite(suites)
         test_runner = unittest.TextTestRunner()
         test_result = test_runner.run(test_suite)
-        assert len(test_result.errors) == 0 
+        assert len(test_result.errors) == 0
 
 if __name__ == "__main__":
-    absltest.main()
+    log_xml_dir = os.getenv(
+        'FASTSEQ_UNITTEST_LOG_XML_DIR',
+        os.path.join(os.getcwd(), 'tests', 'log_xml'))
+    os.makedirs(log_xml_dir, exist_ok=True)
+    suffix = '_' + time.strftime("%Y%m%d%H%M%S") + '.xml'
+    log_xml_file = __file__.replace(os.sep, '_').replace('.py', suffix)
+    log_xml_file = os.path.join(log_xml_dir, log_xml_file)
+
+    out = io.BytesIO()
+    unittest.main(
+        testRunner=xmlrunner.XMLTestRunner(output=out),
+        failfast=False, buffer=False, catchbreak=False, exit=False)
+    with open(log_xml_file, 'wb') as report:
+        report.write(transform(out.getvalue()))
+        print("Save the log of fairseq unit tests into %s" % (log_xml_file))
diff --git a/tests/run_fairseq_tests.sh b/tests/run_fairseq_tests.sh
index 23b12d88..bf4b4369 100755
--- a/tests/run_fairseq_tests.sh
+++ b/tests/run_fairseq_tests.sh
@@ -6,9 +6,12 @@ source ${ENV_PATH}/testing_env/bin/activate
 pip install gitpython
 pip install absl-py
 pip install packaging
+pip install unittest-xml-reporting
+pip install lxml
 cd ${FASTSEQ_TEST_PATH}/../
 pip install torch==1.5.0+cu101 torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+rm -rf build/
+rm ngram_repeat_block_cuda*.so
 pip install --editable .
-cd tests
-python run_fairseq_tests.py 
+python tests/run_fairseq_tests.py
 deactivate
diff --git a/tests/run_fastseq_tests.sh b/tests/run_fastseq_tests.sh
new file mode 100755
index 00000000..aff0ddc3
--- /dev/null
+++ b/tests/run_fastseq_tests.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for fastseq_py_test_file in $(find tests/ -name "test_*.py")
+do
+  echo "Running $fastseq_py_test_file"
+  python $fastseq_py_test_file
+done
diff --git a/tests/run_transformers_tests.py b/tests/run_transformers_tests.py
index cc23d8b0..ffa7e679 100644
--- a/tests/run_transformers_tests.py
+++ b/tests/run_transformers_tests.py
@@ -2,15 +2,18 @@
 # Licensed under the MIT License.
 """ script for importing transformers tests """
 
-import glob
-import sys
+import io
 import os
-import argparse
-import logging
 import shutil
+import sys
+import time
+import unittest
+
+import xmlrunner
+from absl.testing import parameterized
 from git import Repo
-from absl.testing import absltest, parameterized
 from pip._internal import main as pipmain
+from xmlrunner.extra.xunit_plugin import transform
 
 FASTSEQ_PATH = os.sep.join(os.path.realpath(__file__).split('/')[0:-2])
 TRANSFORMERS_PATH = '/tmp/transformers/'
@@ -44,7 +47,10 @@ def clone_and_build_transformers(self, repo, version):
         'testcase_name': 'Normal',
         'without_fastseq_opt': False,
         'transformers_version': 'v3.0.2',
-        'blocked_tests': ['test_modeling_reformer.py']
+        'blocked_tests': ['modeling_reformer',
+                          'multigpu',
+                          'HfApiEndpoints'
+        ]
     })
     def test_suites(self, without_fastseq_opt, transformers_version,
                     blocked_tests):
@@ -56,10 +62,26 @@ def test_suites(self, without_fastseq_opt, transformers_version,
         import pytest #pylint: disable=import-outside-toplevel
         self.prepare_env()
         os.chdir(TRANSFORMERS_PATH)
-        blocked_tests_string = (' not '+
-                    ' not '.join([test[5:-3] for test in blocked_tests]))
-        exit_code = pytest.main(['-sv', '-k'+blocked_tests_string,  './tests/'])
+        blocked_tests_string = (
+            ' and '.join([' not '+ test for test in blocked_tests]))
+        exit_code = pytest.main(
+            ['-sv', '-k' + blocked_tests_string,  './tests/'])
         assert str(exit_code).strip() == 'ExitCode.OK'
 
 if __name__ == "__main__":
-    absltest.main()
+    log_xml_dir = os.getenv(
+        'FASTSEQ_UNITTEST_LOG_XML_DIR',
+        os.path.join(os.getcwd(), 'tests', 'log_xml'))
+    os.makedirs(log_xml_dir, exist_ok=True)
+    suffix = '_' + time.strftime("%Y%m%d%H%M%S") + '.xml'
+    log_xml_file = __file__.replace(os.sep, '_').replace('.py', suffix)
+    log_xml_file = os.path.join(log_xml_dir, log_xml_file)
+
+    out = io.BytesIO()
+    unittest.main(
+        testRunner=xmlrunner.XMLTestRunner(output=out),
+        failfast=False, buffer=False, catchbreak=False, exit=False)
+    with open(log_xml_file, 'wb') as report:
+        report.write(transform(out.getvalue()))
+        print(
+            "Save the log of transformers unit tests into %s" % (log_xml_file))
diff --git a/tests/run_transformers_tests.sh b/tests/run_transformers_tests.sh
index be5c9db9..200635df 100644
--- a/tests/run_transformers_tests.sh
+++ b/tests/run_transformers_tests.sh
@@ -9,8 +9,11 @@ pip install packaging
 pip install pytest
 pip install timeout-decorator
 pip install torch torchvision
+pip install unittest-xml-reporting
+pip install lxml
 cd ${FASTSEQ_TEST_PATH}/../
+rm -rf build/
+rm ngram_repeat_block_cuda*.so
 pip install --editable .
-cd tests
-python run_transformers_tests.py 
+python tests/run_transformers_tests.py
 deactivate
diff --git a/tests/utils/test_api_decorator.py b/tests/utils/test_api_decorator.py
index a4d2a238..f07a250f 100644
--- a/tests/utils/test_api_decorator.py
+++ b/tests/utils/test_api_decorator.py
@@ -5,7 +5,7 @@
 
 from absl.testing import absltest, parameterized
 from fastseq.utils.api_decorator import get_class, override_method, add_method, export_api, replace
-from fastseq.utils.test_utils import TestCaseBase
+from fastseq.utils.test_utils import fastseq_test_main, TestCaseBase
 
 
 class A:
@@ -152,4 +152,4 @@ def name(self):
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
index 2b48a305..bc94c1fd 100644
--- a/tests/utils/test_file_utils.py
+++ b/tests/utils/test_file_utils.py
@@ -8,7 +8,7 @@
 from absl.testing import absltest, parameterized
 
 from fastseq.utils.file_utils import decompress_file, get_temp_dir, make_dirs, wget
-from fastseq.utils.test_utils import TestCaseBase
+from fastseq.utils.test_utils import fastseq_test_main, TestCaseBase
 
 
 class FileUtilsTest(TestCaseBase):
@@ -90,4 +90,4 @@ def test_wget_and_decompress_file(self, tar_file_url, tar_file_name,
 
 
 if __name__ == "__main__":
-    absltest.main()
+    fastseq_test_main()