From bd3db82f1064f5ea1c481d1614970dcb3b9453d8 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Mon, 19 Jul 2021 16:00:52 +0800 Subject: [PATCH 01/24] add python wrapper for ls transformer --- lightseq/training/__init__.py | 1 + lightseq/training/ops/pytorch/transformer.py | 200 +++++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 lightseq/training/ops/pytorch/transformer.py diff --git a/lightseq/training/__init__.py b/lightseq/training/__init__.py index d075ad4f..5c623d0c 100644 --- a/lightseq/training/__init__.py +++ b/lightseq/training/__init__.py @@ -7,5 +7,6 @@ from lightseq.training.ops.pytorch.transformer_decoder_layer import ( LSTransformerDecoderLayer, ) +from lightseq.training.ops.pytorch.transformer import LSTransformer from lightseq.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer from lightseq.training.ops.pytorch.adam import LSAdam diff --git a/lightseq/training/ops/pytorch/transformer.py b/lightseq/training/ops/pytorch/transformer.py new file mode 100644 index 00000000..ee9a663b --- /dev/null +++ b/lightseq/training/ops/pytorch/transformer.py @@ -0,0 +1,200 @@ +from dataclasses import dataclass + +import torch +import torch.nn as nn + +from lightseq.training import LSTransformerEmbeddingLayer, LSTransformerEncoderLayer, LSTransformerDecoderLayer +from lightseq.training.ops.pytorch.util import MODEL_ARCH + + +class LSTransformer(nn.Module): + def __init__(self, config): + super(LSTransformer, self).__init__() + self.config = config + + print("Lightseq Transformer config is ", self.config.__dict__) + + if self.config.local_rank >= 0: + torch.cuda.set_device(self.config.local_rank) + + self.build_model(self.config) + + @staticmethod + def get_config(**kwargs): + @dataclass + class Config: + max_batch_tokens: int # max batch token numbers + max_seq_len: int # max sequence length + vocab_size: int # vocabulary size + padding_idx: int # index of padding token + num_encoder_layer: int # number of encoder layer + num_decoder_layer: int # number of decoder layer + hidden_size: int # size of transformer hidden layers + intermediate_size: int # size of ffn inner size + nhead: int # number of heads in attention + attn_prob_dropout_ratio: float # attention score dropout ratio + activation_dropout_ratio: float # ffn activation dropout ratio + hidden_dropout_ratio: float # dropout ration before residual + pre_layer_norm: bool # pre layer norm or post + fp16: bool # fp16 presion + local_rank: int # rank in local node + + if "model" in kwargs: + if kwargs["model"] not in MODEL_ARCH: + raise ValueError("{} architecture is not supported.") + MODEL_ARCH[kwargs["model"]](kwargs) + del kwargs["model"] + + return Config(**kwargs) + + def build_model(self, config): + encoder_embed_tokens = self.build_embedding(config) + decoder_embed_tokens = self.build_embedding(config) + + self.encoder = self.build_encoder(config, encoder_embed_tokens) + self.decoder = self.build_decoder(config, decoder_embed_tokens) + + def build_embedding(self, config): + emb_config = LSTransformerEmbeddingLayer.get_config( + vocab_size=config.vocab_size, + embedding_dim=config.hidden_size, + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + padding_idx=config.padding_idx, + dropout=config.hidden_dropout_ratio, + fp16=config.fp16, + local_rank=config.local_rank, + ) + emb = LSTransformerEmbeddingLayer(emb_config) + return emb + + def build_encoder(self, config, embed_tokens): + return LSTransformerEncoder(config, embed_tokens) + + def build_decoder(self, config, embed_tokens): + return LSTransformerDecoder(config, embed_tokens) + + def forward(self, src_tokens, trg_tokens): + encoder_out, encoder_padding_mask = self.encoder(src_tokens) + decoder_out = self.decoder(trg_tokens, encoder_out, encoder_padding_mask) + return decoder_out + + +class LSTransformerEncoder(nn.Module): + def __init__(self, config, embed_tokens): + super(LSTransformerEncoder, self).__init__() + self.config = config + + embed_dim = embed_tokens.config.embedding_dim + self.embed_tokens = embed_tokens + self.padding_idx = self.config.padding_idx + + self.layers = nn.ModuleList( + [self.build_encoder_layer(config) for _ in range(config.num_encoder_layer)] + ) + self.num_layers = len(self.layers) + + self.layer_norm = nn.LayerNorm(embed_dim) + + def build_encoder_layer(self, config): + enc_config = LSTransformerEncoderLayer.get_config( + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + nhead=config.nhead, + attn_prob_dropout_ratio=config.attn_prob_dropout_ratio, + activation_dropout_ratio=config.activation_dropout_ratio, + hidden_dropout_ratio=config.hidden_dropout_ratio, + pre_layer_norm=config.pre_layer_norm, + fp16=config.fp16, + local_rank=config.local_rank, + ) + return LSTransformerEncoderLayer(enc_config) + + def forward_embedding(self, src_tokens): + x = self.embed_tokens(src_tokens) + return x + + def forward(self, src_tokens): + x = self.forward_embedding(src_tokens) + + encoder_padding_mask = src_tokens.eq(self.padding_idx) + + for layer in self.layers: + x = layer(x, encoder_padding_mask) + + x = self.layer_norm(x) + x = x.transpose(0, 1) + + return x, encoder_padding_mask + + +class LSTransformerDecoder(nn.Module): + def __init__(self, config, embed_tokens): + super(LSTransformerDecoder, self).__init__() + self.config = config + + embed_dim = embed_tokens.config.embedding_dim + self.embed_tokens = embed_tokens + self.padding_idx = self.config.padding_idx + + self.layers = nn.ModuleList( + [self.build_decoder_layer(config) for _ in range(config.num_decoder_layer)] + ) + self.num_layers = len(self.layers) + + self.layer_norm = nn.LayerNorm(embed_dim) + + self.output_projection = nn.Linear( + self.embed_tokens.embeddings.shape[1], + self.embed_tokens.embeddings.shape[0], + bias=False, + ) + self.output_projection.weight = self.embed_tokens.embeddings + + def build_decoder_layer(self, config): + dec_config = LSTransformerDecoderLayer.get_config( + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + nhead=config.nhead, + attn_prob_dropout_ratio=config.attn_prob_dropout_ratio, + activation_dropout_ratio=config.activation_dropout_ratio, + hidden_dropout_ratio=config.hidden_dropout_ratio, + pre_layer_norm=config.pre_layer_norm, + fp16=config.fp16, + local_rank=config.local_rank, + nlayer=config.num_decoder_layer, + ) + return LSTransformerDecoderLayer(dec_config) + + def forward_embedding(self, trg_tokens, cache=None): + step = 0 + if cache is not None: + step = trg_tokens.size(1) - 1 + trg_tokens = trg_tokens[:, -1:] + + x = self.embed_tokens(trg_tokens, step) + return x + + def forward( + self, trg_tokens, encoder_out, encoder_padding_mask, cache=None + ): + x = self.forward_embedding( + trg_tokens, cache + ) + + for layer in self.layers: + x = layer( + x, + encoder_out, + encoder_padding_mask, + cache, + ) + + x = self.layer_norm(x) + + x = self.output_projection(x) + return x From c693ced6f43b69f3363c4c00580dd169ceb6a060 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Mon, 19 Jul 2021 20:33:49 +0800 Subject: [PATCH 02/24] add default activation_fn config --- lightseq/training/ops/pytorch/transformer.py | 4 +++- lightseq/training/ops/pytorch/util.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lightseq/training/ops/pytorch/transformer.py b/lightseq/training/ops/pytorch/transformer.py index ee9a663b..3827cd33 100644 --- a/lightseq/training/ops/pytorch/transformer.py +++ b/lightseq/training/ops/pytorch/transformer.py @@ -36,6 +36,7 @@ class Config: activation_dropout_ratio: float # ffn activation dropout ratio hidden_dropout_ratio: float # dropout ration before residual pre_layer_norm: bool # pre layer norm or post + activation_fn: str # relu or gelu fp16: bool # fp16 presion local_rank: int # rank in local node @@ -107,6 +108,7 @@ def build_encoder_layer(self, config): activation_dropout_ratio=config.activation_dropout_ratio, hidden_dropout_ratio=config.hidden_dropout_ratio, pre_layer_norm=config.pre_layer_norm, + activation_fn=config.activation_fn, fp16=config.fp16, local_rank=config.local_rank, ) @@ -164,6 +166,7 @@ def build_decoder_layer(self, config): activation_dropout_ratio=config.activation_dropout_ratio, hidden_dropout_ratio=config.hidden_dropout_ratio, pre_layer_norm=config.pre_layer_norm, + activation_fn=config.activation_fn, fp16=config.fp16, local_rank=config.local_rank, nlayer=config.num_decoder_layer, @@ -175,7 +178,6 @@ def forward_embedding(self, trg_tokens, cache=None): if cache is not None: step = trg_tokens.size(1) - 1 trg_tokens = trg_tokens[:, -1:] - x = self.embed_tokens(trg_tokens, step) return x diff --git a/lightseq/training/ops/pytorch/util.py b/lightseq/training/ops/pytorch/util.py index 16bdafc8..a9612d8b 100644 --- a/lightseq/training/ops/pytorch/util.py +++ b/lightseq/training/ops/pytorch/util.py @@ -13,6 +13,7 @@ def base_architecture(args): args.setdefault("activation_dropout_ratio", 0.0) args.setdefault("hidden_dropout_ratio", 0.1) args.setdefault("pre_layer_norm", True) + args.setdefault("activation_fn", "relu") def transformer_base(args): @@ -35,11 +36,13 @@ def bert_base(args): args.setdefault("attn_prob_dropout_ratio", 0.1) args.setdefault("activation_dropout_ratio", 0.1) args.setdefault("pre_layer_norm", False) + args.setdefault("activation_fn", "gelu") base_architecture(args) def bert_big(args): args.setdefault("pre_layer_norm", False) + args.setdefault("activation_fn", "gelu") transformer_big(args) From f8e57c46b6be7cc4baf962bf9a6ab059585f5a87 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Mon, 19 Jul 2021 21:44:27 +0800 Subject: [PATCH 03/24] add demo example using pure ls layers --- examples/training/custom/demo.py | 70 ++++++++++++++++++++ lightseq/training/ops/pytorch/transformer.py | 14 ++-- 2 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 examples/training/custom/demo.py diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py new file mode 100644 index 00000000..b8c5fdf6 --- /dev/null +++ b/examples/training/custom/demo.py @@ -0,0 +1,70 @@ +import torch + +from lightseq.training import LSTransformer, LSCrossEntropyLayer, LSAdam + +vocab_size, padding_idx = 1000, 0 +batch_size, src_seq_len, trg_seq_len = 6, 10, 15 + +def create_data(): + src_tokens = torch.randint(padding_idx, vocab_size, (batch_size, src_seq_len), dtype=torch.long, device=torch.device("cuda:0")) + trg_tokens = torch.randint(padding_idx, vocab_size, (batch_size, trg_seq_len), dtype=torch.long, device=torch.device("cuda:0")) + target = trg_tokens.clone()[:, 1:] + eos = torch.zeros((batch_size, 1), dtype=torch.long, device=torch.device("cuda:0")) + target = torch.cat([target, eos], dim=-1) + return src_tokens, trg_tokens, target + +def create_model(): + transformer_config = LSTransformer.get_config( + model="transformer-base", + max_batch_tokens=4096, + max_seq_len=256, + vocab_size=vocab_size, + padding_idx=padding_idx, + num_encoder_layer=6, + num_decoder_layer=6, + fp16=True, + local_rank=0 + ) + model = LSTransformer(transformer_config) + model.to(dtype=torch.half, device=torch.device("cuda:0")) + return model + +def create_criterion(): + ce_config = LSCrossEntropyLayer.get_config( + max_batch_tokens=4096, + padding_idx=padding_idx, + epsilon=0.0, + fp16=True, + local_rank=0 + ) + loss_fn = LSCrossEntropyLayer(ce_config) + loss_fn.to(dtype=torch.half, device=torch.device("cuda:0")) + return loss_fn + +if __name__ == "__main__": + src_tokens, trg_tokens, target = create_data() + model = create_model() + loss_fn = create_criterion() + opt = LSAdam(model.parameters(), lr=1e-5) + + print("========================TRAIN========================") + model.train() + for epoch in range(2000): + output = model(src_tokens, trg_tokens) + loss, _ = loss_fn(output, target) + if epoch % 200 == 0: + print("epoch {:03d}: {:.3f}".format(epoch, loss.item())) + loss.backward() + opt.step() + + print("========================TEST========================") + model.eval() + encoder_out, encoder_padding_mask = model.encoder(src_tokens) + predict_tokens = trg_tokens[:, :1] + for _ in range(trg_seq_len): + output = model.decoder(predict_tokens, encoder_out, encoder_padding_mask) + output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1)) + predict_tokens = torch.cat([predict_tokens, output[:, -1:]], dim=-1) + predict_tokens = predict_tokens[:, 1:] + print("target:\n", target) + print("predict_tokens:\n", predict_tokens) \ No newline at end of file diff --git a/lightseq/training/ops/pytorch/transformer.py b/lightseq/training/ops/pytorch/transformer.py index 3827cd33..3d174bc8 100644 --- a/lightseq/training/ops/pytorch/transformer.py +++ b/lightseq/training/ops/pytorch/transformer.py @@ -3,7 +3,11 @@ import torch import torch.nn as nn -from lightseq.training import LSTransformerEmbeddingLayer, LSTransformerEncoderLayer, LSTransformerDecoderLayer +from lightseq.training import ( + LSTransformerEmbeddingLayer, + LSTransformerEncoderLayer, + LSTransformerDecoderLayer, +) from lightseq.training.ops.pytorch.util import MODEL_ARCH @@ -181,12 +185,8 @@ def forward_embedding(self, trg_tokens, cache=None): x = self.embed_tokens(trg_tokens, step) return x - def forward( - self, trg_tokens, encoder_out, encoder_padding_mask, cache=None - ): - x = self.forward_embedding( - trg_tokens, cache - ) + def forward(self, trg_tokens, encoder_out, encoder_padding_mask, cache=None): + x = self.forward_embedding(trg_tokens, cache) for layer in self.layers: x = layer( From 3d490aaec11d1ca71eb5206747da507e281285d8 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 01:00:17 +0800 Subject: [PATCH 04/24] add new line --- examples/training/custom/demo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index b8c5fdf6..0f5f630d 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -67,4 +67,4 @@ def create_criterion(): predict_tokens = torch.cat([predict_tokens, output[:, -1:]], dim=-1) predict_tokens = predict_tokens[:, 1:] print("target:\n", target) - print("predict_tokens:\n", predict_tokens) \ No newline at end of file + print("predict_tokens:\n", predict_tokens) From 6b69495c4622d454a4e0d4b385f2ca62dcee2870 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 01:03:56 +0800 Subject: [PATCH 05/24] format demo.py --- examples/training/custom/demo.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index 0f5f630d..ce9552e2 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -5,14 +5,28 @@ vocab_size, padding_idx = 1000, 0 batch_size, src_seq_len, trg_seq_len = 6, 10, 15 + def create_data(): - src_tokens = torch.randint(padding_idx, vocab_size, (batch_size, src_seq_len), dtype=torch.long, device=torch.device("cuda:0")) - trg_tokens = torch.randint(padding_idx, vocab_size, (batch_size, trg_seq_len), dtype=torch.long, device=torch.device("cuda:0")) + src_tokens = torch.randint( + padding_idx, + vocab_size, + (batch_size, src_seq_len), + dtype=torch.long, + device=torch.device("cuda:0"), + ) + trg_tokens = torch.randint( + padding_idx, + vocab_size, + (batch_size, trg_seq_len), + dtype=torch.long, + device=torch.device("cuda:0"), + ) target = trg_tokens.clone()[:, 1:] eos = torch.zeros((batch_size, 1), dtype=torch.long, device=torch.device("cuda:0")) target = torch.cat([target, eos], dim=-1) return src_tokens, trg_tokens, target + def create_model(): transformer_config = LSTransformer.get_config( model="transformer-base", @@ -23,24 +37,26 @@ def create_model(): num_encoder_layer=6, num_decoder_layer=6, fp16=True, - local_rank=0 + local_rank=0, ) model = LSTransformer(transformer_config) model.to(dtype=torch.half, device=torch.device("cuda:0")) return model + def create_criterion(): ce_config = LSCrossEntropyLayer.get_config( max_batch_tokens=4096, padding_idx=padding_idx, epsilon=0.0, fp16=True, - local_rank=0 + local_rank=0, ) loss_fn = LSCrossEntropyLayer(ce_config) loss_fn.to(dtype=torch.half, device=torch.device("cuda:0")) return loss_fn + if __name__ == "__main__": src_tokens, trg_tokens, target = create_data() model = create_model() From 54fc32015f24e45b162b4208286f96f0f36d2ae2 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 10:53:50 +0800 Subject: [PATCH 06/24] fix embedding block allocation bug --- .../csrc/kernels/embedding_kernels.cu | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/lightseq/training/csrc/kernels/embedding_kernels.cu b/lightseq/training/csrc/kernels/embedding_kernels.cu index 8b181aab..74f17354 100644 --- a/lightseq/training/csrc/kernels/embedding_kernels.cu +++ b/lightseq/training/csrc/kernels/embedding_kernels.cu @@ -10,8 +10,8 @@ lookup table, scale, add position embedding and dropout. @thread gridDim.x = batch_size -gridDim.y = threads_per_seq -blockDim.x = tokens_per_thread +gridDim.y = blocks_per_seq +blockDim.x = tokens_per_block blockDim.y = min(embedding_dim, MAX_THREADS) @param @@ -178,10 +178,11 @@ void launch_lookup_scale_pos_dropout( float emb_scale = sqrt(embedding_dim); embedding_dim >>= 2; - int tokens_per_thread = (MAX_THREADS + embedding_dim - 1) / embedding_dim; - int threads_per_seq = (seq_len + tokens_per_thread - 1) / tokens_per_thread; - dim3 grid_dim(batch_size, threads_per_seq); - dim3 block_dim(tokens_per_thread, min(embedding_dim, MAX_THREADS)); + int threads_per_token = min(embedding_dim, MAX_THREADS); + int tokens_per_block = MAX_THREADS / threads_per_token; + int blocks_per_seq = (seq_len + tokens_per_block - 1) / tokens_per_block; + dim3 grid_dim(batch_size, blocks_per_seq); + dim3 block_dim(tokens_per_block, threads_per_token); int seed = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); @@ -200,10 +201,11 @@ void launch_lookup_scale_pos_dropout<__half>( float emb_scale = sqrt(embedding_dim); embedding_dim >>= 3; - int tokens_per_thread = (MAX_THREADS + embedding_dim - 1) / embedding_dim; - int threads_per_seq = (seq_len + tokens_per_thread - 1) / tokens_per_thread; - dim3 grid_dim(batch_size, threads_per_seq); - dim3 block_dim(tokens_per_thread, min(embedding_dim, MAX_THREADS)); + int threads_per_token = min(embedding_dim, MAX_THREADS); + int tokens_per_block = MAX_THREADS / threads_per_token; + int blocks_per_seq = (seq_len + tokens_per_block - 1) / tokens_per_block; + dim3 grid_dim(batch_size, blocks_per_seq); + dim3 block_dim(tokens_per_block, threads_per_token); int seed = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); @@ -219,8 +221,8 @@ backward of embedding layer in fairseq. @thread gridDim.x = batch_size -gridDim.y = threads_per_seq -blockDim.x = tokens_per_thread +gridDim.y = blocks_per_seq +blockDim.x = tokens_per_block blockDim.y = min(embedding_dim, MAX_THREADS) @param @@ -360,10 +362,11 @@ void launch_d_lookup_scale_pos_dropout( zero_grads <<>>(grad_embeddings, total_nums); - int tokens_per_thread = (MAX_THREADS + embedding_dim - 1) / embedding_dim; - int threads_per_seq = (seq_len + tokens_per_thread - 1) / tokens_per_thread; - dim3 grid_dim(batch_size, threads_per_seq); - dim3 block_dim(tokens_per_thread, min(embedding_dim, MAX_THREADS)); + int threads_per_token = min(embedding_dim, MAX_THREADS); + int tokens_per_block = MAX_THREADS / threads_per_token; + int blocks_per_seq = (seq_len + tokens_per_block - 1) / tokens_per_block; + dim3 grid_dim(batch_size, blocks_per_seq); + dim3 block_dim(tokens_per_block, threads_per_token); d_lookup_scale_pos_dropout<<>>( grad_embeddings, grad_output, input, dropout_mask, seq_len, embedding_dim, @@ -386,10 +389,11 @@ void launch_d_lookup_scale_pos_dropout<__half>( zero_grads<__half> <<>>(grad_embeddings, total_nums); - int tokens_per_thread = (MAX_THREADS + embedding_dim - 1) / embedding_dim; - int threads_per_seq = (seq_len + tokens_per_thread - 1) / tokens_per_thread; - dim3 grid_dim(batch_size, threads_per_seq); - dim3 block_dim(tokens_per_thread, min(embedding_dim, MAX_THREADS)); + int threads_per_token = min(embedding_dim, MAX_THREADS); + int tokens_per_block = MAX_THREADS / threads_per_token; + int blocks_per_seq = (seq_len + tokens_per_block - 1) / tokens_per_block; + dim3 grid_dim(batch_size, blocks_per_seq); + dim3 block_dim(tokens_per_block, threads_per_token); d_lookup_scale_pos_dropout<__half><<>>( grad_embeddings, grad_output, input, dropout_mask, seq_len, embedding_dim, From d8022338fabbf720790649f9ea0a4d1ee4a4a4f8 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 16:49:01 +0800 Subject: [PATCH 07/24] polish unit test code --- tests/fairseq_layers.py | 306 +++++--------- tests/gen_test_layers.py | 223 +++++++++++ tests/test_ls_ops.py | 832 ++++++++++----------------------------- tests/util.py | 130 +++--- 4 files changed, 586 insertions(+), 905 deletions(-) create mode 100644 tests/gen_test_layers.py diff --git a/tests/fairseq_layers.py b/tests/fairseq_layers.py index 45026dc7..92d9e50e 100644 --- a/tests/fairseq_layers.py +++ b/tests/fairseq_layers.py @@ -4,9 +4,8 @@ We use layers from Facebook Fairseq as our baseline for unit test """ -from typing import Dict, List, Optional, Callable +from typing import Dict, List, Optional import math -from copy import deepcopy import torch import torch.nn as nn @@ -18,7 +17,7 @@ from torch import Tensor -class TransformerEncoderLayer(nn.Module): +class FSTransformerEncoderLayer(nn.Module): """Encoder layer implemented by fairseq. This version only removes the "args" parameter, no other changes @@ -165,128 +164,7 @@ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): return x -class TransformerSentenceEncoderLayer(nn.Module): - """ - Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained - models. - """ - - def __init__( - self, - embedding_dim: int = 768, - ffn_embedding_dim: int = 3072, - num_attention_heads: int = 8, - dropout: float = 0.1, - attention_dropout: float = 0.1, - activation_dropout: float = 0.1, - activation_fn: str = "relu", - export: bool = False, - q_noise: float = 0.0, - qn_block_size: int = 8, - init_fn: Callable = None, - ) -> None: - super().__init__() - - if init_fn is not None: - init_fn() - - # Initialize parameters - self.embedding_dim = embedding_dim - self.dropout_module = FairseqDropout( - dropout, module_name=self.__class__.__name__ - ) - self.activation_dropout_module = FairseqDropout( - activation_dropout, module_name=self.__class__.__name__ - ) - - # Initialize blocks - self.activation_fn = utils.get_activation_fn(activation_fn) - self.self_attn = self.build_self_attention( - self.embedding_dim, - num_attention_heads, - dropout=attention_dropout, - self_attention=True, - q_noise=q_noise, - qn_block_size=qn_block_size, - ) - - # layer norm associated with the self attention layer - self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) - - self.fc1 = self.build_fc1( - self.embedding_dim, - ffn_embedding_dim, - q_noise=q_noise, - qn_block_size=qn_block_size, - ) - self.fc2 = self.build_fc2( - ffn_embedding_dim, - self.embedding_dim, - q_noise=q_noise, - qn_block_size=qn_block_size, - ) - - # layer norm associated with the position wise feed-forward NN - self.final_layer_norm = LayerNorm(self.embedding_dim, export=export) - - def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): - return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) - - def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): - return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) - - def build_self_attention( - self, - embed_dim, - num_attention_heads, - dropout, - self_attention, - q_noise, - qn_block_size, - ): - return MultiheadAttention( - embed_dim, - num_attention_heads, - dropout=dropout, - self_attention=True, - q_noise=q_noise, - qn_block_size=qn_block_size, - ) - - def forward( - self, - x: torch.Tensor, - self_attn_mask: Optional[torch.Tensor] = None, - self_attn_padding_mask: Optional[torch.Tensor] = None, - ): - """ - LayerNorm is applied either before or after the self-attention/ffn - modules similar to the original Transformer implementation. - """ - residual = x - x, attn = self.self_attn( - query=x, - key=x, - value=x, - key_padding_mask=self_attn_padding_mask, - need_weights=False, - attn_mask=self_attn_mask, - ) - x = self.dropout_module(x) - x = residual + x - x = self.self_attn_layer_norm(x) - - residual = x - x = self.activation_fn(self.fc1(x)) - x = self.activation_dropout_module(x) - x = self.fc2(x) - x = self.dropout_module(x) - x = residual + x - x = self.final_layer_norm(x) - return x, attn - - -class TransformerDecoderLayer(nn.Module): +class FSTransformerDecoderLayer(nn.Module): """Decoder layer implemented by fairseq. This version only removes the "args" parameter, no other changes """ @@ -544,72 +422,6 @@ def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn -def generate_enc_layer(): - hidden_size = 1024 - intermediate_size = 1024 * 4 - heads = 16 - hidden_dropout_ratio = 0.0 - attn_dropout_ratio = 0.0 - activation_dropout_ratio = 0.0 - pre_layer_norm = True - layer = TransformerEncoderLayer( - hidden_size, - intermediate_size, - heads, - hidden_dropout_ratio, - attn_dropout_ratio, - activation_dropout_ratio, - pre_layer_norm, - activation_fn="relu", - ) - layer.to(torch.device("cuda:0"), dtype=torch.half) - return layer - - -def generate_dec_layer(): - hidden_size = 1024 - intermediate_size = 1024 * 4 - heads = 16 - hidden_dropout_ratio = 0.0 - attn_dropout_ratio = 0.0 - activation_dropout_ratio = 0.0 - pre_layer_norm = True - layer = TransformerDecoderLayer( - embed_dim=hidden_size, - ffn_embed_dim=intermediate_size, - nhead=heads, - encoder_embed_dim=hidden_size, - dropout=hidden_dropout_ratio, - attn_dropout=attn_dropout_ratio, - activation_dropout=activation_dropout_ratio, - normalize_before=pre_layer_norm, - activation_fn="relu", - ) - - layer.to(torch.device("cuda:0"), dtype=torch.half) - return layer - - -def generate_bert_enc_layer(): - hidden_size = 1024 - intermediate_size = 1024 * 4 - heads = 16 - hidden_dropout_ratio = 0.0 - attn_dropout_ratio = 0.0 - activation_dropout_ratio = 0.0 - layer = TransformerSentenceEncoderLayer( - hidden_size, - intermediate_size, - heads, - hidden_dropout_ratio, - attn_dropout_ratio, - activation_dropout_ratio, - activation_fn="gelu", - ) - layer.to(torch.device("cuda:0")) - return layer - - class SinusoidalPositionalEmbedding(nn.Module): """This module produces sinusoidal positional embeddings of any length. @@ -674,7 +486,7 @@ def forward( ).detach() -class TransformerEmbeddingLayer(nn.Module): +class FSTransformerEmbeddingLayer(nn.Module): def __init__( self, vocab_size, embedding_dim, max_seq_len, padding_idx, dropout, fp16 ): @@ -703,21 +515,97 @@ def forward(self, input): return x -def generate_emb_layer(ls_emb_config): - layer = TransformerEmbeddingLayer( - ls_emb_config.vocab_size, - ls_emb_config.embedding_dim, - ls_emb_config.max_seq_len, - ls_emb_config.padding_idx, - ls_emb_config.dropout, - ls_emb_config.fp16, - ) - dtype = torch.float16 if ls_emb_config.fp16 else torch.float32 - layer.to(torch.device("cuda:0"), dtype=dtype) - - return layer - +class FSCrossEntropyLayer(nn.Module): + def __init__(self, epsilon, ignore_index): + super().__init__() -if __name__ == "__main__": - generate_enc_layer() - generate_dec_layer() + self.epsilon = epsilon + self.ignore_index = ignore_index + + def label_smoothed_nll_loss(self, lprobs, target, reduce=True): + if target.dim() == lprobs.dim() - 1: + target = target.unsqueeze(-1) + nll_loss = -lprobs.gather(dim=-1, index=target) + smooth_loss = -lprobs.sum(dim=-1, keepdim=True) + if self.ignore_index is not None: + pad_mask = target.eq(self.ignore_index) + nll_loss.masked_fill_(pad_mask, 0.0) + smooth_loss.masked_fill_(pad_mask, 0.0) + else: + nll_loss = nll_loss.squeeze(-1) + smooth_loss = smooth_loss.squeeze(-1) + if reduce: + nll_loss = nll_loss.sum() + smooth_loss = smooth_loss.sum() + eps_i = self.epsilon / (lprobs.size(-1) - 1) + loss = (1.0 - self.epsilon - eps_i) * nll_loss + eps_i * smooth_loss + return loss, nll_loss + + def forward(self, inputs, targets): + x = torch.nn.functional.log_softmax(inputs, dim=-1, dtype=torch.float32) + loss, nll_loss = self.label_smoothed_nll_loss(x, targets) + loss = loss.to(inputs) + nll_loss = nll_loss.to(inputs) + + return loss, nll_loss + + +def get_fairseq_enc_params(fairseq_layer): + initial_weights = [] + initial_biases = [] + + initial_weights.append(fairseq_layer.self_attn.q_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.q_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.k_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.k_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.v_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.v_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.out_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.out_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn_layer_norm.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn_layer_norm.bias.detach().clone()) + + initial_weights.append(fairseq_layer.fc1.weight.detach().clone()) + initial_biases.append(fairseq_layer.fc1.bias.detach().clone()) + initial_weights.append(fairseq_layer.fc2.weight.detach().clone()) + initial_biases.append(fairseq_layer.fc2.bias.detach().clone()) + initial_weights.append(fairseq_layer.final_layer_norm.weight.detach().clone()) + initial_biases.append(fairseq_layer.final_layer_norm.bias.detach().clone()) + return initial_weights, initial_biases + + +def get_fairseq_dec_params(fairseq_layer): + initial_weights = [] + initial_biases = [] + + initial_weights.append(fairseq_layer.self_attn.q_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.q_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.k_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.k_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.v_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.v_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn.out_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn.out_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.self_attn_layer_norm.weight.detach().clone()) + initial_biases.append(fairseq_layer.self_attn_layer_norm.bias.detach().clone()) + + initial_weights.append(fairseq_layer.encodec_attn.q_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.encodec_attn.q_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.encodec_attn.k_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.encodec_attn.k_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.encodec_attn.v_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.encodec_attn.v_proj.bias.detach().clone()) + initial_weights.append(fairseq_layer.encodec_attn.out_proj.weight.detach().clone()) + initial_biases.append(fairseq_layer.encodec_attn.out_proj.bias.detach().clone()) + initial_weights.append( + fairseq_layer.encodec_attn_layer_norm.weight.detach().clone() + ) + initial_biases.append(fairseq_layer.encodec_attn_layer_norm.bias.detach().clone()) + + initial_weights.append(fairseq_layer.fc1.weight.detach().clone()) + initial_biases.append(fairseq_layer.fc1.bias.detach().clone()) + initial_weights.append(fairseq_layer.fc2.weight.detach().clone()) + initial_biases.append(fairseq_layer.fc2.bias.detach().clone()) + initial_weights.append(fairseq_layer.final_layer_norm.weight.detach().clone()) + initial_biases.append(fairseq_layer.final_layer_norm.bias.detach().clone()) + return initial_weights, initial_biases diff --git a/tests/gen_test_layers.py b/tests/gen_test_layers.py new file mode 100644 index 00000000..40651ffd --- /dev/null +++ b/tests/gen_test_layers.py @@ -0,0 +1,223 @@ +import torch + +from tests.fairseq_layers import ( + FSTransformerEncoderLayer, + FSTransformerDecoderLayer, + FSTransformerEmbeddingLayer, + FSCrossEntropyLayer, + get_fairseq_enc_params, + get_fairseq_dec_params, +) +from lightseq.training import ( + LSTransformerEncoderLayer, + LSTransformerEmbeddingLayer, + LSCrossEntropyLayer, +) +from examples.training.fairseq.fs_modules.ls_fs_transformer_decoder_layer import ( + LSFSTransformerDecoderLayer, +) + + +###################### encoder layer ###################### +def gen_enc_layer(global_config): + def gen_ls_enc_layer(initial_weights=None, initial_biases=None): + config = LSTransformerEncoderLayer.get_config( + max_batch_tokens=global_config.max_batch_tokens, + max_seq_len=global_config.max_seq_len, + hidden_size=global_config.hidden_size, + intermediate_size=global_config.intermediate_size, + nhead=global_config.nhead, + attn_prob_dropout_ratio=global_config.attn_prob_dropout_ratio, + activation_dropout_ratio=global_config.activation_dropout_ratio, + hidden_dropout_ratio=global_config.hidden_dropout_ratio, + pre_layer_norm=global_config.pre_layer_norm, + fp16=global_config.fp16, + local_rank=global_config.local_rank, + activation_fn=global_config.activation_fn, + ) + layer = LSTransformerEncoderLayer( + config, + initial_weights, + initial_biases + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + def gen_fs_enc_layer(): + layer = FSTransformerEncoderLayer( + embed_dim=global_config.hidden_size, + ffn_embed_dim=global_config.intermediate_size, + nhead=global_config.nhead, + dropout=global_config.hidden_dropout_ratio, + attn_dropout=global_config.attn_prob_dropout_ratio, + activation_dropout=global_config.activation_dropout_ratio, + normalize_before=global_config.pre_layer_norm, + activation_fn=global_config.activation_fn, + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + custom_enc_layer_list = [] + fairseq_enc_layer_list = [] + + for _ in range(global_config.num_layers): + fairseq_enc_layer = gen_fs_enc_layer() + initial_enc_weights, initial_enc_biases = get_fairseq_enc_params(fairseq_enc_layer) + custom_enc_layer = gen_ls_enc_layer(initial_enc_weights, initial_enc_biases) + custom_enc_layer_list.append(custom_enc_layer) + fairseq_enc_layer_list.append(fairseq_enc_layer) + + return torch.nn.ModuleList(custom_enc_layer_list), torch.nn.ModuleList(fairseq_enc_layer_list) + + +###################### decoder layer ###################### +def gen_dec_layer(global_config): + def gen_ls_dec_layer(initial_weights=None, initial_biases=None): + config = LSFSTransformerDecoderLayer.get_config( + max_batch_tokens=global_config.max_batch_tokens, + max_seq_len=global_config.max_seq_len, + hidden_size=global_config.hidden_size, + intermediate_size=global_config.intermediate_size, + nhead=global_config.nhead, + attn_prob_dropout_ratio=global_config.attn_prob_dropout_ratio, + activation_dropout_ratio=global_config.activation_dropout_ratio, + hidden_dropout_ratio=global_config.hidden_dropout_ratio, + pre_layer_norm=global_config.pre_layer_norm, + fp16=global_config.fp16, + local_rank=global_config.local_rank, + nlayer=global_config.num_layers, + activation_fn=global_config.activation_fn, + ) + layer = LSFSTransformerDecoderLayer( + config, + initial_weights, + initial_biases, + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + def gen_fs_dec_layer(): + layer = FSTransformerDecoderLayer( + embed_dim=global_config.hidden_size, + ffn_embed_dim=global_config.intermediate_size, + nhead=global_config.nhead, + encoder_embed_dim=global_config.hidden_size, + dropout=global_config.hidden_dropout_ratio, + attn_dropout=global_config.attn_prob_dropout_ratio, + activation_dropout=global_config.activation_dropout_ratio, + normalize_before=global_config.pre_layer_norm, + activation_fn=global_config.activation_fn, + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + custom_dec_layer_list = [] + fairseq_dec_layer_list = [] + _initial_dec_weights_list = [] + _initial_dec_biases_list = [] + _initial_encdec_attn_kvw_list = [] + _initial_encdec_attn_kvb_list = [] + + for _ in range(global_config.num_layers): + fairseq_dec_layer = gen_fs_dec_layer() + initial_dec_weights, initial_dec_biases = get_fairseq_dec_params(fairseq_dec_layer) + fairseq_dec_layer_list.append(fairseq_dec_layer) + _initial_dec_weights_list.append(initial_dec_weights) + _initial_dec_biases_list.append(initial_dec_biases) + _initial_encdec_attn_kvw_list.append(initial_dec_weights[6]) + _initial_encdec_attn_kvw_list.append(initial_dec_weights[7]) + _initial_encdec_attn_kvb_list.append(initial_dec_biases[6]) + _initial_encdec_attn_kvb_list.append(initial_dec_biases[7]) + + _initial_encdec_attn_kvw = torch.cat(_initial_encdec_attn_kvw_list, dim=0) + _initial_encdec_attn_kvb = torch.cat(_initial_encdec_attn_kvb_list, dim=0) + for i in range(global_config.num_layers): + _initial_dec_weights_list[i].pop(7) + _initial_dec_weights_list[i].pop(6) + if i == 0: + _initial_dec_weights_list[i].append(_initial_encdec_attn_kvw) + _initial_dec_biases_list[i].pop(7) + _initial_dec_biases_list[i].pop(6) + if i == 0: + _initial_dec_biases_list[i].append(_initial_encdec_attn_kvb) + custom_dec_layer = gen_ls_dec_layer( + _initial_dec_weights_list[i], _initial_dec_biases_list[i] + ) + custom_dec_layer_list.append(custom_dec_layer) + + return torch.nn.ModuleList(custom_dec_layer_list), torch.nn.ModuleList(fairseq_dec_layer_list) + + +###################### embedding layer ###################### +def gen_emb_layer(global_config): + def gen_ls_emb_layer(initial_embedding=None): + config = LSTransformerEmbeddingLayer.get_config( + vocab_size=global_config.vocab_size, + embedding_dim=global_config.hidden_size, + max_batch_tokens=global_config.max_batch_tokens, + max_seq_len=global_config.max_seq_len, + padding_idx=global_config.padding_idx, + dropout=global_config.hidden_dropout_ratio, + fp16=global_config.fp16, + local_rank=global_config.local_rank, + ) + layer = LSTransformerEmbeddingLayer( + config, + initial_embedding + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + def gen_fs_emb_layer(): + layer = FSTransformerEmbeddingLayer( + vocab_size=global_config.vocab_size, + embedding_dim=global_config.hidden_size, + max_seq_len=global_config.max_seq_len, + padding_idx=global_config.padding_idx, + dropout=global_config.hidden_dropout_ratio, + fp16=global_config.fp16 + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + fairseq_emb_layer = gen_fs_emb_layer() + initial_embedding = fairseq_emb_layer.embeddings.weight.detach().clone() + custom_emb_layer = gen_ls_emb_layer(initial_embedding) + + return custom_emb_layer, fairseq_emb_layer + + +###################### cross entropy layer ###################### +def gen_ce_layer(global_config): + def gen_ls_ce_layer(): + config = LSCrossEntropyLayer.get_config( + max_batch_tokens=global_config.max_batch_tokens, + padding_idx=global_config.padding_idx, + epsilon=global_config.label_smooth, + fp16=global_config.fp16, + local_rank=global_config.local_rank, + ) + layer = LSCrossEntropyLayer(config) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + def gen_fs_ce_layer(): + layer = FSCrossEntropyLayer( + epsilon=global_config.label_smooth, + ignore_index=global_config.padding_idx, + ) + layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.train() + return layer + + fairseq_ce_layer = gen_fs_ce_layer() + custom_ce_layer = gen_ls_ce_layer() + + return custom_ce_layer, fairseq_ce_layer diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index 36233848..301b57ac 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -1,445 +1,101 @@ import random -import math -from copy import deepcopy -from dataclasses import dataclass import torch -import torch.nn as nn - +from torch.nn.functional import nll_loss from tests.util import ( TestDecorator, - get_fairseq_enc_params, - get_fairseq_dec_params, - max_batch_tokens, - max_seq_len, + global_config, split_custom_layer_grad, copy_grad_from_paras, ) - -from tests import fairseq_layers -from lightseq.training.ops.pytorch.transformer_encoder_layer import ( - LSTransformerEncoderLayer, -) -from lightseq.training.ops.pytorch.transformer_embedding_layer import ( - LSTransformerEmbeddingLayer, -) -from lightseq.training.ops.pytorch.cross_entropy_layer import LSCrossEntropyLayer -from examples.training.fairseq.fs_modules.ls_fs_transformer_decoder_layer import ( - LSFSTransformerDecoderLayer, +from tests.gen_test_layers import ( + gen_enc_layer, + gen_dec_layer, + gen_emb_layer, + gen_ce_layer, ) -kt = TestDecorator() - -num_layers = 1 - -###################### encoding layer ###################### - - -def generate_enc_layer(initial_weights=None, initial_biases=None): - config = LSTransformerEncoderLayer.get_config( - max_batch_tokens=max_batch_tokens, - max_seq_len=max_seq_len, - hidden_size=1024, - intermediate_size=4096, - nhead=16, - attn_prob_dropout_ratio=0.0, - activation_dropout_ratio=0.0, - hidden_dropout_ratio=0.0, - pre_layer_norm=True, - fp16=True, - local_rank=0, - activation_fn="relu", - ) - layer = LSTransformerEncoderLayer(config, initial_weights, initial_biases) - layer.to(torch.device("cuda:0"), dtype=torch.half) - return layer - - -custom_enc_layer_list = [] -fairseq_enc_layer_list = [] - - -def gen_enc_layer_pair(): - fairseq_enc_layer = fairseq_layers.generate_enc_layer() - fairseq_enc_layer.train() - initial_enc_weights, initial_enc_biases = get_fairseq_enc_params(fairseq_enc_layer) - custom_enc_layer = generate_enc_layer(initial_enc_weights, initial_enc_biases) - custom_enc_layer.train() - return fairseq_enc_layer, custom_enc_layer - - -for _ in range(num_layers): - fairseq_enc_layer, custom_enc_layer = gen_enc_layer_pair() - custom_enc_layer_list.append(custom_enc_layer) - fairseq_enc_layer_list.append(fairseq_enc_layer) - - -###################### bert encoder layer ###################### - - -def get_test_bert_encoder(num_layers): - def ls_generate_bert_enc_layer(initial_weights=None, initial_biases=None): - config = LSTransformerEncoderLayer.get_config( - max_batch_tokens=max_batch_tokens, - max_seq_len=max_seq_len, - hidden_size=1024, - intermediate_size=4096, - nhead=16, - attn_prob_dropout_ratio=0.0, - activation_dropout_ratio=0.0, - hidden_dropout_ratio=0.0, - pre_layer_norm=False, - fp16=True, - local_rank=0, - activation_fn="gelu", - ) - layer = LSTransformerEncoderLayer(config, initial_weights, initial_biases) - layer.to(torch.device("cuda:0")) - return layer - - def gen_bert_enc_layer_pair(): - fairseq_enc_layer = fairseq_layers.generate_bert_enc_layer() - fairseq_enc_layer.train() - initial_enc_weights, initial_enc_biases = get_fairseq_enc_params( - fairseq_enc_layer - ) - custom_enc_layer = ls_generate_bert_enc_layer( - initial_enc_weights, initial_enc_biases - ) - custom_enc_layer.train() - return fairseq_enc_layer, custom_enc_layer - - custom_bert_enc_layer_list = [] - fairseq_bert_enc_layer_list = [] - for _ in range(num_layers): - fairseq_enc_layer, custom_enc_layer = gen_bert_enc_layer_pair() - custom_bert_enc_layer_list.append(custom_enc_layer) - fairseq_bert_enc_layer_list.append(fairseq_enc_layer) - - return torch.nn.ModuleList(custom_bert_enc_layer_list), torch.nn.ModuleList( - fairseq_bert_enc_layer_list - ) - - -###################### decoding layer ###################### - - -def generate_dec_layer(initial_weights=None, initial_biases=None): - config = LSFSTransformerDecoderLayer.get_config( - max_batch_tokens=max_batch_tokens, - max_seq_len=max_seq_len, - hidden_size=1024, - intermediate_size=4096, - nhead=16, - attn_prob_dropout_ratio=0.0, - activation_dropout_ratio=0.0, - hidden_dropout_ratio=0.0, - pre_layer_norm=True, - fp16=True, - local_rank=0, - nlayer=num_layers, - activation_fn="relu", - ) - layer = LSFSTransformerDecoderLayer( - config, - initial_weights, - initial_biases, - ) - layer.to(torch.device("cuda:0"), dtype=torch.half) - return layer - - -custom_dec_layer_list = [] -fairseq_dec_layer_list = [] -_initial_dec_weights_list = [] -_initial_dec_biases_list = [] -_initial_encdec_attn_kvw_list = [] -_initial_encdec_attn_kvb_list = [] - -for _ in range(num_layers): - fairseq_dec_layer = fairseq_layers.generate_dec_layer() - fairseq_dec_layer.train() - initial_dec_weights, initial_dec_biases = get_fairseq_dec_params(fairseq_dec_layer) - fairseq_dec_layer_list.append(fairseq_dec_layer) - _initial_dec_weights_list.append(initial_dec_weights) - _initial_dec_biases_list.append(initial_dec_biases) - _initial_encdec_attn_kvw_list.append(initial_dec_weights[6]) - _initial_encdec_attn_kvw_list.append(initial_dec_weights[7]) - _initial_encdec_attn_kvb_list.append(initial_dec_biases[6]) - _initial_encdec_attn_kvb_list.append(initial_dec_biases[7]) - -_initial_encdec_attn_kvw = torch.cat(_initial_encdec_attn_kvw_list, dim=0) -_initial_encdec_attn_kvb = torch.cat(_initial_encdec_attn_kvb_list, dim=0) -for i in range(num_layers): - _initial_dec_weights_list[i].pop(7) - _initial_dec_weights_list[i].pop(6) - if i == 0: - _initial_dec_weights_list[i].append(_initial_encdec_attn_kvw) - _initial_dec_biases_list[i].pop(7) - _initial_dec_biases_list[i].pop(6) - if i == 0: - _initial_dec_biases_list[i].append(_initial_encdec_attn_kvb) - custom_dec_layer = generate_dec_layer( - _initial_dec_weights_list[i], _initial_dec_biases_list[i] - ) - custom_dec_layer.train() - custom_dec_layer_list.append(custom_dec_layer) - -# ###################### embedding layer ###################### - -ls_emb_config_fp16 = LSTransformerEmbeddingLayer.get_config( - vocab_size=40480, - embedding_dim=1024, - max_batch_tokens=9216, - max_seq_len=256, - padding_idx=2, - dropout=0.0, - fp16=True, - local_rank=0, -) -ls_emb_config_fp32 = deepcopy(ls_emb_config_fp16) -ls_emb_config_fp32.fp16 = False - -fs_emb_layer_fp32 = fairseq_layers.generate_emb_layer(ls_emb_config_fp32) -fs_emb_layer_fp16 = fairseq_layers.generate_emb_layer(ls_emb_config_fp16) -fs_emb_layer_fp32.train() -fs_emb_layer_fp16.train() - - -def generate_emb_layer(config, initial_weights=None): - custom_layer = LSTransformerEmbeddingLayer(config, initial_weights) - dtype = torch.float16 if config.fp16 else torch.float32 - custom_layer.to(torch.device("cuda:0"), dtype=dtype) - return custom_layer - - -custom_emb_layer_fp32 = generate_emb_layer( - ls_emb_config_fp32, fs_emb_layer_fp32.embeddings.weight.detach().clone() -) -custom_emb_layer_fp16 = generate_emb_layer( - ls_emb_config_fp16, fs_emb_layer_fp16.embeddings.weight.detach().clone() -) -custom_emb_layer_fp32.train() -custom_emb_layer_fp16.train() - -###################### cross entropy layer ###################### - -ce_config_fp16 = LSCrossEntropyLayer.get_config( - max_batch_tokens=9216, - padding_idx=2, - epsilon=0.1, - fp16=True, - local_rank=0, -) -ce_config_fp32 = deepcopy(ce_config_fp16) -ce_config_fp32.fp16 = False - - -def generate_cross_entropy_layer(config): - dtype = torch.float16 if config.fp16 else torch.float32 - custom_layer = LSCrossEntropyLayer(config) - custom_layer.to(torch.device("cuda:0"), dtype=dtype) - return custom_layer +kt = TestDecorator() -custom_cross_entropy_layer_fp32 = generate_cross_entropy_layer(ce_config_fp32) -custom_cross_entropy_layer_fp16 = generate_cross_entropy_layer(ce_config_fp16) -custom_cross_entropy_layer_fp32.train() -custom_cross_entropy_layer_fp16.train() +custom_enc_layers, fairseq_enc_layers = gen_enc_layer(global_config) +custom_dec_layers, fairseq_dec_layers = gen_dec_layer(global_config) +custom_emb_layer, fairseq_emb_layer = gen_emb_layer(global_config) +custom_ce_layer, fairseq_ce_layer = gen_ce_layer(global_config) -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(rtol=1e-3, atol=1e-2, ntest=10) def test_encoder_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_states = kt.rand((batch_size, seq_len, 1024)) - self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - - def custom(): - res = hidden_states.clone() - for i in range(num_layers): - res = custom_enc_layer_list[i](res, self_attn_padding_mask) - return [ - res.contiguous().detach(), - ] - - def baseline(): - res = hidden_states.transpose(0, 1).contiguous().clone() - for i in range(num_layers): - res = fairseq_enc_layer_list[i](res, self_attn_padding_mask) - return [ - res.transpose(0, 1).contiguous().detach(), - ] - - return custom, baseline - - -@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) -def test_encoder_layer_backward(): - batch_size, seq_len = kt.bs_sl() - print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = 1024 - shs = hidden_size * hidden_size - + hidden_size = global_config.hidden_size hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() - def custom(): - for i in range(num_layers): - custom_enc_layer_list[i].zero_grad() - res = hidden_states.clone() - for i in range(num_layers): - res = custom_enc_layer_list[i](res, self_attn_padding_mask) - custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) - custom_loss.backward() - grad_list = [] - for i in range(num_layers - 1, -1, -1): - """ - attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, - inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb - """ - grads = split_custom_layer_grad(custom_enc_layer_list[i]) - grad_list.extend( - [ - grads[8], - grads[9], - grads[6], - grads[7], - grads[10], - grads[11], - grads[2], - grads[3], - grads[0][:shs], - grads[1][:hidden_size], - grads[0][shs : shs * 2], - grads[1][hidden_size : hidden_size * 2], - grads[0][shs * 2 : shs * 3], - grads[1][hidden_size * 2 : hidden_size * 3], - grads[4], - grads[5], - ] - ) - return grad_list - - def baseline(): - for i in range(num_layers): - fairseq_enc_layer_list[i].zero_grad() - res = hidden_states.transpose(0, 1).clone() - for i in range(num_layers): - res = fairseq_enc_layer_list[i](res, self_attn_padding_mask) - fairseq_loss = (res / 1000).sum() - fairseq_loss.data.copy_(loss_data) - fairseq_loss.backward() - grad_list = [] - for i in range(num_layers - 1, -1, -1): - curl = fairseq_enc_layer_list[i] - cur_grads = copy_grad_from_paras( - [ - curl.fc2.weight, - curl.fc2.bias, - curl.fc1.weight, - curl.fc1.bias, - curl.final_layer_norm.weight, - curl.final_layer_norm.bias, - curl.self_attn.out_proj.weight, - curl.self_attn.out_proj.bias, - curl.self_attn.q_proj.weight, - curl.self_attn.q_proj.bias, - curl.self_attn.k_proj.weight, - curl.self_attn.k_proj.bias, - curl.self_attn.v_proj.weight, - curl.self_attn.v_proj.bias, - curl.self_attn_layer_norm.weight, - curl.self_attn_layer_norm.bias, - ] - ) - grad_list.extend(cur_grads) - return grad_list - - return custom, baseline - - -@kt.case(dtypes=[torch.float, torch.half], rtol=1e-3, atol=1e-2, ntest=10) -def test_bert_encoder_layer_forward(): - batch_size, seq_len = kt.bs_sl() - print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - - hidden_states = kt.rand((batch_size, seq_len, 1024)) - self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - num_layers = 1 - - custom_bert_enc_layer_list, fairseq_bert_enc_layer_list = get_test_bert_encoder( - num_layers - ) - - custom_bert_enc_layer_list = custom_bert_enc_layer_list.to(kt.dtype) - fairseq_bert_enc_layer_list = fairseq_bert_enc_layer_list.to(kt.dtype) + custom_enc_layers.to(kt.dtype) + fairseq_enc_layers.to(kt.dtype) def custom(): res = hidden_states.clone() - for i in range(num_layers): - res = custom_bert_enc_layer_list[i](res, self_attn_padding_mask) + for i in range(global_config.num_layers): + res = custom_enc_layers[i](res, self_attn_padding_mask) return [ res.contiguous().detach(), ] def baseline(): res = hidden_states.transpose(0, 1).contiguous().clone() - for i in range(num_layers): - res = fairseq_bert_enc_layer_list[i]( - res, self_attn_padding_mask=self_attn_padding_mask - )[0] + for i in range(global_config.num_layers): + res = fairseq_enc_layers[i](res, self_attn_padding_mask) return [ res.transpose(0, 1).contiguous().detach(), ] - del custom_bert_enc_layer_list, fairseq_bert_enc_layer_list return custom, baseline -@kt.case(dtypes=[torch.float, torch.half], rtol=1e-2, atol=1e-2, ntest=10) -def test_bert_encoder_layer_backward(): +@kt.case(rtol=1e-2, atol=1e-2, ntest=10) +def test_encoder_layer_backward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = 1024 - shs = hidden_size * hidden_size + hidden_size = global_config.hidden_size + shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) + loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() - num_layers = 1 - custom_bert_enc_layer_list, fairseq_bert_enc_layer_list = get_test_bert_encoder( - num_layers - ) - custom_bert_enc_layer_list = custom_bert_enc_layer_list.to(kt.dtype).train() - fairseq_bert_enc_layer_list = fairseq_bert_enc_layer_list.to(kt.dtype).train() - - cus_x = hidden_states.clone() - for i in range(num_layers): - cus_x = custom_bert_enc_layer_list[i](cus_x, self_attn_padding_mask) - custom_loss = (cus_x / 1000).sum() - - base_x = hidden_states.transpose(0, 1).clone() - for i in range(num_layers): - base_x = fairseq_bert_enc_layer_list[i]( - base_x, self_attn_padding_mask=self_attn_padding_mask - )[0] - fairseq_loss = (base_x.transpose(0, 1) / 1000).sum() + # custom fw + custom_enc_layers.to(kt.dtype) + custom_enc_layers.zero_grad() + res = hidden_states.clone() + for i in range(global_config.num_layers): + res = custom_enc_layers[i](res, self_attn_padding_mask) + custom_loss = (res / 1000).sum() + custom_loss.data.copy_(loss_data) + + # fairseq fw + fairseq_enc_layers.to(kt.dtype) + fairseq_enc_layers.zero_grad() + res = hidden_states.transpose(0, 1).clone() + for i in range(global_config.num_layers): + res = fairseq_enc_layers[i](res, self_attn_padding_mask) + fairseq_loss = (res / 1000).sum() + fairseq_loss.data.copy_(loss_data) def custom(): - custom_bert_enc_layer_list.zero_grad() + custom_enc_layers.zero_grad() custom_loss.backward(retain_graph=True) + grad_list = [] - for i in range(num_layers - 1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): """ attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb """ - grads = split_custom_layer_grad(custom_bert_enc_layer_list[i]) + grads = split_custom_layer_grad(custom_enc_layers[i]) grad_list.extend( [ grads[8], @@ -463,11 +119,12 @@ def custom(): return grad_list def baseline(): - fairseq_bert_enc_layer_list.zero_grad() + fairseq_enc_layers.zero_grad() fairseq_loss.backward(retain_graph=True) + grad_list = [] - for i in range(num_layers - 1, -1, -1): - curl = fairseq_bert_enc_layer_list[i] + for i in range(global_config.num_layers - 1, -1, -1): + curl = fairseq_enc_layers[i] cur_grads = copy_grad_from_paras( [ curl.fc2.weight, @@ -491,11 +148,10 @@ def baseline(): grad_list.extend(cur_grads) return grad_list - del custom_bert_enc_layer_list, fairseq_bert_enc_layer_list return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(rtol=1e-3, atol=1e-2, ntest=10) def test_decoder_layer_forward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) @@ -503,16 +159,20 @@ def test_decoder_layer_forward(): f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" ) - hidden_states = kt.rand((batch_size, dec_seq_len, 1024)) - encoder_out = kt.rand((enc_seq_len, batch_size, 1024)) + hidden_size = global_config.hidden_size + hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) + encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) incremental_state = None encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 + custom_dec_layers.to(kt.dtype) + fairseq_dec_layers.to(kt.dtype) + def custom(): res = hidden_states.clone() - for i in range(num_layers): - res, _, _ = custom_dec_layer_list[i]( + for i in range(global_config.num_layers): + res, _, _ = custom_dec_layers[i]( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -524,8 +184,8 @@ def custom(): def baseline(): res = hidden_states.transpose(0, 1).clone() - for i in range(num_layers): - res, _, _ = fairseq_dec_layer_list[i]( + for i in range(global_config.num_layers): + res, _, _ = fairseq_dec_layers[i]( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -539,14 +199,15 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) +@kt.case(rtol=1e-2, atol=1e-2, ntest=10) def test_decoder_layer_backward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) print( f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" ) - hidden_size = 1024 + + hidden_size = global_config.hidden_size shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) @@ -555,29 +216,46 @@ def test_decoder_layer_backward(): self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() + custom_dec_layers.to(kt.dtype) + custom_dec_layers.zero_grad() + res = hidden_states.clone() + for i in range(global_config.num_layers): + res, _, _ = custom_dec_layers[i]( + res, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + ) + custom_loss = (res / 1000).sum() + custom_loss.data.copy_(loss_data) + + fairseq_dec_layers.to(kt.dtype) + fairseq_dec_layers.zero_grad() + res = hidden_states.transpose(0, 1).clone() + for i in range(global_config.num_layers): + res, _, _ = fairseq_dec_layers[i]( + res, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + self_attn_mask=self_attn_mask, + incremental_state=incremental_state, + ) + fairseq_loss = (res / 1000).sum() + fairseq_loss.data.copy_(loss_data) + def custom(): - for i in range(num_layers): - custom_dec_layer_list[i].zero_grad() - res = hidden_states.clone() - for i in range(num_layers): - res, _, _ = custom_dec_layer_list[i]( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - incremental_state=incremental_state, - ) - custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) - custom_loss.backward() + custom_dec_layers.zero_grad() + custom_loss.backward(retain_graph=True) + grad_list = [] - for i in range(num_layers - 1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): """ 0 attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, 6 encdec_attn_qw, encdec_attn_qb, encdec_attn_ow, encdec_attn_ob, encdec_attn_nw, encdec_attn_nb, 12 inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb 18 encdec_attn_kvw, encdec_attn_kvb, """ - grads = split_custom_layer_grad(custom_dec_layer_list[i]) + grads = split_custom_layer_grad(custom_dec_layers[i]) grad_list.extend( [ grads[14], @@ -618,127 +296,76 @@ def custom(): return grad_list def baseline(): - for i in range(num_layers): - fairseq_dec_layer_list[i].zero_grad() - res = hidden_states.transpose(0, 1).clone() - for i in range(num_layers): - res, _, _ = fairseq_dec_layer_list[i]( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - self_attn_mask=self_attn_mask, - incremental_state=incremental_state, - ) - fairseq_loss = (res / 1000).sum() - fairseq_loss.data.copy_(loss_data) - fairseq_loss.backward() + fairseq_dec_layers.zero_grad() + fairseq_loss.backward(retain_graph=True) + grad_list = [] - for i in range(num_layers - 1, -1, -1): - grad_list.extend( + for i in range(global_config.num_layers - 1, -1, -1): + curl = fairseq_dec_layers[i] + cur_grads = copy_grad_from_paras( [ - fairseq_dec_layer_list[i].fc2.weight.grad.contiguous().detach(), - fairseq_dec_layer_list[i].fc2.bias.grad.contiguous().detach(), - fairseq_dec_layer_list[i].fc1.weight.grad.contiguous().detach(), - fairseq_dec_layer_list[i].fc1.bias.grad.contiguous().detach(), - fairseq_dec_layer_list[i] - .final_layer_norm.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .final_layer_norm.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.out_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.out_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.q_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.q_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.k_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.k_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.v_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn.v_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn_layer_norm.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .self_attn_layer_norm.bias.grad.contiguous() - .detach(), - # encdec weights grad - fairseq_dec_layer_list[i] - .encodec_attn.q_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.q_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.out_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.out_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn_layer_norm.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn_layer_norm.bias.grad.contiguous() - .detach(), + curl.fc2.weight, + curl.fc2.bias, + curl.fc1.weight, + curl.fc1.bias, + curl.final_layer_norm.weight, + curl.final_layer_norm.bias, + curl.self_attn.out_proj.weight, + curl.self_attn.out_proj.bias, + curl.self_attn.q_proj.weight, + curl.self_attn.q_proj.bias, + curl.self_attn.k_proj.weight, + curl.self_attn.k_proj.bias, + curl.self_attn.v_proj.weight, + curl.self_attn.v_proj.bias, + curl.self_attn_layer_norm.weight, + curl.self_attn_layer_norm.bias, + curl.self_attn.q_proj.weight, + curl.self_attn.q_proj.bias, + curl.self_attn.out_proj.weight, + curl.self_attn.out_proj.bias, + curl.encodec_attn_layer_norm.weight, + curl.encodec_attn_layer_norm.bias, ] ) + grad_list.extend(cur_grads) if i == 0: - grad_list.extend( + cur_grads = copy_grad_from_paras( [ - # encdec kv grad - fairseq_dec_layer_list[i] - .encodec_attn.k_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.k_proj.bias.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.v_proj.weight.grad.contiguous() - .detach(), - fairseq_dec_layer_list[i] - .encodec_attn.v_proj.bias.grad.contiguous() - .detach(), + curl.encodec_attn.k_proj.weight, + curl.encodec_attn.k_proj.bias, + curl.encodec_attn.v_proj.weight, + curl.encodec_attn.v_proj.bias, ] ) + grad_list.extend(cur_grads) return grad_list return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10, nrepeat=1) +@kt.case(rtol=1e-3, atol=1e-2, ntest=10) def test_decoder_layer_forward_inference(): batch_size, enc_seq_len = kt.bs_sl() print(f"(batch_size, enc_seq_len): ({batch_size}, {enc_seq_len})") - # beam_size = random.randint(2,5) - # print(f"(batch_size, enc_seq_len, beam_size): ({batch_size}, {enc_seq_len}, {beam_size})") - # ls_encoder_out = kt.rand((batch_size, enc_seq_len, 1024)) - # fs_encoder_out = ls_encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).reshape(-1, enc_seq_len, 1024) + hidden_size = global_config.hidden_size + + # beam_size = random.randint(2, 5) + # print(f"(batch_size, enc_seq_len, beam_size): ({batch_size}, {enc_seq_len}, {beam_size})") + # ls_encoder_out = kt.rand((batch_size, enc_seq_len, hidden_size)) + # fs_encoder_out = ls_encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).reshape(-1, enc_seq_len, hidden_size) # ls_enc_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) # fs_enc_mask = ls_enc_mask.unsqueeze(1).repeat(1, beam_size, 1).reshape(-1, enc_seq_len) - encoder_out = kt.rand((enc_seq_len, batch_size, 1024)) + + encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) hidden_states_list = [] max_step = 10 for i in range(max_step): - # hidden_states = kt.rand((batch_size * beam_size, 1, 1024)) - hidden_states = kt.rand((batch_size, 1, 1024)) + # hidden_states = kt.rand((batch_size*beam_size, 1, hidden_size)) + hidden_states = kt.rand((batch_size, 1, hidden_size)) hidden_states_list.append(hidden_states) def custom(): @@ -746,10 +373,10 @@ def custom(): res_list = [] for i in range(max_step): res = hidden_states_list[i].clone() - for i in range(num_layers): - res, _, _ = custom_dec_layer_list[i]( + for i in range(global_config.num_layers): + res, _, _ = custom_dec_layers[i]( res, - # encoder_out=ls_encoder_out.transpose(0,1), + # encoder_out=ls_encoder_out.transpose(0, 1), # encoder_padding_mask=ls_enc_mask, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -763,8 +390,8 @@ def baseline(): res_list = [] for i in range(max_step): res = hidden_states_list[i].transpose(0, 1).clone() - for i in range(num_layers): - res, _, _ = fairseq_dec_layer_list[i]( + for i in range(global_config.num_layers): + res, _, _ = fairseq_dec_layers[i]( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -776,32 +403,29 @@ def baseline(): return custom, baseline -@kt.case(ntest=10) +@kt.case(rtol=1e-3, atol=1e-3, ntest=10) def test_embedding_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.int) # TODO: can not generate PAD in the middle of the sentences. - config = ls_emb_config_fp16 - input = kt.randint(config.padding_idx + 1, config.vocab_size, (batch_size, seq_len)) - input = input * (1 - padding_mask) + config.padding_idx * padding_mask + input = kt.randint( + global_config.padding_idx + 1, global_config.vocab_size, (batch_size, seq_len) + ) + input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask - if kt.dtype == torch.float: - custom_layer = custom_emb_layer_fp32 - fs_layer = fs_emb_layer_fp32 - else: - custom_layer = custom_emb_layer_fp16 - fs_layer = fs_emb_layer_fp16 + custom_emb_layer.to(kt.dtype) + fairseq_emb_layer.to(kt.dtype) def custom(): - res = custom_layer(input) + res = custom_emb_layer(input) return [ res.contiguous().detach(), ] def baseline(): - x = fs_layer(input) + x = fairseq_emb_layer(input) return [ x.contiguous().detach(), ] @@ -815,64 +439,45 @@ def test_embedding_layer_backward(): print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.int) - config = ls_emb_config_fp16 - input = kt.randint(config.padding_idx + 1, config.vocab_size, (batch_size, seq_len)) - input = input * (1 - padding_mask) + config.padding_idx * padding_mask + input = kt.randint( + global_config.padding_idx + 1, global_config.vocab_size, (batch_size, seq_len) + ) + input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask + loss_data = torch.randn(1, dtype=kt.dtype).sum() - if kt.dtype == torch.float: - custom_layer = custom_emb_layer_fp32 - fs_layer = fs_emb_layer_fp32 - else: - custom_layer = custom_emb_layer_fp16 - fs_layer = fs_emb_layer_fp16 + custom_emb_layer.to(kt.dtype) + custom_emb_layer.zero_grad() + custom_input = input.clone() + res = custom_emb_layer(custom_input) + custom_loss = (res / 1000).sum() + custom_loss.data.copy_(loss_data) - loss_data = torch.randn(1, dtype=kt.dtype).sum() + fairseq_emb_layer.to(kt.dtype) + fairseq_emb_layer.zero_grad() + fs_input = input.clone() + res = fairseq_emb_layer(fs_input) + fs_loss = (res / 1000).sum() + fs_loss.data.copy_(loss_data) def custom(): - custom_layer.zero_grad() - custom_input = input.clone() - res = custom_layer(custom_input) - custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) - custom_loss.backward() + custom_emb_layer.zero_grad() + custom_loss.backward(retain_graph=True) + return [ - custom_layer.embeddings.grad.contiguous().detach(), + custom_emb_layer.embeddings.grad.contiguous().detach(), ] def baseline(): - fs_layer.zero_grad() - fs_input = input.clone() - res = fs_layer(fs_input) - fs_loss = (res / 1000).sum() - fs_loss.data.copy_(loss_data) - fs_loss.backward() + fairseq_emb_layer.zero_grad() + fs_loss.backward(retain_graph=True) + return [ - fs_layer.embeddings.weight.grad.contiguous().detach(), + fairseq_emb_layer.embeddings.weight.grad.contiguous().detach(), ] return custom, baseline -def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True): - if target.dim() == lprobs.dim() - 1: - target = target.unsqueeze(-1) - nll_loss = -lprobs.gather(dim=-1, index=target) - smooth_loss = -lprobs.sum(dim=-1, keepdim=True) - if ignore_index is not None: - pad_mask = target.eq(ignore_index) - nll_loss.masked_fill_(pad_mask, 0.0) - smooth_loss.masked_fill_(pad_mask, 0.0) - else: - nll_loss = nll_loss.squeeze(-1) - smooth_loss = smooth_loss.squeeze(-1) - if reduce: - nll_loss = nll_loss.sum() - smooth_loss = smooth_loss.sum() - eps_i = epsilon / (lprobs.size(-1) - 1) - loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss - return loss, nll_loss - - @kt.case(ntest=10) def test_cross_entropy_layer_forward(): batch_size, seq_len = kt.bs_sl() @@ -881,32 +486,24 @@ def test_cross_entropy_layer_forward(): inputs = kt.rand((batch_size, seq_len, vocab_size)) targets = kt.randint( - ce_config_fp16.padding_idx - 1, vocab_size, (batch_size, seq_len) + global_config.padding_idx - 1, vocab_size, (batch_size, seq_len) ) targets_32 = targets.to(torch.int32) - if kt.dtype == torch.float: - custom_layer = custom_cross_entropy_layer_fp32 - else: - custom_layer = custom_cross_entropy_layer_fp16 + custom_ce_layer.to(kt.dtype) + fairseq_ce_layer.to(kt.dtype) def custom(): - res, cus_nll_loss = custom_layer(inputs, targets_32) + loss, cus_nll_loss = custom_ce_layer(inputs, targets_32) return [ - res.contiguous().detach(), + loss.contiguous().detach(), cus_nll_loss.contiguous().detach(), ] def baseline(): - - x = torch.nn.functional.log_softmax(inputs, dim=-1, dtype=torch.float32) - x, base_nll_loss = label_smoothed_nll_loss( - x, targets, ce_config_fp16.epsilon, ignore_index=ce_config_fp16.padding_idx - ) - x = x.to(inputs) - base_nll_loss = base_nll_loss.to(inputs) + loss, base_nll_loss = fairseq_ce_layer(inputs, targets) return [ - x.contiguous().detach(), + loss.contiguous().detach(), base_nll_loss.contiguous().detach(), ] @@ -922,33 +519,30 @@ def test_cross_entropy_layer_backward(): base_inputs = kt.rand((batch_size, seq_len, vocab_size)).requires_grad_() cus_inputs = base_inputs.clone().detach().requires_grad_() targets = kt.randint( - ce_config_fp16.padding_idx - 1, vocab_size, (batch_size, seq_len) + global_config.padding_idx - 1, vocab_size, (batch_size, seq_len) ) targets_32 = targets.to(torch.int32) - if kt.dtype == torch.float: - custom_layer = custom_cross_entropy_layer_fp32 - else: - custom_layer = custom_cross_entropy_layer_fp16 - cus_res = custom_layer(cus_inputs, targets_32)[0].to(kt.dtype) - x = torch.nn.functional.log_softmax(base_inputs, dim=-1, dtype=torch.float32) - base_res, _ = label_smoothed_nll_loss( - x, targets, ce_config_fp16.epsilon, ignore_index=ce_config_fp16.padding_idx - ) - base_res = base_res.to(kt.dtype) + custom_ce_layer.to(kt.dtype) + custom_ce_layer.zero_grad() + custom_loss, _ = custom_ce_layer(base_inputs, targets_32) + + fairseq_ce_layer.to(kt.dtype) + fairseq_ce_layer.zero_grad() + base_loss, _ = fairseq_ce_layer(cus_inputs, targets) def custom(): - if cus_inputs.grad is not None: - cus_inputs.grad.zero_() - cus_res.backward(retain_graph=True) + custom_ce_layer.zero_grad() + custom_loss.backward(retain_graph=True) + return [ cus_inputs.grad.contiguous().detach(), ] def baseline(): - if base_inputs.grad is not None: - base_inputs.grad.zero_() - base_res.backward(retain_graph=True) + fairseq_ce_layer.zero_grad() + base_loss.backward(retain_graph=True) + return [ base_inputs.grad.contiguous().detach(), ] @@ -957,19 +551,19 @@ def baseline(): if __name__ == "__main__": - kt.init(device="cuda:0", nhead=16) + kt.init( + device="cuda:{}".format(global_config.local_rank), nhead=global_config.nhead + ) kt.run( [ - "test_encoder_layer_forward", - "test_encoder_layer_backward", - "test_bert_encoder_layer_forward", - "test_bert_encoder_layer_backward", - "test_decoder_layer_forward", - "test_decoder_layer_backward", - "test_decoder_layer_forward_inference", + # "test_encoder_layer_forward", + # "test_encoder_layer_backward", + # "test_decoder_layer_forward", + # "test_decoder_layer_backward", + # "test_decoder_layer_forward_inference", "test_embedding_layer_forward", - "test_embedding_layer_backward", - "test_cross_entropy_layer_forward", - "test_cross_entropy_layer_backward", + # "test_embedding_layer_backward", + # "test_cross_entropy_layer_forward", + # "test_cross_entropy_layer_backward", ] ) diff --git a/tests/util.py b/tests/util.py index 46b228fe..26b4430b 100644 --- a/tests/util.py +++ b/tests/util.py @@ -1,25 +1,50 @@ import random import time from collections import OrderedDict +from dataclasses import dataclass import numpy as np import torch -def cast_fp32_tensor(tlist): - return [ele.to(torch.float32) for ele in tlist] - - -def is_nan(x): - return x.isnan().any().item() - - -def is_inf(x): - return x.isinf().any().item() - - -max_batch_tokens = 9216 -max_seq_len = 256 +@dataclass +class Config: + max_batch_tokens: int + max_seq_len: int + vocab_size: int + padding_idx: int + hidden_size: int + intermediate_size: int + nhead: int + attn_prob_dropout_ratio: float + activation_dropout_ratio: float + hidden_dropout_ratio: float + pre_layer_norm: bool + fp16: bool + local_rank: int + activation_fn: str + num_layers: int + label_smooth: float + + +global_config = Config( + max_batch_tokens=9216, + max_seq_len=256, + vocab_size=40480, + padding_idx=0, + hidden_size=1024, + intermediate_size=1024 * 4, + nhead=16, + attn_prob_dropout_ratio=0.0, + activation_dropout_ratio=0.0, + hidden_dropout_ratio=0.0, + pre_layer_norm=True, + fp16=True, + local_rank=0, + activation_fn="relu", + num_layers=1, + label_smooth=0.1, +) class TestDecorator(object): @@ -27,8 +52,8 @@ def __init__(self): self.all_case = OrderedDict() self.dtypes = [torch.float, torch.half] self.dtype = None - self.max_batch_tokens = max_batch_tokens - self.max_seq_len = max_seq_len + self.max_batch_tokens = global_config.max_batch_tokens + self.max_seq_len = global_config.max_seq_len def init(self, device, nhead): # device: str. e.g. "cuda:0" @@ -183,6 +208,18 @@ def run(self, case_names=None): self.test(custom, baseline, nrepeat, rtol, atol) +def cast_fp32_tensor(tlist): + return [ele.to(torch.float32) for ele in tlist] + + +def is_nan(x): + return x.isnan().any().item() + + +def is_inf(x): + return x.isinf().any().item() + + def flat_dim(idxs, dims): assert len(idxs) == len(dims) or len(idxs) == len(dims) + 1 base = 1 @@ -209,67 +246,6 @@ def expand_dim(idx, dims): return res[::-1] -def get_fairseq_enc_params(fairseq_layer): - initial_weights = [] - initial_biases = [] - - initial_weights.append(fairseq_layer.self_attn.q_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.q_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.k_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.k_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.v_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.v_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.out_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.out_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn_layer_norm.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn_layer_norm.bias.detach().clone()) - - initial_weights.append(fairseq_layer.fc1.weight.detach().clone()) - initial_biases.append(fairseq_layer.fc1.bias.detach().clone()) - initial_weights.append(fairseq_layer.fc2.weight.detach().clone()) - initial_biases.append(fairseq_layer.fc2.bias.detach().clone()) - initial_weights.append(fairseq_layer.final_layer_norm.weight.detach().clone()) - initial_biases.append(fairseq_layer.final_layer_norm.bias.detach().clone()) - return initial_weights, initial_biases - - -def get_fairseq_dec_params(fairseq_layer): - initial_weights = [] - initial_biases = [] - - initial_weights.append(fairseq_layer.self_attn.q_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.q_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.k_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.k_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.v_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.v_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn.out_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn.out_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.self_attn_layer_norm.weight.detach().clone()) - initial_biases.append(fairseq_layer.self_attn_layer_norm.bias.detach().clone()) - - initial_weights.append(fairseq_layer.encodec_attn.q_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.encodec_attn.q_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.encodec_attn.k_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.encodec_attn.k_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.encodec_attn.v_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.encodec_attn.v_proj.bias.detach().clone()) - initial_weights.append(fairseq_layer.encodec_attn.out_proj.weight.detach().clone()) - initial_biases.append(fairseq_layer.encodec_attn.out_proj.bias.detach().clone()) - initial_weights.append( - fairseq_layer.encodec_attn_layer_norm.weight.detach().clone() - ) - initial_biases.append(fairseq_layer.encodec_attn_layer_norm.bias.detach().clone()) - - initial_weights.append(fairseq_layer.fc1.weight.detach().clone()) - initial_biases.append(fairseq_layer.fc1.bias.detach().clone()) - initial_weights.append(fairseq_layer.fc2.weight.detach().clone()) - initial_biases.append(fairseq_layer.fc2.bias.detach().clone()) - initial_weights.append(fairseq_layer.final_layer_norm.weight.detach().clone()) - initial_biases.append(fairseq_layer.final_layer_norm.bias.detach().clone()) - return initial_weights, initial_biases - - def split_custom_layer_grad(layer): res = [] for i in range(1, len(layer.para_offset)): From 2eb6cf2d910ca0539b7e26a76401221bd0ac41e8 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 17:13:08 +0800 Subject: [PATCH 08/24] convert fp16 when dtype changes --- tests/test_ls_ops.py | 68 ++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index 301b57ac..ec2ea42a 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -35,20 +35,22 @@ def test_encoder_layer_forward(): self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) custom_enc_layers.to(kt.dtype) + for layer in custom_enc_layers: + layer.config.fp16 = (kt.dtype == torch.half) fairseq_enc_layers.to(kt.dtype) def custom(): res = hidden_states.clone() - for i in range(global_config.num_layers): - res = custom_enc_layers[i](res, self_attn_padding_mask) + for layer in custom_enc_layers: + res = layer(res, self_attn_padding_mask) return [ res.contiguous().detach(), ] def baseline(): res = hidden_states.transpose(0, 1).contiguous().clone() - for i in range(global_config.num_layers): - res = fairseq_enc_layers[i](res, self_attn_padding_mask) + for layer in fairseq_enc_layers: + res = layer(res, self_attn_padding_mask) return [ res.transpose(0, 1).contiguous().detach(), ] @@ -69,10 +71,12 @@ def test_encoder_layer_backward(): # custom fw custom_enc_layers.to(kt.dtype) + for layer in custom_enc_layers: + layer.config.fp16 = (kt.dtype == torch.half) custom_enc_layers.zero_grad() res = hidden_states.clone() - for i in range(global_config.num_layers): - res = custom_enc_layers[i](res, self_attn_padding_mask) + for layer in custom_enc_layers: + res = layer(res, self_attn_padding_mask) custom_loss = (res / 1000).sum() custom_loss.data.copy_(loss_data) @@ -80,8 +84,8 @@ def test_encoder_layer_backward(): fairseq_enc_layers.to(kt.dtype) fairseq_enc_layers.zero_grad() res = hidden_states.transpose(0, 1).clone() - for i in range(global_config.num_layers): - res = fairseq_enc_layers[i](res, self_attn_padding_mask) + for layer in fairseq_enc_layers: + res = layer(res, self_attn_padding_mask) fairseq_loss = (res / 1000).sum() fairseq_loss.data.copy_(loss_data) @@ -90,7 +94,7 @@ def custom(): custom_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(global_config.num_layers-1, -1, -1): """ attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb @@ -123,7 +127,7 @@ def baseline(): fairseq_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(global_config.num_layers-1, -1, -1): curl = fairseq_enc_layers[i] cur_grads = copy_grad_from_paras( [ @@ -167,12 +171,14 @@ def test_decoder_layer_forward(): self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 custom_dec_layers.to(kt.dtype) + for layer in custom_dec_layers: + layer.config.fp16 = (kt.dtype == torch.half) fairseq_dec_layers.to(kt.dtype) def custom(): res = hidden_states.clone() - for i in range(global_config.num_layers): - res, _, _ = custom_dec_layers[i]( + for layer in custom_dec_layers: + res, _, _ = layer( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -184,8 +190,8 @@ def custom(): def baseline(): res = hidden_states.transpose(0, 1).clone() - for i in range(global_config.num_layers): - res, _, _ = fairseq_dec_layers[i]( + for layer in fairseq_dec_layers: + res, _, _ = layer( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -217,10 +223,12 @@ def test_decoder_layer_backward(): loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() custom_dec_layers.to(kt.dtype) + for layer in custom_dec_layers: + layer.config.fp16 = (kt.dtype == torch.half) custom_dec_layers.zero_grad() res = hidden_states.clone() - for i in range(global_config.num_layers): - res, _, _ = custom_dec_layers[i]( + for layer in custom_dec_layers: + res, _, _ = layer( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -232,8 +240,8 @@ def test_decoder_layer_backward(): fairseq_dec_layers.to(kt.dtype) fairseq_dec_layers.zero_grad() res = hidden_states.transpose(0, 1).clone() - for i in range(global_config.num_layers): - res, _, _ = fairseq_dec_layers[i]( + for layer in fairseq_dec_layers: + res, _, _ = layer( res, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, @@ -248,7 +256,7 @@ def custom(): custom_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(global_config.num_layers-1, -1, -1): """ 0 attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, 6 encdec_attn_qw, encdec_attn_qb, encdec_attn_ow, encdec_attn_ob, encdec_attn_nw, encdec_attn_nb, @@ -300,7 +308,7 @@ def baseline(): fairseq_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(global_config.num_layers-1, -1, -1): curl = fairseq_dec_layers[i] cur_grads = copy_grad_from_paras( [ @@ -416,6 +424,7 @@ def test_embedding_layer_forward(): input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask custom_emb_layer.to(kt.dtype) + custom_emb_layer.config.fp16 = (kt.dtype == torch.half) fairseq_emb_layer.to(kt.dtype) def custom(): @@ -446,6 +455,7 @@ def test_embedding_layer_backward(): loss_data = torch.randn(1, dtype=kt.dtype).sum() custom_emb_layer.to(kt.dtype) + custom_emb_layer.config.fp16 = (kt.dtype == torch.half) custom_emb_layer.zero_grad() custom_input = input.clone() res = custom_emb_layer(custom_input) @@ -491,6 +501,7 @@ def test_cross_entropy_layer_forward(): targets_32 = targets.to(torch.int32) custom_ce_layer.to(kt.dtype) + custom_ce_layer.config.fp16 = (kt.dtype == torch.half) fairseq_ce_layer.to(kt.dtype) def custom(): @@ -524,6 +535,7 @@ def test_cross_entropy_layer_backward(): targets_32 = targets.to(torch.int32) custom_ce_layer.to(kt.dtype) + custom_ce_layer.config.fp16 = (kt.dtype == torch.half) custom_ce_layer.zero_grad() custom_loss, _ = custom_ce_layer(base_inputs, targets_32) @@ -556,14 +568,14 @@ def baseline(): ) kt.run( [ - # "test_encoder_layer_forward", - # "test_encoder_layer_backward", - # "test_decoder_layer_forward", - # "test_decoder_layer_backward", - # "test_decoder_layer_forward_inference", + "test_encoder_layer_forward", + "test_encoder_layer_backward", + "test_decoder_layer_forward", + "test_decoder_layer_backward", + "test_decoder_layer_forward_inference", "test_embedding_layer_forward", - # "test_embedding_layer_backward", - # "test_cross_entropy_layer_forward", - # "test_cross_entropy_layer_backward", + "test_embedding_layer_backward", + "test_cross_entropy_layer_forward", + "test_cross_entropy_layer_backward", ] ) From 5dc5740a808bf56805014a530d63df1e2827bebe Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 17:14:42 +0800 Subject: [PATCH 09/24] style format --- tests/gen_test_layers.py | 33 +++++++++++++++++---------------- tests/test_ls_ops.py | 24 ++++++++++++------------ 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/tests/gen_test_layers.py b/tests/gen_test_layers.py index 40651ffd..4a2d88e2 100644 --- a/tests/gen_test_layers.py +++ b/tests/gen_test_layers.py @@ -35,11 +35,7 @@ def gen_ls_enc_layer(initial_weights=None, initial_biases=None): local_rank=global_config.local_rank, activation_fn=global_config.activation_fn, ) - layer = LSTransformerEncoderLayer( - config, - initial_weights, - initial_biases - ) + layer = LSTransformerEncoderLayer(config, initial_weights, initial_biases) layer.to(torch.device("cuda:{}".format(global_config.local_rank))) layer.train() return layer @@ -64,12 +60,16 @@ def gen_fs_enc_layer(): for _ in range(global_config.num_layers): fairseq_enc_layer = gen_fs_enc_layer() - initial_enc_weights, initial_enc_biases = get_fairseq_enc_params(fairseq_enc_layer) + initial_enc_weights, initial_enc_biases = get_fairseq_enc_params( + fairseq_enc_layer + ) custom_enc_layer = gen_ls_enc_layer(initial_enc_weights, initial_enc_biases) custom_enc_layer_list.append(custom_enc_layer) fairseq_enc_layer_list.append(fairseq_enc_layer) - - return torch.nn.ModuleList(custom_enc_layer_list), torch.nn.ModuleList(fairseq_enc_layer_list) + + return torch.nn.ModuleList(custom_enc_layer_list), torch.nn.ModuleList( + fairseq_enc_layer_list + ) ###################### decoder layer ###################### @@ -124,7 +124,9 @@ def gen_fs_dec_layer(): for _ in range(global_config.num_layers): fairseq_dec_layer = gen_fs_dec_layer() - initial_dec_weights, initial_dec_biases = get_fairseq_dec_params(fairseq_dec_layer) + initial_dec_weights, initial_dec_biases = get_fairseq_dec_params( + fairseq_dec_layer + ) fairseq_dec_layer_list.append(fairseq_dec_layer) _initial_dec_weights_list.append(initial_dec_weights) _initial_dec_biases_list.append(initial_dec_biases) @@ -148,8 +150,10 @@ def gen_fs_dec_layer(): _initial_dec_weights_list[i], _initial_dec_biases_list[i] ) custom_dec_layer_list.append(custom_dec_layer) - - return torch.nn.ModuleList(custom_dec_layer_list), torch.nn.ModuleList(fairseq_dec_layer_list) + + return torch.nn.ModuleList(custom_dec_layer_list), torch.nn.ModuleList( + fairseq_dec_layer_list + ) ###################### embedding layer ###################### @@ -165,10 +169,7 @@ def gen_ls_emb_layer(initial_embedding=None): fp16=global_config.fp16, local_rank=global_config.local_rank, ) - layer = LSTransformerEmbeddingLayer( - config, - initial_embedding - ) + layer = LSTransformerEmbeddingLayer(config, initial_embedding) layer.to(torch.device("cuda:{}".format(global_config.local_rank))) layer.train() return layer @@ -180,7 +181,7 @@ def gen_fs_emb_layer(): max_seq_len=global_config.max_seq_len, padding_idx=global_config.padding_idx, dropout=global_config.hidden_dropout_ratio, - fp16=global_config.fp16 + fp16=global_config.fp16, ) layer.to(torch.device("cuda:{}".format(global_config.local_rank))) layer.train() diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index ec2ea42a..c5fa55a3 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -36,7 +36,7 @@ def test_encoder_layer_forward(): custom_enc_layers.to(kt.dtype) for layer in custom_enc_layers: - layer.config.fp16 = (kt.dtype == torch.half) + layer.config.fp16 = kt.dtype == torch.half fairseq_enc_layers.to(kt.dtype) def custom(): @@ -72,7 +72,7 @@ def test_encoder_layer_backward(): # custom fw custom_enc_layers.to(kt.dtype) for layer in custom_enc_layers: - layer.config.fp16 = (kt.dtype == torch.half) + layer.config.fp16 = kt.dtype == torch.half custom_enc_layers.zero_grad() res = hidden_states.clone() for layer in custom_enc_layers: @@ -94,7 +94,7 @@ def custom(): custom_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers-1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): """ attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb @@ -127,7 +127,7 @@ def baseline(): fairseq_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers-1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): curl = fairseq_enc_layers[i] cur_grads = copy_grad_from_paras( [ @@ -172,7 +172,7 @@ def test_decoder_layer_forward(): custom_dec_layers.to(kt.dtype) for layer in custom_dec_layers: - layer.config.fp16 = (kt.dtype == torch.half) + layer.config.fp16 = kt.dtype == torch.half fairseq_dec_layers.to(kt.dtype) def custom(): @@ -224,7 +224,7 @@ def test_decoder_layer_backward(): custom_dec_layers.to(kt.dtype) for layer in custom_dec_layers: - layer.config.fp16 = (kt.dtype == torch.half) + layer.config.fp16 = kt.dtype == torch.half custom_dec_layers.zero_grad() res = hidden_states.clone() for layer in custom_dec_layers: @@ -256,7 +256,7 @@ def custom(): custom_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers-1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): """ 0 attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, 6 encdec_attn_qw, encdec_attn_qb, encdec_attn_ow, encdec_attn_ob, encdec_attn_nw, encdec_attn_nb, @@ -308,7 +308,7 @@ def baseline(): fairseq_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers-1, -1, -1): + for i in range(global_config.num_layers - 1, -1, -1): curl = fairseq_dec_layers[i] cur_grads = copy_grad_from_paras( [ @@ -424,7 +424,7 @@ def test_embedding_layer_forward(): input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask custom_emb_layer.to(kt.dtype) - custom_emb_layer.config.fp16 = (kt.dtype == torch.half) + custom_emb_layer.config.fp16 = kt.dtype == torch.half fairseq_emb_layer.to(kt.dtype) def custom(): @@ -455,7 +455,7 @@ def test_embedding_layer_backward(): loss_data = torch.randn(1, dtype=kt.dtype).sum() custom_emb_layer.to(kt.dtype) - custom_emb_layer.config.fp16 = (kt.dtype == torch.half) + custom_emb_layer.config.fp16 = kt.dtype == torch.half custom_emb_layer.zero_grad() custom_input = input.clone() res = custom_emb_layer(custom_input) @@ -501,7 +501,7 @@ def test_cross_entropy_layer_forward(): targets_32 = targets.to(torch.int32) custom_ce_layer.to(kt.dtype) - custom_ce_layer.config.fp16 = (kt.dtype == torch.half) + custom_ce_layer.config.fp16 = kt.dtype == torch.half fairseq_ce_layer.to(kt.dtype) def custom(): @@ -535,7 +535,7 @@ def test_cross_entropy_layer_backward(): targets_32 = targets.to(torch.int32) custom_ce_layer.to(kt.dtype) - custom_ce_layer.config.fp16 = (kt.dtype == torch.half) + custom_ce_layer.config.fp16 = kt.dtype == torch.half custom_ce_layer.zero_grad() custom_loss, _ = custom_ce_layer(base_inputs, targets_32) From c6193d81dc2a7f34afbb109ffd71c6b89cd3f7bd Mon Sep 17 00:00:00 2001 From: godweiyang Date: Tue, 20 Jul 2021 20:35:06 +0800 Subject: [PATCH 10/24] polish unit test code --- .../ops/pytorch/transformer_decoder_layer.py | 2 +- .../pytorch/transformer_embedding_layer.py | 2 - tests/gen_test_layers.py | 212 ++++++++++-------- tests/test_ls_ops.py | 193 +++++++++------- 4 files changed, 235 insertions(+), 174 deletions(-) diff --git a/lightseq/training/ops/pytorch/transformer_decoder_layer.py b/lightseq/training/ops/pytorch/transformer_decoder_layer.py index 3687f8d1..a7972587 100644 --- a/lightseq/training/ops/pytorch/transformer_decoder_layer.py +++ b/lightseq/training/ops/pytorch/transformer_decoder_layer.py @@ -95,7 +95,7 @@ def backward(ctx, grad_output): grad_enc_out = None grad = _all_layer_grads[ctx.config.layer_id] - ctx.config = None + return (grad_input, grad_enc_out, None, grad, None, None) diff --git a/lightseq/training/ops/pytorch/transformer_embedding_layer.py b/lightseq/training/ops/pytorch/transformer_embedding_layer.py index b7a00ff3..f071fd3d 100644 --- a/lightseq/training/ops/pytorch/transformer_embedding_layer.py +++ b/lightseq/training/ops/pytorch/transformer_embedding_layer.py @@ -49,8 +49,6 @@ def backward(ctx, grad_output): grad = _all_layer_grads[ctx.config.layer_id] - ctx.config = None - return (None, None, grad, None) diff --git a/tests/gen_test_layers.py b/tests/gen_test_layers.py index 4a2d88e2..07b7edd6 100644 --- a/tests/gen_test_layers.py +++ b/tests/gen_test_layers.py @@ -10,6 +10,7 @@ ) from lightseq.training import ( LSTransformerEncoderLayer, + LSTransformerDecoderLayer, LSTransformerEmbeddingLayer, LSCrossEntropyLayer, ) @@ -19,46 +20,52 @@ ###################### encoder layer ###################### -def gen_enc_layer(global_config): +def gen_enc_layer(config): def gen_ls_enc_layer(initial_weights=None, initial_biases=None): - config = LSTransformerEncoderLayer.get_config( - max_batch_tokens=global_config.max_batch_tokens, - max_seq_len=global_config.max_seq_len, - hidden_size=global_config.hidden_size, - intermediate_size=global_config.intermediate_size, - nhead=global_config.nhead, - attn_prob_dropout_ratio=global_config.attn_prob_dropout_ratio, - activation_dropout_ratio=global_config.activation_dropout_ratio, - hidden_dropout_ratio=global_config.hidden_dropout_ratio, - pre_layer_norm=global_config.pre_layer_norm, - fp16=global_config.fp16, - local_rank=global_config.local_rank, - activation_fn=global_config.activation_fn, - ) - layer = LSTransformerEncoderLayer(config, initial_weights, initial_biases) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + enc_config = LSTransformerEncoderLayer.get_config( + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + nhead=config.nhead, + attn_prob_dropout_ratio=config.attn_prob_dropout_ratio, + activation_dropout_ratio=config.activation_dropout_ratio, + hidden_dropout_ratio=config.hidden_dropout_ratio, + pre_layer_norm=config.pre_layer_norm, + fp16=config.fp16, + local_rank=config.local_rank, + activation_fn=config.activation_fn, + ) + layer = LSTransformerEncoderLayer(enc_config, initial_weights, initial_biases) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer def gen_fs_enc_layer(): layer = FSTransformerEncoderLayer( - embed_dim=global_config.hidden_size, - ffn_embed_dim=global_config.intermediate_size, - nhead=global_config.nhead, - dropout=global_config.hidden_dropout_ratio, - attn_dropout=global_config.attn_prob_dropout_ratio, - activation_dropout=global_config.activation_dropout_ratio, - normalize_before=global_config.pre_layer_norm, - activation_fn=global_config.activation_fn, - ) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + embed_dim=config.hidden_size, + ffn_embed_dim=config.intermediate_size, + nhead=config.nhead, + dropout=config.hidden_dropout_ratio, + attn_dropout=config.attn_prob_dropout_ratio, + activation_dropout=config.activation_dropout_ratio, + normalize_before=config.pre_layer_norm, + activation_fn=config.activation_fn, + ) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer custom_enc_layer_list = [] fairseq_enc_layer_list = [] - for _ in range(global_config.num_layers): + for _ in range(config.num_layers): fairseq_enc_layer = gen_fs_enc_layer() initial_enc_weights, initial_enc_biases = get_fairseq_enc_params( fairseq_enc_layer @@ -73,45 +80,51 @@ def gen_fs_enc_layer(): ###################### decoder layer ###################### -def gen_dec_layer(global_config): +def gen_dec_layer(config): def gen_ls_dec_layer(initial_weights=None, initial_biases=None): - config = LSFSTransformerDecoderLayer.get_config( - max_batch_tokens=global_config.max_batch_tokens, - max_seq_len=global_config.max_seq_len, - hidden_size=global_config.hidden_size, - intermediate_size=global_config.intermediate_size, - nhead=global_config.nhead, - attn_prob_dropout_ratio=global_config.attn_prob_dropout_ratio, - activation_dropout_ratio=global_config.activation_dropout_ratio, - hidden_dropout_ratio=global_config.hidden_dropout_ratio, - pre_layer_norm=global_config.pre_layer_norm, - fp16=global_config.fp16, - local_rank=global_config.local_rank, - nlayer=global_config.num_layers, - activation_fn=global_config.activation_fn, + dec_config = LSFSTransformerDecoderLayer.get_config( + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + nhead=config.nhead, + attn_prob_dropout_ratio=config.attn_prob_dropout_ratio, + activation_dropout_ratio=config.activation_dropout_ratio, + hidden_dropout_ratio=config.hidden_dropout_ratio, + pre_layer_norm=config.pre_layer_norm, + fp16=config.fp16, + local_rank=config.local_rank, + nlayer=config.num_layers, + activation_fn=config.activation_fn, ) layer = LSFSTransformerDecoderLayer( - config, + dec_config, initial_weights, initial_biases, ) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer def gen_fs_dec_layer(): layer = FSTransformerDecoderLayer( - embed_dim=global_config.hidden_size, - ffn_embed_dim=global_config.intermediate_size, - nhead=global_config.nhead, - encoder_embed_dim=global_config.hidden_size, - dropout=global_config.hidden_dropout_ratio, - attn_dropout=global_config.attn_prob_dropout_ratio, - activation_dropout=global_config.activation_dropout_ratio, - normalize_before=global_config.pre_layer_norm, - activation_fn=global_config.activation_fn, - ) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + embed_dim=config.hidden_size, + ffn_embed_dim=config.intermediate_size, + nhead=config.nhead, + encoder_embed_dim=config.hidden_size, + dropout=config.hidden_dropout_ratio, + attn_dropout=config.attn_prob_dropout_ratio, + activation_dropout=config.activation_dropout_ratio, + normalize_before=config.pre_layer_norm, + activation_fn=config.activation_fn, + ) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer @@ -122,7 +135,7 @@ def gen_fs_dec_layer(): _initial_encdec_attn_kvw_list = [] _initial_encdec_attn_kvb_list = [] - for _ in range(global_config.num_layers): + for _ in range(config.num_layers): fairseq_dec_layer = gen_fs_dec_layer() initial_dec_weights, initial_dec_biases = get_fairseq_dec_params( fairseq_dec_layer @@ -137,7 +150,7 @@ def gen_fs_dec_layer(): _initial_encdec_attn_kvw = torch.cat(_initial_encdec_attn_kvw_list, dim=0) _initial_encdec_attn_kvb = torch.cat(_initial_encdec_attn_kvb_list, dim=0) - for i in range(global_config.num_layers): + for i in range(config.num_layers): _initial_dec_weights_list[i].pop(7) _initial_dec_weights_list[i].pop(6) if i == 0: @@ -157,33 +170,42 @@ def gen_fs_dec_layer(): ###################### embedding layer ###################### -def gen_emb_layer(global_config): +def gen_emb_layer(config): def gen_ls_emb_layer(initial_embedding=None): - config = LSTransformerEmbeddingLayer.get_config( - vocab_size=global_config.vocab_size, - embedding_dim=global_config.hidden_size, - max_batch_tokens=global_config.max_batch_tokens, - max_seq_len=global_config.max_seq_len, - padding_idx=global_config.padding_idx, - dropout=global_config.hidden_dropout_ratio, - fp16=global_config.fp16, - local_rank=global_config.local_rank, - ) - layer = LSTransformerEmbeddingLayer(config, initial_embedding) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + emb_config = LSTransformerEmbeddingLayer.get_config( + vocab_size=config.vocab_size, + embedding_dim=config.hidden_size, + max_batch_tokens=config.max_batch_tokens, + max_seq_len=config.max_seq_len, + padding_idx=config.padding_idx, + dropout=config.hidden_dropout_ratio, + fp16=config.fp16, + local_rank=config.local_rank, + ) + layer = LSTransformerEmbeddingLayer( + emb_config, + initial_embedding, + ) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer def gen_fs_emb_layer(): layer = FSTransformerEmbeddingLayer( - vocab_size=global_config.vocab_size, - embedding_dim=global_config.hidden_size, - max_seq_len=global_config.max_seq_len, - padding_idx=global_config.padding_idx, - dropout=global_config.hidden_dropout_ratio, - fp16=global_config.fp16, - ) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + vocab_size=config.vocab_size, + embedding_dim=config.hidden_size, + max_seq_len=config.max_seq_len, + padding_idx=config.padding_idx, + dropout=config.hidden_dropout_ratio, + fp16=config.fp16, + ) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer @@ -195,26 +217,32 @@ def gen_fs_emb_layer(): ###################### cross entropy layer ###################### -def gen_ce_layer(global_config): +def gen_ce_layer(config): def gen_ls_ce_layer(): - config = LSCrossEntropyLayer.get_config( - max_batch_tokens=global_config.max_batch_tokens, - padding_idx=global_config.padding_idx, - epsilon=global_config.label_smooth, - fp16=global_config.fp16, - local_rank=global_config.local_rank, - ) - layer = LSCrossEntropyLayer(config) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) + ce_config = LSCrossEntropyLayer.get_config( + max_batch_tokens=config.max_batch_tokens, + padding_idx=config.padding_idx, + epsilon=config.label_smooth, + fp16=config.fp16, + local_rank=config.local_rank, + ) + layer = LSCrossEntropyLayer(ce_config) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), + ) layer.train() return layer def gen_fs_ce_layer(): layer = FSCrossEntropyLayer( - epsilon=global_config.label_smooth, - ignore_index=global_config.padding_idx, + epsilon=config.label_smooth, + ignore_index=config.padding_idx, + ) + layer.to( + torch.device("cuda:{}".format(config.local_rank)), + dtype=(torch.half if config.fp16 else torch.float), ) - layer.to(torch.device("cuda:{}".format(global_config.local_rank))) layer.train() return layer diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index c5fa55a3..f316b352 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -1,4 +1,5 @@ import random +from copy import deepcopy import torch from torch.nn.functional import nll_loss @@ -19,13 +20,22 @@ kt = TestDecorator() -custom_enc_layers, fairseq_enc_layers = gen_enc_layer(global_config) -custom_dec_layers, fairseq_dec_layers = gen_dec_layer(global_config) -custom_emb_layer, fairseq_emb_layer = gen_emb_layer(global_config) -custom_ce_layer, fairseq_ce_layer = gen_ce_layer(global_config) +# config_32 = deepcopy(global_config) +# config_32.fp16 = False +config_16 = deepcopy(global_config) +config_16.fp16 = True +# custom_enc_layers_32, fairseq_enc_layers_32 = gen_enc_layer(config_32) +# custom_dec_layers_32, fairseq_dec_layers_32 = gen_dec_layer(config_32) +# custom_emb_layer_32, fairseq_emb_layer_32 = gen_emb_layer(config_32) +# custom_ce_layer_32, fairseq_ce_layer_32 = gen_ce_layer(config_32) +custom_enc_layers_16, fairseq_enc_layers_16 = gen_enc_layer(config_16) +custom_dec_layers_16, fairseq_dec_layers_16 = gen_dec_layer(config_16) +custom_emb_layer_16, fairseq_emb_layer_16 = gen_emb_layer(config_16) +custom_ce_layer_16, fairseq_ce_layer_16 = gen_ce_layer(config_16) -@kt.case(rtol=1e-3, atol=1e-2, ntest=10) + +@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) def test_encoder_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") @@ -34,10 +44,12 @@ def test_encoder_layer_forward(): hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - custom_enc_layers.to(kt.dtype) - for layer in custom_enc_layers: - layer.config.fp16 = kt.dtype == torch.half - fairseq_enc_layers.to(kt.dtype) + # if kt.dtype == torch.float: + # custom_enc_layers = custom_enc_layers_32 + # fairseq_enc_layers = fairseq_enc_layers_32 + # else: + custom_enc_layers = custom_enc_layers_16 + fairseq_enc_layers = fairseq_enc_layers_16 def custom(): res = hidden_states.clone() @@ -58,7 +70,7 @@ def baseline(): return custom, baseline -@kt.case(rtol=1e-2, atol=1e-2, ntest=10) +@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) def test_encoder_layer_backward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") @@ -69,10 +81,14 @@ def test_encoder_layer_backward(): self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() + # if kt.dtype == torch.float: + # custom_enc_layers = custom_enc_layers_32 + # fairseq_enc_layers = fairseq_enc_layers_32 + # else: + custom_enc_layers = custom_enc_layers_16 + fairseq_enc_layers = fairseq_enc_layers_16 + # custom fw - custom_enc_layers.to(kt.dtype) - for layer in custom_enc_layers: - layer.config.fp16 = kt.dtype == torch.half custom_enc_layers.zero_grad() res = hidden_states.clone() for layer in custom_enc_layers: @@ -81,7 +97,6 @@ def test_encoder_layer_backward(): custom_loss.data.copy_(loss_data) # fairseq fw - fairseq_enc_layers.to(kt.dtype) fairseq_enc_layers.zero_grad() res = hidden_states.transpose(0, 1).clone() for layer in fairseq_enc_layers: @@ -155,7 +170,7 @@ def baseline(): return custom, baseline -@kt.case(rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) def test_decoder_layer_forward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) @@ -170,10 +185,12 @@ def test_decoder_layer_forward(): encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 - custom_dec_layers.to(kt.dtype) - for layer in custom_dec_layers: - layer.config.fp16 = kt.dtype == torch.half - fairseq_dec_layers.to(kt.dtype) + # if kt.dtype == torch.float: + # custom_dec_layers = custom_dec_layers_32 + # fairseq_dec_layers = fairseq_dec_layers_32 + # else: + custom_dec_layers = custom_dec_layers_16 + fairseq_dec_layers = fairseq_dec_layers_16 def custom(): res = hidden_states.clone() @@ -205,7 +222,7 @@ def baseline(): return custom, baseline -@kt.case(rtol=1e-2, atol=1e-2, ntest=10) +@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) def test_decoder_layer_backward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) @@ -222,38 +239,26 @@ def test_decoder_layer_backward(): self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() - custom_dec_layers.to(kt.dtype) - for layer in custom_dec_layers: - layer.config.fp16 = kt.dtype == torch.half - custom_dec_layers.zero_grad() - res = hidden_states.clone() - for layer in custom_dec_layers: - res, _, _ = layer( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - incremental_state=incremental_state, - ) - custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) - - fairseq_dec_layers.to(kt.dtype) - fairseq_dec_layers.zero_grad() - res = hidden_states.transpose(0, 1).clone() - for layer in fairseq_dec_layers: - res, _, _ = layer( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - self_attn_mask=self_attn_mask, - incremental_state=incremental_state, - ) - fairseq_loss = (res / 1000).sum() - fairseq_loss.data.copy_(loss_data) + # if kt.dtype == torch.float: + # custom_dec_layers = custom_dec_layers_32 + # fairseq_dec_layers = fairseq_dec_layers_32 + # else: + custom_dec_layers = custom_dec_layers_16 + fairseq_dec_layers = fairseq_dec_layers_16 def custom(): custom_dec_layers.zero_grad() - custom_loss.backward(retain_graph=True) + res = hidden_states.clone() + for layer in custom_dec_layers: + res, _, _ = layer( + res, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + ) + custom_loss = (res / 1000).sum() + custom_loss.data.copy_(loss_data) + custom_loss.backward() grad_list = [] for i in range(global_config.num_layers - 1, -1, -1): @@ -305,7 +310,18 @@ def custom(): def baseline(): fairseq_dec_layers.zero_grad() - fairseq_loss.backward(retain_graph=True) + res = hidden_states.transpose(0, 1).clone() + for layer in fairseq_dec_layers: + res, _, _ = layer( + res, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + self_attn_mask=self_attn_mask, + incremental_state=incremental_state, + ) + fairseq_loss = (res / 1000).sum() + fairseq_loss.data.copy_(loss_data) + fairseq_loss.backward() grad_list = [] for i in range(global_config.num_layers - 1, -1, -1): @@ -328,10 +344,10 @@ def baseline(): curl.self_attn.v_proj.bias, curl.self_attn_layer_norm.weight, curl.self_attn_layer_norm.bias, - curl.self_attn.q_proj.weight, - curl.self_attn.q_proj.bias, - curl.self_attn.out_proj.weight, - curl.self_attn.out_proj.bias, + curl.encodec_attn.q_proj.weight, + curl.encodec_attn.q_proj.bias, + curl.encodec_attn.out_proj.weight, + curl.encodec_attn.out_proj.bias, curl.encodec_attn_layer_norm.weight, curl.encodec_attn_layer_norm.bias, ] @@ -352,7 +368,7 @@ def baseline(): return custom, baseline -@kt.case(rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) def test_decoder_layer_forward_inference(): batch_size, enc_seq_len = kt.bs_sl() print(f"(batch_size, enc_seq_len): ({batch_size}, {enc_seq_len})") @@ -376,6 +392,13 @@ def test_decoder_layer_forward_inference(): hidden_states = kt.rand((batch_size, 1, hidden_size)) hidden_states_list.append(hidden_states) + # if kt.dtype == torch.float: + # custom_dec_layers = custom_dec_layers_32 + # fairseq_dec_layers = fairseq_dec_layers_32 + # else: + custom_dec_layers = custom_dec_layers_16 + fairseq_dec_layers = fairseq_dec_layers_16 + def custom(): incremental_state = {} res_list = [] @@ -411,7 +434,7 @@ def baseline(): return custom, baseline -@kt.case(rtol=1e-3, atol=1e-3, ntest=10) +@kt.case(dtypes=[torch.half], ntest=10) def test_embedding_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") @@ -423,9 +446,12 @@ def test_embedding_layer_forward(): ) input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask - custom_emb_layer.to(kt.dtype) - custom_emb_layer.config.fp16 = kt.dtype == torch.half - fairseq_emb_layer.to(kt.dtype) + # if kt.dtype == torch.float: + # custom_emb_layer = custom_emb_layer_32 + # fairseq_emb_layer = fairseq_emb_layer_32 + # else: + custom_emb_layer = custom_emb_layer_16 + fairseq_emb_layer = fairseq_emb_layer_16 def custom(): res = custom_emb_layer(input) @@ -442,7 +468,7 @@ def baseline(): return custom, baseline -@kt.case(ntest=10) +@kt.case(dtypes=[torch.half], ntest=10) def test_embedding_layer_backward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") @@ -454,15 +480,19 @@ def test_embedding_layer_backward(): input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask loss_data = torch.randn(1, dtype=kt.dtype).sum() - custom_emb_layer.to(kt.dtype) - custom_emb_layer.config.fp16 = kt.dtype == torch.half + # if kt.dtype == torch.float: + # custom_emb_layer = custom_emb_layer_32 + # fairseq_emb_layer = fairseq_emb_layer_32 + # else: + custom_emb_layer = custom_emb_layer_16 + fairseq_emb_layer = fairseq_emb_layer_16 + custom_emb_layer.zero_grad() custom_input = input.clone() res = custom_emb_layer(custom_input) custom_loss = (res / 1000).sum() custom_loss.data.copy_(loss_data) - fairseq_emb_layer.to(kt.dtype) fairseq_emb_layer.zero_grad() fs_input = input.clone() res = fairseq_emb_layer(fs_input) @@ -488,24 +518,27 @@ def baseline(): return custom, baseline -@kt.case(ntest=10) +@kt.case(dtypes=[torch.half], ntest=10) def test_cross_entropy_layer_forward(): batch_size, seq_len = kt.bs_sl() vocab_size = random.randint(30413, 40519) print(f"(batch_size, seq_len, vocab_size): ({batch_size}, {seq_len}, {vocab_size})") inputs = kt.rand((batch_size, seq_len, vocab_size)) - targets = kt.randint( - global_config.padding_idx - 1, vocab_size, (batch_size, seq_len) - ) + targets = kt.randint(0, vocab_size, (batch_size, seq_len)) targets_32 = targets.to(torch.int32) - custom_ce_layer.to(kt.dtype) - custom_ce_layer.config.fp16 = kt.dtype == torch.half - fairseq_ce_layer.to(kt.dtype) + # if kt.dtype == torch.float: + # custom_ce_layer = custom_ce_layer_32 + # fairseq_ce_layer = fairseq_ce_layer_32 + # else: + custom_ce_layer = custom_ce_layer_16 + fairseq_ce_layer = fairseq_ce_layer_16 def custom(): loss, cus_nll_loss = custom_ce_layer(inputs, targets_32) + loss = loss.to(inputs) + cus_nll_loss = cus_nll_loss.to(inputs) return [ loss.contiguous().detach(), cus_nll_loss.contiguous().detach(), @@ -521,7 +554,7 @@ def baseline(): return custom, baseline -@kt.case(ntest=10) +@kt.case(dtypes=[torch.half], ntest=10) def test_cross_entropy_layer_backward(): batch_size, seq_len = kt.bs_sl() vocab_size = random.randint(30413, 40519) @@ -529,19 +562,21 @@ def test_cross_entropy_layer_backward(): base_inputs = kt.rand((batch_size, seq_len, vocab_size)).requires_grad_() cus_inputs = base_inputs.clone().detach().requires_grad_() - targets = kt.randint( - global_config.padding_idx - 1, vocab_size, (batch_size, seq_len) - ) + targets = kt.randint(0, vocab_size, (batch_size, seq_len)) targets_32 = targets.to(torch.int32) - custom_ce_layer.to(kt.dtype) - custom_ce_layer.config.fp16 = kt.dtype == torch.half + # if kt.dtype == torch.float: + # custom_ce_layer = custom_ce_layer_32 + # fairseq_ce_layer = fairseq_ce_layer_32 + # else: + custom_ce_layer = custom_ce_layer_16 + fairseq_ce_layer = fairseq_ce_layer_16 + custom_ce_layer.zero_grad() - custom_loss, _ = custom_ce_layer(base_inputs, targets_32) + custom_loss, _ = custom_ce_layer(cus_inputs, targets_32) - fairseq_ce_layer.to(kt.dtype) fairseq_ce_layer.zero_grad() - base_loss, _ = fairseq_ce_layer(cus_inputs, targets) + base_loss, _ = fairseq_ce_layer(base_inputs, targets) def custom(): custom_ce_layer.zero_grad() From 0682ec653a34530a8e9e4b79f411b9dcedae4632 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Wed, 21 Jul 2021 00:00:10 +0800 Subject: [PATCH 11/24] generate random config for unit test --- tests/gen_test_layers.py | 1 - tests/test_ls_kernels.py | 105 +++++++++++++--------------- tests/test_ls_ops.py | 143 +++++++++------------------------------ tests/util.py | 56 +++++++++------ 4 files changed, 115 insertions(+), 190 deletions(-) diff --git a/tests/gen_test_layers.py b/tests/gen_test_layers.py index 07b7edd6..1ed51590 100644 --- a/tests/gen_test_layers.py +++ b/tests/gen_test_layers.py @@ -10,7 +10,6 @@ ) from lightseq.training import ( LSTransformerEncoderLayer, - LSTransformerDecoderLayer, LSTransformerEmbeddingLayer, LSCrossEntropyLayer, ) diff --git a/tests/test_ls_kernels.py b/tests/test_ls_kernels.py index 6600e9df..293e24f8 100644 --- a/tests/test_ls_kernels.py +++ b/tests/test_ls_kernels.py @@ -14,8 +14,7 @@ @kt.case() def test_launch_bias_add_transform_20314(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim - nhead = kt.nhead + hidden_dim, nhead = kt.h_nh head_dim = int(hidden_dim / nhead) count = random.randint(1, 20) print( @@ -56,8 +55,7 @@ def baseline(): @kt.case() def test_launch_transform_0213(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim - nhead = kt.nhead + hidden_dim, nhead = kt.h_nh head_dim = int(hidden_dim / nhead) print( "(batch_size, seq_len, hidden_dim, nhead): " @@ -93,8 +91,7 @@ def baseline(): @kt.case() def test_launch_transform4d_0213(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim - nhead = kt.nhead + hidden_dim, nhead = kt.h_nh head_dim = int(hidden_dim / nhead) trans_count = random.choice([1, 3]) print( @@ -128,7 +125,7 @@ def baseline(): return custom, baseline -@kt.case(atol=1e-3, rtol=1e-3, ntest=20) +@kt.case(rtol=1e-3, atol=1e-3) def test_launch_attn_softmax(): batch_size, from_len = kt.bs_sl() is_dec_self_attn = random.choice([True, False]) @@ -145,7 +142,7 @@ def test_launch_attn_softmax(): beam_size = random.choice([3, 4, 5]) batch_size *= beam_size - nhead = kt.nhead + _, nhead = kt.h_nh print( "(batch_size, nhead, from_len, to_len, is_dec_self_attn): " f"({batch_size}, {nhead}, {from_len}, {to_len}, {is_dec_self_attn})" @@ -199,9 +196,9 @@ def baseline(): return custom, baseline -@kt.case(atol=1e-2, rtol=1e-3) +@kt.case(rtol=1e-3, atol=1e-2) def test_launch_attn_softmax_bw(): - nhead = kt.nhead + _, nhead = kt.h_nh batch_size, from_len = kt.bs_sl() _, to_len = kt.bs_sl(batch_size) print( @@ -243,7 +240,7 @@ def baseline(): @kt.case() def test_launch_fused_add2(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh print( "(batch_size, seq_len, hidden_dim): " f"({batch_size}, {seq_len}, {hidden_dim})" ) @@ -275,11 +272,11 @@ def baseline(): return custom, baseline -@kt.case(atol=1e-2, rtol=1e-3) +@kt.case(rtol=1e-3, atol=1e-2) def test_launch_layer_norm(): batch_size, seq_len = kt.bs_sl() bsz_seq = batch_size * seq_len - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh with_mean = random.choice([True, False]) print( "(batch_token_num, hidden_dim, with_mean): " @@ -316,11 +313,11 @@ def baseline(): return custom, baseline -@kt.case(atol=1e-3, rtol=1e-2) +@kt.case(rtol=1e-2, atol=1e-3) def test_launch_ln_bw(): batch_size, seq_len = kt.bs_sl() bsz_seq = batch_size * seq_len - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh with_mean = random.choice([True, False]) fuse_add = random.choice([True, False]) print( @@ -396,10 +393,10 @@ def baseline(): return custom, baseline -@kt.case() +@kt.case(rtol=1e-3, atol=1e-4) def test_launch_ffn_bias_bwd(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh coef = random.randint(1, 4) print("(rows, cols): " f"({batch_size*seq_len}, {coef*hidden_dim})") @@ -434,8 +431,7 @@ def baseline(): @kt.case() def test_launch_concat3_dim1(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim - nhead = kt.nhead + hidden_dim, nhead = kt.h_nh head_dim = int(hidden_dim / nhead) assert seq_len > 1 sl1 = random.randint(1, seq_len - 1) @@ -465,10 +461,10 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.float32]) +@kt.case(dtypes=[torch.float]) def test_adam(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh cus_p = kt.rand((batch_size, seq_len, hidden_dim * 32)) cus_out_p = kt.rand((batch_size, seq_len, hidden_dim * 32)) cus_exp_avg = kt.rand((batch_size, seq_len, hidden_dim * 32)) @@ -524,20 +520,15 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.float, torch.half], ntest=5, atol=1e-2, rtol=1e-2) +@kt.case(rtol=1e-2, atol=1e-2) def test_launch_dropout_relu_bias(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh print("test shape:", (batch_size, seq_len, hidden_dim)) test_input = kt.rand((batch_size, seq_len, hidden_dim)) test_bias = kt.rand((hidden_dim,)) - test_out_base = kt.rand((batch_size, seq_len, hidden_dim)) test_out_cus = kt.rand((batch_size, seq_len, hidden_dim)) - test_mask_base = torch.rand((batch_size, seq_len, hidden_dim)).to( - dtype=torch.uint8, - device="cuda:0", - ) test_mask_cus = torch.rand((batch_size, seq_len, hidden_dim)).to( dtype=torch.uint8, device="cuda:0" ) @@ -569,21 +560,15 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.float, torch.half], ntest=5, atol=1e-2, rtol=1e-2) +@kt.case(rtol=1e-2, atol=1e-2) def test_launch_dropout_gelu_bias(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim + hidden_dim, _ = kt.h_nh print("test shape:", (batch_size, seq_len, hidden_dim)) test_input = kt.rand((batch_size, seq_len, hidden_dim)) test_bias = kt.rand((hidden_dim,)) - test_out_base = kt.rand((batch_size, seq_len, hidden_dim)) test_out_cus = kt.rand((batch_size, seq_len, hidden_dim)) - temp = kt.rand((batch_size, seq_len, hidden_dim)) - test_mask_base = torch.rand((batch_size, seq_len, hidden_dim)).to( - dtype=torch.uint8, - device="cuda:0", - ) test_mask_cus = torch.rand((batch_size, seq_len, hidden_dim)).to( dtype=torch.uint8, device="cuda:0" ) @@ -615,10 +600,11 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.float, torch.half], ntest=5, atol=1e-2, rtol=1e-2) +@kt.case(rtol=1e-2, atol=1e-2) def test_launch_dropout_relu_bias_bwd(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim * 4 + hidden_dim, _ = kt.h_nh + hidden_dim *= 4 print("test shape:", (batch_size, seq_len, hidden_dim)) test_input = kt.rand((batch_size, seq_len, hidden_dim)) @@ -661,10 +647,11 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.float, torch.half], ntest=5, atol=1e-2, rtol=1e-2) +@kt.case(rtol=1e-2, atol=1e-2) def test_launch_dropout_gelu_bias_bwd(): batch_size, seq_len = kt.bs_sl() - hidden_dim = kt.hidden_dim * 4 + hidden_dim, _ = kt.h_nh + hidden_dim *= 4 print("test shape:", (batch_size, seq_len, hidden_dim)) test_input = kt.rand((batch_size, seq_len, hidden_dim)) @@ -719,22 +706,22 @@ def baseline(): if __name__ == "__main__": - kt.init(device="cuda:0", nhead=16) - kernel_list = [ - "test_launch_transform_0213", - "test_launch_bias_add_transform_20314", - "test_launch_transform4d_0213", - "test_launch_fused_add2", - "test_launch_ffn_bias_bwd", - "test_launch_attn_softmax", - "test_launch_attn_softmax_bw", - "test_launch_layer_norm", - "test_launch_ln_bw", - "test_launch_concat3_dim1", - "test_adam", - "test_launch_dropout_gelu_bias", - "test_launch_dropout_relu_bias", - "test_launch_dropout_relu_bias_bwd", - "test_launch_dropout_gelu_bias_bwd", - ] - kt.run(kernel_list) + kt.run( + [ + "test_launch_transform_0213", + "test_launch_bias_add_transform_20314", + "test_launch_transform4d_0213", + "test_launch_fused_add2", + "test_launch_ffn_bias_bwd", + "test_launch_attn_softmax", + "test_launch_attn_softmax_bw", + "test_launch_layer_norm", + "test_launch_ln_bw", + "test_launch_concat3_dim1", + "test_adam", + "test_launch_dropout_gelu_bias", + "test_launch_dropout_relu_bias", + "test_launch_dropout_relu_bias_bwd", + "test_launch_dropout_gelu_bias_bwd", + ] + ) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index f316b352..bd8f5cad 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -2,11 +2,9 @@ from copy import deepcopy import torch -from torch.nn.functional import nll_loss from tests.util import ( TestDecorator, - global_config, split_custom_layer_grad, copy_grad_from_paras, ) @@ -19,38 +17,24 @@ kt = TestDecorator() +config = kt.generate_config(use_default=False) +kt.dtypes = [torch.half if config.fp16 else torch.float] -# config_32 = deepcopy(global_config) -# config_32.fp16 = False -config_16 = deepcopy(global_config) -config_16.fp16 = True +custom_enc_layers, fairseq_enc_layers = gen_enc_layer(config) +custom_dec_layers, fairseq_dec_layers = gen_dec_layer(config) +custom_emb_layer, fairseq_emb_layer = gen_emb_layer(config) +custom_ce_layer, fairseq_ce_layer = gen_ce_layer(config) -# custom_enc_layers_32, fairseq_enc_layers_32 = gen_enc_layer(config_32) -# custom_dec_layers_32, fairseq_dec_layers_32 = gen_dec_layer(config_32) -# custom_emb_layer_32, fairseq_emb_layer_32 = gen_emb_layer(config_32) -# custom_ce_layer_32, fairseq_ce_layer_32 = gen_ce_layer(config_32) -custom_enc_layers_16, fairseq_enc_layers_16 = gen_enc_layer(config_16) -custom_dec_layers_16, fairseq_dec_layers_16 = gen_dec_layer(config_16) -custom_emb_layer_16, fairseq_emb_layer_16 = gen_emb_layer(config_16) -custom_ce_layer_16, fairseq_ce_layer_16 = gen_ce_layer(config_16) - -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(rtol=1e-3, atol=1e-2) def test_encoder_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = global_config.hidden_size + hidden_size = config.hidden_size hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - # if kt.dtype == torch.float: - # custom_enc_layers = custom_enc_layers_32 - # fairseq_enc_layers = fairseq_enc_layers_32 - # else: - custom_enc_layers = custom_enc_layers_16 - fairseq_enc_layers = fairseq_enc_layers_16 - def custom(): res = hidden_states.clone() for layer in custom_enc_layers: @@ -70,24 +54,17 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) +@kt.case(rtol=1e-2, atol=1e-2) def test_encoder_layer_backward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = global_config.hidden_size + hidden_size = config.hidden_size shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() - # if kt.dtype == torch.float: - # custom_enc_layers = custom_enc_layers_32 - # fairseq_enc_layers = fairseq_enc_layers_32 - # else: - custom_enc_layers = custom_enc_layers_16 - fairseq_enc_layers = fairseq_enc_layers_16 - # custom fw custom_enc_layers.zero_grad() res = hidden_states.clone() @@ -109,7 +86,7 @@ def custom(): custom_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(config.num_layers - 1, -1, -1): """ attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, ffn_nw, ffn_nb @@ -142,7 +119,7 @@ def baseline(): fairseq_loss.backward(retain_graph=True) grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(config.num_layers - 1, -1, -1): curl = fairseq_enc_layers[i] cur_grads = copy_grad_from_paras( [ @@ -170,7 +147,7 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(rtol=1e-3, atol=1e-2) def test_decoder_layer_forward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) @@ -178,20 +155,13 @@ def test_decoder_layer_forward(): f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" ) - hidden_size = global_config.hidden_size + hidden_size = config.hidden_size hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) incremental_state = None encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 - # if kt.dtype == torch.float: - # custom_dec_layers = custom_dec_layers_32 - # fairseq_dec_layers = fairseq_dec_layers_32 - # else: - custom_dec_layers = custom_dec_layers_16 - fairseq_dec_layers = fairseq_dec_layers_16 - def custom(): res = hidden_states.clone() for layer in custom_dec_layers: @@ -222,7 +192,7 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-2, atol=1e-2, ntest=10) +@kt.case(rtol=1e-2, atol=1e-2) def test_decoder_layer_backward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) @@ -230,7 +200,7 @@ def test_decoder_layer_backward(): f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" ) - hidden_size = global_config.hidden_size + hidden_size = config.hidden_size shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) @@ -239,13 +209,6 @@ def test_decoder_layer_backward(): self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() - # if kt.dtype == torch.float: - # custom_dec_layers = custom_dec_layers_32 - # fairseq_dec_layers = fairseq_dec_layers_32 - # else: - custom_dec_layers = custom_dec_layers_16 - fairseq_dec_layers = fairseq_dec_layers_16 - def custom(): custom_dec_layers.zero_grad() res = hidden_states.clone() @@ -261,7 +224,7 @@ def custom(): custom_loss.backward() grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(config.num_layers - 1, -1, -1): """ 0 attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, 6 encdec_attn_qw, encdec_attn_qb, encdec_attn_ow, encdec_attn_ob, encdec_attn_nw, encdec_attn_nb, @@ -324,7 +287,7 @@ def baseline(): fairseq_loss.backward() grad_list = [] - for i in range(global_config.num_layers - 1, -1, -1): + for i in range(config.num_layers - 1, -1, -1): curl = fairseq_dec_layers[i] cur_grads = copy_grad_from_paras( [ @@ -368,12 +331,12 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], rtol=1e-3, atol=1e-2, ntest=10) +@kt.case(rtol=1e-3, atol=1e-2) def test_decoder_layer_forward_inference(): batch_size, enc_seq_len = kt.bs_sl() print(f"(batch_size, enc_seq_len): ({batch_size}, {enc_seq_len})") - hidden_size = global_config.hidden_size + hidden_size = config.hidden_size # beam_size = random.randint(2, 5) # print(f"(batch_size, enc_seq_len, beam_size): ({batch_size}, {enc_seq_len}, {beam_size})") @@ -392,19 +355,12 @@ def test_decoder_layer_forward_inference(): hidden_states = kt.rand((batch_size, 1, hidden_size)) hidden_states_list.append(hidden_states) - # if kt.dtype == torch.float: - # custom_dec_layers = custom_dec_layers_32 - # fairseq_dec_layers = fairseq_dec_layers_32 - # else: - custom_dec_layers = custom_dec_layers_16 - fairseq_dec_layers = fairseq_dec_layers_16 - def custom(): incremental_state = {} res_list = [] for i in range(max_step): res = hidden_states_list[i].clone() - for i in range(global_config.num_layers): + for i in range(config.num_layers): res, _, _ = custom_dec_layers[i]( res, # encoder_out=ls_encoder_out.transpose(0, 1), @@ -421,7 +377,7 @@ def baseline(): res_list = [] for i in range(max_step): res = hidden_states_list[i].transpose(0, 1).clone() - for i in range(global_config.num_layers): + for i in range(config.num_layers): res, _, _ = fairseq_dec_layers[i]( res, encoder_out=encoder_out, @@ -434,24 +390,15 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], ntest=10) +@kt.case(rtol=1e-3, atol=1e-3) def test_embedding_layer_forward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.int) # TODO: can not generate PAD in the middle of the sentences. - input = kt.randint( - global_config.padding_idx + 1, global_config.vocab_size, (batch_size, seq_len) - ) - input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask - - # if kt.dtype == torch.float: - # custom_emb_layer = custom_emb_layer_32 - # fairseq_emb_layer = fairseq_emb_layer_32 - # else: - custom_emb_layer = custom_emb_layer_16 - fairseq_emb_layer = fairseq_emb_layer_16 + input = kt.randint(config.padding_idx + 1, config.vocab_size, (batch_size, seq_len)) + input = input * (1 - padding_mask) + config.padding_idx * padding_mask def custom(): res = custom_emb_layer(input) @@ -468,25 +415,16 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], ntest=10) +@kt.case(rtol=1e-3, atol=1e-3) def test_embedding_layer_backward(): batch_size, seq_len = kt.bs_sl() print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.int) - input = kt.randint( - global_config.padding_idx + 1, global_config.vocab_size, (batch_size, seq_len) - ) - input = input * (1 - padding_mask) + global_config.padding_idx * padding_mask + input = kt.randint(config.padding_idx + 1, config.vocab_size, (batch_size, seq_len)) + input = input * (1 - padding_mask) + config.padding_idx * padding_mask loss_data = torch.randn(1, dtype=kt.dtype).sum() - # if kt.dtype == torch.float: - # custom_emb_layer = custom_emb_layer_32 - # fairseq_emb_layer = fairseq_emb_layer_32 - # else: - custom_emb_layer = custom_emb_layer_16 - fairseq_emb_layer = fairseq_emb_layer_16 - custom_emb_layer.zero_grad() custom_input = input.clone() res = custom_emb_layer(custom_input) @@ -518,23 +456,16 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], ntest=10) +@kt.case() def test_cross_entropy_layer_forward(): batch_size, seq_len = kt.bs_sl() - vocab_size = random.randint(30413, 40519) + vocab_size = random.randint(1000, 42000) print(f"(batch_size, seq_len, vocab_size): ({batch_size}, {seq_len}, {vocab_size})") inputs = kt.rand((batch_size, seq_len, vocab_size)) targets = kt.randint(0, vocab_size, (batch_size, seq_len)) targets_32 = targets.to(torch.int32) - # if kt.dtype == torch.float: - # custom_ce_layer = custom_ce_layer_32 - # fairseq_ce_layer = fairseq_ce_layer_32 - # else: - custom_ce_layer = custom_ce_layer_16 - fairseq_ce_layer = fairseq_ce_layer_16 - def custom(): loss, cus_nll_loss = custom_ce_layer(inputs, targets_32) loss = loss.to(inputs) @@ -554,10 +485,10 @@ def baseline(): return custom, baseline -@kt.case(dtypes=[torch.half], ntest=10) +@kt.case() def test_cross_entropy_layer_backward(): batch_size, seq_len = kt.bs_sl() - vocab_size = random.randint(30413, 40519) + vocab_size = random.randint(1000, 42000) print(f"(batch_size, seq_len, vocab_size): ({batch_size}, {seq_len}, {vocab_size})") base_inputs = kt.rand((batch_size, seq_len, vocab_size)).requires_grad_() @@ -565,13 +496,6 @@ def test_cross_entropy_layer_backward(): targets = kt.randint(0, vocab_size, (batch_size, seq_len)) targets_32 = targets.to(torch.int32) - # if kt.dtype == torch.float: - # custom_ce_layer = custom_ce_layer_32 - # fairseq_ce_layer = fairseq_ce_layer_32 - # else: - custom_ce_layer = custom_ce_layer_16 - fairseq_ce_layer = fairseq_ce_layer_16 - custom_ce_layer.zero_grad() custom_loss, _ = custom_ce_layer(cus_inputs, targets_32) @@ -598,9 +522,6 @@ def baseline(): if __name__ == "__main__": - kt.init( - device="cuda:{}".format(global_config.local_rank), nhead=global_config.nhead - ) kt.run( [ "test_encoder_layer_forward", diff --git a/tests/util.py b/tests/util.py index 26b4430b..36469117 100644 --- a/tests/util.py +++ b/tests/util.py @@ -2,6 +2,7 @@ import time from collections import OrderedDict from dataclasses import dataclass +from copy import deepcopy import numpy as np import torch @@ -27,10 +28,10 @@ class Config: label_smooth: float -global_config = Config( +default_config = Config( max_batch_tokens=9216, max_seq_len=256, - vocab_size=40480, + vocab_size=32000, padding_idx=0, hidden_size=1024, intermediate_size=1024 * 4, @@ -42,7 +43,7 @@ class Config: fp16=True, local_rank=0, activation_fn="relu", - num_layers=1, + num_layers=2, label_smooth=0.1, ) @@ -52,30 +53,47 @@ def __init__(self): self.all_case = OrderedDict() self.dtypes = [torch.float, torch.half] self.dtype = None - self.max_batch_tokens = global_config.max_batch_tokens - self.max_seq_len = global_config.max_seq_len - - def init(self, device, nhead): - # device: str. e.g. "cuda:0" - self.device = torch.device(device) - assert nhead % 4 == 0 - self.nhead = nhead + self.device = torch.device("cuda:{}".format(default_config.local_rank)) + + def generate_config(self, use_default=False): + if use_default: + return deepcopy(default_config) + config = deepcopy(default_config) + config.vocab_size = random.randint(1000, 42000) + hidden_size, nhead = self.h_nh + config.hidden_size = hidden_size + config.intermediate_size = hidden_size * 4 + config.nhead = nhead + config.pre_layer_norm = random.choice([True, False]) + config.activation_fn = self.act_fn + config.num_layers = random.randint(1, 2) + return config def bs_sl(self, batch_size=None): if batch_size is None: - seq_len = random.randint(1, self.max_seq_len) - max_batch_size = self.max_batch_tokens // seq_len + seq_len = random.randint(1, default_config.max_seq_len) + max_batch_size = default_config.max_batch_tokens // seq_len batch_size = random.randint(1, max_batch_size) else: - max_seq_len = min(self.max_batch_tokens // batch_size, self.max_seq_len) + max_seq_len = min( + default_config.max_batch_tokens // batch_size, + default_config.max_seq_len, + ) seq_len = random.randint(1, max_seq_len) return batch_size, seq_len @property - def hidden_dim(self): - hs = random.choice([512, 768, 1024, 1536]) - assert hs % (self.nhead * 8) == 0 - return hs + def h_nh(self): + while True: + hs = random.choice([512, 768, 1024, 1536, 2048, 4096]) + nhead = random.choice([8, 12, 16]) + if hs % (nhead * 8) == 0: + return hs, nhead + + @property + def act_fn(self): + act = random.choice(["relu", "gelu"]) + return act def move(self, data): return data.to(self.device, dtype=self.dtype) @@ -120,7 +138,7 @@ def dec_self_attn_mask(self, seq_len, dtype=None): mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1) return mask.to(self.device, dtype=dtype) - def case(self, dtypes=list(), ntest=5, nrepeat=5, rtol=1e-5, atol=1e-5): + def case(self, dtypes=list(), ntest=10, nrepeat=5, rtol=1e-5, atol=1e-5): if not dtypes: dtypes = self.dtypes From 91b80f081bd44082a35a589ef4973e15381998be Mon Sep 17 00:00:00 2001 From: godweiyang Date: Wed, 21 Jul 2021 01:59:35 +0800 Subject: [PATCH 12/24] modify demo example using huggingface tokenizer --- examples/training/custom/demo.py | 86 ++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 33 deletions(-) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index ce9552e2..025995fd 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -1,39 +1,46 @@ import torch +from transformers import BertTokenizer from lightseq.training import LSTransformer, LSCrossEntropyLayer, LSAdam -vocab_size, padding_idx = 1000, 0 -batch_size, src_seq_len, trg_seq_len = 6, 10, 15 - def create_data(): - src_tokens = torch.randint( - padding_idx, - vocab_size, - (batch_size, src_seq_len), - dtype=torch.long, - device=torch.device("cuda:0"), - ) - trg_tokens = torch.randint( - padding_idx, + tokenizer = BertTokenizer.from_pretrained("bert-base-cased") + vocab_size = tokenizer.vocab_size + + src_text = "Which company do you work for?" + src_tokens = tokenizer.encode(src_text, return_tensors="pt") + src_tokens = src_tokens.to(torch.device("cuda:0")) + batch_size, src_seq_len = src_tokens.size(0), src_tokens.size(1) + + trg_text = "I guess it must be LightSeq, because ByteDance is the fastest." + trg_tokens = tokenizer.encode(trg_text, return_tensors="pt") + trg_tokens = trg_tokens.to(torch.device("cuda:0")) + trg_seq_len = trg_tokens.size(1) + + target = trg_tokens.clone()[:, 1:] + trg_tokens = trg_tokens[:, :-1] + return ( + tokenizer, + src_text, + src_tokens, + trg_text, + trg_tokens, + target, vocab_size, - (batch_size, trg_seq_len), - dtype=torch.long, - device=torch.device("cuda:0"), + batch_size, + src_seq_len, + trg_seq_len, ) - target = trg_tokens.clone()[:, 1:] - eos = torch.zeros((batch_size, 1), dtype=torch.long, device=torch.device("cuda:0")) - target = torch.cat([target, eos], dim=-1) - return src_tokens, trg_tokens, target -def create_model(): +def create_model(vocab_size): transformer_config = LSTransformer.get_config( model="transformer-base", - max_batch_tokens=4096, - max_seq_len=256, + max_batch_tokens=2048, + max_seq_len=512, vocab_size=vocab_size, - padding_idx=padding_idx, + padding_idx=0, num_encoder_layer=6, num_decoder_layer=6, fp16=True, @@ -46,8 +53,8 @@ def create_model(): def create_criterion(): ce_config = LSCrossEntropyLayer.get_config( - max_batch_tokens=4096, - padding_idx=padding_idx, + max_batch_tokens=2048, + padding_idx=0, epsilon=0.0, fp16=True, local_rank=0, @@ -58,17 +65,28 @@ def create_criterion(): if __name__ == "__main__": - src_tokens, trg_tokens, target = create_data() - model = create_model() + ( + tokenizer, + src_text, + src_tokens, + trg_text, + trg_tokens, + target, + vocab_size, + batch_size, + src_seq_len, + trg_seq_len, + ) = create_data() + model = create_model(vocab_size) loss_fn = create_criterion() opt = LSAdam(model.parameters(), lr=1e-5) print("========================TRAIN========================") model.train() - for epoch in range(2000): + for epoch in range(1000): output = model(src_tokens, trg_tokens) loss, _ = loss_fn(output, target) - if epoch % 200 == 0: + if epoch % 100 == 0: print("epoch {:03d}: {:.3f}".format(epoch, loss.item())) loss.backward() opt.step() @@ -77,10 +95,12 @@ def create_criterion(): model.eval() encoder_out, encoder_padding_mask = model.encoder(src_tokens) predict_tokens = trg_tokens[:, :1] - for _ in range(trg_seq_len): + for _ in range(trg_seq_len - 1): output = model.decoder(predict_tokens, encoder_out, encoder_padding_mask) output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1)) predict_tokens = torch.cat([predict_tokens, output[:, -1:]], dim=-1) - predict_tokens = predict_tokens[:, 1:] - print("target:\n", target) - print("predict_tokens:\n", predict_tokens) + predict_tokens = torch.squeeze(predict_tokens) + predict_text = tokenizer.decode(predict_tokens, skip_special_tokens=True) + print("source:\n", src_text) + print("target:\n", trg_text) + print("predict:\n", predict_text) From 0508e333e25755d2514cb706337f15205814d1ce Mon Sep 17 00:00:00 2001 From: godweiyang Date: Wed, 21 Jul 2021 02:14:27 +0800 Subject: [PATCH 13/24] modify demo example using huggingface tokenizer --- examples/training/custom/demo.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index 025995fd..7f4f4ff4 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -5,21 +5,26 @@ def create_data(): + # create Hugging Face tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-cased") vocab_size = tokenizer.vocab_size + # source text to id src_text = "Which company do you work for?" src_tokens = tokenizer.encode(src_text, return_tensors="pt") src_tokens = src_tokens.to(torch.device("cuda:0")) batch_size, src_seq_len = src_tokens.size(0), src_tokens.size(1) + # target text to id trg_text = "I guess it must be LightSeq, because ByteDance is the fastest." trg_tokens = tokenizer.encode(trg_text, return_tensors="pt") trg_tokens = trg_tokens.to(torch.device("cuda:0")) trg_seq_len = trg_tokens.size(1) + # left shift 1 token as the target output target = trg_tokens.clone()[:, 1:] trg_tokens = trg_tokens[:, :-1] + return ( tokenizer, src_text, @@ -93,13 +98,19 @@ def create_criterion(): print("========================TEST========================") model.eval() + # obtain encoder output and mask encoder_out, encoder_padding_mask = model.encoder(src_tokens) + # use the first token as initial target input predict_tokens = trg_tokens[:, :1] for _ in range(trg_seq_len - 1): + # TODO: use cache to accelerate the inference output = model.decoder(predict_tokens, encoder_out, encoder_padding_mask) + # predict the next token output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1)) + # concatenate the next token with previous tokens predict_tokens = torch.cat([predict_tokens, output[:, -1:]], dim=-1) predict_tokens = torch.squeeze(predict_tokens) + # predict id to text predict_text = tokenizer.decode(predict_tokens, skip_special_tokens=True) print("source:\n", src_text) print("target:\n", trg_text) From 5b706f03eb493ce862b718c3ed1bd20209446dc4 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Wed, 21 Jul 2021 11:43:25 +0800 Subject: [PATCH 14/24] use cache to accelerate inference in demo example --- examples/training/custom/demo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index 7f4f4ff4..1b28afd2 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -102,13 +102,14 @@ def create_criterion(): encoder_out, encoder_padding_mask = model.encoder(src_tokens) # use the first token as initial target input predict_tokens = trg_tokens[:, :1] + cache = {} for _ in range(trg_seq_len - 1): - # TODO: use cache to accelerate the inference - output = model.decoder(predict_tokens, encoder_out, encoder_padding_mask) + # use cache to accelerate the inference + output = model.decoder(predict_tokens[:, -1:], encoder_out, encoder_padding_mask, cache) # predict the next token output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1)) # concatenate the next token with previous tokens - predict_tokens = torch.cat([predict_tokens, output[:, -1:]], dim=-1) + predict_tokens = torch.cat([predict_tokens, output], dim=-1) predict_tokens = torch.squeeze(predict_tokens) # predict id to text predict_text = tokenizer.decode(predict_tokens, skip_special_tokens=True) From c68a3e4c139568071a2627c1fb09f98bcf985309 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Wed, 21 Jul 2021 11:48:10 +0800 Subject: [PATCH 15/24] use cache to accelerate inference in demo example --- examples/training/custom/demo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/training/custom/demo.py b/examples/training/custom/demo.py index 1b28afd2..43cb0780 100644 --- a/examples/training/custom/demo.py +++ b/examples/training/custom/demo.py @@ -105,7 +105,9 @@ def create_criterion(): cache = {} for _ in range(trg_seq_len - 1): # use cache to accelerate the inference - output = model.decoder(predict_tokens[:, -1:], encoder_out, encoder_padding_mask, cache) + output = model.decoder( + predict_tokens[:, -1:], encoder_out, encoder_padding_mask, cache + ) # predict the next token output = torch.reshape(torch.argmax(output, dim=-1), (batch_size, -1)) # concatenate the next token with previous tokens From c7b77c80e4a039fcedf7bf351e32aee0abaef49d Mon Sep 17 00:00:00 2001 From: "xiongying.taka" Date: Wed, 21 Jul 2021 13:58:54 +0800 Subject: [PATCH 16/24] refactor op test --- tests/test_ls_ops.py | 81 +++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index bd8f5cad..4d2d3a7e 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -29,9 +29,11 @@ @kt.case(rtol=1e-3, atol=1e-2) def test_encoder_layer_forward(): batch_size, seq_len = kt.bs_sl() - print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = config.hidden_size + + print( + f"(batch_size, seq_len, hidden_size): ({batch_size}, {seq_len}, {hidden_size})" + ) hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) @@ -57,13 +59,14 @@ def baseline(): @kt.case(rtol=1e-2, atol=1e-2) def test_encoder_layer_backward(): batch_size, seq_len = kt.bs_sl() - print(f"(batch_size, seq_len): ({batch_size}, {seq_len})") - hidden_size = config.hidden_size + print( + f"(batch_size, seq_len, hidden_size): ({batch_size}, {seq_len}, {hidden_size})" + ) + shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, seq_len, hidden_size)) self_attn_padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.bool) - loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() # custom fw custom_enc_layers.zero_grad() @@ -71,7 +74,6 @@ def test_encoder_layer_backward(): for layer in custom_enc_layers: res = layer(res, self_attn_padding_mask) custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) # fairseq fw fairseq_enc_layers.zero_grad() @@ -79,7 +81,6 @@ def test_encoder_layer_backward(): for layer in fairseq_enc_layers: res = layer(res, self_attn_padding_mask) fairseq_loss = (res / 1000).sum() - fairseq_loss.data.copy_(loss_data) def custom(): custom_enc_layers.zero_grad() @@ -151,11 +152,13 @@ def baseline(): def test_decoder_layer_forward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) + hidden_size = config.hidden_size + print( - f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" + f"(batch_size, enc_seq_len, dec_seq_len, hidden_size): " + "({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" ) - hidden_size = config.hidden_size hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) incremental_state = None @@ -196,33 +199,44 @@ def baseline(): def test_decoder_layer_backward(): batch_size, enc_seq_len = kt.bs_sl() _, dec_seq_len = kt.bs_sl(batch_size) + hidden_size = config.hidden_size print( - f"(batch_size, enc_seq_len, dec_seq_len): ({batch_size}, {enc_seq_len}, {dec_seq_len})" + f"(batch_size, enc_seq_len, dec_seq_len, hidden_size):({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" ) - hidden_size = config.hidden_size shs = hidden_size * hidden_size hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) incremental_state = None encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 - loss_data = torch.randn(1, dtype=hidden_states.dtype).sum() + + cus_res = hidden_states.clone() + cus_encoder_out = encoder_out.clone() + for layer in custom_dec_layers: + cus_res, _, _ = layer( + cus_res, + encoder_out=cus_encoder_out, + encoder_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + ) + custom_loss = (cus_res / 1000).sum() + + base_res = hidden_states.transpose(0, 1).clone() + base_encoder_out = encoder_out.clone() + for layer in fairseq_dec_layers: + base_res, _, _ = layer( + base_res, + encoder_out=base_encoder_out, + encoder_padding_mask=encoder_padding_mask, + self_attn_mask=self_attn_mask, + incremental_state=incremental_state, + ) + fairseq_loss = (base_res / 1000).sum() def custom(): custom_dec_layers.zero_grad() - res = hidden_states.clone() - for layer in custom_dec_layers: - res, _, _ = layer( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - incremental_state=incremental_state, - ) - custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) - custom_loss.backward() - + custom_loss.backward(retain_graph=True) grad_list = [] for i in range(config.num_layers - 1, -1, -1): """ @@ -273,18 +287,7 @@ def custom(): def baseline(): fairseq_dec_layers.zero_grad() - res = hidden_states.transpose(0, 1).clone() - for layer in fairseq_dec_layers: - res, _, _ = layer( - res, - encoder_out=encoder_out, - encoder_padding_mask=encoder_padding_mask, - self_attn_mask=self_attn_mask, - incremental_state=incremental_state, - ) - fairseq_loss = (res / 1000).sum() - fairseq_loss.data.copy_(loss_data) - fairseq_loss.backward() + fairseq_loss.backward(retain_graph=True) grad_list = [] for i in range(config.num_layers - 1, -1, -1): @@ -334,9 +337,11 @@ def baseline(): @kt.case(rtol=1e-3, atol=1e-2) def test_decoder_layer_forward_inference(): batch_size, enc_seq_len = kt.bs_sl() - print(f"(batch_size, enc_seq_len): ({batch_size}, {enc_seq_len})") - hidden_size = config.hidden_size + print( + f"(batch_size, enc_seq_len, hidden_size): " + "({batch_size}, {enc_seq_len}, {hidden_size})" + ) # beam_size = random.randint(2, 5) # print(f"(batch_size, enc_seq_len, beam_size): ({batch_size}, {enc_seq_len}, {beam_size})") From 9edbdcbf4e94867bba8de689f925858422ce59db Mon Sep 17 00:00:00 2001 From: godweiyang Date: Thu, 22 Jul 2021 11:54:12 +0800 Subject: [PATCH 17/24] fix test_decoder_bw bug --- tests/test_ls_ops.py | 48 +++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index 4d2d3a7e..edf353af 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -211,32 +211,19 @@ def test_decoder_layer_backward(): encoder_padding_mask = kt.attn_mask(batch_size, enc_seq_len, dtype=torch.bool) self_attn_mask = kt.dec_self_attn_mask(dec_seq_len) * -1e8 - cus_res = hidden_states.clone() - cus_encoder_out = encoder_out.clone() - for layer in custom_dec_layers: - cus_res, _, _ = layer( - cus_res, - encoder_out=cus_encoder_out, - encoder_padding_mask=encoder_padding_mask, - incremental_state=incremental_state, - ) - custom_loss = (cus_res / 1000).sum() - - base_res = hidden_states.transpose(0, 1).clone() - base_encoder_out = encoder_out.clone() - for layer in fairseq_dec_layers: - base_res, _, _ = layer( - base_res, - encoder_out=base_encoder_out, - encoder_padding_mask=encoder_padding_mask, - self_attn_mask=self_attn_mask, - incremental_state=incremental_state, - ) - fairseq_loss = (base_res / 1000).sum() - def custom(): custom_dec_layers.zero_grad() - custom_loss.backward(retain_graph=True) + cus_res = hidden_states.clone() + cus_encoder_out = encoder_out.clone() + for layer in custom_dec_layers: + cus_res, _, _ = layer( + cus_res, + encoder_out=cus_encoder_out, + encoder_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + ) + custom_loss = (cus_res / 1000).sum() + custom_loss.backward() grad_list = [] for i in range(config.num_layers - 1, -1, -1): """ @@ -287,7 +274,18 @@ def custom(): def baseline(): fairseq_dec_layers.zero_grad() - fairseq_loss.backward(retain_graph=True) + base_res = hidden_states.transpose(0, 1).clone() + base_encoder_out = encoder_out.clone() + for layer in fairseq_dec_layers: + base_res, _, _ = layer( + base_res, + encoder_out=base_encoder_out, + encoder_padding_mask=encoder_padding_mask, + self_attn_mask=self_attn_mask, + incremental_state=incremental_state, + ) + fairseq_loss = (base_res / 1000).sum() + fairseq_loss.backward() grad_list = [] for i in range(config.num_layers - 1, -1, -1): From 33df7683c85276e4360d667dcd5935c0e01226f3 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Thu, 22 Jul 2021 14:54:05 +0800 Subject: [PATCH 18/24] add multiprocessing for different shapunit test --- tests/test_ls_ops.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index edf353af..fd8902cf 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -1,5 +1,5 @@ +import multiprocessing as mp import random -from copy import deepcopy import torch @@ -17,13 +17,13 @@ kt = TestDecorator() -config = kt.generate_config(use_default=False) -kt.dtypes = [torch.half if config.fp16 else torch.float] +kt.dtypes = [torch.half] -custom_enc_layers, fairseq_enc_layers = gen_enc_layer(config) -custom_dec_layers, fairseq_dec_layers = gen_dec_layer(config) -custom_emb_layer, fairseq_emb_layer = gen_emb_layer(config) -custom_ce_layer, fairseq_ce_layer = gen_ce_layer(config) +config = None +custom_enc_layers, fairseq_enc_layers = None, None +custom_dec_layers, fairseq_dec_layers = None, None +custom_emb_layer, fairseq_emb_layer = None, None +custom_ce_layer, fairseq_ce_layer = None, None @kt.case(rtol=1e-3, atol=1e-2) @@ -524,17 +524,39 @@ def baseline(): return custom, baseline -if __name__ == "__main__": +def main(epoch): + print(">>>>>>>>>>>>>>>>>>>>>>Test epoch: {}>>>>>>>>>>>>>>>>>>>>>>".format(epoch)) + global config + global custom_enc_layers, fairseq_enc_layers + global custom_dec_layers, fairseq_dec_layers + global custom_emb_layer, fairseq_emb_layer + global custom_ce_layer, fairseq_ce_layer + + config = kt.generate_config(use_default=False) + print(config) + custom_enc_layers, fairseq_enc_layers = gen_enc_layer(config) + custom_dec_layers, fairseq_dec_layers = gen_dec_layer(config) + custom_emb_layer, fairseq_emb_layer = gen_emb_layer(config) + custom_ce_layer, fairseq_ce_layer = gen_ce_layer(config) + kt.run( [ "test_encoder_layer_forward", "test_encoder_layer_backward", "test_decoder_layer_forward", "test_decoder_layer_backward", - "test_decoder_layer_forward_inference", + # "test_decoder_layer_forward_inference", "test_embedding_layer_forward", "test_embedding_layer_backward", "test_cross_entropy_layer_forward", "test_cross_entropy_layer_backward", ] ) + + +if __name__ == "__main__": + ctx = mp.get_context("spawn") + for i in range(50): + p = ctx.Process(target=main, args=(i,)) + p.start() + p.join() From 52d75ab6692f38baf2f6023cdf92d9cea7494f69 Mon Sep 17 00:00:00 2001 From: godweiyang Date: Thu, 22 Jul 2021 20:42:38 +0800 Subject: [PATCH 19/24] move layer creation o th begining --- tests/test_ls_ops.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index fd8902cf..659bf735 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -17,13 +17,14 @@ kt = TestDecorator() -kt.dtypes = [torch.half] -config = None -custom_enc_layers, fairseq_enc_layers = None, None -custom_dec_layers, fairseq_dec_layers = None, None -custom_emb_layer, fairseq_emb_layer = None, None -custom_ce_layer, fairseq_ce_layer = None, None +config = kt.generate_config() +kt.dtypes = [torch.half if config.fp16 else torch.float] + +custom_enc_layers, fairseq_enc_layers = gen_enc_layer(config) +custom_dec_layers, fairseq_dec_layers = gen_dec_layer(config) +custom_emb_layer, fairseq_emb_layer = gen_emb_layer(config) +custom_ce_layer, fairseq_ce_layer = gen_ce_layer(config) @kt.case(rtol=1e-3, atol=1e-2) @@ -526,19 +527,6 @@ def baseline(): def main(epoch): print(">>>>>>>>>>>>>>>>>>>>>>Test epoch: {}>>>>>>>>>>>>>>>>>>>>>>".format(epoch)) - global config - global custom_enc_layers, fairseq_enc_layers - global custom_dec_layers, fairseq_dec_layers - global custom_emb_layer, fairseq_emb_layer - global custom_ce_layer, fairseq_ce_layer - - config = kt.generate_config(use_default=False) - print(config) - custom_enc_layers, fairseq_enc_layers = gen_enc_layer(config) - custom_dec_layers, fairseq_dec_layers = gen_dec_layer(config) - custom_emb_layer, fairseq_emb_layer = gen_emb_layer(config) - custom_ce_layer, fairseq_ce_layer = gen_ce_layer(config) - kt.run( [ "test_encoder_layer_forward", From 0823b4025e5a159aa435a1be5fa7889be0cc8b78 Mon Sep 17 00:00:00 2001 From: "weiyang.god" Date: Mon, 9 Aug 2021 16:06:42 +0800 Subject: [PATCH 20/24] modify inference unit test --- tests/test_ls_ops.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index 6f9db524..f4120f7a 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -428,16 +428,11 @@ def test_embedding_layer_backward(): padding_mask = kt.attn_mask(batch_size, seq_len, dtype=torch.int) input = kt.randint(config.padding_idx + 1, config.vocab_size, (batch_size, seq_len)) -<<<<<<< HEAD - input = input * (1 - padding_mask) + config.padding_idx * padding_mask - loss_data = torch.randn(1, dtype=kt.dtype).sum() -======= pad_left = random.choice([True, False]) if pad_left: input = input * padding_mask + config.padding_idx * (1 - padding_mask) else: input = input * (1 - padding_mask) + config.padding_idx * padding_mask ->>>>>>> master custom_emb_layer.zero_grad() custom_input = input.clone() @@ -543,7 +538,7 @@ def main(epoch): "test_encoder_layer_backward", "test_decoder_layer_forward", "test_decoder_layer_backward", - # "test_decoder_layer_forward_inference", + "test_decoder_layer_forward_inference", "test_embedding_layer_forward", "test_embedding_layer_backward", "test_cross_entropy_layer_forward", From e0962de3d8bbcacbf5dd45c746d8739123407a71 Mon Sep 17 00:00:00 2001 From: "weiyang.god" Date: Mon, 9 Aug 2021 16:14:23 +0800 Subject: [PATCH 21/24] modify inference unit test --- .../fairseq/fs_cli/lightseq_fairseq_train_cli.py | 14 -------------- tests/test_ls_ops.py | 2 -- 2 files changed, 16 deletions(-) diff --git a/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py b/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py index efe602e5..e69de29b 100644 --- a/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py +++ b/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py @@ -1,14 +0,0 @@ -import pathlib -import sys - -from fairseq_cli.train import cli_main - - -def ls_cli_main(*args, **kwargs): - user_path = pathlib.Path(__file__).parent.parent.joinpath("fs_modules") - sys.argv.extend(["--user-dir", str(user_path)]) - cli_main(*args, **kwargs) - - -if __name__ == "__main__": - ls_cli_main() diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index f4120f7a..b5c952bd 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -438,13 +438,11 @@ def test_embedding_layer_backward(): custom_input = input.clone() res = custom_emb_layer(custom_input) custom_loss = (res / 1000).sum() - custom_loss.data.copy_(loss_data) fairseq_emb_layer.zero_grad() fs_input = input.clone() res = fairseq_emb_layer(fs_input) fs_loss = (res / 1000).sum() - fs_loss.data.copy_(loss_data) def custom(): custom_emb_layer.zero_grad() From 51e77b355a2b8d7ae4e7ffaddf63e2238da2908b Mon Sep 17 00:00:00 2001 From: "weiyang.god" Date: Mon, 9 Aug 2021 16:27:04 +0800 Subject: [PATCH 22/24] modify unit test --- tests/test_ls_ops.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index b5c952bd..9c38a8e7 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -157,7 +157,7 @@ def test_decoder_layer_forward(): print( f"(batch_size, enc_seq_len, dec_seq_len, hidden_size): " - "({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" + f"({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" ) hidden_states = kt.rand((batch_size, dec_seq_len, hidden_size)) @@ -202,7 +202,8 @@ def test_decoder_layer_backward(): _, dec_seq_len = kt.bs_sl(batch_size) hidden_size = config.hidden_size print( - f"(batch_size, enc_seq_len, dec_seq_len, hidden_size):({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" + f"(batch_size, enc_seq_len, dec_seq_len, hidden_size): " + f"({batch_size}, {enc_seq_len}, {dec_seq_len}, {hidden_size})" ) shs = hidden_size * hidden_size @@ -339,7 +340,8 @@ def test_decoder_layer_forward_inference(): beam_size = random.randint(2, 5) hidden_size = config.hidden_size print( - f"(batch_size, enc_seq_len, beam_size, hidden_size): ({batch_size}, {enc_seq_len}, {beam_size}, {hidden_size})" + f"(batch_size, enc_seq_len, beam_size, hidden_size): " + f"({batch_size}, {enc_seq_len}, {beam_size}, {hidden_size})" ) ls_encoder_out = kt.rand((enc_seq_len, batch_size, hidden_size)) @@ -532,22 +534,22 @@ def main(epoch): print(">>>>>>>>>>>>>>>>>>>>>>Test epoch: {}>>>>>>>>>>>>>>>>>>>>>>".format(epoch)) kt.run( [ - "test_encoder_layer_forward", - "test_encoder_layer_backward", - "test_decoder_layer_forward", + # "test_encoder_layer_forward", + # "test_encoder_layer_backward", + # "test_decoder_layer_forward", "test_decoder_layer_backward", - "test_decoder_layer_forward_inference", - "test_embedding_layer_forward", - "test_embedding_layer_backward", - "test_cross_entropy_layer_forward", - "test_cross_entropy_layer_backward", + # "test_decoder_layer_forward_inference", + # "test_embedding_layer_forward", + # "test_embedding_layer_backward", + # "test_cross_entropy_layer_forward", + # "test_cross_entropy_layer_backward", ] ) if __name__ == "__main__": ctx = mp.get_context("spawn") - for i in range(50): + for i in range(1): p = ctx.Process(target=main, args=(i,)) p.start() p.join() From 3daaef797a59d982256e838f326a514852621e78 Mon Sep 17 00:00:00 2001 From: "weiyang.god" Date: Mon, 9 Aug 2021 16:32:01 +0800 Subject: [PATCH 23/24] add fairseq training cli --- .../fairseq/fs_cli/lightseq_fairseq_train_cli.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py b/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py index e69de29b..efe602e5 100644 --- a/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py +++ b/examples/training/fairseq/fs_cli/lightseq_fairseq_train_cli.py @@ -0,0 +1,14 @@ +import pathlib +import sys + +from fairseq_cli.train import cli_main + + +def ls_cli_main(*args, **kwargs): + user_path = pathlib.Path(__file__).parent.parent.joinpath("fs_modules") + sys.argv.extend(["--user-dir", str(user_path)]) + cli_main(*args, **kwargs) + + +if __name__ == "__main__": + ls_cli_main() From ab9dd43e7f45c32f7de1c582c41d588f8c16b8a2 Mon Sep 17 00:00:00 2001 From: "weiyang.god" Date: Mon, 9 Aug 2021 16:33:51 +0800 Subject: [PATCH 24/24] modify unit test --- tests/test_ls_ops.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_ls_ops.py b/tests/test_ls_ops.py index 9c38a8e7..20013b53 100644 --- a/tests/test_ls_ops.py +++ b/tests/test_ls_ops.py @@ -534,22 +534,22 @@ def main(epoch): print(">>>>>>>>>>>>>>>>>>>>>>Test epoch: {}>>>>>>>>>>>>>>>>>>>>>>".format(epoch)) kt.run( [ - # "test_encoder_layer_forward", - # "test_encoder_layer_backward", - # "test_decoder_layer_forward", + "test_encoder_layer_forward", + "test_encoder_layer_backward", + "test_decoder_layer_forward", "test_decoder_layer_backward", - # "test_decoder_layer_forward_inference", - # "test_embedding_layer_forward", - # "test_embedding_layer_backward", - # "test_cross_entropy_layer_forward", - # "test_cross_entropy_layer_backward", + "test_decoder_layer_forward_inference", + "test_embedding_layer_forward", + "test_embedding_layer_backward", + "test_cross_entropy_layer_forward", + "test_cross_entropy_layer_backward", ] ) if __name__ == "__main__": ctx = mp.get_context("spawn") - for i in range(1): + for i in range(50): p = ctx.Process(target=main, args=(i,)) p.start() p.join()