Pytorch-Transformer
Model Parallelism using Transformers and PyTorch
-
Loading the data
-
Instantiate a model
-
Create torch Dataset and DataLoader
1class myDataset(torch.utils.data.Dataset):Split the data into train and val sets:
1 2from sklearn.model_selection import train_test_split df_train, df_val = train_test_split(imdb_df, test_size=0.3, random_state=2021create DataLoader for train set and val set:
-
Make a wrapper for the model:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27class MultiGPUClassifier(torch.nn.Module): def __init__(self, roberta_model): super(MultiGPUClassifier, self).__init__() # Embedding layer --> cuda:0 self.embedding = roberta_model.roberta.embeddings.to('cuda:0') # Encoder Layer --> cuda:1 self.encoder = roberta_model.roberta.encoder.to('cuda:1') # Classifier --> cuda:1 self.classifier = roberta_model.classifier.to('cuda:1') def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): # Pass the input_ids to cuda:0 since embedding layer in cuda:0 emb_out = self.embedding(input_ids.to('cuda:0')) # Move the outputs of embedding layer to cuda:1 as input to encoder layer enc_out = self.encoder(emb_out.to('cuda:1')) classifier_out = self.classifier(enc_out[0]) return classifier_out # Initialize the model multi_gpu_roberta = MultiGPUClassifier(roberta_model)Upon constructing the model, the memory usage can be seen using
nvidia-smi. -
Create optimizer and loss function for the model:
1 2 3 4 5 6 7 8 9 10 11 12from transformers import get_linear_schedule_with_warmup, AdamW EPOCHS = 2 LR = 1e-5 optimizer = AdamW(multi_gpu_roberta.parameters(), lr=LR) total_steps = len(train_data_loader)*EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) loss_fn = torch.nn.CrossEntropyLoss().to('cuda:1') # match with the roberta.classifier layer -
Create a helper function for training the model and returning accuracy and losses:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28def train_model (model, data_loader, loss_fn, optimizer, scheduler, n_examples): model = model.train() # losses = [] correct_predictions = 0 for d in data_loader: # take a batch input_ids = d['input_ids'] attention_mask = d['attention_mask'] # Reshaping attention mask for adapting the forward pass reshaped_attention_mask = attention_mask.reshape(d['attention_mask'].shape[0],1,1,d['attention_mask'].shape[1]) targets = d['labels'] outputs = model(input_ids = input_ids, attention_mask = reshaped_attention_mask) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets.to('cuda:1')) # move targets to cuda:1 to calculate loss correct_prediction += torch.sum(preds == targets.to('cuda:1')) losses.append(loss.item()) loss.backward() # Clip the gradients of the model to prevent exploding gradients using clip_grad_norm torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() # gradient descent scheduler.step() # lr decay optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses) -
Create a helper function for evaluating the model:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21def eval_model(model, data_loader, loss_fn, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d['input_ids'] attention_mask = d['attention_mask'] reshaped_attention_mask = attention_mask.reshaped(d['attention_mask'].shape[0],1,1,d['attention_mask'].shape[1]) targets = d['labels'] outputs = model(input_ids = input_ids, attention_mask=reshaped_attention_mask) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets.to('cuda:1')) correct_predictions += torch.sum(preds == targets.to('cuda:1')) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses) -
Create the training loop and only store the best one:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25from collections import defaultdict history = defaultdict(list) best_accuracy = 0 %%time for epoch in range(EPOCHS): print(f'Epoch {epoch+1}/{EPOCHS}) print('-' * 10) train_acc, train_loss = train_model(multi_gpu_roberta, train_data_loader, loss_fn, optimizer, scheduler, len(df_train)) print(f'Train Loss:{train_loss}; Train Accuracy: {train_acc}') val_acc, val_loss = eval_model(multi_gpu_roberta, val_data_loader, loss_fn, len(df_val)) print(f'Val Loss: {val_loss}; Val Accuracy: {val_acc}') history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: torch.save(multi_gpu_roberta.state_dict(), 'multi_gpu_roberta_best_model_state.bin') best_accuracy = val_acc -
Visualizing model performance
Combining DDP with Model Parallelism
|
|
DDP wraps a multi-GPU model:
|
|
Apply Model Parallel to Existing Modules
SINGLE-MACHINE MODEL PARALLEL BEST PRACTICES ResNet50
nn.Sequential
GNT model.py
|
|
In a new file: model_parallel.py
|
|
In train.py:
|
|
Pytorch-DDP-RPC
Invited Talk: PyTorch Distributed (DDP, RPC) - By Facebook Research Scientist Shen Li ytb
(DDG search: tensorflow model split distributed parallel)