make attention mask + pipe parallel more flexible

sdtblck · web-flow · commit 3b49e7f4ea7e · 2021-05-19T14:48:01.000+02:00
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -183,7 +183,9 @@ def __init__(self, *super_args, **super_kwargs):
 
         if self.is_last_stage():
             self.loss_model = self.module.loss_fn
-
+            
+        self.has_attention_mask = self.module.__class__.__name__ == 'GPT2ModelPipe'
+        
         # Initialize pipeline communicators. Just send a 0.
         if is_even(self.stage_id):
             if not self.is_last_stage():
@@ -196,6 +198,10 @@ def __init__(self, *super_args, **super_kwargs):
             if not self.is_last_stage():
                 p2p.send(self.loss, self.next_stage)
 
+    def set_has_attention_mask(self, value):
+        assert isinstance(value, boolean)
+        self.has_attention_mask = value
+        
     def _build_data_iter(self, dataset):
         sampler = torch.utils.data.distributed.DistributedSampler(
             dataset,
@@ -919,7 +925,7 @@ def _exec_send_activations(self, buffer_id):
         # NCCL does not like to send torch.BoolTensor types, so cast the mask to half().
         # We could do char, but with half() we can eventually flatten with other fp16
         # messages (TODO)
-        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+        if self.has_attention_mask:
             outputs = list(outputs)
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
@@ -938,7 +944,7 @@ def _exec_send_activations(self, buffer_id):
                                       f'{type(outputs)}')
 
         # Restore the boolean tensor
-        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+        if self.has_attention_mask:
             outputs = list(outputs)
             outputs[-1] = outputs[-1].bool()
             outputs = tuple(outputs)
@@ -968,7 +974,7 @@ def _exec_send_grads(self, buffer_id):
         # a grad that needs to be communicated. We free the buffer immediately
         # after, so no need to restore it. The receiver also has a hack that skips
         # the recv. This is because NCCL does not let us send torch.BoolTensor :-(.
-        if self.module.__class__.__name__ == 'GPT2ModelPipe':
+        if self.has_attention_mask:
             inputs = list(inputs)
             inputs.pop()
             inputs = tuple(inputs)
@@ -1030,7 +1036,7 @@ def _exec_recv_activations(self, buffer_id):
 
             # NCCL does not like to send torch.BoolTensor types, so un-cast the
             # attention mask
-            if self.module.__class__.__name__ == 'GPT2ModelPipe':
+            if self.has_attention_mask:
                 recvd[-1] = recvd[-1].bool()
 
             recvd = tuple(recvd)