Skip to content

Commit 8900aa0

Browse files
author
server-jack
committed
remove compressed allreduce
1 parent cfe1bab commit 8900aa0

1 file changed

Lines changed: 4 additions & 12 deletions

File tree

deepspeed/runtime/engine.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
2727
ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
2828
TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT
29-
from deepspeed.runtime.comm import compressed_all_reduce
3029

3130
from deepspeed.runtime.dataloader import DeepSpeedDataLoader
3231
from deepspeed.runtime.constants import \
@@ -139,7 +138,6 @@ def __init__(self,
139138
self.store_gradients = False
140139
self.store_gradients_cpu = False
141140
self.stored_gradients = None
142-
self.bf16_compressed_allreduce = False # hardcode for now - it's not really working
143141

144142
if dist_init_required is None:
145143
dist_init_required = not dist.is_initialized()
@@ -1292,29 +1290,23 @@ def allreduce_bucket(self, bucket):
12921290

12931291
tensor_to_allreduce = tensor
12941292

1295-
if self.allreduce_always_fp32() and not self.bf16_compressed_allreduce:
1293+
if self.allreduce_always_fp32():
12961294
tensor_to_allreduce = tensor.float()
12971295

12981296
if self.postscale_gradients():
12991297
if self.gradient_predivide_factor() != 1.0:
13001298
tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor())
1301-
if self.bf16_compressed_allreduce and self.precision() == torch.bfloat16:
1302-
compressed_all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
1303-
else:
1304-
dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
1299+
dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
13051300

13061301
if self.gradient_average:
13071302
if self.gradient_predivide_factor() != self.dp_world_size:
13081303
tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
13091304
self.dp_world_size)
13101305
else:
13111306
tensor_to_allreduce.div_(self.dp_world_size)
1312-
if self.bf16_compressed_allreduce and self.precision() == torch.bfloat16:
1313-
compressed_all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
1314-
else:
1315-
dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
1307+
dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)
13161308

1317-
if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce and not self.bf16_compressed_allreduce:
1309+
if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce:
13181310
tensor.copy_(tensor_to_allreduce)
13191311

13201312
return tensor

0 commit comments

Comments
 (0)