Skip to content

Commit 91aab2a

Browse files
authored
Merge pull request #116 from FDecaYed/deyuf/fp16_with_apex
Change to use apex for better fp16 and multi-gpu support
2 parents 32a227f + 3b0a14b commit 91aab2a

File tree

6 files changed

+155
-187
lines changed

6 files changed

+155
-187
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ The optimizer accepts the following arguments:
338338
- `b1` : Adams b1. Default : `0.9`
339339
- `b2` : Adams b2. Default : `0.999`
340340
- `e` : Adams epsilon. Default : `1e-6`
341-
- `weight_decay_rate:` Weight decay. Default : `0.01`
341+
- `weight_decay:` Weight decay. Default : `0.01`
342342
- `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0`
343343

344344
## Examples

examples/run_classifier.py

Lines changed: 49 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# coding=utf-8
22
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
3+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
34
#
45
# Licensed under the Apache License, Version 2.0 (the "License");
56
# you may not use this file except in compliance with the License.
@@ -35,6 +36,13 @@
3536
from pytorch_pretrained_bert.optimization import BertAdam
3637
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
3738

39+
try:
40+
from apex.optimizers import FP16_Optimizer
41+
from apex.optimizers import FusedAdam
42+
from apex.parallel import DistributedDataParallel as DDP
43+
except ImportError:
44+
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to run this.")
45+
3846
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
3947
datefmt = '%m/%d/%Y %H:%M:%S',
4048
level = logging.INFO)
@@ -295,34 +303,10 @@ def accuracy(out, labels):
295303
outputs = np.argmax(out, axis=1)
296304
return np.sum(outputs == labels)
297305

298-
def copy_optimizer_params_to_model(named_params_model, named_params_optimizer):
299-
""" Utility function for optimize_on_cpu and 16-bits training.
300-
Copy the parameters optimized on CPU/RAM back to the model on GPU
301-
"""
302-
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
303-
if name_opti != name_model:
304-
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
305-
raise ValueError
306-
param_model.data.copy_(param_opti.data)
307-
308-
def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False):
309-
""" Utility function for optimize_on_cpu and 16-bits training.
310-
Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
311-
"""
312-
is_nan = False
313-
for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model):
314-
if name_opti != name_model:
315-
logger.error("name_opti != name_model: {} {}".format(name_opti, name_model))
316-
raise ValueError
317-
if param_model.grad is not None:
318-
if test_nan and torch.isnan(param_model.grad).sum() > 0:
319-
is_nan = True
320-
if param_opti.grad is None:
321-
param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size()))
322-
param_opti.grad.data.copy_(param_model.grad.data)
323-
else:
324-
param_opti.grad = None
325-
return is_nan
306+
def warmup_linear(x, warmup=0.002):
307+
if x < warmup:
308+
return x/warmup
309+
return 1.0 - x
326310

327311
def main():
328312
parser = argparse.ArgumentParser()
@@ -403,17 +387,15 @@ def main():
403387
type=int,
404388
default=1,
405389
help="Number of updates steps to accumulate before performing a backward/update pass.")
406-
parser.add_argument('--optimize_on_cpu',
407-
default=False,
408-
action='store_true',
409-
help="Whether to perform optimization and keep the optimizer averages on CPU")
410390
parser.add_argument('--fp16',
411391
default=False,
412392
action='store_true',
413393
help="Whether to use 16-bit float precision instead of 32-bit")
414394
parser.add_argument('--loss_scale',
415-
type=float, default=128,
416-
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
395+
type=float, default=0,
396+
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
397+
"0 (default value): dynamic loss scaling.\n"
398+
"Positive power of 2: static loss scaling value.\n")
417399

418400
args = parser.parse_args()
419401

@@ -433,13 +415,11 @@ def main():
433415
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
434416
n_gpu = torch.cuda.device_count()
435417
else:
418+
torch.cuda.set_device(args.local_rank)
436419
device = torch.device("cuda", args.local_rank)
437420
n_gpu = 1
438421
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
439422
torch.distributed.init_process_group(backend='nccl')
440-
if args.fp16:
441-
logger.info("16-bits training currently not supported in distributed training")
442-
args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
443423
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
444424

445425
if args.gradient_accumulation_steps < 1:
@@ -487,32 +467,35 @@ def main():
487467
model.half()
488468
model.to(device)
489469
if args.local_rank != -1:
490-
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
491-
output_device=args.local_rank)
470+
model = DDP(model)
492471
elif n_gpu > 1:
493472
model = torch.nn.DataParallel(model)
494473

495474
# Prepare optimizer
496-
if args.fp16:
497-
param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
498-
for n, param in model.named_parameters()]
499-
elif args.optimize_on_cpu:
500-
param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
501-
for n, param in model.named_parameters()]
502-
else:
503-
param_optimizer = list(model.named_parameters())
504-
no_decay = ['bias', 'gamma', 'beta']
475+
param_optimizer = list(model.named_parameters())
476+
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
505477
optimizer_grouped_parameters = [
506-
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
507-
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
478+
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
479+
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
508480
]
509481
t_total = num_train_steps
510482
if args.local_rank != -1:
511483
t_total = t_total // torch.distributed.get_world_size()
512-
optimizer = BertAdam(optimizer_grouped_parameters,
513-
lr=args.learning_rate,
514-
warmup=args.warmup_proportion,
515-
t_total=t_total)
484+
if args.fp16:
485+
optimizer = FusedAdam(optimizer_grouped_parameters,
486+
lr=args.learning_rate,
487+
bias_correction=False,
488+
max_grad_norm=1.0)
489+
if args.loss_scale == 0:
490+
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
491+
else:
492+
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
493+
494+
else:
495+
optimizer = BertAdam(optimizer_grouped_parameters,
496+
lr=args.learning_rate,
497+
warmup=args.warmup_proportion,
498+
t_total=t_total)
516499

517500
global_step = 0
518501
if args.do_train:
@@ -543,34 +526,24 @@ def main():
543526
loss = model(input_ids, segment_ids, input_mask, label_ids)
544527
if n_gpu > 1:
545528
loss = loss.mean() # mean() to average on multi-gpu.
546-
if args.fp16 and args.loss_scale != 1.0:
547-
# rescale loss for fp16 training
548-
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
549-
loss = loss * args.loss_scale
550529
if args.gradient_accumulation_steps > 1:
551530
loss = loss / args.gradient_accumulation_steps
552-
loss.backward()
531+
532+
if args.fp16:
533+
optimizer.backward(loss)
534+
else:
535+
loss.backward()
536+
553537
tr_loss += loss.item()
554538
nb_tr_examples += input_ids.size(0)
555539
nb_tr_steps += 1
556540
if (step + 1) % args.gradient_accumulation_steps == 0:
557-
if args.fp16 or args.optimize_on_cpu:
558-
if args.fp16 and args.loss_scale != 1.0:
559-
# scale down gradients for fp16 training
560-
for param in model.parameters():
561-
if param.grad is not None:
562-
param.grad.data = param.grad.data / args.loss_scale
563-
is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True)
564-
if is_nan:
565-
logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling")
566-
args.loss_scale = args.loss_scale / 2
567-
model.zero_grad()
568-
continue
569-
optimizer.step()
570-
copy_optimizer_params_to_model(model.named_parameters(), param_optimizer)
571-
else:
572-
optimizer.step()
573-
model.zero_grad()
541+
# modify learning rate with special warm up BERT uses
542+
lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion)
543+
for param_group in optimizer.param_groups:
544+
param_group['lr'] = lr_this_step
545+
optimizer.step()
546+
optimizer.zero_grad()
574547
global_step += 1
575548

576549
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):

0 commit comments

Comments
 (0)