11# coding=utf-8
22# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
3+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
34#
45# Licensed under the Apache License, Version 2.0 (the "License");
56# you may not use this file except in compliance with the License.
3536from pytorch_pretrained_bert .optimization import BertAdam
3637from pytorch_pretrained_bert .file_utils import PYTORCH_PRETRAINED_BERT_CACHE
3738
39+ try :
40+ from apex .optimizers import FP16_Optimizer
41+ from apex .optimizers import FusedAdam
42+ from apex .parallel import DistributedDataParallel as DDP
43+ except ImportError :
44+ raise ImportError ("Please install apex from https://www.github.com/nvidia/apex to run this." )
45+
3846logging .basicConfig (format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' ,
3947 datefmt = '%m/%d/%Y %H:%M:%S' ,
4048 level = logging .INFO )
@@ -295,34 +303,10 @@ def accuracy(out, labels):
295303 outputs = np .argmax (out , axis = 1 )
296304 return np .sum (outputs == labels )
297305
298- def copy_optimizer_params_to_model (named_params_model , named_params_optimizer ):
299- """ Utility function for optimize_on_cpu and 16-bits training.
300- Copy the parameters optimized on CPU/RAM back to the model on GPU
301- """
302- for (name_opti , param_opti ), (name_model , param_model ) in zip (named_params_optimizer , named_params_model ):
303- if name_opti != name_model :
304- logger .error ("name_opti != name_model: {} {}" .format (name_opti , name_model ))
305- raise ValueError
306- param_model .data .copy_ (param_opti .data )
307-
308- def set_optimizer_params_grad (named_params_optimizer , named_params_model , test_nan = False ):
309- """ Utility function for optimize_on_cpu and 16-bits training.
310- Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model
311- """
312- is_nan = False
313- for (name_opti , param_opti ), (name_model , param_model ) in zip (named_params_optimizer , named_params_model ):
314- if name_opti != name_model :
315- logger .error ("name_opti != name_model: {} {}" .format (name_opti , name_model ))
316- raise ValueError
317- if param_model .grad is not None :
318- if test_nan and torch .isnan (param_model .grad ).sum () > 0 :
319- is_nan = True
320- if param_opti .grad is None :
321- param_opti .grad = torch .nn .Parameter (param_opti .data .new ().resize_ (* param_opti .data .size ()))
322- param_opti .grad .data .copy_ (param_model .grad .data )
323- else :
324- param_opti .grad = None
325- return is_nan
306+ def warmup_linear (x , warmup = 0.002 ):
307+ if x < warmup :
308+ return x / warmup
309+ return 1.0 - x
326310
327311def main ():
328312 parser = argparse .ArgumentParser ()
@@ -403,17 +387,15 @@ def main():
403387 type = int ,
404388 default = 1 ,
405389 help = "Number of updates steps to accumulate before performing a backward/update pass." )
406- parser .add_argument ('--optimize_on_cpu' ,
407- default = False ,
408- action = 'store_true' ,
409- help = "Whether to perform optimization and keep the optimizer averages on CPU" )
410390 parser .add_argument ('--fp16' ,
411391 default = False ,
412392 action = 'store_true' ,
413393 help = "Whether to use 16-bit float precision instead of 32-bit" )
414394 parser .add_argument ('--loss_scale' ,
415- type = float , default = 128 ,
416- help = 'Loss scaling, positive power of 2 values can improve fp16 convergence.' )
395+ type = float , default = 0 ,
396+ help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n "
397+ "0 (default value): dynamic loss scaling.\n "
398+ "Positive power of 2: static loss scaling value.\n " )
417399
418400 args = parser .parse_args ()
419401
@@ -433,13 +415,11 @@ def main():
433415 device = torch .device ("cuda" if torch .cuda .is_available () and not args .no_cuda else "cpu" )
434416 n_gpu = torch .cuda .device_count ()
435417 else :
418+ torch .cuda .set_device (args .local_rank )
436419 device = torch .device ("cuda" , args .local_rank )
437420 n_gpu = 1
438421 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
439422 torch .distributed .init_process_group (backend = 'nccl' )
440- if args .fp16 :
441- logger .info ("16-bits training currently not supported in distributed training" )
442- args .fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
443423 logger .info ("device %s n_gpu %d distributed training %r" , device , n_gpu , bool (args .local_rank != - 1 ))
444424
445425 if args .gradient_accumulation_steps < 1 :
@@ -487,32 +467,35 @@ def main():
487467 model .half ()
488468 model .to (device )
489469 if args .local_rank != - 1 :
490- model = torch .nn .parallel .DistributedDataParallel (model , device_ids = [args .local_rank ],
491- output_device = args .local_rank )
470+ model = DDP (model )
492471 elif n_gpu > 1 :
493472 model = torch .nn .DataParallel (model )
494473
495474 # Prepare optimizer
496- if args .fp16 :
497- param_optimizer = [(n , param .clone ().detach ().to ('cpu' ).float ().requires_grad_ ()) \
498- for n , param in model .named_parameters ()]
499- elif args .optimize_on_cpu :
500- param_optimizer = [(n , param .clone ().detach ().to ('cpu' ).requires_grad_ ()) \
501- for n , param in model .named_parameters ()]
502- else :
503- param_optimizer = list (model .named_parameters ())
504- no_decay = ['bias' , 'gamma' , 'beta' ]
475+ param_optimizer = list (model .named_parameters ())
476+ no_decay = ['bias' , 'LayerNorm.bias' , 'LayerNorm.weight' ]
505477 optimizer_grouped_parameters = [
506- {'params' : [p for n , p in param_optimizer if not any (nd in n for nd in no_decay )], 'weight_decay_rate ' : 0.01 },
507- {'params' : [p for n , p in param_optimizer if any (nd in n for nd in no_decay )], 'weight_decay_rate ' : 0.0 }
478+ {'params' : [p for n , p in param_optimizer if not any (nd in n for nd in no_decay )], 'weight_decay ' : 0.01 },
479+ {'params' : [p for n , p in param_optimizer if any (nd in n for nd in no_decay )], 'weight_decay ' : 0.0 }
508480 ]
509481 t_total = num_train_steps
510482 if args .local_rank != - 1 :
511483 t_total = t_total // torch .distributed .get_world_size ()
512- optimizer = BertAdam (optimizer_grouped_parameters ,
513- lr = args .learning_rate ,
514- warmup = args .warmup_proportion ,
515- t_total = t_total )
484+ if args .fp16 :
485+ optimizer = FusedAdam (optimizer_grouped_parameters ,
486+ lr = args .learning_rate ,
487+ bias_correction = False ,
488+ max_grad_norm = 1.0 )
489+ if args .loss_scale == 0 :
490+ optimizer = FP16_Optimizer (optimizer , dynamic_loss_scale = True )
491+ else :
492+ optimizer = FP16_Optimizer (optimizer , static_loss_scale = args .loss_scale )
493+
494+ else :
495+ optimizer = BertAdam (optimizer_grouped_parameters ,
496+ lr = args .learning_rate ,
497+ warmup = args .warmup_proportion ,
498+ t_total = t_total )
516499
517500 global_step = 0
518501 if args .do_train :
@@ -543,34 +526,24 @@ def main():
543526 loss = model (input_ids , segment_ids , input_mask , label_ids )
544527 if n_gpu > 1 :
545528 loss = loss .mean () # mean() to average on multi-gpu.
546- if args .fp16 and args .loss_scale != 1.0 :
547- # rescale loss for fp16 training
548- # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
549- loss = loss * args .loss_scale
550529 if args .gradient_accumulation_steps > 1 :
551530 loss = loss / args .gradient_accumulation_steps
552- loss .backward ()
531+
532+ if args .fp16 :
533+ optimizer .backward (loss )
534+ else :
535+ loss .backward ()
536+
553537 tr_loss += loss .item ()
554538 nb_tr_examples += input_ids .size (0 )
555539 nb_tr_steps += 1
556540 if (step + 1 ) % args .gradient_accumulation_steps == 0 :
557- if args .fp16 or args .optimize_on_cpu :
558- if args .fp16 and args .loss_scale != 1.0 :
559- # scale down gradients for fp16 training
560- for param in model .parameters ():
561- if param .grad is not None :
562- param .grad .data = param .grad .data / args .loss_scale
563- is_nan = set_optimizer_params_grad (param_optimizer , model .named_parameters (), test_nan = True )
564- if is_nan :
565- logger .info ("FP16 TRAINING: Nan in gradients, reducing loss scaling" )
566- args .loss_scale = args .loss_scale / 2
567- model .zero_grad ()
568- continue
569- optimizer .step ()
570- copy_optimizer_params_to_model (model .named_parameters (), param_optimizer )
571- else :
572- optimizer .step ()
573- model .zero_grad ()
541+ # modify learning rate with special warm up BERT uses
542+ lr_this_step = args .learning_rate * warmup_linear (global_step / t_total , args .warmup_proportion )
543+ for param_group in optimizer .param_groups :
544+ param_group ['lr' ] = lr_this_step
545+ optimizer .step ()
546+ optimizer .zero_grad ()
574547 global_step += 1
575548
576549 if args .do_eval and (args .local_rank == - 1 or torch .distributed .get_rank () == 0 ):
0 commit comments