nyu-mll · sleepinyourhat · Apr 20, 2019 · Apr 6, 2019 · Apr 6, 2019 · Apr 15, 2019
@@ -153,10 +153,10 @@ To see the full set of available params, see [config/defaults.conf](config/defau
 
 We also include an experimental option to use a shared [Transformer](https://arxiv.org/abs/1706.03762) in place of the shared BiLSTM by setting ``sent_enc = transformer``. When using a Transformer, we use the [Noam learning rate scheduler](https://github.com/allenai/allennlp/blob/master/allennlp/training/learning_rate_schedulers.py#L84), as that seems important to training the Transformer thoroughly. 
 
-We also support using pretrained Transformer language models. To use the OpenAI transformer model, set `openai_transformer = 1` and `openai_transformer_fine_tune = 1`.
+We also support using pretrained Transformer language models. To use the OpenAI transformer model, set `openai_transformer = 1`.
 To use [BERT](https://arxiv.org/abs/1810.04805) architecture, set ``bert_model_name`` to one of the models listed [here](https://github.com/huggingface/pytorch-pretrained-BERT#loading-google-ai-or-openai-pre-trained-weigths-or-pytorch-dump), e.g. ``bert-base-cased``. You should also set ``tokenizer`` to be the BERT model used in order to ensure you are using the same tokenization and vocabulary.
 
-When using BERT, we follow the procedures set out in the original work as closely as possible: For pair sentence tasks, we concatenate the sentences with a sepcial `[SEP]` token. Rather than max-pooling, we take the first representation of the sequence (corresponding to the special `[CLS]` token) as the representation of the entire sequence. To fine-tune BERT, set `bert_fine_tune = 1`.
+When using BERT, we follow the procedures set out in the original work as closely as possible: For pair sentence tasks, we concatenate the sentences with a sepcial `[SEP]` token. Rather than max-pooling, we take the first representation of the sequence (corresponding to the special `[CLS]` token) as the representation of the entire sequence.
 We also have support for the version of Adam that was used in training BERT (``optimizer = bert_adam``).
 
 ## Trainer
@@ -181,7 +181,6 @@ Note: if you want to train and evaluate on a task, that task must be in both ``p
 We support two modes of adapting pretrained models to target tasks. 
 Setting `transfer_paradigm = finetune` will fine-tune the entire model while training for a target task.
 The mode will create a copy of the model _per target task_.
-If using a pretrained model such as BERT or GPT, be sure to also set the corresponding fine-tune flag, e.g. `bert_fine_tune = 1`.
 Setting `transfer_paradigm = frozen` will only train the target-task specific components while training for a target task.
 If using ELMo and `sep_embs_for_skip = 1`, we will also learn a task-specific set of layer-mixing weights.
 

@@ -71,7 +71,10 @@ do_full_eval = 1     // Evaluate the model on the tasks on target_tasks.
 // Related configuration
 load_model = 1  // If true, restore from checkpoint when starting do_pretrain. No impact on do_target_task_training.
 transfer_paradigm = "frozen" // How to do learning on top of the pretrained model
+                             // this option also affect the updating of downloaded parameters for models like ELMO/GPT/BERT
                              // Options: "frozen", "finetune"
+                             // "frozen" will train the downstream models on fixed representations from the encoder model
+                             // "finetune" will update the parameters of the encoders models as well as the downstream models
 load_eval_checkpoint = none  // If not "none", load the specified model_state checkpoint file when starting do_target_task_training.
 allow_untrained_encoder_parameters = 0  // Set for experiments involving random untrained encoders only. Allows do_target_task_training
                                         // and do_full_eval to proceed without pretraining.
@@ -203,13 +206,6 @@ openai_embeddings_mode = "none"  // How to handle the embedding layer of the
                                  // "only" returns only lexical layer
                                  // "mix" uses ELMo-style scalar mixing (with
                                  // learned weights) across all layers
-openai_transformer_fine_tune = 0  // If false, OpenAI Transformer weights will
-                                  // be frozen during training. If enabled,
-                                  // train according to global settings for
-                                  // training a token embedder module.
-                                  // Note that this might require a
-                                  // high amount of GPU memory, so consider
-                                  // lowering the batch size if set to true.
 bert_model_name = ""  // If nonempty, use this BERT model for representations.
                       // Available values are:
                       // bert-base-uncased, bert-large-cased, ...
@@ -224,14 +220,7 @@ bert_embeddings_mode = "none"  // How to handle the embedding layer of the
                                //   lexical layer
                                // "only" returns only lexical layer
                                // "mix" uses ELMo-style scalar mixing (with
-                               // learned weights) across all layers
-bert_fine_tune = 0  // If false, BERT transformer weights will
-                    // be frozen during training. If enabled,
-                    // train according to global settings for
-                    // training a token embedder module.
-                    // Note that this might require a
-                    // high amount of GPU memory, so consider
-                    // lowering the batch size if set to true.
+                               // learned weights) across all layers.
 
 // Sentence Encoder //
 sent_enc = rnn  // Type of sentence encoder: 'bow', 'rnn' (for LSTM), or 'transformer'

@@ -12,7 +12,6 @@ bert_embeddings_mode = "top"   // how to use the outputs of the BERT module
                                 // set as "top", we use only the top-layer activation
                                 // other options: "only" uses the lexical layer (first layer)
                                 //                "cat" uses lexical layer + top layer
-bert_fine_tune = 1
 elmo = 0
 elmo_chars_only = 0 
 pair_attn = 0 // shouldn't be needed but JIC

@@ -15,7 +15,7 @@ elmo = 0
 tokenizer = "OpenAI.BPE"  // gpt must use this tokenizer
 openai_transformer = 1
 openai_embedding_mode = "none"  // only use repr from the last layer
-openai_transformer_fine_tune = 1  // fine tune the transformers in gpt 
+transfer_paradigm = "finetune"  // fine tune the transformers in gpt 
 sent_enc = "null"  // we feed the gpt repr into the mlp classifier, no more layer in between 
 sep_embs_for_skip = 1  // otherwise some assert will refuse to build the model
 lr = 1e-5  // we need to keep lr small for fine-tuning gpt, especially so because our BS is smaller than that in gpt paper

@@ -57,11 +57,11 @@ def __init__(self, args, cache_dir=None):
 
         # Set trainability of this module.
         for param in self.model.parameters():
-            param.requires_grad = bool(args.bert_fine_tune)
+            param.requires_grad = bool(args.transfer_paradigm == 'finetune')
 
         # Configure scalar mixing, ELMo-style.
         if self.embeddings_mode == "mix":
-            if not args.bert_fine_tune:
+            if args.transfer_paradigm == 'frozen':
                 log.warning("NOTE: bert_embeddings_mode='mix', so scalar "
                             "mixing weights will be fine-tuned even if BERT "
                             "model is frozen.")

@@ -198,7 +198,7 @@ def __init__(self, args, n_special=3, n_ctx=512):
 
         # Set trainability of this module.
         for param in self.model.parameters():
-            param.requires_grad = bool(args.openai_transformer_fine_tune)
+            param.requires_grad = bool(args.transfer_paradigm == 'finetune')
 
         # Configure scalar mixing, ELMo-style.
         if args.openai_embeddings_mode == "mix":
@@ -209,7 +209,7 @@ def __init__(self, args, n_special=3, n_ctx=512):
                     ("openai_embeddings_mode='mix' only supports a single set of "
                      "scalars (but if you need this feature, see the TODO in "
                      "the code!)")
-            if not args.openai_transformer_fine_tune:
+            if args.transfer_paradigm == 'frozen':
                 log.warning("NOTE: openai_embeddings_mode='mix', so scalar "
                             "mixing weights will be fine-tuned even if "
                             "transformer weights are frozen.")