nyu-mll · pruksmhc · Apr 10, 2020 · Oct 30, 2019 · Oct 31, 2019 · Nov 4, 2019
@@ -551,7 +551,6 @@ def main(cl_arguments):
     tasks = sorted(set(pretrain_tasks + target_tasks), key=lambda x: x.name)
     log.info("\tFinished loading tasks in %.3fs", time.time() - start_time)
     log.info("\t Tasks: {}".format([task.name for task in tasks]))
-
     # Build model
     log.info("Building model...")
     start_time = time.time()
@@ -567,7 +566,16 @@ def main(cl_arguments):
     if args.do_pretrain:
         # Train on pretrain tasks
         log.info("Training...")
-        stop_metric = pretrain_tasks[0].val_metric if len(pretrain_tasks) == 1 else "macro_avg"
+        if args.early_stopping_method != "auto":
+            pretrain_names = [task.name for task in pretrain_tasks]
+            if args.early_stopping_method in pretrain_names:
+                index = pretrain_names.index(args.early_stopping_method)
+                stop_metric = pretrain_tasks[index].val_metric
+            else:
+                raise ValueError("args.early_stopping_method must be either 'auto' or a task name")
+
+        else:
+            stop_metric = pretrain_tasks[0].val_metric if len(pretrain_tasks) == 1 else "macro_avg"
         should_decrease = (
             pretrain_tasks[0].val_metric_decreases if len(pretrain_tasks) == 1 else False
         )

@@ -0,0 +1,19 @@
+// Base config file for mlm experiments wit roberta
+include "defaults.conf"
+
+early_stopping_method=auto // Early stopping method. Options: task_name to only do early stopping based 
+                      // on a specific task, 'auto': use the macro_avg
+
+// Multi-task Training
+weighting_method = proportional  // Weighting method for task sampling, relative to the number of
+                                 // training examples in each task:
+                                 // Options: uniform, power_<power>, softmax_<temp>
+                                 //   proportional, proportional_log_batch, and
+                                 //   proportional_log_example (plus the less-useful inverse,
+                                 // inverse_log_example, and inverse_log_batch).
+                                 //  Additionally, we include the T5 method of examples-proportional-mixing.
+                                 // See relevant source code for details.
+scaling_method = uniform  // Method for scaling loss:
+                          // Options: uniform, max_power_<power>, max_proportional,
+                          //   max_proportional_log, max_inverse, max_inverse_log
+                          //   max_epoch_<E1_E2_..._En>
@@ -178,6 +178,8 @@ max_epochs = -1 // If positive, maximum number of epochs (full pass over a task'
                 // especially if it's higher than one epoch's worth of steps, it's possible to
                 // significantly overshoot the intended number of epochs.
 
+early_stopping_method=auto // Early stopping method. Options: task_name to only do early stopping based
+                           // on a specific task, 'auto': use the macro_avg
 patience = 5  // Patience in early stopping. Training will stop if performance does not improve at
               // all in patience + 1 validations.
 keep_all_checkpoints = 0  // If set, keep checkpoint files from every validation. Otherwise, keep
@@ -196,6 +198,8 @@ weighting_method = proportional  // Weighting method for task sampling, relative
                                  //   proportional, proportional_log_batch, and
                                  //   proportional_log_example (plus the less-useful inverse,
                                  // inverse_log_example, and inverse_log_batch).
+                                 // Additionally, we include the T5 method of examples_proportional_mixing.
+                                 // To use this, set weighting_method=examples_proportional_mixingK=104857
                                  // See relevant source code for details.
 scaling_method = uniform  // Method for scaling loss:
                           // Options: uniform, max_power_<power>, max_proportional,

@@ -39,6 +39,7 @@ def __init__(self, args):
         self._sep_id = None
         self._pad_id = None
         self._unk_id = None
+        self._mask_id = None
 
         # If set, treat these special tokens as part of input segments other than A/B.
         self._SEG_ID_CLS = None
@@ -270,6 +271,7 @@ def __init__(self, args):
         self._cls_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
         self._pad_id = self.tokenizer.convert_tokens_to_ids("[PAD]")
         self._unk_id = self.tokenizer.convert_tokens_to_ids("[UNK]")
+        self._mask_id = self.tokenizer.convert_tokens_to_ids("[MASK]")
 
         self.parameter_setup(args)
 
@@ -327,6 +329,7 @@ def __init__(self, args):
         self._cls_id = self.tokenizer.convert_tokens_to_ids("<s>")
         self._pad_id = self.tokenizer.convert_tokens_to_ids("<pad>")
         self._unk_id = self.tokenizer.convert_tokens_to_ids("<unk>")
+        self._mask_id = self.tokenizer.convert_tokens_to_ids("<mask>")
 
         self.parameter_setup(args)
 
@@ -358,8 +361,8 @@ def get_pretrained_lm_head(self):
             self.input_module, cache_dir=self.cache_dir
         )
         lm_head = model_with_lm_head.lm_head
-        lm_head.predictions.decoder.weight = self.model.embeddings.word_embeddings.weight
-        return nn.Sequential(lm_head, nn.LogSoftmax(dim=-1))
+        lm_head.decoder.weight = self.model.embeddings.word_embeddings.weight
+        return lm_head
 
 
 class AlbertEmbedderModule(HuggingfaceTransformersEmbedderModule):
@@ -381,6 +384,7 @@ def __init__(self, args):
         self._cls_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
         self._pad_id = self.tokenizer.convert_tokens_to_ids("<pad>")
         self._unk_id = self.tokenizer.convert_tokens_to_ids("<unk>")
+        self._mask_id = self.tokenizer.convert_tokens_to_ids("[MASK]")
 
         self.parameter_setup(args)
 
@@ -437,6 +441,7 @@ def __init__(self, args):
         self._cls_id = self.tokenizer.convert_tokens_to_ids("<cls>")
         self._pad_id = self.tokenizer.convert_tokens_to_ids("<pad>")
         self._unk_id = self.tokenizer.convert_tokens_to_ids("<unk>")
+        self._mask_id = self.tokenizer.convert_tokens_to_ids("<mask>")
 
         self.parameter_setup(args)
 
@@ -478,7 +483,7 @@ def get_pretrained_lm_head(self, args):
         )
         lm_head = model_with_lm_head.lm_loss
         lm_head.weight = self.model.word_embedding.weight
-        return nn.Sequential(lm_head, nn.LogSoftmax(dim=-1))
+        return lm_head
 
 
 class OpenAIGPTEmbedderModule(HuggingfaceTransformersEmbedderModule):
@@ -541,7 +546,7 @@ def get_pretrained_lm_head(self, args):
         )
         lm_head = model_with_lm_head.lm_head
         lm_head.weight = self.model.tokens_embed.weight[: lm_head.weight.size()[0]]
-        return nn.Sequential(lm_head, nn.LogSoftmax(dim=-1))
+        return lm_head
 
 
 class GPT2EmbedderModule(HuggingfaceTransformersEmbedderModule):
@@ -603,7 +608,7 @@ def get_pretrained_lm_head(self):
         )
         lm_head = model_with_lm_head.lm_head
         lm_head.weight = self.model.wte.weight[: lm_head.weight.size()[0]]
-        return nn.Sequential(lm_head, nn.LogSoftmax(dim=-1))
+        return lm_head
 
 
 class TransfoXLEmbedderModule(HuggingfaceTransformersEmbedderModule):

@@ -45,7 +45,7 @@
 from jiant.modules.span_modules import SpanClassifierModule
 from jiant.huggingface_transformers_interface import input_module_uses_transformers
 from jiant.tasks.edge_probing import EdgeProbingTask
-from jiant.tasks.lm import LanguageModelingTask
+from jiant.tasks.lm import AutoregressiveLanguageModelingTask, MaskedLanguageModelingTask
 from jiant.tasks.lm_parsing import LanguageModelingParsingTask
 from jiant.tasks.qa import MultiRCTask, ReCoRDTask
 from jiant.tasks.seq2seq import Seq2SeqTask
@@ -76,6 +76,7 @@
     format_output,
     uses_cuda,
 )
+from jiant.utils.data_loaders import get_tokenizer
 
 # Elmo stuff
 # Look in $ELMO_SRC_DIR (e.g. /usr/share/jsalt/elmo) or download from web
@@ -158,18 +159,22 @@ def build_sent_encoder(args, vocab, d_emb, tasks, embedder, cove_layer):
         )
         d_sent = args.d_word
         log.info("Using PRPN sentence encoder!")
-    elif any(isinstance(task, LanguageModelingTask) for task in tasks) or args.sent_enc == "bilm":
+    elif (
+        any(isinstance(task, AutoregressiveLanguageModelingTask) for task in tasks)
+        or args.sent_enc == "bilm"
+    ):
         assert_for_log(args.sent_enc in ["rnn", "bilm"], "Only RNNLM supported!")
-        assert_for_log(
-            not (
-                args.input_module == "elmo"
-                or args.input_module.startswith("bert")
-                or args.input_module.startswith("xlnet")
-            ),
-            f"Using input_module = {args.input_module} for language modeling is probably not a "
-            "good idea, since it allows the language model to use information from the right-hand "
-            "context.",
-        )
+        if any(isinstance(task, AutoregressiveLanguageModelingTask) for task in tasks):
+            assert_for_log(
+                not (
+                    args.input_module == "elmo"
+                    or args.input_module.startswith("bert")
+                    or args.input_module.startswith("xlnet")
+                ),
+                f"Using input_module = {args.input_module} for language modeling is probably not a "
+                "good idea, since it allows the language model to use information from the right-hand "
+                "context.",
+            )
         bilm = BiLMEncoder(d_emb, args.d_hid, args.d_hid, args.n_layers_enc)
         sent_encoder = SentenceEncoder(
             vocab,
@@ -549,7 +554,10 @@ def build_task_specific_modules(task, model, d_sent, d_emb, vocab, embedder, arg
         hid2voc = build_lm(task, d_sent, args)
         setattr(model, "%s_hid2voc" % task.name, hid2voc)
         setattr(model, "%s_mdl" % task.name, hid2voc)
-    elif isinstance(task, LanguageModelingTask):
+    elif isinstance(task, MaskedLanguageModelingTask):
+        module = build_mlm(model.sent_encoder._text_field_embedder)
+        setattr(model, "%s_mdl" % task.name, module)
+    elif isinstance(task, AutoregressiveLanguageModelingTask):
         assert not input_module_uses_transformers(args.input_module), (
             "our LM Task does not support transformers, if you need them, try to update",
             "corresponding parts of the code. You may find get_pretrained_lm_head and",
@@ -746,6 +754,12 @@ def build_lm(task, d_inp, args):
     return hid2voc
 
 
+def build_mlm(embedder):
+    " Build MLM components "
+    lm_head = embedder.get_pretrained_lm_head()
+    return lm_head
+
+
 def build_span_classifier(task, d_sent, task_params):
     module = SpanClassifierModule(task, d_sent, task_params, num_spans=task.num_spans)
     return module
@@ -853,7 +867,9 @@ def forward(self, task, batch, predict=False):
             task, (PairClassificationTask, PairRegressionTask, PairOrdinalRegressionTask)
         ):
             out = self._pair_sentence_forward(batch, task, predict)
-        elif isinstance(task, LanguageModelingTask):
+        elif isinstance(task, MaskedLanguageModelingTask):
+            out = self._masked_lm_forward(batch, task, predict)
+        elif isinstance(task, AutoregressiveLanguageModelingTask):
             if isinstance(self.sent_encoder._phrase_layer, ONLSTMStack) or isinstance(
                 self.sent_encoder._phrase_layer, PRPN
             ):
@@ -1160,6 +1176,67 @@ def _lm_forward(self, batch, task, predict):
             pass
         return out
 
+    def _masked_lm_forward(self, batch, task, predict):
+        """
+        We currently only support RoBERTa-style dynamic masking, with the exact 
+        setup and parameters as RoBERTa. 
+        """
+        mlm_probability = 0.15
+        out = {}
+        sent_encoder = self.sent_encoder
+        tokenizer_name = self.sent_encoder._text_field_embedder.input_module
+        vocab_size = (
+            self.sent_encoder._text_field_embedder.model.embeddings.word_embeddings.num_embeddings
+        )
+        tokenizer = get_tokenizer(tokenizer_name)
+        input_key = self.sent_encoder._text_field_embedder.tokenizer_required
+        mask_idx = self.sent_encoder._text_field_embedder._mask_id
+        b_size, seq_len = batch["targs"].size()
+        inputs = batch["input"][input_key]
+        labels = batch["targs"]
+        # Masking code from https://github.com/huggingface/transformers/blob/master/examples/run_language_modeling.py
+        probability_matrix = torch.full(labels.shape, mlm_probability, device=inputs.device)
+        padding_mask = labels.eq(0)
+        probability_matrix.masked_fill_(padding_mask, value=0.0)
+
+        masked_indices = torch.bernoulli(probability_matrix).to(
+            device=inputs.device, dtype=torch.uint8
+        )
+        tokenizer_name = self.sent_encoder._text_field_embedder.tokenizer_required
+        labels, _ = self.sent_encoder._text_field_embedder.correct_sent_indexing(
+            {tokenizer_name: labels}
+        )
+        # We only compute loss on masked tokens
+        # nn.CrossEntropy ignores the idices with value = -100 by default.
+        # Therefore, we replace non-masked indices with -100 so that they get ignored
+        # in loss computation.
+        labels[~masked_indices] = -100
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        bernoulli_mask = torch.bernoulli(torch.full(labels.shape, 0.8)).to(
+            device=inputs.device, dtype=torch.uint8
+        )
+        indices_replaced = bernoulli_mask & masked_indices
+        inputs[indices_replaced] = mask_idx
+
+        # 10% of the time, we replace masked input tokens with random word
+        bernoulli_mask = torch.bernoulli(torch.full(labels.shape, 0.5)).to(
+            device=inputs.device, dtype=torch.uint8
+        )
+        indices_random = bernoulli_mask & masked_indices & ~indices_replaced
+        random_words = torch.randint(
+            len(tokenizer), labels.shape, dtype=torch.long, device=inputs.device
+        )
+        inputs[indices_random] = random_words[indices_random]
+        batch["input"][input_key] = inputs
+        sent_embs, sent_mask = self.sent_encoder(batch["input"], task)
+        module = getattr(self, "%s_mdl" % task.name)
+        logits = module.forward(sent_embs)
+        out["logits"] = logits
+        out["loss"] = F.cross_entropy(logits.view(-1, vocab_size), labels.view(-1))
+        out["n_exs"] = format_output(b_size, self._cuda_device)
+        return out
+
     def _mc_forward(self, batch, task, predict):
         """ Forward for a multiple choice question answering task """
         out = {}

@@ -54,6 +54,7 @@
 from jiant.tasks import REGISTRY as TASKS_REGISTRY
 from jiant.tasks.seq2seq import Seq2SeqTask
 from jiant.tasks.tasks import SequenceGenerationTask, Task
+from jiant.tasks.lm import MaskedLanguageModelingTask
 from jiant.utils import config, serialize, utils, options
 from jiant.utils.options import parse_task_list_arg
 
@@ -261,6 +262,7 @@ def _build_vocab(args: config.Params, tasks: List[Task], vocab_path: str):
     for task in tasks:  # add custom label namespaces
         # TODO: surface more docs for add_task_label_vocab:
         add_task_label_vocab(vocab, task)
+
     if args.force_include_wsj_vocabulary:
         # Add WSJ full vocabulary for PTB F1 parsing tasks.
         add_wsj_vocab(vocab, args.data_dir)
@@ -661,10 +663,6 @@ def add_task_label_vocab(vocab, task):
         return
     log.info("\tTask '%s': adding vocab namespace '%s'", task.name, namespace)
 
-    if isinstance(task, SequenceGenerationTask):
-        for special in SPECIALS:
-            vocab.add_token_to_namespace(special, namespace)
-
     for label in task.get_all_labels():
         vocab.add_token_to_namespace(label, namespace)