NVIDIA · yidong72 · Dec 9, 2022 · Oct 13, 2022 · Oct 13, 2022 · Oct 13, 2022
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_inference.yaml
@@ -28,6 +28,9 @@ hparams_file: null # model configuration file, only used for PTL checkpoint load
 prompts: # prompts for GPT inference
   - "Q: How are you?"
   - "Q: How big is the universe?"
-server: False  # whether launch the inference server
+server: False  # whether launch the API server
 port: 5555 # the port number for the inference server
-
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
@@ -34,11 +34,28 @@ prompts: # prompts for RETRO model inference
 
 ########### Faiss service parameters ########
 retrieval_service:
-  faiss_devices: '0,1,2'
-  faiss_index: null  # the faiss index file that is used to find KNN
-  nprobe: 100
-  retrieval_index: null
-  sentence_bert: 'all-mpnet-base-v2' 
-  sentence_bert_batch: 4
   neighbors: 4
   frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
+  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
+  store_retrieved: False # whether store the retrieved documents, so it can be checked
+  weights: [0.5, 0.5] # weight for different retrieval services
+  sentence_bert:
+    devices: '0,1,2'
+    sentence_bert: 'all-mpnet-base-v2' 
+    sentence_bert_batch: 4
+  services: 
+    - type: FaissRetrievalService
+      faiss_devices: '0,1,2'
+      faiss_index: null  # the faiss index file that is used to find KNN
+      nprobe: 100
+      retrieval_index: null
+    - type: DynamicFaissRetrievalService
+      faiss_devices: '0,1,2'
+      chunk_size: 64
+      stride: 32
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: False  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import threading
 
 import torch
 from omegaconf import OmegaConf, open_dict
@@ -21,6 +22,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.modules.common.megatron_web_server import get_demo
 from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
 from nemo.collections.nlp.modules.common.text_generation_utils import generate
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
@@ -253,6 +255,9 @@ def main(cfg) -> None:
     # Third method of running text generation, use inference server
     if cfg.server:
         if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
+            if cfg.web_server:
+                thread = threading.Thread(target=get_demo, daemon=True, args=(cfg.share, cfg.username, cfg.password))
+                thread.start()
             server = MegatronServer(model.cuda())
             server.run("0.0.0.0", port=cfg.port)
 

diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -13,17 +13,28 @@
 # limitations under the License.
 
 import os
+import threading
 
+import torch
 from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from torch.utils.data import DataLoader
 
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.modules.common.megatron_web_server import get_retro_demo
+from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
+from nemo.collections.nlp.modules.common.text_generation_utils import generate
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
 from nemo.core.config import hydra_runner
 
+try:
+    from apex.transformer import parallel_state
+
+    HAVE_APEX = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_APEX = False
 
 """
 This is the script to run RETRO Model text generation.
@@ -86,26 +97,55 @@ def main(cfg) -> None:
         "compute_logprob": cfg.inference.compute_logprob,
     }
 
-    if not cfg.use_predict_method:
-        # First method of running text generation, call model.generate method
-        response = model.generate(
-            inputs=OmegaConf.to_container(cfg.prompts),
-            length_params=length_params,
-            sampling_params=sampling_params,
-            **cfg.retrieval_service,
-        )
+    # check whether the DDP is initialized
+    if parallel_state.is_unitialized():
+
+        def dummy():
+            return
+
+        if model.trainer.strategy.launcher is not None:
+            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+        model.trainer.strategy.setup_environment()
+
+    config = OmegaConf.to_container(cfg.inference)
+    retrieval_service = OmegaConf.to_container(cfg.retrieval_service)
+    model.set_inference_config(config, retrieval_service)
+
+    # running text generation, use inference server
+    if cfg.server:
+        if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
+            if cfg.web_server:
+                thread = threading.Thread(
+                    target=get_retro_demo, daemon=True, args=(cfg.share, cfg.username, cfg.password)
+                )
+                thread.start()
+            server = MegatronServer(model.cuda(), inference_strategy=model.inference_strategy)
+            server.run("0.0.0.0", port=cfg.port)
+
+        while True:
+            choice = torch.cuda.LongTensor(1)
+            torch.distributed.broadcast(choice, 0)
+            if choice[0].item() == 0:
+                generate(model.cuda(), strategy=model.inference_strategy)
     else:
-        # Second method of running text generation, call trainer.predict
-        ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
-        request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size)
-        config = OmegaConf.to_container(cfg.inference)
-        retrieval_service = OmegaConf.to_container(cfg.retrieval_service)
-        model.set_inference_config(config, retrieval_service)
-        response = trainer.predict(model, request_dl)
-
-    print("***************************")
-    print(response)
-    print("***************************")
+
+        if not cfg.use_predict_method:
+            # First method of running text generation, call model.generate method
+            response = model.generate(
+                inputs=OmegaConf.to_container(cfg.prompts),
+                length_params=length_params,
+                sampling_params=sampling_params,
+                strategy=model.inference_strategy,
+            )
+        else:
+            # Second method of running text generation, call trainer.predict
+            ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
+            request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size)
+            response = trainer.predict(model, request_dl)
+
+        print("***************************")
+        print(response)
+        print("***************************")
 
 
 if __name__ == '__main__':

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -340,6 +340,8 @@ def validation_step(self, batch, batch_idx):
         return reduced_loss
 
     def validation_epoch_end(self, outputs):
+        if len(outputs) == 0:
+            return
         averaged_loss = torch.stack(outputs).mean()
         self.log('val_loss', averaged_loss, prog_bar=True)
         # formula to compute the perplexity
@@ -457,7 +459,7 @@ def setup(self, stage=None):
 
     def set_inference_config(self, inference_config, retrieval_config):
         self._inference_config = inference_config
-        self._inference_strategy = model_inference_strategy_dispatcher(self, **retrieval_config)
+        self.inference_strategy = model_inference_strategy_dispatcher(self, **retrieval_config)
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self._inference_config
@@ -474,13 +476,13 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
                 inference_config['all_probs'] = True
                 inference_config["add_BOS"] = False
                 inference_config['greedy'] = True
-                response = generate(self, **inference_config, strategy=self._inference_strategy)
+                response = generate(self, **inference_config, strategy=self.inference_strategy)
                 compute_prob_response = get_computeprob_response(self.tokenizer, response, batch)
                 return compute_prob_response
             else:
                 del inference_config['compute_logprob']
                 inference_config['inputs'] = batch
-                return generate(self, **inference_config, strategy=self._inference_strategy)
+                return generate(self, **inference_config, strategy=self.inference_strategy)
 
     def generate(
         self,

diff --git a/nemo/collections/nlp/modules/common/megatron/mup/layer.py b/nemo/collections/nlp/modules/common/megatron/mup/layer.py
@@ -71,13 +71,16 @@ def __init__(self, mpu_vocab_size, parallel_output):
         self.bias.partition_dim = 0
         self.bias.stride = 1
         self.parallel_output = parallel_output
+        self.warn_once = False
 
     def forward(self, hidden_states, word_embeddings_weight):
         if hasattr(word_embeddings_weight, 'infshape'):
             width_mult = word_embeddings_weight.infshape.width_mult()
         else:
             width_mult = 1.0
-            logging.warning("need to set_shape before use mu-Transfer readout layer")
+            if not self.warn_once:
+                logging.warning("need to set_shape before use mu-Transfer readout layer")
+            self.warn_once = True
         async_tensor_model_parallel_allreduce = parallel_state.get_tensor_model_parallel_world_size() > 1
         output = parallel_lm_logits(
             hidden_states / width_mult,