FlagAI-Open · marscrazy · Aug 29, 2022 · Jun 29, 2022 · Jun 29, 2022 · Jun 30, 2022
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ FlagAI (Fast LArge-scale General AI models) is an fast, easy-to-use and extensib
 
 * Now it supports **WuDao GLM** with a maximum of 10 billion parameters (see [Introduction to GLM](/docs/GLM.md)). It also supports **BERT**, **RoBERTa**, **GPT2**, **T5**, and models from Huggingface Transformers.
 
-* It provides APIs to quickly download and use those pre-trained models on a given text, fine-tune them on widely-used datasets collected from [SuperGLUE](https://super.gluebenchmark.com/) and [CLUE](https://github.com/CLUEbenchmark/CLUE) benchmarks, and then share them with the community on our model hub. It also provides [prompt-learning](/docs/TUTORIAL_7_PROMPT_LERANING.md) toolkit for few shot tasks.   
+* It provides APIs to quickly download and use those pre-trained models on a given text, fine-tune them on widely-used datasets collected from [SuperGLUE](https://super.gluebenchmark.com/) and [CLUE](https://github.com/CLUEbenchmark/CLUE) benchmarks, and then share them with the community on our model hub. It also provides [prompt-learning](/docs/TUTORIAL_7_PROMPT_LEARNING.md) toolkit for few shot tasks.   
 
 * These models can be applied to (Chinese/English) Text, for tasks like text classification, information extraction, question answering, summarization, and text generation.
 

diff --git a/README_zh.md b/README_zh.md
@@ -207,7 +207,7 @@ for text_pair in test_data:
 * [Tutorial 4: 为模型和数据并行训练定制训练器](/doc_zh/TUTORIAL_4_TRAINER.md)
 * [Tutorial 5: 使用 Autoloader 简化模型和分词器初始化过程](/doc_zh/TUTORIAL_5_INSTRUCTIONS_FOR_AutoLoader.md)
 * [Tutorial 6: 将现成的推理算法与 Predictor 结合使用](/doc_zh/TUTORIAL_6_INSTRUCTIONS_FOR_PREDICTOR.md)
-* [Tutorial 7: 使用飞智提示学习工具包来提高在SuperGLUE任务上的表现](/doc_zh/TUTORIAL_7_PROMPT_LERANING.md)
+* [Tutorial 7: 使用飞智提示学习工具包来提高在SuperGLUE任务上的表现](/doc_zh/TUTORIAL_7_PROMPT_LEARNING.md)
 * [Tutorial 8: 多机训练模型搭建环境](/doc_zh/TUTORIAL_8_ENVIRONMENT_SETUP.md)
 * [Tutorial 9: 使用encoder/decoder/encoder-decoder模型进行文本生成](/doc_zh/TUTORIAL_9_SEQ2SEQ_METHOD.md)
 

diff --git a/examples/glm_title_generation/train.py b/examples/glm_title_generation/train.py
@@ -27,12 +27,16 @@
     num_checkpoints=1,
 )
 
-cur_dir = os.path.dirname(os.path.abspath(__file__))
-src_dir = cur_dir + '/data/train.src'
-tgt_dir = cur_dir + '/data/train.tgt'
+# cur_dir = os.path.dirname(os.path.abspath(__file__))
+# src_dir = cur_dir + '/data/train.src'
+# tgt_dir = cur_dir + '/data/train.tgt'
+
+src_dir = "./data/train.src"
+tgt_dir = "./data/train.tgt"
+
 
 maxlen = 256
-auto_loader = AutoLoader("seq2seq",
+auto_loader = AutoLoader("lm",
                          model_name="GLM-large-ch",
                          model_dir="./state_dict/")
 model = auto_loader.get_model()

diff --git a/examples/opt/README.md b/examples/opt/README.md
@@ -52,4 +52,100 @@ out = predictor.predict_generate_randomsample(text,
                                               repetition_penalty=3.0)
 
 print(f"input is {text} \n out is {out}")
-```
+```
+
+# Multi-GPU inference
+## OPT-30b
+
+To inference by multi-GPU and model parallel, we use torch-DDP and Megatron-LM library.
+### Basic step
+1. Set up the parameters of model parallel, such as ```model_parallel_size``` and ```world_size```
+2. Initialize torch-DDP
+3. Initialize Megatron-LM, model parallel
+4. Set up random seed
+5. Initialize the model and tokenizer
+6. Prediction
+### code
+```python
+import torch
+import os
+import argparse
+from flagai import mpu
+from flagai.auto_model.auto_loader import AutoLoader
+import random
+import numpy as np
+from flagai.model.predictor.predictor import Predictor
+
+# run script : python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 opt_30b_en_mutigpu.py
+os.environ["ENV_TYPE"] = "deepspeed+mpu"
+model_parallel_size = 4
+world_size = 4
+
+os.environ["MODEL_PARALLEL_SIZE"] = str(model_parallel_size)
+os.environ["WORLD_SIZE"] = str(world_size)
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank',
+                    type=int,
+                    default=0,
+                    help="local_rank")
+
+ds_args = parser.parse_args()
+local_rank = ds_args.local_rank
+
+master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
+master_port = os.environ.get('MASTER_PORT', '17501')
+
+device = torch.device("cuda", local_rank)
+
+def initialize_distributed():
+    """Initialize torch.distributed."""
+    torch.backends.cudnn.enabled = False
+    # Manually set the device ids.
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+
+    init_method += master_addr + ':' + master_port
+    torch.distributed.init_process_group(
+        backend='nccl',  # gloo
+        world_size=world_size,
+        rank=local_rank,
+        init_method=init_method)
+    mpu.initialize_model_parallel(model_parallel_size)
+
+initialize_distributed()
+
+set_random_seed(123)
+
+print(f"building model...")
+loader = AutoLoader("lm", model_name="opt-30b-en")
+model = loader.get_model()
+tokenizer = loader.get_tokenizer()
+model.half()
+
+model.parallel_output = False
+model.eval()
+model.to(device)
+
+torch.distributed.barrier(group=mpu.get_model_parallel_group())
+
+text = """I think The Old Man and the Sea is a very good book, what do you think? I think """
+
+predictor = Predictor(model, tokenizer)
+out = predictor.predict_generate_randomsample(text)
+if mpu.get_model_parallel_rank() == 0:
+    print(f"pred is {out}")
+```
+### Run script is 
+```commandline
+python -m torch.distributed.launch --nproc_per_node=4 --nnodes=1 opt_30b_en_mutigpu.py
+```
diff --git a/examples/opt/generate_opt_30b.py b/examples/opt/generate_opt_30b.py
@@ -1,27 +1,13 @@
 from flagai.model.predictor.predictor import Predictor
 from flagai.auto_model.auto_loader import AutoLoader
 import torch
-import os
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-device = torch.device("cpu")
-# loader = AutoLoader(task_name="lm",
-#                     model_name="opt-30b")
-# loader.load_pretrain_params("/mnt/models_xingzhaohu/opt_30b")
 
+loader = AutoLoader(task_name="lm",
+                    model_name="opt-30b-en")
 
-from flagai.model.opt_model import OPTModel
-from flagai.data.tokenizer.opt.opt_en_tokenizer import OPTTokenizer
-print(f"正在构建模型")
-model = OPTModel.init_from_json(os.path.join("/mnt/models_xingzhaohu/opt_30b", "config.json"))
-tokenizer = OPTTokenizer()
-model.load_weights("/mnt/models_xingzhaohu/opt_30b/pytorch_model.bin")
-
-
-
-# model = loader.get_model()
-# tokenizer = loader.get_tokenizer()
+model = loader.get_model()
+tokenizer = loader.get_tokenizer()
 model.eval()
-model.to(device)
 
 text = "The trophy doesn’t fit in the suitcase because "
 predictor = Predictor(model, tokenizer)

diff --git a/examples/opt/generate_opt_66b.py b/examples/opt/generate_opt_66b.py
@@ -0,0 +1,22 @@
+from flagai.model.predictor.predictor import Predictor
+from flagai.auto_model.auto_loader import AutoLoader
+import torch
+
+loader = AutoLoader(task_name="lm",
+                    model_name="opt-66b-en")
+
+model = loader.get_model()
+tokenizer = loader.get_tokenizer()
+model.eval()
+
+text = """I think The Old Man and the Sea is a very good book, what do you think? Thank you for your question, I think """
+
+predictor = Predictor(model, tokenizer)
+out = predictor.predict_generate_randomsample(text,
+                                              input_max_length=100,
+                                              out_max_length=300,
+                                              top_k=50,
+                                              top_p=0.9,
+                                              repetition_penalty=3.0)
+
+print(f"input is {text} \n out is {out}")
diff --git a/examples/opt/opt_30b_en_mutigpu.py b/examples/opt/opt_30b_en_mutigpu.py
@@ -1,4 +1,4 @@
-# os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"
+
 import torch
 import os
 import argparse
@@ -7,8 +7,9 @@
 import random
 import numpy as np
 from flagai.model.predictor.predictor import Predictor
+import glob
+import time
 
-# run script : python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 glm_blank_filling_QA_ch_mutigpu.py
 os.environ["ENV_TYPE"] = "deepspeed+mpu"
 model_parallel_size = 4
 world_size = 4
@@ -58,11 +59,14 @@ def initialize_distributed():
 
 set_random_seed(123)
 
-loader = AutoLoader("lm", model_name="opt-350m-en")
+print(f"building model...")
+loader = AutoLoader("lm", model_name="opt-30b-en")
 model = loader.get_model()
-model.half()
 tokenizer = loader.get_tokenizer()
-# model.parallel_output = False
+model.half()
+
+model.parallel_output = False
+
 model.eval()
 model.to(device)
 
@@ -75,4 +79,3 @@ def initialize_distributed():
 if mpu.get_model_parallel_rank() == 0:
     print(f"pred is {out}")
 
-
diff --git a/examples/opt/opt_66b_en_mutigpu.py b/examples/opt/opt_66b_en_mutigpu.py
@@ -0,0 +1,108 @@
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"
+import torch
+import os
+import time
+os.environ["ENV_TYPE"] = "deepspeed+mpu"
+os.environ["MODEL_PARALLEL_SIZE"] = '8'
+os.environ["WORLD_SIZE"] = '8'
+import argparse
+from flagai import mpu
+import random
+import numpy as np
+from flagai.model.predictor.predictor import Predictor
+from flagai.model.opt_model import OPTModel
+from flagai.data.tokenizer import OPTTokenizer
+
+def get_current_rank():
+    with open('current_rank','r',encoding='utf8') as infile:
+        line = infile.readline().strip()
+    return int(line)
+def set_current_rank(rank):
+    with open('current_rank','w',encoding='utf8') as outfile:
+        outfile.write(str(rank))
+
+def get_current_pool():
+    with open('current_pool','r',encoding='utf8') as infile:
+        line = infile.readline().strip()
+    return int(line)
+
+def set_current_pool(rank):
+    with open('current_pool','w',encoding='utf8') as outfile:
+        outfile.write(str(rank))
+
+# run script : python -m torch.distributed.launch --nproc_per_node=2 --nnodes=1 opt_66b_en_mutigpu.py
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank',
+                    type=int,
+                    default=0,
+                    help="local_rank")
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    if seed is not None and seed > 0:
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        mpu.model_parallel_cuda_manual_seed(seed)
+
+ds_args = parser.parse_args()
+local_rank = ds_args.local_rank
+
+master_addr = os.environ.get('MASTER_ADDR', '127.0.0.1')
+master_port = os.environ.get('MASTER_PORT', '17501')
+
+device = torch.device("cuda", local_rank)
+model_parallel_size = 8
+world_size = 8
+
+def initialize_distributed():
+    """Initialize torch.distributed."""
+    torch.backends.cudnn.enabled = False
+    # Manually set the device ids.
+    torch.cuda.set_device(device)
+    # Call the init process
+    init_method = 'tcp://'
+
+    init_method += master_addr + ':' + master_port
+    torch.distributed.init_process_group(
+        backend='nccl',  # gloo
+        world_size=world_size,
+        rank=local_rank,
+        init_method=init_method)
+    mpu.initialize_model_parallel(model_parallel_size)
+
+initialize_distributed()
+
+set_current_pool(4)
+set_current_rank(0)
+set_random_seed(123)
+torch.distributed.barrier(group=mpu.get_model_parallel_group())
+tokenizer = OPTTokenizer()
+
+while get_current_rank() != local_rank:
+    time.sleep(10)
+while get_current_pool() == 0:
+    time.sleep(10)
+set_current_pool(get_current_pool()-1)
+print("loading rank {}".format(local_rank))
+set_current_rank(local_rank + 1)
+
+model = OPTModel.init_from_json('/mnt/models_xingzhaohu/opt-66b-en/config.json')
+checkpoint_path = '/mnt/models_xingzhaohu/opt-66b-en/pytorch_model_{:02d}.bin'.format(local_rank)
+model.half()
+model.eval()
+model.to(device)
+model.load_weights(checkpoint_path)
+
+print("loading rank {} finished".format(local_rank))
+set_current_pool(get_current_pool()+1)
+print('current rank setting is {}'.format(get_current_pool()))
+
+torch.distributed.barrier(group=mpu.get_model_parallel_group())
+text = """I think The Old Man and the Sea is a very good book, what do you think? I think """
+
+predictor = Predictor(model, tokenizer)
+out = predictor.predict_generate_randomsample(text)
+if mpu.get_model_parallel_rank() == 0:
+    print(f"pred is {out}")
+