DS3Lab · LorrinWWW · Jun 9, 2023 · Jun 9, 2023 · Jun 9, 2023
diff --git a/convert_gptneox_from_hf.py b/convert_gptneox_from_hf.py
@@ -12,13 +12,19 @@
                         help='model-name')
     parser.add_argument('--save-dir', type=str, default=DIR, 
                         help='model-name')
+    parser.add_argument('--save-path', type=str, default=None, 
+                        help='model-name')
     args = parser.parse_args()
 
-    if not os.path.exists(args.save_dir):
-        os.mkdir(args.save_dir)
-    save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
-    if not os.path.exists(save_path):
-        os.mkdir(save_path)
+
+    if args.save_path is None:
+        if not os.path.exists(args.save_dir):
+            os.mkdir(args.save_dir)
+        save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+    else:
+        save_path = args.save_path
 
     config = AutoConfig.from_pretrained(args.model_name)
     config.save_pretrained(save_path)

diff --git a/convert_llama_from_hf.py b/convert_llama_from_hf.py
@@ -8,17 +8,22 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
-    parser.add_argument('--model-name', type=str, default='EleutherAI/gpt-neox-20b', 
+    parser.add_argument('--model-name', type=str, default='huggyllama/llama-7b', 
                         help='model-name')
     parser.add_argument('--save-dir', type=str, default=DIR, 
                         help='model-name')
+    parser.add_argument('--save-path', type=str, default=None, 
+                        help='model-name')
     args = parser.parse_args()
 
-    if not os.path.exists(args.save_dir):
-        os.mkdir(args.save_dir)
-    save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
-    if not os.path.exists(save_path):
-        os.mkdir(save_path)
+    if args.save_path is None:
+        if not os.path.exists(args.save_dir):
+            os.mkdir(args.save_dir)
+        save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+    else:
+        save_path = args.save_path
 
     config = LlamaConfig.from_pretrained(args.model_name)
     config.save_pretrained(save_path)

diff --git a/convert_opt_checkpoint.py → convert_opt_from_hf.py b/convert_opt_checkpoint.py → convert_opt_from_hf.py
@@ -11,7 +11,7 @@
     parser = argparse.ArgumentParser(description='Convert HF checkpoints')
     parser.add_argument('--model-name', type=str, default='facebook/opt-1.3b', 
                         help='model-name')
-    parser.add_argument('--save-path', type=str, default='./pretrained_models', 
+    parser.add_argument('--save-path', type=str, default=None, 
                         help='model-name')
     args = parser.parse_args()
 

diff --git a/example_scripts/finetune_allreduce_llama7b.sh b/example_scripts/finetune_allreduce_llama7b.sh
@@ -0,0 +1,52 @@
+
+export WANDB_NAME=llama-7b-instruct-single-node
+
+netif=lo
+master_ip=127.0.0.1
+export GLOO_SOCKET_IFNAME=${netif}
+export NCCL_SOCKET_IFNAME=${netif}
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export QUANT_BITS=4
+export TOPK_RATIO=0.2
+export RANDOMP_RATIO=0.1
+export SHOW_DATA=0
+
+ARGS="--model-name /root/fm/models/llama-7b-shard \
+--tokenizer-name /root/fm/models/llama-7b-shard \
+--project-name demo \
+--model-type flash_llama \
+--optimizer adam \
+--seed 42 \
+--load-pretrained-model true \
+--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
+--checkpoint-path ./model_ckpts/$WANDB_NAME \
+--num-layers 8 --embedding-dim 4096 \
+--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
+--checkpoint-steps 1000 \
+--lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
+--dist-url tcp://${master_ip}:7033 \
+--world-size 8 --pipeline-group-size 4 --data-group-size 2 \
+--job-id 0 --net-interface ${netif} \
+--fp16 \
+--dp-backend nccl \
+--dp-mode allreduce \
+--pp-mode gpipe --profiling no-profiling"
+
+(trap 'kill 0' SIGINT; \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
+    & \
+wait)
diff --git a/example_scripts/finetune_allreduce_llama7b_node0.sh b/example_scripts/finetune_allreduce_llama7b_node0.sh
@@ -0,0 +1,52 @@
+
+export WANDB_NAME=llama-7b-instruct-two-nodes
+
+netif=enp19s0
+master_ip=172.27.6.23
+export GLOO_SOCKET_IFNAME=${netif}
+export NCCL_SOCKET_IFNAME=${netif}
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export QUANT_BITS=4
+export TOPK_RATIO=0.2
+export RANDOMP_RATIO=0.1
+export SHOW_DATA=0
+
+ARGS="--model-name ./llama-7b-shard \
+--tokenizer-name ./llama-7b-shard \
+--project-name demo \
+--model-type flash_llama \
+--optimizer adam \
+--seed 42 \
+--load-pretrained-model true \
+--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
+--checkpoint-path ./model_ckpts/$WANDB_NAME \
+--num-layers 8 --embedding-dim 4096 \
+--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
+--checkpoint-steps 1000 \
+--lr 2e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
+--dist-url tcp://${master_ip}:7033 \
+--world-size 16 --pipeline-group-size 4 --data-group-size 4 \
+--job-id 0 --net-interface ${netif} \
+--fp16 \
+--dp-backend nccl \
+--dp-mode allreduce \
+--pp-mode gpipe --profiling no-profiling"
+
+(trap 'kill 0' SIGINT; \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
+    & \
+wait)
diff --git a/example_scripts/finetune_allreduce_llama7b_node1.sh b/example_scripts/finetune_allreduce_llama7b_node1.sh
@@ -0,0 +1,52 @@
+
+export WANDB_NAME=llama-7b-instruct-two-nodes
+
+netif=enp19s0
+master_ip=172.27.6.23
+export GLOO_SOCKET_IFNAME=${netif}
+export NCCL_SOCKET_IFNAME=${netif}
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export QUANT_BITS=4
+export TOPK_RATIO=0.2
+export RANDOMP_RATIO=0.1
+export SHOW_DATA=0
+
+ARGS="--model-name ./llama-7b-shard \
+--tokenizer-name ./llama-7b-shard \
+--project-name demo \
+--model-type flash_llama \
+--optimizer adam \
+--seed 42 \
+--load-pretrained-model true \
+--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
+--checkpoint-path ./model_ckpts/$WANDB_NAME \
+--num-layers 8 --embedding-dim 4096 \
+--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
+--checkpoint-steps 1000 \
+--lr 2e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
+--dist-url tcp://${master_ip}:7033 \
+--world-size 16 --pipeline-group-size 4 --data-group-size 4 \
+--job-id 0 --net-interface ${netif} \
+--fp16 \
+--dp-backend nccl \
+--dp-mode allreduce \
+--pp-mode gpipe --profiling no-profiling"
+
+(trap 'kill 0' SIGINT; \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 8 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 9 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 10 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 11 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 12 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 13 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 14 \
+    & \
+python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 15 \
+    & \
+wait)
diff --git a/setup_env.sh b/setup_env.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# This script fails when any of its commands fail.
+set -e
+
+wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+sudo sh cuda_11.8.0_520.61.05_linux.run  --silent --toolkit
+
+export CUDA_HOME=/usr/local/cuda-11.8
+
+mamba create -n cocktail python=3.10
+mamba activate cocktail
+
+mamba install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
+mamba install -c conda-forge cupy nccl cudatoolkit=11.8 -y
+
+pip install --upgrade pip
+pip install --no-input transformers
+pip install --no-input datasets
+pip install --no-input netifaces
+pip install --no-input zstandard
+pip install --no-input wandb
+
+rm -rf flash-attention
+git clone https://github.com/HazyResearch/flash-attention.git
+cd flash-attention
+git checkout tags/v1.0.4
+pip install .
+cd ..
+
+cd flash-attention/csrc/rotary && pip install . && ../..
+cd flash-attention/csrc/xentropy && pip install . && ../..
+
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
+cd ..
+
+git clone https://github.com/facebookresearch/xformers.git
+cd xformers
+git submodule update --init --recursive
+pip install .
+cd ..
+