Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #5

Merged
merged 2 commits into from
Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions convert_gptneox_from_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,19 @@
help='model-name')
parser.add_argument('--save-dir', type=str, default=DIR,
help='model-name')
parser.add_argument('--save-path', type=str, default=None,
help='model-name')
args = parser.parse_args()

if not os.path.exists(args.save_dir):
os.mkdir(args.save_dir)
save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
if not os.path.exists(save_path):
os.mkdir(save_path)

if args.save_path is None:
if not os.path.exists(args.save_dir):
os.mkdir(args.save_dir)
save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
if not os.path.exists(save_path):
os.mkdir(save_path)
else:
save_path = args.save_path

config = AutoConfig.from_pretrained(args.model_name)
config.save_pretrained(save_path)
Expand Down
17 changes: 11 additions & 6 deletions convert_llama_from_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert HF checkpoints')
parser.add_argument('--model-name', type=str, default='EleutherAI/gpt-neox-20b',
parser.add_argument('--model-name', type=str, default='huggyllama/llama-7b',
help='model-name')
parser.add_argument('--save-dir', type=str, default=DIR,
help='model-name')
parser.add_argument('--save-path', type=str, default=None,
help='model-name')
args = parser.parse_args()

if not os.path.exists(args.save_dir):
os.mkdir(args.save_dir)
save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
if not os.path.exists(save_path):
os.mkdir(save_path)
if args.save_path is None:
if not os.path.exists(args.save_dir):
os.mkdir(args.save_dir)
save_path = os.path.join(args.save_dir, args.model_name.replace('/', '_'))
if not os.path.exists(save_path):
os.mkdir(save_path)
else:
save_path = args.save_path

config = LlamaConfig.from_pretrained(args.model_name)
config.save_pretrained(save_path)
Expand Down
2 changes: 1 addition & 1 deletion convert_opt_checkpoint.py → convert_opt_from_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
parser = argparse.ArgumentParser(description='Convert HF checkpoints')
parser.add_argument('--model-name', type=str, default='facebook/opt-1.3b',
help='model-name')
parser.add_argument('--save-path', type=str, default='./pretrained_models',
parser.add_argument('--save-path', type=str, default=None,
help='model-name')
args = parser.parse_args()

Expand Down
52 changes: 52 additions & 0 deletions example_scripts/finetune_allreduce_llama7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

export WANDB_NAME=llama-7b-instruct-single-node

netif=lo
master_ip=127.0.0.1
export GLOO_SOCKET_IFNAME=${netif}
export NCCL_SOCKET_IFNAME=${netif}
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export QUANT_BITS=4
export TOPK_RATIO=0.2
export RANDOMP_RATIO=0.1
export SHOW_DATA=0

ARGS="--model-name /root/fm/models/llama-7b-shard \
--tokenizer-name /root/fm/models/llama-7b-shard \
--project-name demo \
--model-type flash_llama \
--optimizer adam \
--seed 42 \
--load-pretrained-model true \
--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
--checkpoint-path ./model_ckpts/$WANDB_NAME \
--num-layers 8 --embedding-dim 4096 \
--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
--checkpoint-steps 1000 \
--lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
--dist-url tcp://${master_ip}:7033 \
--world-size 8 --pipeline-group-size 4 --data-group-size 2 \
--job-id 0 --net-interface ${netif} \
--fp16 \
--dp-backend nccl \
--dp-mode allreduce \
--pp-mode gpipe --profiling no-profiling"

(trap 'kill 0' SIGINT; \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
& \
wait)
52 changes: 52 additions & 0 deletions example_scripts/finetune_allreduce_llama7b_node0.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

export WANDB_NAME=llama-7b-instruct-two-nodes

netif=enp19s0
master_ip=172.27.6.23
export GLOO_SOCKET_IFNAME=${netif}
export NCCL_SOCKET_IFNAME=${netif}
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export QUANT_BITS=4
export TOPK_RATIO=0.2
export RANDOMP_RATIO=0.1
export SHOW_DATA=0

ARGS="--model-name ./llama-7b-shard \
--tokenizer-name ./llama-7b-shard \
--project-name demo \
--model-type flash_llama \
--optimizer adam \
--seed 42 \
--load-pretrained-model true \
--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
--checkpoint-path ./model_ckpts/$WANDB_NAME \
--num-layers 8 --embedding-dim 4096 \
--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
--checkpoint-steps 1000 \
--lr 2e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
--dist-url tcp://${master_ip}:7033 \
--world-size 16 --pipeline-group-size 4 --data-group-size 4 \
--job-id 0 --net-interface ${netif} \
--fp16 \
--dp-backend nccl \
--dp-mode allreduce \
--pp-mode gpipe --profiling no-profiling"

(trap 'kill 0' SIGINT; \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
& \
wait)
52 changes: 52 additions & 0 deletions example_scripts/finetune_allreduce_llama7b_node1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

export WANDB_NAME=llama-7b-instruct-two-nodes

netif=enp19s0
master_ip=172.27.6.23
export GLOO_SOCKET_IFNAME=${netif}
export NCCL_SOCKET_IFNAME=${netif}
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export QUANT_BITS=4
export TOPK_RATIO=0.2
export RANDOMP_RATIO=0.1
export SHOW_DATA=0

ARGS="--model-name ./llama-7b-shard \
--tokenizer-name ./llama-7b-shard \
--project-name demo \
--model-type flash_llama \
--optimizer adam \
--seed 42 \
--load-pretrained-model true \
--task-name ni_dehelm:0.2,p3_dehelm:0.2,pile:0.6 \
--checkpoint-path ./model_ckpts/$WANDB_NAME \
--num-layers 8 --embedding-dim 4096 \
--total-steps 10000 --warmup-steps 10 --train-warmup-steps 0 \
--checkpoint-steps 1000 \
--lr 2e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 2 --gradient-accumulate-step 1 \
--dist-url tcp://${master_ip}:7033 \
--world-size 16 --pipeline-group-size 4 --data-group-size 4 \
--job-id 0 --net-interface ${netif} \
--fp16 \
--dp-backend nccl \
--dp-mode allreduce \
--pp-mode gpipe --profiling no-profiling"

(trap 'kill 0' SIGINT; \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 8 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 9 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 10 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 11 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 12 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 13 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 14 \
& \
python dist_lm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 15 \
& \
wait)
44 changes: 44 additions & 0 deletions setup_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

# This script fails when any of its commands fail.
set -e

wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
sudo sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit

export CUDA_HOME=/usr/local/cuda-11.8

mamba create -n cocktail python=3.10
mamba activate cocktail

mamba install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
mamba install -c conda-forge cupy nccl cudatoolkit=11.8 -y

pip install --upgrade pip
pip install --no-input transformers
pip install --no-input datasets
pip install --no-input netifaces
pip install --no-input zstandard
pip install --no-input wandb

rm -rf flash-attention
git clone https://github.com/HazyResearch/flash-attention.git
cd flash-attention
git checkout tags/v1.0.4
pip install .
cd ..

cd flash-attention/csrc/rotary && pip install . && ../..
cd flash-attention/csrc/xentropy && pip install . && ../..

git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" ./
cd ..

git clone https://github.com/facebookresearch/xformers.git
cd xformers
git submodule update --init --recursive
pip install .
cd ..