Skip to content

Commit

Permalink
models : change convert-pt-to-ggml to use .tiktoken tokenizer files (g…
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-gorin authored Apr 14, 2023
1 parent 6112887 commit 62b51c3
Showing 1 changed file with 7 additions and 11 deletions.
18 changes: 7 additions & 11 deletions models/convert-pt-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import code
import torch
import numpy as np
import base64

#from transformers import GPTJForCausalLM
#from transformers import GPT2TokenizerFast
Expand Down Expand Up @@ -224,18 +225,14 @@ def bytes_to_unicode():
#code.interact(local=locals())

multilingual = hparams["n_vocab"] == 51865
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")

#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
#print(tokenizer)
#print(tokenizer.name_or_path)
#print(len(tokenizer.additional_special_tokens))
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")

# output in the same directory as the model
fname_out = dir_out + "/ggml-model.bin"

with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
tokens = json.load(f)
with open(tokenizer, "rb") as f:
contents = f.read()
tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}

# use 16-bit or 32-bit floats
use_f16 = True
Expand Down Expand Up @@ -271,9 +268,8 @@ def bytes_to_unicode():
fout.write(struct.pack("i", len(tokens)))

for key in tokens:
text = bytearray([byte_decoder[c] for c in key])
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("i", len(key)))
fout.write(key)

for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
Expand Down

0 comments on commit 62b51c3

Please sign in to comment.