diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-11 10:47:09 +0200 |
---|---|---|
committer | Georgi Gerganov <ggerganov@gmail.com> | 2023-03-11 11:28:30 +0200 |
commit | 007a8f6f459c6eb56678fdee4c09219ddb85b640 (patch) | |
tree | 9fc8e018b0acab10f4728f45df7a7527fd0d40cc /convert-pth-to-ggml.py | |
parent | 5f2f970d51a04b783799bc92fd1d006408269f26 (diff) |
Support all LLaMA models + change Q4_0 quantization storage
Diffstat (limited to 'convert-pth-to-ggml.py')
-rw-r--r-- | convert-pth-to-ggml.py | 172 |
1 files changed, 99 insertions, 73 deletions
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index bd0a9d0..fc217c7 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -33,12 +33,23 @@ if len(sys.argv) < 3: # output in the same directory as the model dir_model = sys.argv[1] -fname_out = sys.argv[1] + "/ggml-model.bin" fname_hparams = sys.argv[1] + "/params.json" -fname_model = sys.argv[1] + "/consolidated.00.pth" fname_tokenizer = sys.argv[1] + "/../tokenizer.model" +def get_n_parts(dim): + if dim == 4096: + return 1 + elif dim == 5120: + return 2 + elif dim == 6656: + return 4 + elif dim == 8192: + return 8 + else: + print("Invalid dim: " + str(dim)) + sys.exit(1) + # possible data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -61,76 +72,91 @@ tokenizer = SentencePieceProcessor(fname_tokenizer) hparams.update({"vocab_size": tokenizer.vocab_size()}) +n_parts = get_n_parts(hparams["dim"]) + print(hparams) +print('n_parts = ', n_parts) -model = torch.load(fname_model, map_location="cpu") - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["vocab_size"])) -fout.write(struct.pack("i", hparams["dim"])) -fout.write(struct.pack("i", hparams["multiple_of"])) -fout.write(struct.pack("i", hparams["n_heads"])) -fout.write(struct.pack("i", hparams["n_layers"])) -fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) -fout.write(struct.pack("i", ftype)) - -# Is this correct?? -for i in range(32000): - # TODO: this is probably wrong - not sure how this tokenizer works - text = tokenizer.decode([29889, i]).encode('utf-8') - # remove the first byte (it's always '.') - text = text[1:] - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for k, v in model.items(): - name = k - shape = v.shape - - # skip layers.X.attention.inner_attention.rope.freqs - if name[-5:] == "freqs": - continue - - print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) - - #data = tf.train.load_variable(dir_model, name).squeeze() - data = v.numpy().squeeze() - n_dims = len(data.shape); - - # for efficiency - transpose some matrices - # "model/h.*/attn/c_attn/w" - # "model/h.*/attn/c_proj/w" - # "model/h.*/mlp/c_fc/w" - # "model/h.*/mlp/c_proj/w" - #if name[-14:] == "/attn/c_attn/w" or \ - # name[-14:] == "/attn/c_proj/w" or \ - # name[-11:] == "/mlp/c_fc/w" or \ - # name[-13:] == "/mlp/c_proj/w": - # print(" Transposing") - # data = data.transpose() - - dshape = data.shape - - # default type is fp16 - ftype_cur = 1 - if ftype == 0 or n_dims == 1: - print(" Converting to float32") - data = data.astype(np.float32) - ftype_cur = 0 - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype_cur)) - for i in range(n_dims): - fout.write(struct.pack("i", dshape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") +for p in range(n_parts): + print('Processing part ', p) + + #fname_model = sys.argv[1] + "/consolidated.00.pth" + fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth" + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + if (p > 0): + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p) + + model = torch.load(fname_model, map_location="cpu") + + fout = open(fname_out, "wb") + + fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex + fout.write(struct.pack("i", hparams["vocab_size"])) + fout.write(struct.pack("i", hparams["dim"])) + fout.write(struct.pack("i", hparams["multiple_of"])) + fout.write(struct.pack("i", hparams["n_heads"])) + fout.write(struct.pack("i", hparams["n_layers"])) + fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) + fout.write(struct.pack("i", ftype)) + + # Is this correct?? + for i in range(32000): + # TODO: this is probably wrong - not sure how this tokenizer works + text = tokenizer.decode([29889, i]).encode('utf-8') + # remove the first byte (it's always '.') + text = text[1:] + fout.write(struct.pack("i", len(text))) + fout.write(text) + + for k, v in model.items(): + name = k + shape = v.shape + + # skip layers.X.attention.inner_attention.rope.freqs + if name[-5:] == "freqs": + continue + + print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) + + #data = tf.train.load_variable(dir_model, name).squeeze() + data = v.numpy().squeeze() + n_dims = len(data.shape); + + # for efficiency - transpose some matrices + # "model/h.*/attn/c_attn/w" + # "model/h.*/attn/c_proj/w" + # "model/h.*/mlp/c_fc/w" + # "model/h.*/mlp/c_proj/w" + #if name[-14:] == "/attn/c_attn/w" or \ + # name[-14:] == "/attn/c_proj/w" or \ + # name[-11:] == "/mlp/c_fc/w" or \ + # name[-13:] == "/mlp/c_proj/w": + # print(" Transposing") + # data = data.transpose() + + dshape = data.shape + + # default type is fp16 + ftype_cur = 1 + if ftype == 0 or n_dims == 1: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + # header + sname = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) + for i in range(n_dims): + fout.write(struct.pack("i", dshape[n_dims - 1 - i])) + fout.write(sname); + + # data + data.tofile(fout) + + # I hope this deallocates the memory .. + model = None + + fout.close() + + print("Done. Output file: " + fname_out + ", (part ", p, ")") + print("") |