aboutsummaryrefslogtreecommitdiff
path: root/convert-pth-to-ggml.py
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-03-11 10:47:09 +0200
committerGeorgi Gerganov <ggerganov@gmail.com>2023-03-11 11:28:30 +0200
commit007a8f6f459c6eb56678fdee4c09219ddb85b640 (patch)
tree9fc8e018b0acab10f4728f45df7a7527fd0d40cc /convert-pth-to-ggml.py
parent5f2f970d51a04b783799bc92fd1d006408269f26 (diff)
Support all LLaMA models + change Q4_0 quantization storage
Diffstat (limited to 'convert-pth-to-ggml.py')
-rw-r--r--convert-pth-to-ggml.py172
1 files changed, 99 insertions, 73 deletions
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index bd0a9d0..fc217c7 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -33,12 +33,23 @@ if len(sys.argv) < 3:
# output in the same directory as the model
dir_model = sys.argv[1]
-fname_out = sys.argv[1] + "/ggml-model.bin"
fname_hparams = sys.argv[1] + "/params.json"
-fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
+def get_n_parts(dim):
+ if dim == 4096:
+ return 1
+ elif dim == 5120:
+ return 2
+ elif dim == 6656:
+ return 4
+ elif dim == 8192:
+ return 8
+ else:
+ print("Invalid dim: " + str(dim))
+ sys.exit(1)
+
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
@@ -61,76 +72,91 @@ tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
+n_parts = get_n_parts(hparams["dim"])
+
print(hparams)
+print('n_parts = ', n_parts)
-model = torch.load(fname_model, map_location="cpu")
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["dim"]))
-fout.write(struct.pack("i", hparams["multiple_of"]))
-fout.write(struct.pack("i", hparams["n_heads"]))
-fout.write(struct.pack("i", hparams["n_layers"]))
-fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
-fout.write(struct.pack("i", ftype))
-
-# Is this correct??
-for i in range(32000):
- # TODO: this is probably wrong - not sure how this tokenizer works
- text = tokenizer.decode([29889, i]).encode('utf-8')
- # remove the first byte (it's always '.')
- text = text[1:]
- fout.write(struct.pack("i", len(text)))
- fout.write(text)
-
-for k, v in model.items():
- name = k
- shape = v.shape
-
- # skip layers.X.attention.inner_attention.rope.freqs
- if name[-5:] == "freqs":
- continue
-
- print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
-
- #data = tf.train.load_variable(dir_model, name).squeeze()
- data = v.numpy().squeeze()
- n_dims = len(data.shape);
-
- # for efficiency - transpose some matrices
- # "model/h.*/attn/c_attn/w"
- # "model/h.*/attn/c_proj/w"
- # "model/h.*/mlp/c_fc/w"
- # "model/h.*/mlp/c_proj/w"
- #if name[-14:] == "/attn/c_attn/w" or \
- # name[-14:] == "/attn/c_proj/w" or \
- # name[-11:] == "/mlp/c_fc/w" or \
- # name[-13:] == "/mlp/c_proj/w":
- # print(" Transposing")
- # data = data.transpose()
-
- dshape = data.shape
-
- # default type is fp16
- ftype_cur = 1
- if ftype == 0 or n_dims == 1:
- print(" Converting to float32")
- data = data.astype(np.float32)
- ftype_cur = 0
-
- # header
- str = name.encode('utf-8')
- fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
- for i in range(n_dims):
- fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
- fout.write(str);
-
- # data
- data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fname_out)
-print("")
+for p in range(n_parts):
+ print('Processing part ', p)
+
+ #fname_model = sys.argv[1] + "/consolidated.00.pth"
+ fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
+ fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
+ if (p > 0):
+ fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
+
+ model = torch.load(fname_model, map_location="cpu")
+
+ fout = open(fname_out, "wb")
+
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+ fout.write(struct.pack("i", hparams["vocab_size"]))
+ fout.write(struct.pack("i", hparams["dim"]))
+ fout.write(struct.pack("i", hparams["multiple_of"]))
+ fout.write(struct.pack("i", hparams["n_heads"]))
+ fout.write(struct.pack("i", hparams["n_layers"]))
+ fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
+ fout.write(struct.pack("i", ftype))
+
+ # Is this correct??
+ for i in range(32000):
+ # TODO: this is probably wrong - not sure how this tokenizer works
+ text = tokenizer.decode([29889, i]).encode('utf-8')
+ # remove the first byte (it's always '.')
+ text = text[1:]
+ fout.write(struct.pack("i", len(text)))
+ fout.write(text)
+
+ for k, v in model.items():
+ name = k
+ shape = v.shape
+
+ # skip layers.X.attention.inner_attention.rope.freqs
+ if name[-5:] == "freqs":
+ continue
+
+ print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
+
+ #data = tf.train.load_variable(dir_model, name).squeeze()
+ data = v.numpy().squeeze()
+ n_dims = len(data.shape);
+
+ # for efficiency - transpose some matrices
+ # "model/h.*/attn/c_attn/w"
+ # "model/h.*/attn/c_proj/w"
+ # "model/h.*/mlp/c_fc/w"
+ # "model/h.*/mlp/c_proj/w"
+ #if name[-14:] == "/attn/c_attn/w" or \
+ # name[-14:] == "/attn/c_proj/w" or \
+ # name[-11:] == "/mlp/c_fc/w" or \
+ # name[-13:] == "/mlp/c_proj/w":
+ # print(" Transposing")
+ # data = data.transpose()
+
+ dshape = data.shape
+
+ # default type is fp16
+ ftype_cur = 1
+ if ftype == 0 or n_dims == 1:
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype_cur = 0
+
+ # header
+ sname = name.encode('utf-8')
+ fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
+ for i in range(n_dims):
+ fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
+ fout.write(sname);
+
+ # data
+ data.tofile(fout)
+
+ # I hope this deallocates the memory ..
+ model = None
+
+ fout.close()
+
+ print("Done. Output file: " + fname_out + ", (part ", p, ")")
+ print("")