aboutsummaryrefslogtreecommitdiff
path: root/convert-gpt4all-to-ggml.py
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-03-29 19:29:26 +0300
committerGeorgi Gerganov <ggerganov@gmail.com>2023-03-29 19:29:52 +0300
commit53635c081c49321d523567112f9fddfbba6b787b (patch)
tree3514a2f79bf3ef0ea0f455c768a0236856f1517c /convert-gpt4all-to-ggml.py
parent41318d708ed196ff727dce14d263a64b23c7333d (diff)
py : add GPT4All conversion script
For now: copy-paste Too much time for me to deduplicate the python code
Diffstat (limited to 'convert-gpt4all-to-ggml.py')
-rw-r--r--convert-gpt4all-to-ggml.py107
1 files changed, 107 insertions, 0 deletions
diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
new file mode 100644
index 0000000..f1d9d7a
--- /dev/null
+++ b/convert-gpt4all-to-ggml.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+#
+# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
+#
+
+# Original by https://github.com/eiz
+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
+import argparse
+import glob
+import os
+import struct
+import sys
+from sentencepiece import SentencePieceProcessor
+
+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
+ parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
+ parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+ return parser.parse_args()
+
+def read_header(f_in):
+ struct_fmt = "i" * (3 + len(HPARAMS))
+ struct_size = struct.calcsize(struct_fmt)
+ buf = f_in.read(struct_size)
+ return struct.unpack(struct_fmt, buf)
+
+def write_header(f_out, header):
+ (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
+
+ if magic != 0x67676d6c:
+ raise Exception('Invalid file magic. Must be an old style ggml file.')
+
+ values = [
+ 0x67676d66, # magic: ggml in hex
+ 1, # file version
+ vocab_size,
+ dim,
+ multiple_of,
+ n_heads,
+ n_layers,
+ rot,
+ ftype
+ ]
+ f_out.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+ for i in range(tokenizer.vocab_size()):
+ if tokenizer.is_unknown(i):
+ text = " \u2047 ".encode("utf-8")
+ elif tokenizer.is_control(i):
+ text = b""
+ elif tokenizer.is_byte(i):
+ piece = tokenizer.id_to_piece(i)
+ if len(piece) != 6:
+ print(f"Invalid token: {piece}")
+ sys.exit(1)
+ byte_value = int(piece[3:-1], 16)
+ text = struct.pack("B", byte_value)
+ else:
+ text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+ fout.write(struct.pack("i", len(text)))
+ fout.write(text)
+ fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+ # TODO: GPT4All - add extra <pad> token
+ text = "<pad>".encode("utf-8")
+ fout.write(struct.pack("i", len(text)))
+ fout.write(text)
+ fout.write(struct.pack("f", 0.0))
+
+def read_tokens(f_in, tokenizer):
+ for i in range(tokenizer.vocab_size()):
+ len_b = f_in.read(4)
+ (length,) = struct.unpack("i", len_b)
+ f_in.read(length)
+
+def copy_all_data(f_out, f_in):
+ while True:
+ buf = f_in.read(1024 * 1024)
+ if not buf:
+ break
+ f_out.write(buf)
+
+def convert_one_file(path_in, tokenizer):
+ path_tmp = f"{path_in}.tmp"
+ path_orig= f"{path_in}.orig"
+ print(f"converting {path_in}")
+ with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
+ write_header(f_out, read_header(f_in))
+ read_tokens(f_in, tokenizer)
+ write_tokens(f_out, tokenizer)
+ copy_all_data(f_out, f_in)
+ os.rename(path_in, path_orig)
+ os.rename(path_tmp, path_in)
+
+def main():
+ args = parse_args()
+
+ tokenizer = SentencePieceProcessor(args.tokenizer_model)
+
+ convert_one_file(args.gpt4all_model, tokenizer)
+
+if __name__ == "__main__":
+ main()