Add tokenizer test + revert to C++11 (#355)

* Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now
author: Georgi Gerganov <ggerganov@gmail.com> 2023-03-21 17:29:41 +0200
committer: GitHub <noreply@github.com> 2023-03-21 17:29:41 +0200
commit: eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 (patch)
tree: efa10a024845c09ed2630bbe0675618cc7ba0a50 /convert-pth-to-ggml.py
parent: 2e664f1ff413995506c9a54f3a8d5b8c64e37a91 (diff)
1 files changed, 25 insertions, 3 deletions
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index 108eb1f..c506676 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -10,12 +10,10 @@
 #   - Name (char[name_length])
 #   - Data (float[n_dims])
 #
-# By default, the bigger matrices are converted to 16-bit floats.
-# This can be disabled by adding the "use-f32" CLI argument.
-#
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
+
 import argparse
 import os
 import sys
@@ -23,6 +21,7 @@ import json
 import struct
 import numpy as np
 import torch
+
 from sentencepiece import SentencePieceProcessor
 
 def parse_args():
@@ -30,6 +29,7 @@ def parse_args():
     parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
     parser.add_argument('dir_model', help='directory containing the model checkpoint')
     parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
+    parser.add_argument('vocab_only', type=bool, default=False, help='only write vocab to file')
     return parser.parse_args()
 
 def get_n_parts(dim):
@@ -134,6 +134,27 @@ def main():
     ftype_str = ["f32", "f16"]
 
     hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
+
+    # if only writing vocab to file
+    if args.vocab_only:
+
+        fname_model = f"{dir_model}/consolidated.00.pth"
+        fname_out = f"{dir_model}/ggml-vocab.bin"
+
+        print(f"Extracting only the vocab from '{fname_model}'\n")
+
+        model = torch.load(fname_model, map_location="cpu")
+
+        with open(fname_out, "wb") as fout:
+            fout.write(struct.pack("i", hparams["vocab_size"]))
+            write_tokens(fout, tokenizer)
+
+        del model
+
+        print(f"Done. Output file: {fname_out}\n")
+
+        return
+
     n_parts = get_n_parts(hparams["dim"])
 
     for p in range(n_parts):
@@ -151,6 +172,7 @@ def main():
             process_and_write_variables(fout, model, ftype)
 
         del model
+
         print(f"Done. Output file: {fname_out}, (part {p})\n")
 
 if __name__ == "__main__":
author	Georgi Gerganov <ggerganov@gmail.com>	2023-03-21 17:29:41 +0200
committer	GitHub <noreply@github.com>	2023-03-21 17:29:41 +0200
commit	eb34620aeceaf9d9df7fcb19acc17ad41b9f60f8 (patch)
tree	efa10a024845c09ed2630bbe0675618cc7ba0a50 /convert-pth-to-ggml.py
parent	2e664f1ff413995506c9a54f3a8d5b8c64e37a91 (diff)