aboutsummaryrefslogtreecommitdiff
path: root/convert-pth-to-ggml.py
diff options
context:
space:
mode:
authorMack Straight <eiz@users.noreply.github.com>2023-03-20 03:17:23 -0700
committerGitHub <noreply@github.com>2023-03-20 03:17:23 -0700
commit074bea2eb1f1349a0118239c4152914aecaa1be4 (patch)
tree41ce911ac28d858cabfeff650b10521b30838656 /convert-pth-to-ggml.py
parent5cb63e2493c49bc2c3b9b355696e8dc26cdd0380 (diff)
sentencepiece bpe compatible tokenizer (#252)
* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Diffstat (limited to 'convert-pth-to-ggml.py')
-rw-r--r--convert-pth-to-ggml.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index c1941a8..42f5377 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
- 0x67676d6c, # magic: ggml in hex
+ 0x67676d66, # magic: ggml in hex
+ 1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
ftype
@@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
fout.write(struct.pack("i", len(text)))
fout.write(text)
+ fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype):