diff options
Diffstat (limited to 'convert-pth-to-ggml.py')
-rw-r--r-- | convert-pth-to-ggml.py | 201 |
1 files changed, 149 insertions, 52 deletions
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index d83f8a1..7d46115 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -24,8 +24,57 @@ import torch from sentencepiece import SentencePieceProcessor -def parse_args(): +QK = 32 + +GGML_TYPE_Q4_0 = 0 +GGML_TYPE_Q4_1 = 1 +GGML_TYPE_I8 = 2 +GGML_TYPE_I16 = 3 +GGML_TYPE_I32 = 4 +GGML_TYPE_F16 = 5 +GGML_TYPE_F32 = 6 + +WTYPES = { + 0: GGML_TYPE_F32, + 1: GGML_TYPE_F16, + 2: GGML_TYPE_Q4_0, + 3: GGML_TYPE_Q4_1, +} + +GGML_BLCK_SIZE = { + GGML_TYPE_Q4_0: QK, + GGML_TYPE_Q4_1: QK, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 1, + GGML_TYPE_I32: 1, + GGML_TYPE_F16: 1, + GGML_TYPE_F32: 1, +} + +GGML_TYPE_SIZE = { + GGML_TYPE_Q4_0: 4 + QK/2, + GGML_TYPE_Q4_1: 4*2 + QK/2, + GGML_TYPE_I8: 1, + GGML_TYPE_I16: 2, + GGML_TYPE_I32: 4, + GGML_TYPE_F16: 2, + GGML_TYPE_F32: 4, +} + +def ggml_nelements(shape): + r = 1 + for i in shape: + r *= i + return r + +def ggml_nbytes(shape, ftype): + x = ggml_nelements(shape) + t = WTYPES[ftype] + x *= GGML_TYPE_SIZE[t] + x //= GGML_BLCK_SIZE[t] + return x +def parse_args(): parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') parser.add_argument('dir_model', help='directory containing the model checkpoint') parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) @@ -33,7 +82,6 @@ def parse_args(): return parser.parse_args() def get_n_parts(dim): - mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} n_parts = mappings.get(dim) if n_parts is None: @@ -44,30 +92,24 @@ def get_n_parts(dim): return n_parts def load_hparams_and_tokenizer(dir_model): - # `dir_model` is something like `models/7B` or `models/7B/`. # "tokenizer.model" is expected under model's parent dir. # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found. # Let's use the model's parent dir directly. model_parent_dir = os.path.dirname(os.path.normpath(dir_model)) - fname_hparams = f"{dir_model}/params.json" fname_tokenizer = f"{model_parent_dir}/tokenizer.model" - with open(fname_hparams, "r") as f: hparams = json.load(f) print(hparams) - tokenizer = SentencePieceProcessor(fname_tokenizer) hparams.update({"vocab_size": tokenizer.vocab_size()}) - return hparams, tokenizer def write_header(fout, hparams, ftype): - keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] values = [ - 0x67676d66, # magic: ggmf in hex + 0x67676a74, # magic: ggjt in hex 1, # file version *[hparams[key] for key in keys], hparams["dim"] // hparams["n_heads"], # rot (obsolete) @@ -76,7 +118,6 @@ def write_header(fout, hparams, ftype): fout.write(struct.pack("i" * len(values), *values)) def write_tokens(fout, tokenizer): - for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): text = " \u2047 ".encode("utf-8") @@ -95,85 +136,141 @@ def write_tokens(fout, tokenizer): fout.write(text) fout.write(struct.pack("f", tokenizer.get_score(i))) -def process_and_write_variables(fout, model, ftype): - +def process_and_write_variables(fout, model, ftype, part_id, n_parts): for name, datao in model.items(): - if name.endswith("freqs"): continue - shape = datao.shape - - print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}") - + # remove dimensions with a single element data = datao.numpy().squeeze() - n_dims = len(shape) + partshape = data.shape + n_dims = len(data.shape) + assert n_dims in (1, 2) + + print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}") - # default type is fp16 + # coerce single-dimensional tensors from float16 to float32 ftype_cur = 1 if ftype == 0 or n_dims == 1: print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 - - # header + blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]] + type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]] + + # determine dimension along which multipart tensor is sharded + # + # split_dim 0 regex: + # - output.* + # - layers.*.attention.wq.weight + # - layers.*.attention.wk.weight + # - layers.*.attention.wv.weight + # - layers.*.feed_forward.w1.weight + # - layers.*.feed_forward.w3.weight + # + # split_dim 1 regex: + # - tok_embeddings.* + # - layers.*.attention.wo.weight + # - layers.*.feed_forward.w2.weight + # + if n_dims > 1: + split_dim = 1 + if "tok_embeddings" in name: + split_dim = 1 + elif "layers" in name: + if "attention.wo.weight" in name: + split_dim = 1 + elif "feed_forward.w2.weight" in name: + split_dim = 1 + else: + split_dim = 0 + elif "output" in name: + split_dim = 0 + + # output tensor header + fullshape = list(partshape) + if n_dims > 1: + fullshape[split_dim] *= n_parts sname = name.encode('utf-8') - fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur)) - for dim in reversed(data.shape): + fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) + for dim in reversed(fullshape): fout.write(struct.pack("i", dim)) fout.write(sname) - # data output to file - data.tofile(fout) + # ensure tensor data is aligned + tensor_data_offset = fout.tell() + while tensor_data_offset % QK != 0: + fout.write(struct.pack("B", 0)) + tensor_data_offset += 1 + + # output unified mappable tensor data + if n_dims == 1 or n_parts == 1: + # copy tensor which we thankfully received in one piece + if part_id == 0: + data.tofile(fout) + elif split_dim == 0: + # reassemble multifile tensor containing some of the rows + rows_per_chunk = partshape[0] + current_row = part_id * rows_per_chunk + bytes_per_row = fullshape[1] // blck_size * type_size + offset = current_row * bytes_per_row + fout.seek(tensor_data_offset + offset) + data.tofile(fout) + elif split_dim == 1: + # reassemble multifile tensor containing some of the cols + cols_per_chunk = partshape[1] + current_col = part_id * cols_per_chunk + bytes_per_row = fullshape[1] // blck_size * type_size + offset_current_col = current_col // blck_size * type_size + for row in range(partshape[0]): + offset_row = row * bytes_per_row + offset = offset_row + offset_current_col + fout.seek(tensor_data_offset + offset) + data[row].tofile(fout) + + # advance file position to next tensor + fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur)) def main(): - args = parse_args() dir_model = args.dir_model ftype = args.ftype ftype_str = ["f32", "f16"] - hparams, tokenizer = load_hparams_and_tokenizer(dir_model) print(args) # if only writing vocab to file if args.vocab_only: - fname_model = f"{dir_model}/consolidated.00.pth" fname_out = f"{dir_model}/ggml-vocab.bin" - print(f"Extracting only the vocab from '{fname_model}'\n") - - + model = torch.load(fname_model, map_location="cpu") with open(fname_out, "wb") as fout: write_header(fout, hparams, ftype) write_tokens(fout, tokenizer) - - + del model print(f"Done. Output file: {fname_out}\n") - return n_parts = get_n_parts(hparams["dim"]) - - for p in range(n_parts): - - print(f"Processing part {p+1} of {n_parts}\n") - - fname_model = f"{dir_model}/consolidated.0{p}.pth" - fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}" - - model = torch.load(fname_model, map_location="cpu") - - with open(fname_out, "wb") as fout: - write_header(fout, hparams, ftype) - write_tokens(fout, tokenizer) - process_and_write_variables(fout, model, ftype) - - del model - - print(f"Done. Output file: {fname_out}, (part {p})\n") + fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin" + + # we output a single file for ggml + with open(fname_out, "wb") as fout: + write_header(fout, hparams, ftype) + write_tokens(fout, tokenizer) + offset_of_tensors = fout.tell() + # the tensors we load could be split across multiple files + for part_id in range(n_parts): + fout.seek(offset_of_tensors) + print(f"Processing part {part_id+1} of {n_parts}\n") + fname_model = f"{dir_model}/consolidated.0{part_id}.pth" + model = torch.load(fname_model, map_location="cpu") + process_and_write_variables(fout, model, ftype, part_id, n_parts) + del model + + print(f"Done. Output file: {fname_out}\n") if __name__ == "__main__": main() |