aboutsummaryrefslogtreecommitdiff
path: root/convert-ggml-to-pth.py
diff options
context:
space:
mode:
authorcomex <comexk@gmail.com>2023-04-14 00:03:03 -0700
committerGitHub <noreply@github.com>2023-04-14 10:03:03 +0300
commit723dac55fa2ba7adc6e3fc8609781d1ad0378906 (patch)
treee1ecb4e02708d1be78484b87fd867ba5afb1ecb0 /convert-ggml-to-pth.py
parent0f07cacb05f49704d35a39aa27cfd4b419eb6f8d (diff)
py : new conversion script (#545)
Current status: Working, except for the latest GPTQ-for-LLaMa format that includes `g_idx`. This turns out to require changes to GGML, so for now it only works if you use the `--outtype` option to dequantize it back to f16 (which is pointless except for debugging). I also included some cleanup for the C++ code. This script is meant to replace all the existing conversion scripts (including the ones that convert from older GGML formats), while also adding support for some new formats. Specifically, I've tested with: - [x] `LLaMA` (original) - [x] `llama-65b-4bit` - [x] `alpaca-native` - [x] `alpaca-native-4bit` - [x] LLaMA converted to 'transformers' format using `convert_llama_weights_to_hf.py` - [x] `alpaca-native` quantized with `--true-sequential --act-order --groupsize 128` (dequantized only) - [x] same as above plus `--save_safetensors` - [x] GPT4All - [x] stock unversioned ggml - [x] ggmh There's enough overlap in the logic needed to handle these different cases that it seemed best to move to a single script. I haven't tried this with Alpaca-LoRA because I don't know where to find it. Useful features: - Uses multiple threads for a speedup in some cases (though the Python GIL limits the gain, and sometimes it's disk-bound anyway). - Combines split models into a single file (both the intra-tensor split of the original and the inter-tensor split of 'transformers' format files). Single files are more convenient to work with and more friendly to future changes to use memory mapping on the C++ side. To accomplish this without increasing memory requirements, it has some custom loading code which avoids loading whole input files into memory at once. - Because of the custom loading code, it no longer depends in PyTorch, which might make installing dependencies slightly easier or faster... although it still depends on NumPy and sentencepiece, so I don't know if there's any meaningful difference. In any case, I also added a requirements.txt file to lock the dependency versions in case of any future breaking changes. - Type annotations checked with mypy. - Some attempts to be extra user-friendly: - The script tries to be forgiving with arguments, e.g. you can specify either the model file itself or the directory containing it. - The script doesn't depend on config.json / params.json, just in case the user downloaded files individually and doesn't have those handy. But you still need tokenizer.model and, for Alpaca, added_tokens.json. - The script tries to give a helpful error message if added_tokens.json is missing.
Diffstat (limited to 'convert-ggml-to-pth.py')
-rw-r--r--convert-ggml-to-pth.py299
1 files changed, 0 insertions, 299 deletions
diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py
deleted file mode 100644
index 25a4423..0000000
--- a/convert-ggml-to-pth.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Author: github.com/ductai199x
-import argparse
-import os
-import struct
-
-import numpy as np
-import torch
-from numba import njit
-from tqdm.auto import tqdm
-
-
-def read_header(fin):
- values = struct.unpack("i" * 9, fin.read(4 * 9))
- _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
- return {
- "vocab_size": vocab_size,
- "dim": dim,
- "multiple_of": multiple_of,
- "n_heads": n_heads,
- "n_layers": n_layers,
- }, ftype
-
-
-def read_tokens(fin, vocab_size):
- tokens = []
- for _ in range(vocab_size):
- text_len = struct.unpack("i", fin.read(4))[0]
- text_bytes = fin.read(text_len)
- try:
- text = text_bytes.decode()
- except UnicodeDecodeError:
- text = text_bytes.decode(errors="replace")
- score = struct.unpack("f", fin.read(4))[0]
- tokens.append((text, score))
- return tokens
-
-
-@njit
-def dequantize_weights_numba(fin_data, n_rows, n_cols):
- qk = 32
- nb = n_cols // qk
- bs = 4 + (qk // 2)
-
- weights = np.zeros((n_rows, n_cols), dtype=np.float32)
- data_pos = 0
-
- for row in range(n_rows):
- for block in range(nb):
- d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
- data_pos += 4
- packed_values = fin_data[data_pos : data_pos + (qk // 2)]
- data_pos += qk // 2
-
- for i in range(qk // 2):
- packed_value = packed_values[i]
- v0 = np.float32((packed_value & 0b00001111) - 8) * d
- v1 = np.float32((packed_value >> 4) - 8) * d
-
- weights[row, block * qk + 2 * i] = v0
- weights[row, block * qk + 2 * i + 1] = v1
-
- return weights
-
-
-def dequantize_weights(fin, n_rows, n_cols):
- qk = 32
- nb = n_cols // qk
- data_size = n_rows * n_cols // 2 + n_rows * nb * 4
- fin_data = fin.read(data_size)
- return dequantize_weights_numba(fin_data, n_rows, n_cols)
-
-
-def read_variables(fin):
- model = {}
- pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
- while True:
- start_pos = fin.tell()
- try:
- n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
- except struct.error:
- break
-
- shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
- shape = shape[::-1]
- name = fin.read(name_length).decode()
-
- # ensure tensor data is aligned
- tensor_data_offset = fin.tell()
- tensor_data_offset = (tensor_data_offset + 31) & -32
- fin.seek(tensor_data_offset)
-
- if ftype_cur == 2:
- # 4-bit quantized weights
- dtype = np.uint8
- data = dequantize_weights(fin, shape[0], shape[1])
- data = data.reshape(shape)
- elif ftype_cur == 0:
- dtype = np.float32
- data_size = np.prod(shape)
- data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
- elif ftype_cur == 1:
- dtype = np.float16
- data_size = np.prod(shape)
- data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-
- model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
-
- pbar.update(fin.tell() - start_pos)
-
- return model
-
-
-def convert_to_hf_format(model, hparams):
- # This works for llama 7B, need to test with other models
- n_layers = hparams["n_layers"]
- n_heads = hparams["n_heads"]
- dim = hparams["dim"]
- dims_per_head = dim // n_heads
- base = 10000.0
- inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
- # permute for sliced rotary
- def permute(w):
- return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
-
- state_dict = {}
- for layer_i in range(n_layers):
- state_dict.update(
- {
- f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
- model[f"layers.{layer_i}.attention.wq.weight"]
- ),
- f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
- model[f"layers.{layer_i}.attention.wk.weight"]
- ),
- f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
- f"layers.{layer_i}.attention.wv.weight"
- ],
- f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
- f"layers.{layer_i}.attention.wo.weight"
- ],
- f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
- f"layers.{layer_i}.feed_forward.w1.weight"
- ],
- f"model.layers.{layer_i}.mlp.down_proj.weight": model[
- f"layers.{layer_i}.feed_forward.w2.weight"
- ],
- f"model.layers.{layer_i}.mlp.up_proj.weight": model[
- f"layers.{layer_i}.feed_forward.w3.weight"
- ],
- f"model.layers.{layer_i}.input_layernorm.weight": model[
- f"layers.{layer_i}.attention_norm.weight"
- ],
- f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
- f"layers.{layer_i}.ffn_norm.weight"
- ],
- }
- )
- state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
- state_dict.update(
- {
- "model.embed_tokens.weight": model["tok_embeddings.weight"],
- "model.norm.weight": model["norm.weight"],
- "lm_head.weight": model["output.weight"],
- }
- )
-
- return state_dict
-
-
-def chat(model, hparams, llama_dir):
- from transformers import (GenerationConfig, LlamaForCausalLM,
- LlamaTokenizer, StoppingCriteria,
- StoppingCriteriaList)
- from transformers.models.llama.configuration_llama import LlamaConfig
-
- class StoppingCriteriaSub(StoppingCriteria):
- def __init__(self):
- super().__init__()
-
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
- print(tokenizer.decode(input_ids[0]), end="", flush=True)
- if input_ids[0][-1] == 13:
- return True
-
- return False
-
- config = LlamaConfig(
- vocab_size=hparams["vocab_size"],
- dim=hparams["dim"],
- num_hidden_layers=hparams["n_layers"],
- num_attention_heads=hparams["n_heads"],
- )
-
- llama = LlamaForCausalLM(config=config)
- llama.load_state_dict(state_dict=model, strict=True)
- tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
-
- device = torch.device("cpu")
- llama = llama.to(device)
-
- ctx = """You are AI.
-This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
-User: Hello, AI.
-AI: Hello! How can I assist you today?
-"""
- print(ctx.rstrip("\n"))
- while True:
- print("-" * 60)
- prompt = input("User: ")
- if ctx != "":
- ctx = f"{ctx}User: {prompt}\n"
- else:
- ctx = f"{prompt}\nAI:"
-
- ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
-
- print("-" * 60)
- if len(ctx.strip()) > 0:
- input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
- generation_config = GenerationConfig(
- temperature=0.8,
- top_p=0.95,
- top_k=50,
- repetition_penalty=1.1764,
- )
- with torch.no_grad():
- generation_output = llama.generate(
- input_ids=input_ids,
- generation_config=generation_config,
- return_dict_in_generate=True,
- output_scores=True,
- max_length=2048,
- do_sample=True,
- stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
- )
- s = generation_output.sequences[0]
- decoded = tokenizer.decode(s)
- ctx = f"{decoded}\n"
-
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
- )
- parser.add_argument(
- "--prefix",
- "-p",
- type=str,
- required=True,
- help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
- )
- parser.add_argument(
- "--hf",
- action="store_true",
- help="Whether to save the model in the Hugging Face format. (default: False)",
- )
- parser.add_argument(
- "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
- )
- args = parser.parse_args()
-
- llama_dir = os.path.abspath(f"{args.input_dir}/../")
-
- ggml_files = sorted(
- [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
- )
-
- fin = open(ggml_files[0], "rb")
- hparams, ftype = read_header(fin)
- tokens = read_tokens(fin, hparams["vocab_size"])
- model = read_variables(fin)
-
- for f in tqdm(ggml_files[1:]):
- fin = open(f, "rb")
- read_header(fin)
- read_tokens(fin, hparams["vocab_size"])
- model.update(read_variables(fin))
-
- if args.hf:
- model = convert_to_hf_format(model, hparams)
-
- pth_ckpt = {
- "state_dict": model,
- "hparams": hparams,
- "tokens": tokens,
- }
-
- torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
-
- if args.chat:
- if not args.hf:
- model = convert_to_hf_format(model, hparams)
- chat(model, hparams, llama_dir)
-
-
-if __name__ == "__main__":
- main()