aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavol Rusnak <pavol@rusnak.io>2023-03-31 00:52:06 +0200
committerPavol Rusnak <pavol@rusnak.io>2023-03-31 01:07:32 +0200
commit9733104be5389ebb1ff05095eca2a70280cd875a (patch)
tree284b9e2b26e39271889e0137e3fcafde2b0da414
parent3df890aef432ce68143cfafcd7caf828bc4c3e55 (diff)
drop quantize.py (now that models are using a single file)
-rw-r--r--README.md4
-rw-r--r--quantize.py131
2 files changed, 2 insertions, 133 deletions
diff --git a/README.md b/README.md
index cefcfb7..07066cd 100644
--- a/README.md
+++ b/README.md
@@ -155,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
# convert the 7B model to ggml FP16 format
python3 convert-pth-to-ggml.py models/7B/ 1
-# quantize the model to 4-bits
-python3 quantize.py 7B
+# quantize the model to 4-bits (using method 2 = q4_0)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
diff --git a/quantize.py b/quantize.py
deleted file mode 100644
index 641df8d..0000000
--- a/quantize.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
- """Update the quantize binary name depending on the platform and parse
- the command line arguments and execute the script.
- """
-
- if "linux" in sys.platform or "darwin" in sys.platform:
- quantize_script_binary = "quantize"
-
- elif "win32" in sys.platform or "cygwin" in sys.platform:
- quantize_script_binary = "quantize.exe"
-
- else:
- print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
- quantize_script_binary = "quantize"
-
- parser = argparse.ArgumentParser(
- prog='python3 quantize.py',
- description='This script quantizes the given models by applying the '
- f'"{quantize_script_binary}" script on them.'
- )
- parser.add_argument(
- 'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
- help='The models to quantize.'
- )
- parser.add_argument(
- '-r', '--remove-16', action='store_true', dest='remove_f16',
- help='Remove the f16 model after quantizing it.'
- )
- parser.add_argument(
- '-m', '--models-path', dest='models_path',
- default=os.path.join(os.getcwd(), "models"),
- help='Specify the directory where the models are located.'
- )
- parser.add_argument(
- '-q', '--quantize-script-path', dest='quantize_script_path',
- default=os.path.join(os.getcwd(), quantize_script_binary),
- help='Specify the path to the "quantize" script.'
- )
-
- # TODO: Revise this code
- # parser.add_argument(
- # '-t', '--threads', dest='threads', type='int',
- # default=os.cpu_count(),
- # help='Specify the number of threads to use to quantize many models at '
- # 'once. Defaults to os.cpu_count().'
- # )
-
- args = parser.parse_args()
- args.models_path = os.path.abspath(args.models_path)
-
- if not os.path.isfile(args.quantize_script_path):
- print(
- f'The "{quantize_script_binary}" script was not found in the '
- "current location.\nIf you want to use it from another location, "
- "set the --quantize-script-path argument from the command line."
- )
- sys.exit(1)
-
- for model in args.models:
- # The model is separated in various parts
- # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
- f16_model_path_base = os.path.join(
- args.models_path, model, "ggml-model-f16.bin"
- )
-
- if not os.path.isfile(f16_model_path_base):
- print(f'The file %s was not found' % f16_model_path_base)
- sys.exit(1)
-
- f16_model_parts_paths = map(
- lambda filename: os.path.join(f16_model_path_base, filename),
- glob.glob(f"{f16_model_path_base}*")
- )
-
- for f16_model_part_path in f16_model_parts_paths:
- if not os.path.isfile(f16_model_part_path):
- print(
- f"The f16 model {os.path.basename(f16_model_part_path)} "
- f"was not found in {args.models_path}{os.path.sep}{model}"
- ". If you want to use it from another location, set the "
- "--models-path argument from the command line."
- )
- sys.exit(1)
-
- __run_quantize_script(
- args.quantize_script_path, f16_model_part_path
- )
-
- if args.remove_f16:
- os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path):
- """Run the quantize script specifying the path to it and the path to the
- f16 model to quantize.
- """
-
- new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
- subprocess.run(
- [script_path, f16_model_part_path, new_quantized_model_path, "2"],
- check=True
- )
-
-
-if __name__ == "__main__":
- try:
- main()
-
- except subprocess.CalledProcessError:
- print("\nAn error ocurred while trying to quantize the models.")
- sys.exit(1)
-
- except KeyboardInterrupt:
- sys.exit(0)
-
- else:
- print("\nSuccesfully quantized all models.")