aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore5
-rw-r--r--ci/README.md20
-rw-r--r--ci/run.sh262
-rw-r--r--examples/common.cpp7
-rw-r--r--examples/common.h1
-rw-r--r--examples/perplexity/perplexity.cpp6
-rw-r--r--llama.cpp15
-rw-r--r--tests/test-sampling.cpp2
8 files changed, 312 insertions, 6 deletions
diff --git a/.gitignore b/.gitignore
index faec869..a23ac59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,8 @@ build/
build-em/
build-debug/
build-release/
+build-ci-debug/
+build-ci-release/
build-static/
build-cublas/
build-opencl/
@@ -25,9 +27,10 @@ build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
out/
+tmp/
models/*
-*.bin
+models-mnt
/main
/quantize
diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 0000000..6c74c81
--- /dev/null
+++ b/ci/README.md
@@ -0,0 +1,20 @@
+# CI
+
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+
+https://github.com/ggml-org/ci
+
+It monitors the `master` branch for new commits and runs the
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
+to cover various hardware architectures, including GPU and Apple Silicon instances.
+
+Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
+Only the branches of this repo are monitored for this keyword.
+
+It is a good practice, before publishing changes to execute the full CI locally on your machine:
+
+```bash
+mkdir tmp
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+```
diff --git a/ci/run.sh b/ci/run.sh
new file mode 100644
index 0000000..c823bc4
--- /dev/null
+++ b/ci/run.sh
@@ -0,0 +1,262 @@
+#/bin/bash
+
+if [ -z "$2" ]; then
+ echo "usage: $0 <output-dir> <mnt-dir>"
+ exit 1
+fi
+
+mkdir -p "$1"
+mkdir -p "$2"
+
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+
+rm -v $OUT/*.log
+rm -v $OUT/*.exit
+rm -v $OUT/*.md
+
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+
+## helpers
+
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+ local out=$1
+ local url=$2
+
+ local cwd=`pwd`
+
+ mkdir -p $out
+ cd $out
+
+ # should not re-download if file is the same
+ wget -nv -N $url
+
+ cd $cwd
+}
+
+function gg_printf {
+ printf -- "$@" >> $OUT/README.md
+}
+
+function gg_run {
+ ci=$1
+
+ set -o pipefail
+ set -x
+
+ gg_run_$ci | tee $OUT/$ci.log
+ cur=$?
+ echo "$cur" > $OUT/$ci.exit
+
+ set +x
+ set +o pipefail
+
+ gg_sum_$ci
+
+ ret=$((ret | cur))
+}
+
+## ci
+
+# ctest_debug
+
+function gg_run_ctest_debug {
+ cd ${SRC}
+
+ rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+
+ set -e
+
+ (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+ (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+ set +e
+}
+
+function gg_sum_ctest_debug {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs ctest in debug mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+ gg_printf '```\n'
+ gg_printf '\n'
+}
+
+# ctest_release
+
+function gg_run_ctest_release {
+ cd ${SRC}
+
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+ set -e
+
+ (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+ if [ -z $GG_BUILD_LOW_PERF ]; then
+ (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+ else
+ (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+ fi
+
+ set +e
+}
+
+function gg_sum_ctest_release {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Runs ctest in release mode\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '```\n'
+ gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+ gg_printf '```\n'
+}
+
+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+ cd ${SRC}
+
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+ gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+ gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+ head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+ path_models="../models-mnt/open-llama/3B-v2"
+ path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+ set -e
+
+ (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+ python3 ../convert.py ${path_models}
+
+ model_f16="${path_models}/ggml-model-f16.bin"
+ model_q8_0="${path_models}/ggml-model-q8_0.bin"
+ model_q4_0="${path_models}/ggml-model-q4_0.bin"
+ model_q4_1="${path_models}/ggml-model-q4_1.bin"
+ model_q5_0="${path_models}/ggml-model-q5_0.bin"
+ model_q5_1="${path_models}/ggml-model-q5_1.bin"
+ model_q3_k="${path_models}/ggml-model-q3_k.bin"
+ model_q4_k="${path_models}/ggml-model-q4_k.bin"
+ model_q5_k="${path_models}/ggml-model-q5_k.bin"
+ model_q6_k="${path_models}/ggml-model-q6_k.bin"
+
+ wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+ ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+ ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+ ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+ ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+ ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+ ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+ ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+ ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+ ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+ (time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ function check_ppl {
+ qnt="$1"
+ ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+ if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+ printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+ return 20
+ fi
+
+ printf ' - %s @ %s OK\n' "$qnt" "$ppl"
+ return 0
+ }
+
+ check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+ set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'OpenLLaMA 3B-v2:\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+ gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+ gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+ gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+ gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+ gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+ gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+ gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+ gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+ gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+ gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+}
+
+## main
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+ rm -rf ${SRC}/models-mnt
+
+ mnt_models=$(realpath ${MNT}/models)
+ mkdir -p ${mnt_models}
+ ln -sfn ${mnt_models} ${SRC}/models-mnt
+
+ python3 -m pip install -r ${SRC}/requirements.txt
+fi
+
+ret=0
+
+#test $ret -eq 0 && gg_run ctest_debug
+#test $ret -eq 0 && gg_run ctest_release
+
+if [ -z $GG_BUILD_LOW_PERF ]; then
+ test $ret -eq 0 && gg_run open_llama_3b_v2
+fi
+
+exit $ret
diff --git a/examples/common.cpp b/examples/common.cpp
index 8705127..fd6dbc0 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_keep = std::stoi(argv[i]);
+ } else if (arg == "--chunks") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.n_chunks = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
@@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+ fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
diff --git a/examples/common.h b/examples/common.h
index f52fef6..037a4ee 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -28,6 +28,7 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
+ int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7e120ff..bfad999 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
- int count = 0;
+ const int n_chunk_max = tokens.size() / params.n_ctx;
- const int n_chunk = tokens.size() / params.n_ctx;
+ const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch;
+ int count = 0;
double nll = 0.0;
+
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
diff --git a/llama.cpp b/llama.cpp
index 0f9d534..fa3b7c0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2024,9 +2024,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
}
// Normalize the second derivatives
- float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
- for (float & value : second_derivatives) {
- value /= second_derivatives_sum;
+ {
+ const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+
+ if (second_derivatives_sum > 1e-6f) {
+ for (float & value : second_derivatives) {
+ value /= second_derivatives_sum;
+ }
+ } else {
+ for (float & value : second_derivatives) {
+ value = 1.0f / second_derivatives.size();
+ }
+ }
}
float cum_sum = 0.0f;
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 64f9455..4437c39 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -200,4 +200,6 @@ int main(void) {
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
printf("OK\n");
+
+ return 0;
}