diff options
author | Georgi Gerganov <ggerganov@gmail.com> | 2023-07-18 14:24:43 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-18 14:24:43 +0300 |
commit | d01bccde9f759b24449fdaa16306b406a50eb367 (patch) | |
tree | a1c351c6732f399f540a1e91f957eb61db535bbc /examples | |
parent | 6cbf9dfb32f0e23ed3afd02d30ab066ed53e2c4d (diff) |
ci : integrate with ggml-org/ci (#2250)
* ci : run ctest
ggml-ci
* ci : add open llama 3B-v2 tests
ggml-ci
* ci : disable wget progress output
ggml-ci
* ci : add open llama 3B-v2 tg tests for q4 and q5 quantizations
ggml-ci
* tests : try to fix tail free sampling test
ggml-ci
* ci : add K-quants
ggml-ci
* ci : add short perplexity tests
ggml-ci
* ci : add README.md
* ppl : add --chunks argument to limit max number of chunks
ggml-ci
* ci : update README
Diffstat (limited to 'examples')
-rw-r--r-- | examples/common.cpp | 7 | ||||
-rw-r--r-- | examples/common.h | 1 | ||||
-rw-r--r-- | examples/perplexity/perplexity.cpp | 6 |
3 files changed, 12 insertions, 2 deletions
diff --git a/examples/common.cpp b/examples/common.cpp index 8705127..fd6dbc0 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_keep = std::stoi(argv[i]); + } else if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_chunks = std::stoi(argv[i]); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } diff --git a/examples/common.h b/examples/common.h index f52fef6..037a4ee 100644 --- a/examples/common.h +++ b/examples/common.h @@ -28,6 +28,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7e120ff..bfad999 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // BOS tokens will be added for each chunk before eval auto tokens = ::llama_tokenize(ctx, params.prompt, true); - int count = 0; + const int n_chunk_max = tokens.size() / params.n_ctx; - const int n_chunk = tokens.size() / params.n_ctx; + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_vocab = llama_n_vocab(ctx); const int n_batch = params.n_batch; + int count = 0; double nll = 0.0; + fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { |