3 files changed, 12 insertions, 2 deletions
diff --git a/examples/common.cpp b/examples/common.cpp
index 8705127..fd6dbc0 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -279,6 +279,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_keep = std::stoi(argv[i]);
+        } else if (arg == "--chunks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_chunks = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -515,6 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
     fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stderr, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     if (llama_mlock_supported()) {
         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
diff --git a/examples/common.h b/examples/common.h
index f52fef6..037a4ee 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -28,6 +28,7 @@ struct gpt_params {
     int32_t n_ctx                           = 512; // context size
     int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;  // max number of chunks to process (-1 = unlimited)
     int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
     int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 7e120ff..bfad999 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -32,13 +32,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
     // BOS tokens will be added for each chunk before eval
     auto tokens = ::llama_tokenize(ctx, params.prompt, true);
 
-    int count   = 0;
+    const int n_chunk_max = tokens.size() / params.n_ctx;
 
-    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
     const int n_vocab = llama_n_vocab(ctx);
     const int n_batch = params.n_batch;
 
+    int count = 0;
     double nll = 0.0;
+
     fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     for (int i = 0; i < n_chunk; ++i) {