aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgi Gerganov <ggerganov@gmail.com>2023-03-25 20:26:40 +0200
committerGeorgi Gerganov <ggerganov@gmail.com>2023-03-25 20:26:40 +0200
commita316a425d04027453dc0fd45f003b647c12f66f9 (patch)
treeb33d7c55741f10f1cc84f489df05e1fad96f0417
parentecbe466a364876927994e2f1ec14f4d82301d201 (diff)
Overhaul the examples structure
- main -> examples - utils -> examples (renamed to "common") - quantize -> examples - separate tools for "perplexity" and "embedding" Hope I didn't break something !
-rw-r--r--.gitignore1
-rw-r--r--CMakeLists.txt29
-rw-r--r--Makefile19
-rw-r--r--examples/CMakeLists.txt36
-rw-r--r--examples/common.cpp (renamed from utils.cpp)4
-rw-r--r--examples/common.h (renamed from utils.h)0
-rw-r--r--examples/embedding/CMakeLists.txt4
-rw-r--r--examples/embedding/README.md3
-rw-r--r--examples/embedding/embedding.cpp106
-rw-r--r--examples/main/CMakeLists.txt4
-rw-r--r--examples/main/README.md3
-rw-r--r--examples/main/main.cpp (renamed from main.cpp)119
-rw-r--r--examples/perplexity/CMakeLists.txt4
-rw-r--r--examples/perplexity/README.md3
-rw-r--r--examples/perplexity/perplexity.cpp146
-rw-r--r--examples/quantize/CMakeLists.txt4
-rw-r--r--examples/quantize/README.md3
-rw-r--r--examples/quantize/quantize.cpp (renamed from quantize.cpp)0
-rw-r--r--ggml.c26
-rw-r--r--tests/CMakeLists.txt2
-rw-r--r--tests/test-tokenizer-0.cpp6
21 files changed, 361 insertions, 161 deletions
diff --git a/.gitignore b/.gitignore
index 3087b0e..ce01fd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ models/*
/main
/quantize
/result
+/perplexity
arm_neon.h
compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51af97c..a1ff5a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,17 +211,6 @@ endif()
# Build libraries
#
-add_library(utils OBJECT
- utils.cpp
- utils.h)
-
-target_include_directories(utils PUBLIC .)
-target_compile_features(utils PUBLIC cxx_std_11) # don't bump
-target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
- set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
add_library(ggml OBJECT
ggml.c
ggml.h)
@@ -239,23 +228,13 @@ add_library(llama
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
#
-# Executables
-#
-
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE llama ggml utils)
-
-add_executable(quantize quantize.cpp)
-target_link_libraries(quantize PRIVATE llama ggml utils)
-
-#
# programs, examples and tests
#
@@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
add_subdirectory(tests)
endif ()
-#if (LLAMA_BUILD_EXAMPLES)
-# add_subdirectory(examples)
-#endif()
+if (LLAMA_BUILD_EXAMPLES)
+ add_subdirectory(examples)
+endif()
diff --git a/Makefile b/Makefile
index e8b128c..98a2d85 100644
--- a/Makefile
+++ b/Makefile
@@ -212,7 +212,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
-default: main quantize
+default: main quantize perplexity
#
# Build library
@@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
-utils.o: utils.cpp utils.h
- $(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
+common.o: examples/common.cpp examples/common.h
+ $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean:
- rm -f *.o main quantize
+ rm -vf *.o main quantize perplexity
-main: main.cpp ggml.o llama.o utils.o
- $(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o
+ $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
-quantize: quantize.cpp ggml.o llama.o utils.o
- $(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o
+ $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
+ $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
#
# Tests
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..ce3a347
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,36 @@
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+# ...
+
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+ common.h
+ common.cpp
+ )
+
+if (BUILD_SHARED_LIBS)
+ set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
+
+# examples
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if (EMSCRIPTEN)
+else()
+ add_subdirectory(main)
+ add_subdirectory(quantize)
+ add_subdirectory(perplexity)
+ add_subdirectory(embedding)
+endif()
diff --git a/utils.cpp b/examples/common.cpp
index cea3096..afa7d40 100644
--- a/utils.cpp
+++ b/examples/common.cpp
@@ -1,6 +1,6 @@
-#include "ggml.h"
+#include "common.h"
-#include "utils.h"
+#include "ggml.h"
#include <cassert>
#include <cstring>
diff --git a/utils.h b/examples/common.h
index dede803..dede803 100644
--- a/utils.h
+++ b/examples/common.h
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
new file mode 100644
index 0000000..88c425d
--- /dev/null
+++ b/examples/embedding/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET embedding)
+add_executable(${TARGET} embedding.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
new file mode 100644
index 0000000..21d8be6
--- /dev/null
+++ b/examples/embedding/README.md
@@ -0,0 +1,3 @@
+# embedding
+
+TODO
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
new file mode 100644
index 0000000..3015293
--- /dev/null
+++ b/examples/embedding/embedding.cpp
@@ -0,0 +1,106 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ params.embedding = true;
+
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_context * ctx;
+
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+
+ int n_past = 0;
+
+ // Add a space in front of the first character to match OG llama tokenizer behavior
+ params.prompt.insert(0, 1, ' ');
+
+ // tokenize the prompt
+ auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+
+ // determine newline token
+ auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
+ if (params.verbose_prompt) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ for (int i = 0; i < (int) embd_inp.size(); i++) {
+ fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+ }
+ fprintf(stderr, "\n");
+ }
+
+ if (params.embedding){
+ if (embd_inp.size() > 0) {
+ if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return 1;
+ }
+ }
+
+ const auto embeddings = llama_get_embeddings(ctx);
+
+ // TODO: print / use the embeddings
+ }
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ return 0;
+}
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
new file mode 100644
index 0000000..b2dcc29
--- /dev/null
+++ b/examples/main/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main/README.md b/examples/main/README.md
new file mode 100644
index 0000000..4701aa5
--- /dev/null
+++ b/examples/main/README.md
@@ -0,0 +1,3 @@
+# main
+
+TODO
diff --git a/main.cpp b/examples/main/main.cpp
index 77260bb..b5f1a7b 100644
--- a/main.cpp
+++ b/examples/main/main.cpp
@@ -1,5 +1,4 @@
-#include "utils.h"
-#include "ggml.h"
+#include "common.h"
#include "llama.h"
#include <cassert>
@@ -65,79 +64,6 @@ void set_console_state(console_state new_st)
}
}
-std::vector<double> softmax(const std::vector<float>& logits) {
- std::vector<double> probs(logits.size());
- float max_logit = logits[0];
- for (float v : logits) max_logit = std::max(max_logit, v);
- double sum_exp = 0.0;
- for (size_t i = 0; i < logits.size(); i++) {
- // Subtract the maximum logit value from the current logit value for numerical stability
- float logit = logits[i] - max_logit;
- double exp_logit = std::exp(logit);
- sum_exp += exp_logit;
- probs[i] = exp_logit;
- }
- for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
- return probs;
-}
-
-void perplexity(llama_context * ctx, const gpt_params & params) {
- // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
- // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
- // Output: `perplexity: 13.5106 [114/114]`
- auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-
- int count = 0;
- double nll = 0.0;
- int seq_count = tokens.size() / params.n_ctx;
-
- fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
-
- for (int i = 0; i < seq_count; ++i) {
- int start = i * params.n_ctx;
- int end = start + params.n_ctx - 1;
- std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
- auto start_t = std::chrono::high_resolution_clock::now();
- if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return;
- }
- auto end_t = std::chrono::high_resolution_clock::now();
- if (i == 0) {
- double seconds = std::chrono::duration<double>(end_t - start_t).count();
- printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
- }
- // We get the logits for all the tokens in the context window (params.n_ctx)
- // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
- // calculate the perplexity over the last half the window (so the model always has
- // some context to predict the token).
- //
- // We rely on the fact that attention in the forward pass only looks at previous
- // tokens here, so the logits returned for each token are an accurate representation
- // of what the model would have predicted at that point.
- //
- // Example, we have a context window of 512, we will compute perplexity for each of the
- // last 256 tokens. Then, we split the input up into context window size chunks to
- // process the entire prompt.
-
- auto logits = llama_get_logits(ctx);
- for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
- // Calculate probability of next token, given the previous ones.
- int n_vocab = llama_n_vocab(ctx);
- std::vector<float> tok_logits(
- logits + j * n_vocab,
- logits + (j + 1) * n_vocab);
- double prob = softmax(tok_logits)[tokens[start + j + 1]];
- nll += -std::log(prob);
- ++count;
- }
- // perplexity is e^(average negative log-likelihood)
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
- fflush(stdout);
- }
- printf("\n");
-}
-
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -155,9 +81,6 @@ void sigint_handler(int signo) {
#endif
int main(int argc, char ** argv) {
- // has to be called once at the start of the program to init ggml stuff
- ggml_time_init();
-
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
@@ -165,6 +88,14 @@ int main(int argc, char ** argv) {
return 1;
}
+ if (params.perplexity) {
+ printf("\n************\n");
+ printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+ printf("************\n\n");
+
+ return 0;
+ }
+
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
@@ -198,9 +129,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
- lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
- lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -236,11 +165,6 @@ int main(int argc, char ** argv) {
return 0;
}
- if (params.perplexity) {
- perplexity(ctx, params);
- exit(0);
- }
-
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
@@ -346,27 +270,6 @@ int main(int argc, char ** argv) {
// the first thing we will do is to output the prompt, so set color accordingly
set_console_state(CONSOLE_STATE_PROMPT);
- if (params.embedding){
- embd = embd_inp;
-
- if (embd.size() > 0) {
- if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
- return 1;
- }
- }
-
- const auto embeddings = llama_get_embeddings(ctx);
-
- // TODO: print / use the embeddings
-
- if (params.use_color) {
- printf(ANSI_COLOR_RESET);
- }
-
- return 0;
- }
-
while (remaining_tokens > 0 || params.interactive) {
// predict
if (embd.size() > 0) {
@@ -392,10 +295,6 @@ int main(int argc, char ** argv) {
auto logits = llama_get_logits(ctx);
if (params.ignore_eos) {
- // set the logit of the eos token to zero to avoid sampling it
- //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
- // TODO: this does not work of params.logits_all == true
- assert(params.perplexity == false);
logits[llama_token_eos()] = 0;
}
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
new file mode 100644
index 0000000..5836df8
--- /dev/null
+++ b/examples/perplexity/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
new file mode 100644
index 0000000..a932275
--- /dev/null
+++ b/examples/perplexity/README.md
@@ -0,0 +1,3 @@
+# perplexity
+
+TODO
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
new file mode 100644
index 0000000..f0266a0
--- /dev/null
+++ b/examples/perplexity/perplexity.cpp
@@ -0,0 +1,146 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+std::vector<double> softmax(const std::vector<float>& logits) {
+ std::vector<double> probs(logits.size());
+ float max_logit = logits[0];
+ for (float v : logits) max_logit = std::max(max_logit, v);
+ double sum_exp = 0.0;
+ for (size_t i = 0; i < logits.size(); i++) {
+ // Subtract the maximum logit value from the current logit value for numerical stability
+ float logit = logits[i] - max_logit;
+ double exp_logit = std::exp(logit);
+ sum_exp += exp_logit;
+ probs[i] = exp_logit;
+ }
+ for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+ return probs;
+}
+
+void perplexity(llama_context * ctx, const gpt_params & params) {
+ // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+ // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+ // Output: `perplexity: 13.5106 [114/114]`
+ auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+
+ int count = 0;
+ double nll = 0.0;
+ int seq_count = tokens.size() / params.n_ctx;
+
+ fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+
+ for (int i = 0; i < seq_count; ++i) {
+ int start = i * params.n_ctx;
+ int end = start + params.n_ctx - 1;
+ std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+ auto start_t = std::chrono::high_resolution_clock::now();
+ if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+ fprintf(stderr, "%s : failed to eval\n", __func__);
+ return;
+ }
+ auto end_t = std::chrono::high_resolution_clock::now();
+ if (i == 0) {
+ double seconds = std::chrono::duration<double>(end_t - start_t).count();
+ printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+ }
+ // We get the logits for all the tokens in the context window (params.n_ctx)
+ // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
+ // calculate the perplexity over the last half the window (so the model always has
+ // some context to predict the token).
+ //
+ // We rely on the fact that attention in the forward pass only looks at previous
+ // tokens here, so the logits returned for each token are an accurate representation
+ // of what the model would have predicted at that point.
+ //
+ // Example, we have a context window of 512, we will compute perplexity for each of the
+ // last 256 tokens. Then, we split the input up into context window size chunks to
+ // process the entire prompt.
+
+ auto logits = llama_get_logits(ctx);
+ for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+ // Calculate probability of next token, given the previous ones.
+ int n_vocab = llama_n_vocab(ctx);
+ std::vector<float> tok_logits(
+ logits + j * n_vocab,
+ logits + (j + 1) * n_vocab);
+ double prob = softmax(tok_logits)[tokens[start + j + 1]];
+ nll += -std::log(prob);
+ ++count;
+ }
+ // perplexity is e^(average negative log-likelihood)
+ printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+ fflush(stdout);
+ }
+ printf("\n");
+}
+
+int main(int argc, char ** argv) {
+ gpt_params params;
+ params.model = "models/llama-7B/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ params.perplexity = true;
+
+ if (params.n_ctx > 2048) {
+ fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+ "expect poor results\n", __func__, params.n_ctx);
+ }
+
+ if (params.seed <= 0) {
+ params.seed = time(NULL);
+ }
+
+ fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.random_prompt) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ llama_context * ctx;
+
+ // load the model
+ {
+ auto lparams = llama_context_default_params();
+
+ lparams.n_ctx = params.n_ctx;
+ lparams.n_parts = params.n_parts;
+ lparams.seed = params.seed;
+ lparams.f16_kv = params.memory_f16;
+ lparams.logits_all = params.perplexity;
+ lparams.use_mlock = params.use_mlock;
+ lparams.embedding = params.embedding;
+
+ ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+ }
+
+ // print system information
+ {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+ params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+ }
+
+ perplexity(ctx, params);
+
+ llama_print_timings(ctx);
+ llama_free(ctx);
+
+ return 0;
+}
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
new file mode 100644
index 0000000..fb27d45
--- /dev/null
+++ b/examples/quantize/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
new file mode 100644
index 0000000..f349e91
--- /dev/null
+++ b/examples/quantize/README.md
@@ -0,0 +1,3 @@
+# quantize
+
+TODO
diff --git a/quantize.cpp b/examples/quantize/quantize.cpp
index f0230f5..f0230f5 100644
--- a/quantize.cpp
+++ b/examples/quantize/quantize.cpp
diff --git a/ggml.c b/ggml.c
index 291e12a..b566b56 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
- const int ne00 = src0->ne[0];
- const int ne01 = src0->ne[1];
+ //const int ne00 = src0->ne[0];
+ //const int ne01 = src0->ne[1];
const int ne10 = src1->ne[0];
@@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32(
const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1];
- const int ne12 = src1->ne[2];
- const int ne13 = src1->ne[3];
+ //const int ne12 = src1->ne[2];
+ //const int ne13 = src1->ne[3];
- const int ne0 = dst->ne[0];
- const int ne1 = dst->ne[1];
- const int ne2 = dst->ne[2];
- const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne0 = dst->ne[0];
+ //const int ne1 = dst->ne[1];
+ //const int ne2 = dst->ne[2];
+ //const int ne3 = dst->ne[3];
+ //const int ne = ne0*ne1*ne2*ne3;
- const int nb00 = src0->nb[0];
+ //const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3];
@@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
@@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
@@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3];
- const int ne = ne0*ne1*ne2*ne3;
+ //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1];
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6a4170f..b44d7fe 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source})
- target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
+ target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction()
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 49bc232..3820553 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -1,9 +1,9 @@
-#include "utils.h"
#include "llama.h"
#include <cstdio>
#include <string>
#include <map>
+#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
}
for (const auto & test_kv : k_tests) {
- const auto res = ::llama_tokenize(ctx, test_kv.first, true);
+ std::vector<llama_token> res(test_kv.first.size());
+ const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
+ res.resize(n);
bool correct = res.size() == test_kv.second.size();