diff options
author | Johannes Gäßler <johannesg@5d6.de> | 2023-07-29 23:04:44 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-29 23:04:44 +0200 |
commit | 11f3ca06b8c66b0427aab0a472479da22553b472 (patch) | |
tree | 8e934ff0d93a78447d996b00561f7ff826c3533f /CMakeLists.txt | |
parent | 9baf9ef304f330009d5a93b7390280a0fd27c9a1 (diff) |
CUDA: Quantized matrix matrix multiplication (#2160)
* mmq implementation for non k-quants
* q6_K
* q2_K
* q3_k
* q4_K
* vdr
* q5_K
* faster q8_1 loading
* loop unrolling
* add __restrict__
* q2_K sc_high
* GGML_CUDA_MMQ_Y
* Updated Makefile
* Update Makefile
* DMMV_F16 -> F16
* Updated README, CMakeLists
* Fix CMakeLists.txt
* Fix CMakeLists.txt
* Fix multi GPU out-of-bounds
Diffstat (limited to 'CMakeLists.txt')
-rw-r--r-- | CMakeLists.txt | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index c43e65e..6e1abea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,9 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) +option(LLAMA_CUBLAS "llama: use CUDA" OFF) +option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") @@ -251,6 +253,10 @@ if (LLAMA_CUBLAS) set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) add_compile_definitions(GGML_USE_CUBLAS) + if (LLAMA_CUDA_CUBLAS) + add_compile_definitions(GGML_CUDA_CUBLAS) + endif() + add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() |