aboutsummaryrefslogtreecommitdiff
path: root/Makefile
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2023-06-16 20:08:44 +0300
committerGitHub <noreply@github.com>2023-06-16 20:08:44 +0300
commit3d0112261042b356621e93db3fa4c6798a5d098f (patch)
tree3634baa70ed23142f86c5a44701bbf4b0971c2fd /Makefile
parent602c748863e15270d80d74aa2c3bf86ab8139e07 (diff)
CUDA : faster k-quant dot kernels (#1862)
* cuda : faster k-quant dot kernels * Imrove Q2_K dot kernel on older GPUs We now have a K_QUANTS_PER_ITERATION macro, which should be set to 1 on older and to 2 on newer GPUs. With this, we preserve the performance of the original PR on RTX-4080, and are faster compared to master on GTX-1660. * Imrove Q6_K dot kernel on older GPUs Using the same K_QUANTS_PER_ITERATION macro as last commit, we preserve performance on RTX-4080 and speed up Q6_K on a GTX-1660. * Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile Allowed values are 1 or 2. 2 gives the best performance on modern GPUs and is set as default. On older GPUs 1 may work better. * PR comments --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'Makefile')
-rw-r--r--Makefile5
1 files changed, 5 insertions, 0 deletions
diff --git a/Makefile b/Makefile
index 09c8834..b24caf8 100644
--- a/Makefile
+++ b/Makefile
@@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y
else
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
endif # LLAMA_CUDA_DMMV_Y
+ifdef LLAMA_CUDA_KQUANTS_ITER
+ NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+else
+ NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS