cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
author: slaren <2141330+slaren@users.noreply.github.com> 2023-04-29 02:04:18 +0200
committer: GitHub <noreply@github.com> 2023-04-29 02:04:18 +0200
commit: 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch)
tree: cc017db2f3443a39221ad319ab51df0925012e84 /Makefile
parent: b1ee8f59b4101b46999a0995d9a34506f7285466 (diff)
1 files changed, 3 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index 0715e85..5a1cb3e 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
@@ -164,10 +165,10 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h
+ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-llama.o: llama.cpp ggml.h llama.h llama_util.h
+llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
author	slaren <2141330+slaren@users.noreply.github.com>	2023-04-29 02:04:18 +0200
committer	GitHub <noreply@github.com>	2023-04-29 02:04:18 +0200
commit	7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c (patch)
tree	cc017db2f3443a39221ad319ab51df0925012e84 /Makefile
parent	b1ee8f59b4101b46999a0995d9a34506f7285466 (diff)