From 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Sat, 29 Apr 2023 02:04:18 +0200 Subject: cuBLAS: use host pinned memory and dequantize while copying (#1207) * cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Makefile') diff --git a/Makefile b/Makefile index 0715e85..5a1cb3e 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS endif ifdef LLAMA_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include + CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib OBJS += ggml-cuda.o NVCC = nvcc @@ -164,10 +165,10 @@ $(info ) # Build library # -ggml.o: ggml.c ggml.h +ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h llama.h llama_util.h +llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h -- cgit v1.2.3