Improve cuBLAS performance by dequantizing on the GPU (#1065)

author: slaren <2141330+slaren@users.noreply.github.com> 2023-04-20 03:14:14 +0200
committer: GitHub <noreply@github.com> 2023-04-20 03:14:14 +0200
commit: 02d6988121510c067e06d498a273a351a888f5b9 (patch)
tree: 98c6204ad4f3db40bc49595bb7705e8bcd699e5d /ggml-cuda.h
parent: 834695fe3a3ed2a962e774c9615e3f7b41d360a8 (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/ggml-cuda.h b/ggml-cuda.h
new file mode 100644
index 0000000..646caaf
--- /dev/null
+++ b/ggml-cuda.h
@@ -0,0 +1,11 @@
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+
+#ifdef  __cplusplus
+}
+#endif
author	slaren <2141330+slaren@users.noreply.github.com>	2023-04-20 03:14:14 +0200
committer	GitHub <noreply@github.com>	2023-04-20 03:14:14 +0200
commit	02d6988121510c067e06d498a273a351a888f5b9 (patch)
tree	98c6204ad4f3db40bc49595bb7705e8bcd699e5d /ggml-cuda.h
parent	834695fe3a3ed2a962e774c9615e3f7b41d360a8 (diff)