cuBLAS: refactor and optimize f16 mat mul performance (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device * cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16 * fix build * cuBLAS: update block_q5_1
author: slaren <2141330+slaren@users.noreply.github.com> 2023-05-01 18:11:07 +0200
committer: GitHub <noreply@github.com> 2023-05-01 18:11:07 +0200
commit: 58b367c2d757c0ea12aec672382462b42204c724 (patch)
tree: b2fa89daf71c08788c44e3fb9abf1747ec8ee65d /ggml.h
parent: ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/ggml.h b/ggml.h
index d6feacd..ef5a048 100644
--- a/ggml.h
+++ b/ggml.h
@@ -197,6 +197,14 @@
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -212,6 +220,9 @@ extern "C" {
     GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
     GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
 
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
+
     struct ggml_object;
     struct ggml_context;
author	slaren <2141330+slaren@users.noreply.github.com>	2023-05-01 18:11:07 +0200
committer	GitHub <noreply@github.com>	2023-05-01 18:11:07 +0200
commit	58b367c2d757c0ea12aec672382462b42204c724 (patch)
tree	b2fa89daf71c08788c44e3fb9abf1747ec8ee65d /ggml.h
parent	ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae (diff)