diff options
author | slaren <2141330+slaren@users.noreply.github.com> | 2023-03-28 16:26:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-28 17:26:55 +0300 |
commit | a6bdc47cba23713a22ade47dd65b6afeb8009ff4 (patch) | |
tree | f13eebd524401b4dca3b57e698435700348397b0 | |
parent | 7b8dbcb78b2f65c4676e41da215800d65846edd0 (diff) |
Fix usage of F16C intrinsics in AVX code (#563)
* Fix usage of F16C intrinsics in AVX code when F16C is not defined
-rw-r--r-- | ggml.c | 25 |
1 files changed, 24 insertions, 1 deletions
@@ -1122,13 +1122,36 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { #define GGML_F16_EPR 8 // F16 arithmetic is not supported by AVX, so we use F32 instead -// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32 #define GGML_F32Cx8 __m256 #define GGML_F32Cx8_ZERO _mm256_setzero_ps() #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) + +#if defined(__F16C__) +// the _mm256_cvt intrinsics require F16C #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) + tmp[i] = GGML_FP16_TO_FP32(x[i]); + + return _mm256_loadu_ps(tmp); +} +static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { + float arr[8]; + + _mm256_storeu_ps(arr, y); + + for (int i = 0; i < 8; i++) + x[i] = GGML_FP16_TO_FP32(arr[i]); +} +#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#endif + #define GGML_F32Cx8_FMA GGML_F32x8_FMA #define GGML_F32Cx8_ADD _mm256_add_ps #define GGML_F32Cx8_MUL _mm256_mul_ps |