diff options
| -rw-r--r-- | ggml.c | 162 | ||||
| -rw-r--r-- | ggml.h | 17 | 
2 files changed, 177 insertions, 2 deletions
| @@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];  // precomputed f32 table for f16 (256 KB)  static float table_f32_f16[1 << 16]; -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__wasm_simd128__)  #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) @@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int              const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));              const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));              const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); -            const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15)); +            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));              y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);              y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); @@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *      }      *s = vaddvq_f32(sumv); +#elif defined(__wasm_simd128__) +    v128_t sumv = wasm_f32x4_splat(0.0f); + +    uint64_t tmp[4]; + +    for (int i = 0; i < nb; ++i) { +        const block_q5_0 * restrict x0 = &x[i]; +        const block_q8_0 * restrict y0 = &y[i]; + +        const v128_t m4b  = wasm_i8x16_splat(0x0F); +        const v128_t s16b = wasm_i8x16_splat(0x10); + +        // extract the 5th bit +        uint32_t qh; +        memcpy(&qh, x0->qh, sizeof(qh)); + +        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF]; +        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF]; +        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; +        tmp[3] = table_b2b_u[(qh >> 24)       ]; + +        const v128_t qhl = wasm_v128_load(tmp + 0); +        const v128_t qhh = wasm_v128_load(tmp + 2); + +        const v128_t v0 = wasm_v128_load(x0->qs); + +        // 4-bit -> 8-bit +        const v128_t v0l = wasm_v128_and (v0, m4b); +        const v128_t v0h = wasm_u8x16_shr(v0, 4); + +        // interleave +        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23); +        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + +        // add high bit and sub 16 +        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b); +        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b); + +        // load y +        const v128_t v1l = wasm_v128_load(y0->qs); +        const v128_t v1h = wasm_v128_load(y0->qs + 16); + +        // int8x16 -> int16x8 +        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); +        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); +        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); +        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + +        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); +        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); +        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); +        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + +        const float x0d = GGML_FP16_TO_FP32(x0->d); + +        // dot product +        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( +                        wasm_i32x4_add( +                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), +                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)), +                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), +                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); +    } + +    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + +         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);  #elif defined(__AVX2__)      // Initialize accumulator with zeros      __m256 acc = _mm256_setzero_ps(); @@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *      }      *s = vaddvq_f32(sumv) + summs; +#elif defined(__wasm_simd128__) +    v128_t sumv = wasm_f32x4_splat(0.0f); + +    float summs = 0.0f; + +    uint64_t tmp[4]; + +    for (int i = 0; i < nb; ++i) { +        const block_q5_1 * restrict x0 = &x[i]; +        const block_q8_1 * restrict y0 = &y[i]; + +        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + +        const v128_t m4b = wasm_i8x16_splat(0x0F); + +        // extract the 5th bit +        uint32_t qh; +        memcpy(&qh, x0->qh, sizeof(qh)); + +        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF]; +        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF]; +        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; +        tmp[3] = table_b2b_u[(qh >> 24)       ]; + +        const v128_t qhl = wasm_v128_load(tmp + 0); +        const v128_t qhh = wasm_v128_load(tmp + 2); + +        const v128_t v0 = wasm_v128_load(x0->qs); + +        // 4-bit -> 8-bit +        const v128_t v0l = wasm_v128_and (v0, m4b); +        const v128_t v0h = wasm_u8x16_shr(v0, 4); + +        static bool x = true; + +        // interleave +        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23); +        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + +        // add high bit +        const v128_t v0lf = wasm_v128_or(v0lz, qhl); +        const v128_t v0hf = wasm_v128_or(v0hz, qhh); + +        // load y +        const v128_t v1l = wasm_v128_load(y0->qs); +        const v128_t v1h = wasm_v128_load(y0->qs + 16); + +        // int8x16 -> int16x8 +        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); +        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); +        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); +        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + +        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); +        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); +        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); +        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + +        const float x0d = GGML_FP16_TO_FP32(x0->d); + +        // dot product +        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( +                        wasm_i32x4_add( +                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), +                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)), +                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), +                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); +    } + +    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + +         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;  #elif defined(__AVX2__)      // Initialize accumulator with zeros      __m256 acc = _mm256_setzero_ps(); @@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {      return GGML_IS_QUANTIZED[type];  } +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { +    enum ggml_type wtype = GGML_TYPE_COUNT; + +    switch (ftype) { +        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break; +        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break; +        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break; +        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break; +        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break; +        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break; +        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break; +        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break; +        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break; +        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; +    } + +    GGML_ASSERT(wtype != GGML_TYPE_COUNT); + +    return wtype; +} +  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {      return tensor->nb[0] > tensor->nb[1];  } @@ -232,6 +232,20 @@ extern "C" {          GGML_TYPE_COUNT,      }; +    // model file types +    enum ggml_ftype { +        GGML_FTYPE_UNKNOWN     = -1, +        GGML_FTYPE_ALL_F32     = 0, +        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors +        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors +    }; +      // available tensor operations:      enum ggml_op {          GGML_OP_NONE = 0, @@ -385,6 +399,9 @@ extern "C" {      GGML_API bool    ggml_is_quantized(enum ggml_type type); +    // TODO: temporary until model loading of ggml examples is refactored +    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); +      // main      GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); | 
