diff options
| -rw-r--r-- | ggml.h | 1273 | 
1 files changed, 646 insertions, 627 deletions
| @@ -169,14 +169,27 @@  //  // -#ifdef  __cplusplus -extern "C" { +#ifdef GGML_SHARED +#    if defined(_WIN32) && !defined(__MINGW32__) +#        ifdef GGML_BUILD +#            define GGML_API __declspec(dllexport) +#        else +#            define GGML_API __declspec(dllimport) +#        endif +#    else +#        define GGML_API __attribute__ ((visibility ("default"))) +#    endif +#else +#    define GGML_API  #endif  #include <stdint.h>  #include <stddef.h>  #include <stdbool.h> +#define GGML_FILE_MAGIC   0x67676d6c // "ggml" +#define GGML_FILE_VERSION 1 +  #define GGML_MAX_DIMS          4  #define GGML_MAX_NODES         4096  #define GGML_MAX_PARAMS        16 @@ -184,682 +197,688 @@ extern "C" {  #define GGML_MAX_OPT           4  #define GGML_DEFAULT_N_THREADS 4 +#ifdef  __cplusplus +extern "C" { +#endif +  #ifdef __ARM_NEON -// we use the built-in 16-bit float type -typedef __fp16 ggml_fp16_t; +    // we use the built-in 16-bit float type +    typedef __fp16 ggml_fp16_t;  #else -typedef uint16_t ggml_fp16_t; +    typedef uint16_t ggml_fp16_t;  #endif -// convert FP16 <-> FP32 -float       ggml_fp16_to_fp32(ggml_fp16_t x); -ggml_fp16_t ggml_fp32_to_fp16(float x); - -struct ggml_object; -struct ggml_context; - -enum ggml_type { -    // explicitly numbered values are used in llama.cpp files -    GGML_TYPE_F32  = 0, -    GGML_TYPE_F16  = 1, -    GGML_TYPE_Q4_0 = 2, -    GGML_TYPE_Q4_1 = 3, -    GGML_TYPE_Q4_2 = 4, -    GGML_TYPE_Q4_3 = 5, -    GGML_TYPE_Q8_0 = 6, -    GGML_TYPE_I8, -    GGML_TYPE_I16, -    GGML_TYPE_I32, -    GGML_TYPE_COUNT, -}; - -// available tensor operations: -enum ggml_op { -    GGML_OP_NONE = 0, - -    GGML_OP_DUP, -    GGML_OP_ADD, -    GGML_OP_SUB, -    GGML_OP_MUL, -    GGML_OP_DIV, -    GGML_OP_SQR, -    GGML_OP_SQRT, -    GGML_OP_SUM, -    GGML_OP_MEAN, -    GGML_OP_REPEAT, -    GGML_OP_ABS, -    GGML_OP_SGN, -    GGML_OP_NEG, -    GGML_OP_STEP, -    GGML_OP_RELU, -    GGML_OP_GELU, -    GGML_OP_SILU, -    GGML_OP_NORM, // normalize -    GGML_OP_RMS_NORM, - -    GGML_OP_MUL_MAT, - -    GGML_OP_SCALE, -    GGML_OP_CPY, -    GGML_OP_CONT, -    GGML_OP_RESHAPE, -    GGML_OP_VIEW, -    GGML_OP_PERMUTE, -    GGML_OP_TRANSPOSE, -    GGML_OP_GET_ROWS, -    GGML_OP_DIAG_MASK_INF, -    GGML_OP_SOFT_MAX, -    GGML_OP_ROPE, -    GGML_OP_CONV_1D_1S, -    GGML_OP_CONV_1D_2S, - -    GGML_OP_FLASH_ATTN, -    GGML_OP_FLASH_FF, - -    GGML_OP_MAP_UNARY, -    GGML_OP_MAP_BINARY, - -    GGML_OP_COUNT, -}; - - -// ggml object -struct ggml_object { -    size_t offs; -    size_t size; - -    struct ggml_object * next; - -    char padding[8]; -}; - -static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - -// n-dimensional tensor -struct ggml_tensor { -    enum ggml_type type; - -    int    n_dims; -    int64_t ne[GGML_MAX_DIMS]; // number of elements -    size_t  nb[GGML_MAX_DIMS]; // stride in bytes: -                               // nb[0] = sizeof(type) -                               // nb[1] = nb[0]   * ne[0] + padding -                               // nb[i] = nb[i-1] * ne[i-1] - -    // compute data -    enum ggml_op op; - -    bool is_param; - -    struct ggml_tensor * grad; -    struct ggml_tensor * src0; -    struct ggml_tensor * src1; -    struct ggml_tensor * opt[GGML_MAX_OPT]; - -    // thread scheduling -    int n_tasks; - -    // performance -    int     perf_runs; -    int64_t perf_cycles; -    int64_t perf_time_us; - -    void * data; -    char padding[8]; -}; - -// computation graph -struct ggml_cgraph { -    int n_nodes; -    int n_leafs; -    int n_threads; - -    size_t work_size; -    struct ggml_tensor * work; - -    struct ggml_tensor * nodes[GGML_MAX_NODES]; -    struct ggml_tensor * grads[GGML_MAX_NODES]; -    struct ggml_tensor * leafs[GGML_MAX_NODES]; - -    // performance -    int     perf_runs; -    int64_t perf_cycles; -    int64_t perf_time_us; -}; - -// scratch buffer -struct ggml_scratch { -    size_t offs; -    size_t size; -    void * data; -}; +    // convert FP16 <-> FP32 +    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x); +    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + +    struct ggml_object; +    struct ggml_context; + +    enum ggml_type { +        GGML_TYPE_F32  = 0, +        GGML_TYPE_F16  = 1, +        GGML_TYPE_Q4_0 = 2, +        GGML_TYPE_Q4_1 = 3, +        GGML_TYPE_Q4_2 = 4, +        GGML_TYPE_Q4_3 = 5, +        GGML_TYPE_Q8_0 = 6, +        GGML_TYPE_I8, +        GGML_TYPE_I16, +        GGML_TYPE_I32, +        GGML_TYPE_COUNT, +    }; + +    // available tensor operations: +    enum ggml_op { +        GGML_OP_NONE = 0, + +        GGML_OP_DUP, +        GGML_OP_ADD, +        GGML_OP_SUB, +        GGML_OP_MUL, +        GGML_OP_DIV, +        GGML_OP_SQR, +        GGML_OP_SQRT, +        GGML_OP_SUM, +        GGML_OP_MEAN, +        GGML_OP_REPEAT, +        GGML_OP_ABS, +        GGML_OP_SGN, +        GGML_OP_NEG, +        GGML_OP_STEP, +        GGML_OP_RELU, +        GGML_OP_GELU, +        GGML_OP_SILU, +        GGML_OP_NORM, // normalize +        GGML_OP_RMS_NORM, + +        GGML_OP_MUL_MAT, + +        GGML_OP_SCALE, +        GGML_OP_CPY, +        GGML_OP_CONT, +        GGML_OP_RESHAPE, +        GGML_OP_VIEW, +        GGML_OP_PERMUTE, +        GGML_OP_TRANSPOSE, +        GGML_OP_GET_ROWS, +        GGML_OP_DIAG_MASK_INF, +        GGML_OP_SOFT_MAX, +        GGML_OP_ROPE, +        GGML_OP_CONV_1D_1S, +        GGML_OP_CONV_1D_2S, + +        GGML_OP_FLASH_ATTN, +        GGML_OP_FLASH_FF, + +        GGML_OP_MAP_UNARY, +        GGML_OP_MAP_BINARY, + +        GGML_OP_COUNT, +    }; + + +    // ggml object +    struct ggml_object { +        size_t offs; +        size_t size; + +        struct ggml_object * next; + +        char padding[8]; +    }; + +    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + +    // n-dimensional tensor +    struct ggml_tensor { +        enum ggml_type type; + +        int     n_dims; +        int64_t ne[GGML_MAX_DIMS]; // number of elements +        size_t  nb[GGML_MAX_DIMS]; // stride in bytes: +                                   // nb[0] = sizeof(type) +                                   // nb[1] = nb[0]   * ne[0] + padding +                                   // nb[i] = nb[i-1] * ne[i-1] + +        // compute data +        enum ggml_op op; + +        bool is_param; + +        struct ggml_tensor * grad; +        struct ggml_tensor * src0; +        struct ggml_tensor * src1; +        struct ggml_tensor * opt[GGML_MAX_OPT]; + +        // thread scheduling +        int n_tasks; + +        // performance +        int     perf_runs; +        int64_t perf_cycles; +        int64_t perf_time_us; + +        void * data; +        char padding[8]; +    }; + +    // computation graph +    struct ggml_cgraph { +        int n_nodes; +        int n_leafs; +        int n_threads; + +        size_t work_size; +        struct ggml_tensor * work; + +        struct ggml_tensor * nodes[GGML_MAX_NODES]; +        struct ggml_tensor * grads[GGML_MAX_NODES]; +        struct ggml_tensor * leafs[GGML_MAX_NODES]; + +        // performance +        int     perf_runs; +        int64_t perf_cycles; +        int64_t perf_time_us; +    }; + +    // scratch buffer +    struct ggml_scratch { +        size_t offs; +        size_t size; +        void * data; +    }; -struct ggml_init_params { -    // memory pool -    size_t mem_size;   // bytes -    void * mem_buffer; // if NULL, memory will be allocated internally -    bool   no_alloc;   // don't allocate memory for the tensor data -}; +    struct ggml_init_params { +        // memory pool +        size_t mem_size;   // bytes +        void * mem_buffer; // if NULL, memory will be allocated internally +        bool   no_alloc;   // don't allocate memory for the tensor data +    }; -void    ggml_time_init(void); // call this once at the beginning of the program -int64_t ggml_time_ms(void); -int64_t ggml_time_us(void); -int64_t ggml_cycles(void); -int64_t ggml_cycles_per_ms(void); +    // misc -void ggml_print_object (const struct ggml_object * obj); -void ggml_print_objects(const struct ggml_context * ctx); +    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program +    GGML_API int64_t ggml_time_ms(void); +    GGML_API int64_t ggml_time_us(void); +    GGML_API int64_t ggml_cycles(void); +    GGML_API int64_t ggml_cycles_per_ms(void); -int64_t ggml_nelements(const struct ggml_tensor * tensor); -size_t  ggml_nbytes   (const struct ggml_tensor * tensor); +    GGML_API void    ggml_print_object (const struct ggml_object * obj); +    GGML_API void    ggml_print_objects(const struct ggml_context * ctx); -int    ggml_blck_size (enum ggml_type type); -size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block -float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float +    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor); +    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor); -const char * ggml_type_name(enum ggml_type type); +    GGML_API int     ggml_blck_size (enum ggml_type type); +    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block +    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float -size_t ggml_element_size(const struct ggml_tensor * tensor); +    GGML_API const char * ggml_type_name(enum ggml_type type); -bool ggml_is_quantized(enum ggml_type type); +    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor); -struct ggml_context * ggml_init(struct ggml_init_params params); -void ggml_free(struct ggml_context * ctx); +    GGML_API bool    ggml_is_quantized(enum ggml_type type); -size_t ggml_used_mem(const struct ggml_context * ctx); +    // main -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); +    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); +    GGML_API void    ggml_free(struct ggml_context * ctx); -struct ggml_tensor * ggml_new_tensor( -        struct ggml_context * ctx, -        enum   ggml_type type, -        int    n_dims, -        const int64_t *ne); - -struct ggml_tensor * ggml_new_tensor_1d( -        struct ggml_context * ctx, -        enum   ggml_type type, -        int64_t ne0); - -struct ggml_tensor * ggml_new_tensor_2d( -        struct ggml_context * ctx, -        enum   ggml_type type, -        int64_t ne0, -        int64_t ne1); - -struct ggml_tensor * ggml_new_tensor_3d( -        struct ggml_context * ctx, -        enum   ggml_type type, -        int64_t ne0, -        int64_t ne1, -        int64_t ne2); - -struct ggml_tensor * ggml_new_tensor_4d( -        struct ggml_context * ctx, -        enum   ggml_type type, -        int64_t ne0, -        int64_t ne1, -        int64_t ne2, -        int64_t ne3); - -struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); -struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); - -struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); -struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); - -struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); -struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); -struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - -int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); -void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - -float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); -void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - - void * ggml_get_data    (const struct ggml_tensor * tensor); -float * ggml_get_data_f32(const struct ggml_tensor * tensor); - -// -// operations on tensors with backpropagation -// - -struct ggml_tensor * ggml_dup( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_add( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); +    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx); +    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); -struct ggml_tensor * ggml_add_inplace( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); +    GGML_API struct ggml_tensor * ggml_new_tensor( +            struct ggml_context * ctx, +            enum   ggml_type type, +            int    n_dims, +            const int64_t *ne); -struct ggml_tensor * ggml_sub( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); +    GGML_API struct ggml_tensor * ggml_new_tensor_1d( +            struct ggml_context * ctx, +            enum   ggml_type type, +            int64_t ne0); -struct ggml_tensor * ggml_mul( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); +    GGML_API struct ggml_tensor * ggml_new_tensor_2d( +            struct ggml_context * ctx, +            enum   ggml_type type, +            int64_t ne0, +            int64_t ne1); -struct ggml_tensor * ggml_div( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -struct ggml_tensor * ggml_sqr( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_sqrt( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// return scalar -// TODO: compute sum along rows -struct ggml_tensor * ggml_sum( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// mean along rows -struct ggml_tensor * ggml_mean( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// if a is the same shape as b, and a is not parameter, return a -// otherwise, return a new tensor: repeat(a) to fit in b -struct ggml_tensor * ggml_repeat( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -struct ggml_tensor * ggml_abs( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_sgn( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_neg( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_step( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_relu( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// TODO: double-check this computation is correct -struct ggml_tensor * ggml_gelu( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_silu( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// normalize along rows -// TODO: eps is hardcoded to 1e-5 for now -struct ggml_tensor * ggml_norm( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_rms_norm( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// A: m rows, n columns -// B: p rows, n columns (i.e. we transpose it internally) -// result is m columns, p rows -struct ggml_tensor * ggml_mul_mat( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -// -// operations on tensors without backpropagation -// - -// in-place, returns view(a) -struct ggml_tensor * ggml_scale( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -// a -> b, return view(b) -struct ggml_tensor * ggml_cpy( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -// make contiguous -struct ggml_tensor * ggml_cont( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// return view(a), b specifies the new shape -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -// return view(a) -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape_2d( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int64_t               ne0, -        int64_t               ne1); - -// return view(a) -// TODO: when we start computing gradient, make a copy instead of view -struct ggml_tensor * ggml_reshape_3d( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int64_t               ne0, -        int64_t               ne1, -        int64_t               ne2); - -// offset in bytes -struct ggml_tensor * ggml_view_1d( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int64_t               ne0, -        size_t                offset); - -struct ggml_tensor * ggml_view_2d( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int64_t               ne0, -        int64_t               ne1, -        size_t                nb1, // row stride in bytes -        size_t                offset); - -struct ggml_tensor * ggml_view_3d( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int64_t               ne0, -        int64_t               ne1, -        int64_t               ne2, -        size_t                nb1, // row   stride in bytes -        size_t                nb2, // slice stride in bytes -        size_t                offset); - -struct ggml_tensor * ggml_permute( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int                   axis0, -        int                   axis1, -        int                   axis2, -        int                   axis3); - -// alias for ggml_permute(ctx, a, 1, 0, 2, 3) -struct ggml_tensor * ggml_transpose( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -struct ggml_tensor * ggml_get_rows( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -// set elements above the diagonal to -INF -// in-place, returns view(a) -struct ggml_tensor * ggml_diag_mask_inf( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int                   n_past); - -// in-place, returns view(a) -struct ggml_tensor * ggml_soft_max( -        struct ggml_context * ctx, -        struct ggml_tensor  * a); - -// rotary position embedding -// in-place, returns view(a) -// if mode & 1 == 1, skip n_past elements -// if mode & 2 == 1, GPT-NeoX style -// TODO: avoid creating a new tensor every time -struct ggml_tensor * ggml_rope( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        int                   n_past, -        int                   n_dims, -        int                   mode); - -// padding = 1 -// TODO: we don't support extra parameters for now -//       that's why we are hard-coding the stride, padding, and dilation -//       not great .. -struct ggml_tensor * ggml_conv_1d_1s( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -struct ggml_tensor * ggml_conv_1d_2s( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b); - -struct ggml_tensor * ggml_flash_attn( -        struct ggml_context * ctx, -        struct ggml_tensor  * q, -        struct ggml_tensor  * k, -        struct ggml_tensor  * v, -        bool                  masked); - -struct ggml_tensor * ggml_flash_ff( -        struct ggml_context * ctx, -        struct ggml_tensor  * a, -        struct ggml_tensor  * b0, -        struct ggml_tensor  * b1, -        struct ggml_tensor  * c0, -        struct ggml_tensor  * c1); - -// Mapping operations -typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); -typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - -struct ggml_tensor * ggml_map_unary_f32( -        struct ggml_context        * ctx, -        struct ggml_tensor         * a, -        const  ggml_unary_op_f32_t fun); - -struct ggml_tensor * ggml_map_binary_f32( -        struct ggml_context         * ctx, -        struct ggml_tensor          * a, -        struct ggml_tensor          * b, -        const  ggml_binary_op_f32_t fun); - -// -// automatic differentiation -// - -void ggml_set_param( -        struct ggml_context * ctx, -        struct ggml_tensor * tensor); - -void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - -struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); -struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); -void ggml_graph_reset  (struct ggml_cgraph * cgraph); - -// print info and performance information for the graph -void ggml_graph_print(const struct ggml_cgraph * cgraph); - -// dump the graph into a file using the dot format -void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); - -// -// optimization -// - -// optimization methods -enum ggml_opt_type { -    GGML_OPT_ADAM, -    GGML_OPT_LBFGS, -}; - -// linesearch methods -enum ggml_linesearch { -    GGML_LINESEARCH_DEFAULT = 1, - -    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0, -    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1, -    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, -}; - -// optimization return values -enum ggml_opt_result { -    GGML_OPT_OK = 0, -    GGML_OPT_DID_NOT_CONVERGE, -    GGML_OPT_NO_CONTEXT, -    GGML_OPT_INVALID_WOLFE, -    GGML_OPT_FAIL, +    GGML_API struct ggml_tensor * ggml_new_tensor_3d( +            struct ggml_context * ctx, +            enum   ggml_type type, +            int64_t ne0, +            int64_t ne1, +            int64_t ne2); -    GGML_LINESEARCH_FAIL = -128, -    GGML_LINESEARCH_MINIMUM_STEP, -    GGML_LINESEARCH_MAXIMUM_STEP, -    GGML_LINESEARCH_MAXIMUM_ITERATIONS, -    GGML_LINESEARCH_INVALID_PARAMETERS, -}; +    GGML_API struct ggml_tensor * ggml_new_tensor_4d( +            struct ggml_context * ctx, +            enum   ggml_type type, +            int64_t ne0, +            int64_t ne1, +            int64_t ne2, +            int64_t ne3); -// optimization parameters -// -//   see ggml.c (ggml_opt_default_params) for default values -// -struct ggml_opt_params { -    enum ggml_opt_type type; +    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); +    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + +    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); +    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + +    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); +    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); +    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + +    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); +    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + +    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); +    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); -    int n_threads; +    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor); +    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); -    // delta-based convergence test      // -    //   if past == 0 - disabled -    //   if past > 0: -    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) +    // operations on tensors with backpropagation      // -    int past; -    float delta; -    // maximum number of iterations without improvement +    GGML_API struct ggml_tensor * ggml_dup( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_add( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_add_inplace( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_sub( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_mul( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_div( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_sqr( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_sqrt( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // return scalar +    // TODO: compute sum along rows +    GGML_API struct ggml_tensor * ggml_sum( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // mean along rows +    GGML_API struct ggml_tensor * ggml_mean( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // if a is the same shape as b, and a is not parameter, return a +    // otherwise, return a new tensor: repeat(a) to fit in b +    GGML_API struct ggml_tensor * ggml_repeat( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_abs( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_sgn( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_neg( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_step( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_relu( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // TODO: double-check this computation is correct +    GGML_API struct ggml_tensor * ggml_gelu( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_silu( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // normalize along rows +    // TODO: eps is hardcoded to 1e-5 for now +    GGML_API struct ggml_tensor * ggml_norm( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_rms_norm( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // A: m rows, n columns +    // B: p rows, n columns (i.e. we transpose it internally) +    // result is m columns, p rows +    GGML_API struct ggml_tensor * ggml_mul_mat( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); +      // -    //   if 0 - disabled -    //   if > 0: -    //     assume convergence if no cost improvement in this number of iterations +    // operations on tensors without backpropagation      // -    int max_no_improvement; -    bool print_forward_graph; -    bool print_backward_graph; +    // in-place, returns view(a) +    GGML_API struct ggml_tensor * ggml_scale( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    // a -> b, return view(b) +    GGML_API struct ggml_tensor * ggml_cpy( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    // make contiguous +    GGML_API struct ggml_tensor * ggml_cont( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // return view(a), b specifies the new shape +    // TODO: when we start computing gradient, make a copy instead of view +    GGML_API struct ggml_tensor * ggml_reshape( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    // return view(a) +    // TODO: when we start computing gradient, make a copy instead of view +    GGML_API struct ggml_tensor * ggml_reshape_2d( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int64_t               ne0, +            int64_t               ne1); + +    // return view(a) +    // TODO: when we start computing gradient, make a copy instead of view +    GGML_API struct ggml_tensor * ggml_reshape_3d( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int64_t               ne0, +            int64_t               ne1, +            int64_t               ne2); + +    // offset in bytes +    GGML_API struct ggml_tensor * ggml_view_1d( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int64_t               ne0, +            size_t                offset); + +    GGML_API struct ggml_tensor * ggml_view_2d( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int64_t               ne0, +            int64_t               ne1, +            size_t                nb1, // row stride in bytes +            size_t                offset); + +    GGML_API struct ggml_tensor * ggml_view_3d( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int64_t               ne0, +            int64_t               ne1, +            int64_t               ne2, +            size_t                nb1, // row   stride in bytes +            size_t                nb2, // slice stride in bytes +            size_t                offset); + +    GGML_API struct ggml_tensor * ggml_permute( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int                   axis0, +            int                   axis1, +            int                   axis2, +            int                   axis3); + +    // alias for ggml_permute(ctx, a, 1, 0, 2, 3) +    GGML_API struct ggml_tensor * ggml_transpose( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    GGML_API struct ggml_tensor * ggml_get_rows( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    // set elements above the diagonal to -INF +    // in-place, returns view(a) +    GGML_API struct ggml_tensor * ggml_diag_mask_inf( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int                   n_past); + +    // in-place, returns view(a) +    GGML_API struct ggml_tensor * ggml_soft_max( +            struct ggml_context * ctx, +            struct ggml_tensor  * a); + +    // rotary position embedding +    // in-place, returns view(a) +    // if mode & 1 == 1, skip n_past elements +    // if mode & 2 == 1, GPT-NeoX style +    // TODO: avoid creating a new tensor every time +    GGML_API struct ggml_tensor * ggml_rope( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            int                   n_past, +            int                   n_dims, +            int                   mode); + +    // padding = 1 +    // TODO: we don't support extra parameters for now +    //       that's why we are hard-coding the stride, padding, and dilation +    //       not great .. +    GGML_API struct ggml_tensor * ggml_conv_1d_1s( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_conv_1d_2s( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b); + +    GGML_API struct ggml_tensor * ggml_flash_attn( +            struct ggml_context * ctx, +            struct ggml_tensor  * q, +            struct ggml_tensor  * k, +            struct ggml_tensor  * v, +            bool                  masked); + +    GGML_API struct ggml_tensor * ggml_flash_ff( +            struct ggml_context * ctx, +            struct ggml_tensor  * a, +            struct ggml_tensor  * b0, +            struct ggml_tensor  * b1, +            struct ggml_tensor  * c0, +            struct ggml_tensor  * c1); + +    // Mapping operations +    GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); +    GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + +    GGML_API struct ggml_tensor * ggml_map_unary_f32( +            struct ggml_context        * ctx, +            struct ggml_tensor         * a, +            const  ggml_unary_op_f32_t fun); + +    GGML_API struct ggml_tensor * ggml_map_binary_f32( +            struct ggml_context         * ctx, +            struct ggml_tensor          * a, +            struct ggml_tensor          * b, +            const  ggml_binary_op_f32_t fun); -    // ADAM parameters -    struct { -        int n_iter; +    // +    // automatic differentiation +    // -        float alpha; // learning rate -        float beta1; -        float beta2; -        float eps;   // epsilon for numerical stability -        float eps_f; // epsilon for convergence test -        float eps_g; // epsilon for convergence test -    } adam; +    GGML_API void ggml_set_param( +            struct ggml_context * ctx, +            struct ggml_tensor * tensor); -    // LBFGS parameters -    struct { -        int m; // number of corrections to approximate the inv. Hessian -        int n_iter; -        int max_linesearch; +    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); -        float eps;      // convergence tolerance -        float ftol;     // line search tolerance -        float wolfe; -        float min_step; -        float max_step; +    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); +    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); -        enum ggml_linesearch linesearch; -    } lbfgs; -}; +    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); +    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph); -struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); +    // print info and performance information for the graph +    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); -// optimize the function defined by the tensor f -enum ggml_opt_result ggml_opt( -        struct ggml_context * ctx, -        struct ggml_opt_params params, -        struct ggml_tensor * f); +    // dump the graph into a file using the dot format +    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); -// -// quantization -// +    // +    // optimization +    // -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); +    // optimization methods +    enum ggml_opt_type { +        GGML_OPT_ADAM, +        GGML_OPT_LBFGS, +    }; + +    // linesearch methods +    enum ggml_linesearch { +        GGML_LINESEARCH_DEFAULT = 1, + +        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0, +        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1, +        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, +    }; + +    // optimization return values +    enum ggml_opt_result { +        GGML_OPT_OK = 0, +        GGML_OPT_DID_NOT_CONVERGE, +        GGML_OPT_NO_CONTEXT, +        GGML_OPT_INVALID_WOLFE, +        GGML_OPT_FAIL, + +        GGML_LINESEARCH_FAIL = -128, +        GGML_LINESEARCH_MINIMUM_STEP, +        GGML_LINESEARCH_MAXIMUM_STEP, +        GGML_LINESEARCH_MAXIMUM_ITERATIONS, +        GGML_LINESEARCH_INVALID_PARAMETERS, +    }; + +    // optimization parameters +    // +    //   see ggml.c (ggml_opt_default_params) for default values +    // +    struct ggml_opt_params { +        enum ggml_opt_type type; + +        int n_threads; + +        // delta-based convergence test +        // +        //   if past == 0 - disabled +        //   if past > 0: +        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) +        // +        int past; +        float delta; + +        // maximum number of iterations without improvement +        // +        //   if 0 - disabled +        //   if > 0: +        //     assume convergence if no cost improvement in this number of iterations +        // +        int max_no_improvement; + +        bool print_forward_graph; +        bool print_backward_graph; + +        // ADAM parameters +        struct { +            int n_iter; + +            float alpha; // learning rate +            float beta1; +            float beta2; +            float eps;   // epsilon for numerical stability +            float eps_f; // epsilon for convergence test +            float eps_g; // epsilon for convergence test +        } adam; + +        // LBFGS parameters +        struct { +            int m; // number of corrections to approximate the inv. Hessian +            int n_iter; +            int max_linesearch; + +            float eps;      // convergence tolerance +            float ftol;     // line search tolerance +            float wolfe; +            float min_step; +            float max_step; + +            enum ggml_linesearch linesearch; +        } lbfgs; +    }; + +    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + +    // optimize the function defined by the tensor f +    GGML_API enum ggml_opt_result ggml_opt( +            struct ggml_context * ctx, +            struct ggml_opt_params params, +            struct ggml_tensor * f); -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); +    // +    // quantization +    // -// -// system info -// +    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); +    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); +    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); +    GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); -int ggml_cpu_has_avx(void); -int ggml_cpu_has_avx2(void); -int ggml_cpu_has_avx512(void); -int ggml_cpu_has_avx512_vbmi(void); -int ggml_cpu_has_avx512_vnni(void); -int ggml_cpu_has_fma(void); -int ggml_cpu_has_neon(void); -int ggml_cpu_has_arm_fma(void); -int ggml_cpu_has_f16c(void); -int ggml_cpu_has_fp16_va(void); -int ggml_cpu_has_wasm_simd(void); -int ggml_cpu_has_blas(void); -int ggml_cpu_has_cublas(void); -int ggml_cpu_has_sse3(void); -int ggml_cpu_has_vsx(void); +    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); +    // +    // system info +    // -// -// Internal types and functions exposed for tests and benchmarks -// +    GGML_API int ggml_cpu_has_avx        (void); +    GGML_API int ggml_cpu_has_avx2       (void); +    GGML_API int ggml_cpu_has_avx512     (void); +    GGML_API int ggml_cpu_has_avx512_vbmi(void); +    GGML_API int ggml_cpu_has_avx512_vnni(void); +    GGML_API int ggml_cpu_has_fma        (void); +    GGML_API int ggml_cpu_has_neon       (void); +    GGML_API int ggml_cpu_has_arm_fma    (void); +    GGML_API int ggml_cpu_has_f16c       (void); +    GGML_API int ggml_cpu_has_fp16_va    (void); +    GGML_API int ggml_cpu_has_wasm_simd  (void); +    GGML_API int ggml_cpu_has_blas       (void); +    GGML_API int ggml_cpu_has_cublas     (void); +    GGML_API int ggml_cpu_has_sse3       (void); +    GGML_API int ggml_cpu_has_vsx        (void); + + +    // +    // Internal types and functions exposed for tests and benchmarks +    //  #ifdef  __cplusplus -// restrict not standard in C++ +    // restrict not standard in C++  #define GGML_RESTRICT  #else  #define GGML_RESTRICT restrict  #endif -typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); - -typedef struct { -    dequantize_row_q_t dequantize_row_q; -    quantize_row_q_t   quantize_row_q; -    quantize_row_q_t   quantize_row_q_reference; -    quantize_row_q_t   quantize_row_q_dot; -    vec_dot_q_t        vec_dot_q; -} quantize_fns_t; - -quantize_fns_t ggml_internal_get_quantize_fn(size_t i); +    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + +    typedef struct { +        dequantize_row_q_t dequantize_row_q; +        quantize_row_q_t   quantize_row_q; +        quantize_row_q_t   quantize_row_q_reference; +        quantize_row_q_t   quantize_row_q_dot; +        vec_dot_q_t        vec_dot_q; +    } quantize_fns_t; + +    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);  #ifdef  __cplusplus  } | 
