diff options
| author | 0cc4m <picard12@live.de> | 2023-04-28 16:57:16 +0200 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-04-28 17:57:16 +0300 | 
| commit | 7296c961d9303010a2b98379f738da2a8a55aa1b (patch) | |
| tree | 398b36fb53bfab4411572cb69f861bbdbdbc2672 /ggml-opencl.c | |
| parent | 78ec543733d10a1629f984fd0302fdaa4e87fe66 (diff) | |
ggml : add CLBlast support (#1164)
* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing
* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers
* Finish merge of ClBlast support
* Move CLBlast implementation to separate file
Add buffer reuse code (adapted from slaren's cuda implementation)
* Add q4_2 and q4_3 CLBlast support, improve code
* Double CLBlast speed by disabling OpenBLAS thread workaround
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
* Fix device selection env variable names
* Fix cast in opencl kernels
* Add CLBlast to CMakeLists.txt
* Replace buffer pool with static buffers a, b, qb, c
Fix compile warnings
* Fix typos, use GGML_TYPE defines, improve code
* Improve btype dequant kernel selection code, add error if type is unsupported
* Improve code quality
* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read
Co-authored-by: Henri Vasserman <henv@hot.ee>
* Use c compiler for opencl files
* Simplify code, fix include
* First check error, then release event
* Make globals static, fix indentation
* Rename dequant kernels file to conform with other file names
* Fix import cl file name
---------
Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Diffstat (limited to 'ggml-opencl.c')
| -rw-r--r-- | ggml-opencl.c | 216 | 
1 files changed, 216 insertions, 0 deletions
diff --git a/ggml-opencl.c b/ggml-opencl.c new file mode 100644 index 0000000..1d68f19 --- /dev/null +++ b/ggml-opencl.c @@ -0,0 +1,216 @@ +#include "ggml-opencl.h" + +#define CL_TARGET_OPENCL_VERSION 110 +#include <clblast_c.h> + +#include <stdio.h> +#include <string.h> + +#include "ggml.h" + +#include "ggml-opencl-dequant.cl" + +#define CL_CHECK(err, name)                                                                     \ +    do {                                                                                        \ +        cl_int err_ = (err);                                                                    \ +        if (err_ != CL_SUCCESS) {                                                               \ +            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \ +            exit(1);                                                                            \ +        }                                                                                       \ +    } while (0) + +static cl_platform_id platform; +static cl_device_id device; +static cl_context context; +static cl_command_queue queue; +static cl_program program; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3; +static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; +static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; + +static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { +    cl_program p; +    char *program_log; +    size_t program_size, log_size; +    int err; + +    program_size = strlen(program_buffer); + +    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); +    if(err < 0) { +        fprintf(stderr, "OpenCL error creating program"); +        exit(1); +    } + +    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL); +    if(err < 0) { + +        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); +        program_log = (char*) malloc(log_size + 1); +        program_log[log_size] = '\0'; +        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); +        printf("%s\n", program_log); +        free(program_log); +        exit(1); +    } + +    return p; +} + +void ggml_cl_init(void) { +    cl_int err = 0; +    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); +    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); +    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); +    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); +    printf("\nInitializing CLBlast (First Run)..."); +    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); +    cl_uint num_platforms; +    clGetPlatformIDs(0, NULL, &num_platforms); +    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); +    clGetPlatformIDs(num_platforms, platforms, NULL); +    platform = platforms[plat_num]; +    char platform_buffer[1024]; +    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL); +    cl_uint num_devices; +    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); +    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); +    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); +    device = devices[dev_num]; +    char device_buffer[1024]; +    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL); +    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer); +    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); +    CL_CHECK(err, "clCreateContext"); +    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); +    CL_CHECK(err, "clCreateCommandQueue"); + +    free(platforms); +    free(devices); + +    program = build_program_from_source(context, device, clblast_dequant); + +    // Prepare dequantize kernels +    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err); +    CL_CHECK(err, "clCreateKernel"); +    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err); +    CL_CHECK(err, "clCreateKernel"); +    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); +    CL_CHECK(err, "clCreateKernel"); +    kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err); +    CL_CHECK(err, "clCreateKernel"); +} + +static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { +    if (req_size <= *cur_size) { +        return; +    } + +    // Reallocate buffer with enough space +    if (*cur_size > 0) { +        clReleaseMemObject(*buf); +    } +    cl_int err; +    *buf = clCreateBuffer(context, flags, req_size, NULL, &err); +    *cur_size = req_size; +    CL_CHECK(err, "clCreateBuffer"); +} + +void ggml_cl_sgemm_wrapper( +        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, +        const int m, const int n, const int k, +        const float alpha, const void *host_a, const int lda, +        const float *host_b, const int ldb, const float beta, +        float *host_c, const int ldc, const int btype) { +    cl_int err = 0; + +    cl_kernel kernel; +    size_t global = n * k, local, size_qb; +    bool dequant; + +    switch (btype) { +    case GGML_TYPE_F32: +        dequant = false; +        break; +    case GGML_TYPE_Q4_0: +        dequant = true; +        kernel = kernel_q4_0; +        local = 16; +        size_qb = global * (sizeof(float) + local) / 32; +        break; +    case GGML_TYPE_Q4_1: +        dequant = true; +        kernel = kernel_q4_1; +        local = 16; +        size_qb = global * (sizeof(float) * 2 + local) / 32; +        break; +    case GGML_TYPE_Q4_2: +        dequant = true; +        kernel = kernel_q4_2; +        local = 8; +        size_qb = global * (sizeof(short) + local) / 16; +        break; +    case GGML_TYPE_Q4_3: +        dequant = true; +        kernel = kernel_q4_3; +        local = 8; +        size_qb = global * (sizeof(short) * 2 + local) / 16; +        break; +    default: +        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); +        abort(); +    } + +    const size_t size_a =  m * k * sizeof(float); +    const size_t size_b =  n * k * sizeof(float); +    const size_t size_c =  m * n * sizeof(float); + +    // Prepare buffers +    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); +    if (dequant) { +        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); +    } +    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); +    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); + +    cl_event ev_a, ev_qb, ev_b; + +    if (dequant) { +        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); +        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); +        CL_CHECK(err, "clSetKernelArg"); +        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); +    } else { +        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); +    } + +    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); +    if (dequant) { +        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); +        CL_CHECK(err, "clEnqueueNDRangeKernel"); +        clReleaseEvent(ev_qb); +    } +    clWaitForEvents(1, &ev_a); +    clWaitForEvents(1, &ev_b); +    clReleaseEvent(ev_a); +    clReleaseEvent(ev_b); + +    cl_event ev_sgemm; +    CLBlastSgemm((CLBlastLayout)order, +                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, +                 m, n, k, +                 alpha, +                 cl_buffer_a, 0, lda, +                 cl_buffer_b, 0, ldb, +                 beta, +                 cl_buffer_c, 0, ldc, +                 &queue, &ev_sgemm); + +    cl_event ev_c; +    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); + +    // Wait for completion +    clWaitForEvents(1, &ev_c); +    clReleaseEvent(ev_sgemm); +    clReleaseEvent(ev_c); +}  | 
