diff options
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r-- | ggml-cuda.cu | 1291 |
1 files changed, 875 insertions, 416 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5385e01..c700890 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -24,19 +24,35 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } \ } while (0) +#if CUDART_VERSION >= 12 #define CUBLAS_CHECK(err) \ do { \ cublasStatus_t err_ = (err); \ if (err_ != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ + err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ exit(1); \ } \ } while (0) +#else +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) +#endif // CUDART_VERSION >= 11 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1); typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); -typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream); typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v); +typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_cuda_op_t)( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, + float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main); // QK = number of values after dequantization // QR = QK / number of values before dequantization @@ -132,8 +148,10 @@ static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define WARP_SIZE 32 +#define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 - +#define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_ROPE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec @@ -144,6 +162,15 @@ static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define GGML_CUDA_DMMV_Y 1 #endif +static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] + y[i]; +} + static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -153,6 +180,45 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co dst[i] = x[i] * y[i%ky]; } +static __global__ void silu_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + expf(-x[i])); +} + +static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + const float eps = 1e-6; + + float tmp = 0.0f; // partial sum for thread in warp + + for (int i = 0; i < ncols; i += WARP_SIZE) { + const int col = i + tid; + const float xi = x[row*ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + __syncthreads(); +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + } + + const float mean = tmp / ncols; + const float scale = 1.0f / sqrtf(mean + eps); + + for (int i = 0; i < ncols; i += WARP_SIZE) { + const int col = i + tid; + dst[row*ncols + col] = scale * x[row*ncols + col]; + } +} + static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ const block_q4_0 * x = (const block_q4_0 *) vx; @@ -565,8 +631,8 @@ static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){ const half * x = (const half *) vx; - v0 = __half2float(x[ib + 0]); - v1 = __half2float(x[ib + 1]); + v0 = __half2float(x[ib + iqs + 0]); + v1 = __half2float(x[ib + iqs + 1]); } template <int qk, int qr, dequantize_kernel_t dequantize_kernel> @@ -599,7 +665,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter const int y_offset = qr == 1 ? 1 : qk/2; - float tmp = 0; // partial sum for thread in warp + float tmp = 0.0f; // partial sum for thread in warp for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; @@ -671,11 +737,48 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y } } +static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) { + const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); + + if (col >= ncols) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const float theta = p*powf(theta_scale, col/2); + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k); +} + static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE; mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky); } +static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k); +} + +static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols); +} + static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); @@ -799,7 +902,7 @@ static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, f static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); + dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k); } static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { @@ -839,33 +942,12 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } -static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - return dequantize_mul_mat_vec_q4_0_cuda; - case GGML_TYPE_Q4_1: - return dequantize_mul_mat_vec_q4_1_cuda; - case GGML_TYPE_Q5_0: - return dequantize_mul_mat_vec_q5_0_cuda; - case GGML_TYPE_Q5_1: - return dequantize_mul_mat_vec_q5_1_cuda; - case GGML_TYPE_Q8_0: - return dequantize_mul_mat_vec_q8_0_cuda; - case GGML_TYPE_Q2_K: - return dequantize_mul_mat_vec_q2_k_cuda; - case GGML_TYPE_Q3_K: - return dequantize_mul_mat_vec_q3_k_cuda; - case GGML_TYPE_Q4_K: - return dequantize_mul_mat_vec_q4_k_cuda; - case GGML_TYPE_Q5_K: - return dequantize_mul_mat_vec_q5_k_cuda; - case GGML_TYPE_Q6_K: - return dequantize_mul_mat_vec_q6_k_cuda; - case GGML_TYPE_F16: - return convert_mul_mat_vec_f16_cuda; - default: - return nullptr; - } +static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(nrows % 2 == 0); + const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale); } // buffer pool for cuda @@ -890,14 +972,16 @@ struct cuda_buffer { size_t size = 0; }; -static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; +static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(cudaGetDevice(&id)); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - cuda_buffer& b = g_cuda_buffer_pool[i]; + cuda_buffer& b = g_cuda_buffer_pool[id][i]; if (b.size >= size && b.ptr != nullptr) { void * ptr = b.ptr; *actual_size = b.size; @@ -914,9 +998,11 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(cudaGetDevice(&id)); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - cuda_buffer& b = g_cuda_buffer_pool[i]; + cuda_buffer& b = g_cuda_buffer_pool[id][i]; if (b.ptr == nullptr) { b.ptr = ptr; b.size = size; @@ -927,31 +1013,87 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { CUDA_CHECK(cudaFree(ptr)); } + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication. #define GGML_CUDA_MAX_EVENTS 64 -static cublasHandle_t g_cublasH = nullptr; -static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr }; -static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr }; -static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr }; + +static int g_device_count = -1; +static int g_main_device = 0; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; + +static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; +static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr }; void ggml_init_cublas() { - if (g_cublasH == nullptr) { - // create streams - for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking)); + static bool initialized = false; + + if (!initialized) { + CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); + GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); + int64_t total_vram = 0; + fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); + for (int id = 0; id < g_device_count; ++id) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); + fprintf(stderr, " Device %d: %s\n", id, prop.name); + g_tensor_split[id] = total_vram; + total_vram += prop.totalGlobalMem; } - // create events - for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { - CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming)); + for (int id = 0; id < g_device_count; ++id) { + g_tensor_split[id] /= total_vram; } - // create cublas handle - CUBLAS_CHECK(cublasCreate(&g_cublasH)); - CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH)); + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(cudaSetDevice(id)); + + // create streams + for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking)); + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking)); + } + // create events + for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { + CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming)); + } + + // create cublas handle + CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); + CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); + } // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + + initialized = true; + } +} + +void ggml_cuda_set_tensor_split(const float * tensor_split) { + bool all_zero = true; + for (int i = 0; i < g_device_count; ++i) { + if (tensor_split[i] != 0.0f) { + all_zero = false; + break; + } + } + if (all_zero) { + return; + } + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] /= split_sum; } } @@ -975,26 +1117,29 @@ void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } -static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { - const uint64_t ne0 = src->ne[0]; - const uint64_t ne1 = src->ne[1]; - const uint64_t nb0 = src->nb[0]; - const uint64_t nb1 = src->nb[1]; - const uint64_t nb2 = src->nb[2]; - const uint64_t nb3 = src->nb[3]; +static cudaError_t ggml_cuda_h2d_tensor_2d( + void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { + + char * dst_char = (char *) dst; + const int64_t ne0 = src->ne[0]; + const int64_t nb0 = src->nb[0]; + const int64_t nb1 = src->nb[1]; + const int64_t nb2 = src->nb[2]; + const int64_t nb3 = src->nb[3]; const enum ggml_type type = src->type; - const size_t ts = ggml_type_size(type); - const size_t bs = ggml_blck_size(type); + const int64_t ts = ggml_type_size(type); + const int64_t bs = ggml_blck_size(type); + int64_t i1_diff = i1_high - i1_low; - const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); + const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3); if (nb0 == ts && nb1 == ts*ne0/bs) { - return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream); + return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream); } else if (nb0 == ts) { - return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream); + return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream); } else { - for (uint64_t i1 = 0; i1 < ne1; i1++) { + for (int64_t i1 = 0; i1 < i1_diff; i1++) { const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) ((char *) dst + i1*ts*ne0/bs); + void * rd = (void *) (dst_char + i1*ts*ne0/bs); // pretend the row is a matrix with cols=1 cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream); if (r != cudaSuccess) return r; @@ -1003,446 +1148,760 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor } } -static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA); +inline void ggml_cuda_op_add( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne0 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_mul( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[2]; - const int64_t ne0 = ne00 * ne01 * ne02 * ne03; + const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; - size_t x_size, d_size; - - float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0 - float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted. - float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const int i0 = i03*ne02 + i02; - float * c_X2 = d_X + i0*ne01*ne00; - float * c_D2 = d_D + i0*ne01*ne00; - - cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS]; - cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS]; - cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS]; - - // copy src0 to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2)); - CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2)); - - // wait for data - CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); - - for (int64_t i01 = 0; i01 < ne01; i01++) { - const int64_t i13 = i03%ne13; - const int64_t i12 = i02%ne12; - const int64_t i11 = i01%ne11; - const int i1 = i13*ne12*ne11 + i12*ne11 + i11; - - float * c_X1 = c_X2 + i01*ne00; - float * c_Y = d_Y + i1*ne10; - float * c_D1 = c_D2 + i01*ne00; - - // compute - mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream); - CUDA_CHECK(cudaGetLastError()); - } - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream)); - } + for (int64_t i01 = i01_low; i01 < i01_high; i01++) { + const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0 + + float * src0_ddf_i01 = src0_ddf_i + i01*ne00; + float * src1_ddf_i01 = src1_ddf_i + i11*ne10; + float * dst_ddf_i01 = dst_ddf_i + i01*ne00; + + // compute + mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); } - CUDA_CHECK(cudaDeviceSynchronize()); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_D, d_size); + + (void) dst; + (void) src0_ddq_i; + (void) i02; } -static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +inline void ggml_cuda_op_silu( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; +inline void ggml_cuda_op_rms_norm( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const int n_mm = ne03 * ne02; - - size_t x_size, y_size, d_size; - float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); - float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); - float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - int i = i03*ne02 + i02; - cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; - - float * c_X = d_X + i * x_ne; - float * c_Y = d_Y + i * y_ne; - float * c_D = d_D + i * d_ne; - - // copy data to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); - - // compute - CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, c_X, ne00, - c_Y, ne10, - &beta, c_D, ne01)); - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - } + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + // compute + rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i02; + (void) i1; +} + +inline void ggml_cuda_op_dequantize_mul_mat_vec( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddq_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = i01_high - i01_low; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + default: + GGML_ASSERT(false); + break; } + CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaDeviceSynchronize()); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); + (void) src1; + (void) dst; + (void) src0_ddf_i; + (void) i02; + (void) i1; } -static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { +inline void ggml_cuda_op_mul_mat_cublas( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ + + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); + + const float alpha = 1.0f; + const float beta = 0.0f; + const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t ne03 = src0->ne[3]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - const int nb10 = src1->nb[0]; - const int nb11 = src1->nb[1]; - const int nb12 = src1->nb[2]; - const int nb13 = src1->nb[3]; + const int64_t ne0 = dst->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main)); + CUBLAS_CHECK( + cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + i01_diff, ne11, ne10, + &alpha, src0_ddf_i, ne00, + src1_ddf_i, ne10, + &beta, dst_ddf_i, ldc)); + + (void) dst; + (void) src0_ddq_i; + (void) i02; + (void) i1; +} - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; +inline void ggml_cuda_op_rope( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, + float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, + cudaStream_t & cudaStream_main){ - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const int n_mm = ne03 * ne02; - - size_t x_size, y_size, d_size; - half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size); - half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size); - float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); - - bool src1_cont_rows = nb10 == sizeof(float); - bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - int i = i03*ne02 + i02; - cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; - - half * c_X = d_X + i * x_ne; - half * c_Y = d_Y + i * y_ne; - float * c_D = d_D + i * d_ne; - - // copy src0 to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); - - // convert src1 to fp16 - // TODO: use multiple threads - ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); - char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; - if (src1_cont_rows) { - if (src1_cont_cols) { - ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); - } - else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); - } - } - } - else { - for (int64_t i01 = 0; i01 < ne11; i01++) { - for (int64_t i00 = 0; i00 < ne10; i00++) { - // very slow due to no inlining - tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); - } - } - } + GGML_ASSERT(src0_ddf_i != nullptr); + GGML_ASSERT(dst_ddf_i != nullptr); - // copy src1 to device - CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream)); - - // compute - CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); - CUBLAS_CHECK( - cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, c_X, CUDA_R_16F, ne00, - c_Y, CUDA_R_16F, ne10, - &beta, c_D, CUDA_R_32F, ne01, - CUBLAS_COMPUTE_32F_FAST_16F, - CUBLAS_GEMM_DEFAULT)); - - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); - } - } + const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + + const int n_past = ((int32_t *) src1->data)[0]; + const int n_dims = ((int32_t *) src1->data)[1]; + const int mode = ((int32_t *) src1->data)[2]; + GGML_ASSERT(mode == 0); + + const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float p = ((mode & 1) == 0 ? n_past + i02 : i02); - CUDA_CHECK(cudaDeviceSynchronize()); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); + // compute + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + + (void) dst; + (void) src0_ddq_i; + (void) src1_ddf_i; + (void) i1; } -static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + ggml_cuda_op_t op, bool src0_needs_f32) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; + const int64_t nrows0 = ggml_nrows(src0); - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; + const bool use_src1 = src1 != nullptr; + const int64_t ne10 = use_src1 ? src1->ne[0] : 1; + const int64_t ne11 = use_src1 ? src1->ne[1] : 1; + const int64_t ne12 = use_src1 ? src1->ne[2] : 1; + const int64_t ne13 = use_src1 ? src1->ne[3] : 1; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - const ggml_type type = src0->type; - const bool mul_mat_vec = ne11 == 1; - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const int n_mm = ne03 * ne02; - const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); - - size_t x_size, y_size, d_size, q_size; - float * d_X = nullptr; - if (!mul_mat_vec) { - d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); - } - float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); - float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); - char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size); - - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type); - dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type); - GGML_ASSERT(to_fp32_cuda != nullptr); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - int i = i03*ne02 + i02; - cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; - cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS]; - cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS]; - - float * c_Y = d_Y + i * y_ne; - float * c_D = d_D + i * d_ne; - char * c_Q = d_Q + i * q_sz; - - // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2)); - } else if (src0->backend == GGML_BACKEND_CUDA) { - c_Q = ((char *) src0->data) + i * q_sz; + GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + + // strides for iteration over dims 3 and 2 + const int64_t src0_stride = ne00 * ne01; + const int64_t src1_stride = ne10 * ne11; + const int64_t dst_stride = ne0 * ne1; + const int64_t num_iters = ne02 * ne03; + + const size_t src0_ts = ggml_type_size(src0->type); + const size_t src0_bs = ggml_blck_size(src0->type); + + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_is_f32 = src0->type == GGML_TYPE_F32; + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + + // dd = data device + char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized + float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + // asq = actual size quantized, asf = actual size float + size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + + for (int id = 0; id < g_device_count; ++id) { + if (!split && id != g_main_device) { + continue; + } + + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + + int64_t row_low, row_high; + if (split) { + row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; + row_low -= row_low % GGML_CUDA_DMMV_Y; + row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; + row_high -= row_high % GGML_CUDA_DMMV_Y; + } else { + row_low = 0; + row_high = nrows0; + } + if (row_low == row_high) { + continue; + } + + int64_t row_diff = row_high - row_low; + + cudaSetDevice(id); + + if (src0_on_device) { + if (src0_is_f32) { + src0_ddf[id] = (float *) src0_extra->data_device[id]; + } else { + src0_ddq[id] = (char *) src0_extra->data_device[id]; + } + } else { + if (src0_is_f32) { + src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); } else { - GGML_ASSERT(false); + src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]); } - if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel - CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2)); - - // copy src1 to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); - - // wait for data - CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); - - // compute - //printf("Calling dmmv\n"); - dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream); - CUDA_CHECK(cudaGetLastError()); - - } else { // general dequantization kernel + cuBLAS matrix matrix multiplication - float * c_X = d_X + i * x_ne; - -//typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); - // convert src0 to fp32 on device - to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2)); - - // copy src1 to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); - - // wait for conversion - CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); - - // compute - CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, c_X, ne00, - c_Y, ne10, - &beta, c_D, ne01)); + } + + if (src0_needs_f32 && !src0_is_f32) { + src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]); + } + + if (use_src1) { + if (src1_on_device) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]); } + } + if (dst_on_device) { + dst_ddf[id] = (float *) dst_extra->data_device[id]; + } else { + size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float); + dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); + } + + for (int64_t i03 = 0; i03 < ne03; i03++) { + const int64_t i13 = i03 % ne13; + for (int64_t i02 = 0; i02 < ne02; i02++) { + const int64_t i12 = i02 % ne12; + + const int64_t i0 = i03*ne02 + i02; + const int64_t i0_offset_low = row_low/ne01; + const int64_t i0_offset_high = row_high/ne01; + + int64_t i01_low = 0; + int64_t i01_high = ne01; + if (split) { + if (i0 < i0_offset_low || i0 > i0_offset_high) { + continue; + } + if (i0 == i0_offset_low) { + i01_low = row_low % ne01; + } + if (i0 == i0_offset_high) { + i01_high = row_high % ne01; + } + } + const int64_t i01_diff = i01_high - i01_low; + if (i01_diff == 0) { + continue; + } + const int64_t i11 = i13*ne12 + i12; + + cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS]; + cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS]; + cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS]; + + // for split tensors the data begins at i0 == i0_offset_low + char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; + float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride; + float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; + float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; + + // for split tensors the data pointer needs to be rounded down + // to the bin edge for i03, i02 bins beyond the first + if (i0 - i0_offset_low > 0) { + src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs; + src0_ddf_i -= (row_low % ne01)*ne00; + } + if (i0 - i0_offset_low > 0) { + dst_ddf_i -= (row_low % ne0)*ne1; + } + + // the main device memory buffer can be on VRAM scratch, with space for all partial results + // in that case an offset on dst_ddf_i is needed + if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + dst_ddf_i += i01_low; // offset is 0 if no tensor split + } + + // copy src0, src1 to device if necessary + if (use_src1) { + if (src1->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1)); + } else if (src1->backend == GGML_BACKEND_GPU) { + if (id != g_main_device) { + float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; + src1_ddf_i_source += i11*src1_stride; + CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), + cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1)); + } + } else { + GGML_ASSERT(false); + } + } + CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1)); + if (!src0_on_device) { + if (src0_is_f32) { + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + } else { + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + } + } + + // convert src0 to f32 if it's necessary for the ggml_cuda_op + if (src0_needs_f32 && !src0_is_f32) { + to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main); + CUDA_CHECK(cudaGetLastError()); + } - // copy dst to host - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + // wait with main stream until src1 memcpy is done + CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0)); + + // do the computation + op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); + + // copy dst to host or other device if necessary + if (!dst_on_device) { + void * dst_off_device; + cudaMemcpyKind kind; + if (dst->backend == GGML_BACKEND_CPU) { + dst_off_device = dst->data; + kind = cudaMemcpyDeviceToHost; + } else if (dst->backend == GGML_BACKEND_GPU) { + dst_off_device = dst_extra->data_device[g_main_device]; + kind = cudaMemcpyDeviceToDevice; + } else { + GGML_ASSERT(false); + } + if (split) { + // src0 = weight matrix is saved as a transposed matrix for better memory layout. + // dst is NOT transposed. + // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // Instead they need to be copied to the correct slice in ne0 = dst row index. + // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. + for (int64_t j = 0; j < ne1; ++j) { + float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main)); + } + } else { + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main)); + } + } + } } } - CUDA_CHECK(cudaDeviceSynchronize()); - if (!mul_mat_vec) { - ggml_cuda_pool_free(d_X, x_size); + // wait until each device is finished, then free their buffers + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + if (src0_asq[id] > 0) { + ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); + } + if (src0_asf[id] > 0) { + ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]); + } + if (src1_asf[id] > 0) { + ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); + } + if (dst_asf[id] > 0) { + ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]); + } } - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); - ggml_cuda_pool_free(d_Q, q_size); } -void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); - ggml_cuda_mul_f32(src0, src1, dst); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true); +} + +void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true); +} + +void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true); +} + +void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true); } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU); const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; + // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) { + // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n", + // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + // return false; + // } + // TODO: find the optimal values for these if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) { + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { return true; } return false; } -bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { - size_t src0_sz = ggml_nbytes(src0); - size_t src1_sz = ggml_nbytes(src1); - - // mul_mat_q: src0 is converted to fp32 on device - size_t mul_mat_q_transfer = src0_sz + src1_sz; +void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + if (src0->type == GGML_TYPE_F32) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true); + } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { + if (src1->ne[1] == 1) { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } else { + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true); + } + } else { + GGML_ASSERT(false); + } +} - // mul_mat_f16: src1 is converted to fp16 on cpu - size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1); +void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true); +} - // choose the smaller one to transfer to the device - // TODO: this is not always the best choice due to the overhead of converting to fp16 - return mul_mat_f16_transfer < mul_mat_q_transfer; +void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; } -void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst)); +void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) { + FILE * fp = fopen(fname, "rb"); + int nrows = ggml_nrows(tensor); + const size_t nb1 = tensor->nb[1]; + ggml_backend backend = tensor->backend; + struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; - if (src0->type == GGML_TYPE_F32) { - ggml_cuda_mul_mat_f32(src0, src1, dst); - } - else if (src0->type == GGML_TYPE_F16) { - if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { - ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize); + for (int id = 0; id < g_device_count; ++id) { + extra->data_device[id] = nullptr; + + if (backend == GGML_BACKEND_GPU && id != g_main_device) { + continue; } - else { - ggml_cuda_mul_mat_q_f32(src0, src1, dst); + + cudaSetDevice(id); + + int row_low, row_high; + if (backend == GGML_BACKEND_GPU) { + row_low = 0; + row_high = nrows; + } else if (backend == GGML_BACKEND_GPU_SPLIT) { + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; + row_low -= row_low % GGML_CUDA_DMMV_Y; + row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1]; + row_high -= row_high % GGML_CUDA_DMMV_Y; + } else { + GGML_ASSERT(false); } + if (row_low == row_high) { + continue; + } + + int64_t nrows_split = row_high - row_low; + + const size_t offset_split = offset + row_low*nb1; + const size_t size = ggml_nbytes_split(tensor, nrows_split); + + void * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + void * buf_host = malloc(size); + +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET); +#else + int ret = fseek(fp, (long) offset_split, SEEK_SET); +#endif + GGML_ASSERT(ret == 0); // same + + size_t ret2 = fread(buf_host, size, 1, fp); + if (ret2 != 1) { + fprintf(stderr, "unexpectedly reached end of file"); + exit(1); + } + + cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + free(buf_host); + extra->data_device[id] = buf; } - else if (ggml_is_quantized(src0->type)) { - ggml_cuda_mul_mat_q_f32(src0, src1, dst); - } - else { - GGML_ASSERT(false); - } + + tensor->extra = extra; + fclose(fp); } -size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { - return ggml_nelements(src1) * sizeof(ggml_fp16_t); +void ggml_cuda_free_data(struct ggml_tensor * tensor) { + if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) { + return; } - else { - return 0; + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + if (extra->data_device[id] == nullptr) { + continue; + } + + CUDA_CHECK(cudaSetDevice(id)); + CUDA_CHECK(cudaFree(extra->data_device[id])); } + + delete extra; } -void ggml_cuda_transform_tensor(ggml_tensor * tensor) { - const int64_t ne0 = tensor->ne[0]; - const int64_t ne1 = tensor->ne[1]; - const int64_t ne2 = tensor->ne[2]; - const int64_t ne3 = tensor->ne[3]; +void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) { + ggml_cuda_assign_buffers(tensor); + } - const ggml_type type = tensor->type; - const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); + const size_t size = ggml_nbytes(tensor); + GGML_ASSERT(size <= g_scratch_size); + if (g_scratch_offset + size > g_scratch_size) { + g_scratch_offset = 0; + } - size_t q_size; - char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size); + tensor->backend = GGML_BACKEND_GPU; + struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - cudaStream_t cudaStream2 = g_cudaStreams2[0]; + bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data; - // copy tensor to device - for (int64_t i3 = 0; i3 < ne3; i3++) { - for (int64_t i2 = 0; i2 < ne2; i2++) { - int i = i3*ne2 + i2; - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2)); + CUDA_CHECK(cudaSetDevice(g_main_device)); + if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) { + struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra; + extra->data_device[g_main_device] = src0_extra->data_device; + GGML_ASSERT(false); + } else { + char * data = (char *) g_scratch_buffer; + if (data == nullptr) { + CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); + g_scratch_buffer = data; } + extra->data_device[g_main_device] = data + g_scratch_offset; } - tensor->data = dst; - tensor->backend = GGML_BACKEND_CUDA; -} + // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]); + g_scratch_offset += size; + // fprintf(stderr, "%s: scratch %d, %p - %p\n", + // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size); -void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) { - FILE * fp = fopen(fname, "rb"); + GGML_ASSERT(g_scratch_offset <= g_scratch_size); + tensor->extra = extra; +} - const size_t size = ggml_nbytes(tensor); +void ggml_cuda_set_main_device(int main_device) { + if (main_device > g_device_count) { + fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", + main_device, g_device_count, g_main_device); + return; + } + g_main_device = main_device; + if (g_device_count > 1) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); + fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); + } +} - void * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - void * buf_host = malloc(size); +void ggml_cuda_set_scratch_size(size_t scratch_size) { + g_scratch_size = scratch_size; +} -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, SEEK_SET); -#else - int ret = fseek(fp, (long) offset, SEEK_SET); -#endif - GGML_ASSERT(ret == 0); // same +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + ggml_cuda_func_t func; + const bool any_on_device = tensor->backend == GGML_BACKEND_GPU + || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT + || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); - size_t ret2 = fread(buf_host, size, 1, fp); - if (ret2 != 1) { - fprintf(stderr, "unexpectedly reached end of file"); - exit(1); + switch (tensor->op) { + case GGML_OP_ADD: + if (!any_on_device) { + return false; + } + func = ggml_cuda_add; + break; + case GGML_OP_MUL: + if (!any_on_device) { + return false; + } + func = ggml_cuda_mul; + break; + case GGML_OP_SILU: + if (!any_on_device) { + return false; + } + func = ggml_cuda_silu; + break; + case GGML_OP_RMS_NORM: + if (!any_on_device) { + return false; + } + func = ggml_cuda_rms_norm; + break; + case GGML_OP_MUL_MAT: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { + return false; + } + func = ggml_cuda_mul_mat; + break; + case GGML_OP_RESHAPE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_nop; + break; + case GGML_OP_ROPE: + if (!any_on_device) { + return false; + } + func = ggml_cuda_rope; + break; + default: + return false; } - cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice); - cudaDeviceSynchronize(); - - tensor->data = buf; - free(buf_host); - fclose(fp); + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + func(tensor->src0, tensor->src1, tensor); + return true; } |