aboutsummaryrefslogtreecommitdiff
path: root/ggml-cuda.cu
diff options
context:
space:
mode:
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r--ggml-cuda.cu1291
1 files changed, 875 insertions, 416 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5385e01..c700890 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -24,19 +24,35 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} \
} while (0)
+#if CUDART_VERSION >= 12
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
+ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
exit(1); \
} \
} while (0)
+#else
+#define CUBLAS_CHECK(err) \
+ do { \
+ cublasStatus_t err_ = (err); \
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
+ fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
+ exit(1); \
+ } \
+ } while (0)
+#endif // CUDART_VERSION >= 11
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_t)(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i,
+ float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main);
// QK = number of values after dequantization
// QR = QK / number of values before dequantization
@@ -132,8 +148,10 @@ static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define WARP_SIZE 32
+#define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256
-
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
// dmmv = dequantize_mul_mat_vec
@@ -144,6 +162,15 @@ static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define GGML_CUDA_DMMV_Y 1
#endif
+static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = x[i] + y[i];
+}
+
static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -153,6 +180,45 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
dst[i] = x[i] * y[i%ky];
}
+static __global__ void silu_f32(const float * x, float * dst, const int k) {
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
+
+ if (i >= k) {
+ return;
+ }
+ dst[i] = x[i] / (1.0f + expf(-x[i]));
+}
+
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
+ const int row = blockIdx.x*blockDim.y + threadIdx.y;
+ const int tid = threadIdx.x;
+
+ const float eps = 1e-6;
+
+ float tmp = 0.0f; // partial sum for thread in warp
+
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
+ const int col = i + tid;
+ const float xi = x[row*ncols + col];
+ tmp += xi * xi;
+ }
+
+ // sum up partial sums
+ __syncthreads();
+#pragma unroll
+ for (int mask = 16; mask > 0; mask >>= 1) {
+ tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
+ }
+
+ const float mean = tmp / ncols;
+ const float scale = 1.0f / sqrtf(mean + eps);
+
+ for (int i = 0; i < ncols; i += WARP_SIZE) {
+ const int col = i + tid;
+ dst[row*ncols + col] = scale * x[row*ncols + col];
+ }
+}
+
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q4_0 * x = (const block_q4_0 *) vx;
@@ -565,8 +631,8 @@ static __device__ void vec_dot_q6_k(const void * vx, const int ib, const int iqs
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const half * x = (const half *) vx;
- v0 = __half2float(x[ib + 0]);
- v1 = __half2float(x[ib + 1]);
+ v0 = __half2float(x[ib + iqs + 0]);
+ v1 = __half2float(x[ib + iqs + 1]);
}
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -599,7 +665,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
const int y_offset = qr == 1 ? 1 : qk/2;
- float tmp = 0; // partial sum for thread in warp
+ float tmp = 0.0f; // partial sum for thread in warp
for (int i = 0; i < ncols; i += iter_stride) {
const int col = i + vals_per_iter*tid;
@@ -671,11 +737,48 @@ static __global__ void dequantize_mul_mat_vec_k(const void * vx, const float * y
}
}
+static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p, const float theta_scale) {
+ const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+ if (col >= ncols) {
+ return;
+ }
+
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
+ const int i = row*ncols + col;
+
+ const float theta = p*powf(theta_scale, col/2);
+ const float sin_theta = sinf(theta);
+ const float cos_theta = cosf(theta);
+
+ const float x0 = x[i + 0];
+ const float x1 = x[i + 1];
+
+ dst[i + 0] = x0*cos_theta - x1*sin_theta;
+ dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
+ add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
+}
+
static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
}
+static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
+ const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+ silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
+}
+
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+ GGML_ASSERT(ncols % WARP_SIZE == 0);
+ const dim3 block_dims(WARP_SIZE, 1, 1);
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
+}
+
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -799,7 +902,7 @@ static void dequantize_mul_mat_vec_q6_k_cuda(const void * vx, const float * y, f
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
- dequantize_block<32, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
+ dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -839,33 +942,12 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
}
}
-static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
- switch (type) {
- case GGML_TYPE_Q4_0:
- return dequantize_mul_mat_vec_q4_0_cuda;
- case GGML_TYPE_Q4_1:
- return dequantize_mul_mat_vec_q4_1_cuda;
- case GGML_TYPE_Q5_0:
- return dequantize_mul_mat_vec_q5_0_cuda;
- case GGML_TYPE_Q5_1:
- return dequantize_mul_mat_vec_q5_1_cuda;
- case GGML_TYPE_Q8_0:
- return dequantize_mul_mat_vec_q8_0_cuda;
- case GGML_TYPE_Q2_K:
- return dequantize_mul_mat_vec_q2_k_cuda;
- case GGML_TYPE_Q3_K:
- return dequantize_mul_mat_vec_q3_k_cuda;
- case GGML_TYPE_Q4_K:
- return dequantize_mul_mat_vec_q4_k_cuda;
- case GGML_TYPE_Q5_K:
- return dequantize_mul_mat_vec_q5_k_cuda;
- case GGML_TYPE_Q6_K:
- return dequantize_mul_mat_vec_q6_k_cuda;
- case GGML_TYPE_F16:
- return convert_mul_mat_vec_f16_cuda;
- default:
- return nullptr;
- }
+static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float theta_scale, cudaStream_t stream) {
+ GGML_ASSERT(nrows % 2 == 0);
+ const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1);
+ const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+ const dim3 block_nums(num_blocks_x, nrows, 1);
+ rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
}
// buffer pool for cuda
@@ -890,14 +972,16 @@ struct cuda_buffer {
size_t size = 0;
};
-static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[i];
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.size >= size && b.ptr != nullptr) {
void * ptr = b.ptr;
*actual_size = b.size;
@@ -914,9 +998,11 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
- cuda_buffer& b = g_cuda_buffer_pool[i];
+ cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
@@ -927,31 +1013,87 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr));
}
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
+static size_t g_scratch_offset = 0;
+
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
#define GGML_CUDA_MAX_EVENTS 64
-static cublasHandle_t g_cublasH = nullptr;
-static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
-static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
-static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+
+static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
+
+static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
void ggml_init_cublas() {
- if (g_cublasH == nullptr) {
- // create streams
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
+ static bool initialized = false;
+
+ if (!initialized) {
+ CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+ GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+ int64_t total_vram = 0;
+ fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
+ for (int id = 0; id < g_device_count; ++id) {
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
+ fprintf(stderr, " Device %d: %s\n", id, prop.name);
+ g_tensor_split[id] = total_vram;
+ total_vram += prop.totalGlobalMem;
}
- // create events
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
+ for (int id = 0; id < g_device_count; ++id) {
+ g_tensor_split[id] /= total_vram;
}
- // create cublas handle
- CUBLAS_CHECK(cublasCreate(&g_cublasH));
- CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaSetDevice(id));
+
+ // create streams
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
+ }
+ // create events
+ for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
+ }
+
+ // create cublas handle
+ CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
+ CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
+ }
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+ initialized = true;
+ }
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+ bool all_zero = true;
+ for (int i = 0; i < g_device_count; ++i) {
+ if (tensor_split[i] != 0.0f) {
+ all_zero = false;
+ break;
+ }
+ }
+ if (all_zero) {
+ return;
+ }
+ float split_sum = 0.0f;
+ for (int i = 0; i < g_device_count; ++i) {
+ g_tensor_split[i] = split_sum;
+ split_sum += tensor_split[i];
+ }
+ for (int i = 0; i < g_device_count; ++i) {
+ g_tensor_split[i] /= split_sum;
}
}
@@ -975,26 +1117,29 @@ void ggml_cuda_host_free(void * ptr) {
CUDA_CHECK(cudaFreeHost(ptr));
}
-static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
- const uint64_t ne0 = src->ne[0];
- const uint64_t ne1 = src->ne[1];
- const uint64_t nb0 = src->nb[0];
- const uint64_t nb1 = src->nb[1];
- const uint64_t nb2 = src->nb[2];
- const uint64_t nb3 = src->nb[3];
+static cudaError_t ggml_cuda_h2d_tensor_2d(
+ void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
+
+ char * dst_char = (char *) dst;
+ const int64_t ne0 = src->ne[0];
+ const int64_t nb0 = src->nb[0];
+ const int64_t nb1 = src->nb[1];
+ const int64_t nb2 = src->nb[2];
+ const int64_t nb3 = src->nb[3];
const enum ggml_type type = src->type;
- const size_t ts = ggml_type_size(type);
- const size_t bs = ggml_blck_size(type);
+ const int64_t ts = ggml_type_size(type);
+ const int64_t bs = ggml_blck_size(type);
+ int64_t i1_diff = i1_high - i1_low;
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+ const void * x = (const void *) ((const char *) src->data + i1_low*nb1 + i2*nb2 + i3*nb3);
if (nb0 == ts && nb1 == ts*ne0/bs) {
- return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
+ return cudaMemcpyAsync(dst_char, x, i1_diff*nb1, cudaMemcpyHostToDevice, stream);
} else if (nb0 == ts) {
- return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
+ return cudaMemcpy2DAsync(dst_char, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyHostToDevice, stream);
} else {
- for (uint64_t i1 = 0; i1 < ne1; i1++) {
+ for (int64_t i1 = 0; i1 < i1_diff; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
- void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
+ void * rd = (void *) (dst_char + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
if (r != cudaSuccess) return r;
@@ -1003,446 +1148,760 @@ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor
}
}
-static void ggml_cuda_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- GGML_ASSERT(src1->backend == GGML_BACKEND_CUDA);
+inline void ggml_cuda_op_add(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const int64_t ne0 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne0*i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
+ (void) i1;
+}
+
+inline void ggml_cuda_op_mul(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[2];
- const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
+
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
- const int64_t ne12 = src1->ne[2];
- const int64_t ne13 = src1->ne[3];
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
- size_t x_size, d_size;
-
- float * d_X = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &x_size); // src0
- float * d_Y = (float *) src1->data; // src1 is already on device, broadcasted.
- float * d_D = (float *) ggml_cuda_pool_malloc(ne0 * sizeof(float), &d_size); // dst
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- const int i0 = i03*ne02 + i02;
- float * c_X2 = d_X + i0*ne01*ne00;
- float * c_D2 = d_D + i0*ne01*ne00;
-
- cudaStream_t cudaStream = g_cudaStreams[i0 % GGML_CUDA_MAX_STREAMS];
- cudaStream_t cudaStream2 = g_cudaStreams2[i0 % GGML_CUDA_MAX_STREAMS];
- cudaEvent_t cudaEvent = g_cudaEvents[i0 % GGML_CUDA_MAX_EVENTS];
-
- // copy src0 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X2, src0, i03, i02, cudaStream2));
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // wait for data
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- for (int64_t i01 = 0; i01 < ne01; i01++) {
- const int64_t i13 = i03%ne13;
- const int64_t i12 = i02%ne12;
- const int64_t i11 = i01%ne11;
- const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
-
- float * c_X1 = c_X2 + i01*ne00;
- float * c_Y = d_Y + i1*ne10;
- float * c_D1 = c_D2 + i01*ne00;
-
- // compute
- mul_f32_cuda(c_X1, c_Y, c_D1, ne00, ne10, cudaStream);
- CUDA_CHECK(cudaGetLastError());
- }
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D2, sizeof(float)*ne00*ne01, cudaMemcpyDeviceToHost, cudaStream));
- }
+ for (int64_t i01 = i01_low; i01 < i01_high; i01++) {
+ const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0
+
+ float * src0_ddf_i01 = src0_ddf_i + i01*ne00;
+ float * src1_ddf_i01 = src1_ddf_i + i11*ne10;
+ float * dst_ddf_i01 = dst_ddf_i + i01*ne00;
+
+ // compute
+ mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
}
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_D, d_size);
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
}
-static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+inline void ggml_cuda_op_silu(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[3];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i02;
+ (void) i1;
+}
- const int64_t ne10 = src1->ne[0];
- const int64_t ne11 = src1->ne[1];
+inline void ggml_cuda_op_rms_norm(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
-
- size_t x_size, y_size, d_size;
- float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
- float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
-
- float * c_X = d_X + i * x_ne;
- float * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
-
- // copy data to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, ne00,
- c_Y, ne10,
- &beta, c_D, ne01));
-
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
- }
+ const int64_t ne00 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ // compute
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) src1;
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i02;
+ (void) i1;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddq_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t nrows = i01_high - i01_low;
+
+ switch (src0->type) {
+ case GGML_TYPE_Q4_0:
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q4_1:
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_0:
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_1:
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q8_0:
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q2_K:
+ dequantize_mul_mat_vec_q2_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q3_K:
+ dequantize_mul_mat_vec_q3_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q4_K:
+ dequantize_mul_mat_vec_q4_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q5_K:
+ dequantize_mul_mat_vec_q5_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_Q6_K:
+ dequantize_mul_mat_vec_q6_k_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ case GGML_TYPE_F16:
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
+ break;
+ default:
+ GGML_ASSERT(false);
+ break;
}
+ CUDA_CHECK(cudaGetLastError());
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
+ (void) src1;
+ (void) dst;
+ (void) src0_ddf_i;
+ (void) i02;
+ (void) i1;
}
-static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+inline void ggml_cuda_op_mul_mat_cublas(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
+
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(src1_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
+
+ const float alpha = 1.0f;
+ const float beta = 0.0f;
+
const int64_t ne00 = src0->ne[0];
- const int64_t ne01 = src0->ne[1];
- const int64_t ne02 = src0->ne[2];
- const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
- const int nb10 = src1->nb[0];
- const int nb11 = src1->nb[1];
- const int nb12 = src1->nb[2];
- const int nb13 = src1->nb[3];
+ const int64_t ne0 = dst->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ int id;
+ CUDA_CHECK(cudaGetDevice(&id));
+
+ // the main device has a larger memory buffer to hold the results from all GPUs
+ // ldc == nrows of the matrix that cuBLAS writes into
+ int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
+
+ CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], cudaStream_main));
+ CUBLAS_CHECK(
+ cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
+ i01_diff, ne11, ne10,
+ &alpha, src0_ddf_i, ne00,
+ src1_ddf_i, ne10,
+ &beta, dst_ddf_i, ldc));
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) i02;
+ (void) i1;
+}
- const int nb2 = dst->nb[2];
- const int nb3 = dst->nb[3];
+inline void ggml_cuda_op_rope(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
+ float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
+ cudaStream_t & cudaStream_main){
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
-
- size_t x_size, y_size, d_size;
- half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
- half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
-
- bool src1_cont_rows = nb10 == sizeof(float);
- bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
-
- half * c_X = d_X + i * x_ne;
- half * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
-
- // copy src0 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
-
- // convert src1 to fp16
- // TODO: use multiple threads
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
- char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
- if (src1_cont_rows) {
- if (src1_cont_cols) {
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
- }
- else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
- }
- }
- }
- else {
- for (int64_t i01 = 0; i01 < ne11; i01++) {
- for (int64_t i00 = 0; i00 < ne10; i00++) {
- // very slow due to no inlining
- tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
- }
- }
- }
+ GGML_ASSERT(src0_ddf_i != nullptr);
+ GGML_ASSERT(dst_ddf_i != nullptr);
- // copy src1 to device
- CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, CUDA_R_16F, ne00,
- c_Y, CUDA_R_16F, ne10,
- &beta, c_D, CUDA_R_32F, ne01,
- CUBLAS_COMPUTE_32F_FAST_16F,
- CUBLAS_GEMM_DEFAULT));
-
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
- }
- }
+ const int64_t ne00 = src0->ne[0];
+ const int64_t i01_diff = i01_high - i01_low;
+
+ const int n_past = ((int32_t *) src1->data)[0];
+ const int n_dims = ((int32_t *) src1->data)[1];
+ const int mode = ((int32_t *) src1->data)[2];
+ GGML_ASSERT(mode == 0);
+
+ const float theta_scale = powf(10000.0, -2.0f/n_dims);
+ const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
- CUDA_CHECK(cudaDeviceSynchronize());
- ggml_cuda_pool_free(d_X, x_size);
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
+ // compute
+ rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+
+ (void) dst;
+ (void) src0_ddq_i;
+ (void) src1_ddf_i;
+ (void) i1;
}
-static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ ggml_cuda_op_t op, bool src0_needs_f32) {
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
+ const int64_t nrows0 = ggml_nrows(src0);
- const int64_t ne10 = src1->ne[0];
- const int64_t ne11 = src1->ne[1];
+ const bool use_src1 = src1 != nullptr;
+ const int64_t ne10 = use_src1 ? src1->ne[0] : 1;
+ const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
+ const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
+ const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
- const ggml_type type = src0->type;
- const bool mul_mat_vec = ne11 == 1;
- const float alpha = 1.0f;
- const float beta = 0.0f;
- const int x_ne = ne01 * ne00;
- const int y_ne = ne11 * ne10;
- const int d_ne = ne11 * ne01;
- const int n_mm = ne03 * ne02;
- const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
-
- size_t x_size, y_size, d_size, q_size;
- float * d_X = nullptr;
- if (!mul_mat_vec) {
- d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
- }
- float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
- float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
- char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
-
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
- dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
- GGML_ASSERT(to_fp32_cuda != nullptr);
-
- for (int64_t i03 = 0; i03 < ne03; i03++) {
- for (int64_t i02 = 0; i02 < ne02; i02++) {
- int i = i03*ne02 + i02;
- cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
- cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
- cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
-
- float * c_Y = d_Y + i * y_ne;
- float * c_D = d_D + i * d_ne;
- char * c_Q = d_Q + i * q_sz;
-
- // copy src0 to device if necessary
- if (src0->backend == GGML_BACKEND_CPU) {
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
- } else if (src0->backend == GGML_BACKEND_CUDA) {
- c_Q = ((char *) src0->data) + i * q_sz;
+ GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+ GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+ // strides for iteration over dims 3 and 2
+ const int64_t src0_stride = ne00 * ne01;
+ const int64_t src1_stride = ne10 * ne11;
+ const int64_t dst_stride = ne0 * ne1;
+ const int64_t num_iters = ne02 * ne03;
+
+ const size_t src0_ts = ggml_type_size(src0->type);
+ const size_t src0_bs = ggml_blck_size(src0->type);
+
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+
+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+ const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
+
+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+
+ // dd = data device
+ char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // quantized
+ float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+ // asq = actual size quantized, asf = actual size float
+ size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
+
+ for (int id = 0; id < g_device_count; ++id) {
+ if (!split && id != g_main_device) {
+ continue;
+ }
+
+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+ int64_t row_low, row_high;
+ if (split) {
+ row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
+ row_low -= row_low % GGML_CUDA_DMMV_Y;
+ row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
+ row_high -= row_high % GGML_CUDA_DMMV_Y;
+ } else {
+ row_low = 0;
+ row_high = nrows0;
+ }
+ if (row_low == row_high) {
+ continue;
+ }
+
+ int64_t row_diff = row_high - row_low;
+
+ cudaSetDevice(id);
+
+ if (src0_on_device) {
+ if (src0_is_f32) {
+ src0_ddf[id] = (float *) src0_extra->data_device[id];
+ } else {
+ src0_ddq[id] = (char *) src0_extra->data_device[id];
+ }
+ } else {
+ if (src0_is_f32) {
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
} else {
- GGML_ASSERT(false);
+ src0_ddq[id] = (char *) ggml_cuda_pool_malloc(row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
}
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // copy src1 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // wait for data
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- // compute
- //printf("Calling dmmv\n");
- dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
- CUDA_CHECK(cudaGetLastError());
-
- } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
- float * c_X = d_X + i * x_ne;
-
-//typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
- // convert src0 to fp32 on device
- to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
- CUDA_CHECK(cudaGetLastError());
- CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
-
- // copy src1 to device
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
-
- // wait for conversion
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
-
- // compute
- CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
- CUBLAS_CHECK(
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
- ne01, ne11, ne10,
- &alpha, c_X, ne00,
- c_Y, ne10,
- &beta, c_D, ne01));
+ }
+
+ if (src0_needs_f32 && !src0_is_f32) {
+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_asf[id]);
+ }
+
+ if (use_src1) {
+ if (src1_on_device) {
+ src1_ddf[id] = (float *) src1_extra->data_device[id];
+ } else {
+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc(num_iters*src1_stride * sizeof(float), &src1_asf[id]);
}
+ }
+ if (dst_on_device) {
+ dst_ddf[id] = (float *) dst_extra->data_device[id];
+ } else {
+ size_t size_dst_ddf = split ? row_diff*ne1 * sizeof(float) : num_iters*dst_stride * sizeof(float);
+ dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
+ }
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ const int64_t i13 = i03 % ne13;
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ const int64_t i12 = i02 % ne12;
+
+ const int64_t i0 = i03*ne02 + i02;
+ const int64_t i0_offset_low = row_low/ne01;
+ const int64_t i0_offset_high = row_high/ne01;
+
+ int64_t i01_low = 0;
+ int64_t i01_high = ne01;
+ if (split) {
+ if (i0 < i0_offset_low || i0 > i0_offset_high) {
+ continue;
+ }
+ if (i0 == i0_offset_low) {
+ i01_low = row_low % ne01;
+ }
+ if (i0 == i0_offset_high) {
+ i01_high = row_high % ne01;
+ }
+ }
+ const int64_t i01_diff = i01_high - i01_low;
+ if (i01_diff == 0) {
+ continue;
+ }
+ const int64_t i11 = i13*ne12 + i12;
+
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
+ cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
+ cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
+
+ // for split tensors the data begins at i0 == i0_offset_low
+ char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
+ float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
+ float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
+
+ // for split tensors the data pointer needs to be rounded down
+ // to the bin edge for i03, i02 bins beyond the first
+ if (i0 - i0_offset_low > 0) {
+ src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
+ src0_ddf_i -= (row_low % ne01)*ne00;
+ }
+ if (i0 - i0_offset_low > 0) {
+ dst_ddf_i -= (row_low % ne0)*ne1;
+ }
+
+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
+ // in that case an offset on dst_ddf_i is needed
+ if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+ dst_ddf_i += i01_low; // offset is 0 if no tensor split
+ }
+
+ // copy src0, src1 to device if necessary
+ if (use_src1) {
+ if (src1->backend == GGML_BACKEND_CPU) {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src1_ddf_i, src1, i03, i02, 0, ne11, cudaStream_memcpy_src1));
+ } else if (src1->backend == GGML_BACKEND_GPU) {
+ if (id != g_main_device) {
+ float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+ src1_ddf_i_source += i11*src1_stride;
+ CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
+ cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+ }
+ CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
+ if (!src0_on_device) {
+ if (src0_is_f32) {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
+ } else {
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
+ }
+ }
+
+ // convert src0 to f32 if it's necessary for the ggml_cuda_op
+ if (src0_needs_f32 && !src0_is_f32) {
+ to_fp32_cuda(src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
+ CUDA_CHECK(cudaGetLastError());
+ }
- // copy dst to host
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+ // wait with main stream until src1 memcpy is done
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
+
+ // do the computation
+ op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
+
+ // copy dst to host or other device if necessary
+ if (!dst_on_device) {
+ void * dst_off_device;
+ cudaMemcpyKind kind;
+ if (dst->backend == GGML_BACKEND_CPU) {
+ dst_off_device = dst->data;
+ kind = cudaMemcpyDeviceToHost;
+ } else if (dst->backend == GGML_BACKEND_GPU) {
+ dst_off_device = dst_extra->data_device[g_main_device];
+ kind = cudaMemcpyDeviceToDevice;
+ } else {
+ GGML_ASSERT(false);
+ }
+ if (split) {
+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+ // dst is NOT transposed.
+ // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+ for (int64_t j = 0; j < ne1; ++j) {
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
+ }
+ } else {
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+ CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
+ }
+ }
+ }
}
}
- CUDA_CHECK(cudaDeviceSynchronize());
- if (!mul_mat_vec) {
- ggml_cuda_pool_free(d_X, x_size);
+ // wait until each device is finished, then free their buffers
+ for (int id = 0; id < g_device_count; ++id) {
+ CUDA_CHECK(cudaSetDevice(id));
+ CUDA_CHECK(cudaDeviceSynchronize());
+ if (src0_asq[id] > 0) {
+ ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
+ }
+ if (src0_asf[id] > 0) {
+ ggml_cuda_pool_free(src0_ddf[id], src0_asf[id]);
+ }
+ if (src1_asf[id] > 0) {
+ ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+ }
+ if (dst_asf[id] > 0) {
+ ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
+ }
}
- ggml_cuda_pool_free(d_Y, y_size);
- ggml_cuda_pool_free(d_D, d_size);
- ggml_cuda_pool_free(d_Q, q_size);
}
-void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
- ggml_cuda_mul_f32(src0, src1, dst);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_add, true);
+}
+
+void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true);
+}
+
+void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true);
+}
+
+void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true);
}
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+ GGML_ASSERT(src0->backend != GGML_BACKEND_GPU);
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
+ // if (strcmp(dst->name, "KQ") == 0 || strcmp(dst->name, "KQV") == 0) {
+ // fprintf(stderr, "(%ld, %ld, %ld, %ld) + (%ld, %ld, %ld, %ld) -> (%ld, %ld, %ld, %ld)\n",
+ // src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+ // src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+ // dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+ // return false;
+ // }
+
// TODO: find the optimal values for these
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
return true;
}
return false;
}
-bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
- size_t src0_sz = ggml_nbytes(src0);
- size_t src1_sz = ggml_nbytes(src1);
-
- // mul_mat_q: src0 is converted to fp32 on device
- size_t mul_mat_q_transfer = src0_sz + src1_sz;
+void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ if (src0->type == GGML_TYPE_F32) {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
+ } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+ if (src1->ne[1] == 1) {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+ } else {
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true);
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+}
- // mul_mat_f16: src1 is converted to fp16 on cpu
- size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
+void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+ ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
+}
- // choose the smaller one to transfer to the device
- // TODO: this is not always the best choice due to the overhead of converting to fp16
- return mul_mat_f16_transfer < mul_mat_q_transfer;
+void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ (void) src0;
+ (void) src1;
+ (void) dst;
}
-void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
- GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
+ FILE * fp = fopen(fname, "rb");
+ int nrows = ggml_nrows(tensor);
+ const size_t nb1 = tensor->nb[1];
+ ggml_backend backend = tensor->backend;
+ struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
- if (src0->type == GGML_TYPE_F32) {
- ggml_cuda_mul_mat_f32(src0, src1, dst);
- }
- else if (src0->type == GGML_TYPE_F16) {
- if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
- ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
+ for (int id = 0; id < g_device_count; ++id) {
+ extra->data_device[id] = nullptr;
+
+ if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+ continue;
}
- else {
- ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+
+ cudaSetDevice(id);
+
+ int row_low, row_high;
+ if (backend == GGML_BACKEND_GPU) {
+ row_low = 0;
+ row_high = nrows;
+ } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+ row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+ row_low -= row_low % GGML_CUDA_DMMV_Y;
+ row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
+ row_high -= row_high % GGML_CUDA_DMMV_Y;
+ } else {
+ GGML_ASSERT(false);
}
+ if (row_low == row_high) {
+ continue;
+ }
+
+ int64_t nrows_split = row_high - row_low;
+
+ const size_t offset_split = offset + row_low*nb1;
+ const size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+ void * buf;
+ CUDA_CHECK(cudaMalloc(&buf, size));
+ void * buf_host = malloc(size);
+
+#ifdef _WIN32
+ int ret = _fseeki64(fp, (__int64) offset_split, SEEK_SET);
+#else
+ int ret = fseek(fp, (long) offset_split, SEEK_SET);
+#endif
+ GGML_ASSERT(ret == 0); // same
+
+ size_t ret2 = fread(buf_host, size, 1, fp);
+ if (ret2 != 1) {
+ fprintf(stderr, "unexpectedly reached end of file");
+ exit(1);
+ }
+
+ cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
+ cudaDeviceSynchronize();
+
+ free(buf_host);
+ extra->data_device[id] = buf;
}
- else if (ggml_is_quantized(src0->type)) {
- ggml_cuda_mul_mat_q_f32(src0, src1, dst);
- }
- else {
- GGML_ASSERT(false);
- }
+
+ tensor->extra = extra;
+ fclose(fp);
}
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
- if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
- return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+void ggml_cuda_free_data(struct ggml_tensor * tensor) {
+ if (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) {
+ return;
}
- else {
- return 0;
+
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+ for (int id = 0; id < g_device_count; ++id) {
+ if (extra->data_device[id] == nullptr) {
+ continue;
+ }
+
+ CUDA_CHECK(cudaSetDevice(id));
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
}
+
+ delete extra;
}
-void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
- const int64_t ne0 = tensor->ne[0];
- const int64_t ne1 = tensor->ne[1];
- const int64_t ne2 = tensor->ne[2];
- const int64_t ne3 = tensor->ne[3];
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+ if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
+ ggml_cuda_assign_buffers(tensor);
+ }
- const ggml_type type = tensor->type;
- const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+ const size_t size = ggml_nbytes(tensor);
+ GGML_ASSERT(size <= g_scratch_size);
+ if (g_scratch_offset + size > g_scratch_size) {
+ g_scratch_offset = 0;
+ }
- size_t q_size;
- char * dst = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+ tensor->backend = GGML_BACKEND_GPU;
+ struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu;
- cudaStream_t cudaStream2 = g_cudaStreams2[0];
+ bool inplace = tensor->src0 != nullptr && tensor->src0->data == tensor->data;
- // copy tensor to device
- for (int64_t i3 = 0; i3 < ne3; i3++) {
- for (int64_t i2 = 0; i2 < ne2; i2++) {
- int i = i3*ne2 + i2;
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(dst + i*ne0*ne1, tensor, i3, i2, cudaStream2));
+ CUDA_CHECK(cudaSetDevice(g_main_device));
+ if (inplace && tensor->src0->backend == GGML_BACKEND_GPU) {
+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src0->extra;
+ extra->data_device[g_main_device] = src0_extra->data_device;
+ GGML_ASSERT(false);
+ } else {
+ char * data = (char *) g_scratch_buffer;
+ if (data == nullptr) {
+ CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
+ g_scratch_buffer = data;
}
+ extra->data_device[g_main_device] = data + g_scratch_offset;
}
- tensor->data = dst;
- tensor->backend = GGML_BACKEND_CUDA;
-}
+ // fprintf(stderr, "data=%p offset=%ld data_device=%p\n", data, g_scratch_offset, extra->data_device[0]);
+ g_scratch_offset += size;
+ // fprintf(stderr, "%s: scratch %d, %p - %p\n",
+ // tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
- FILE * fp = fopen(fname, "rb");
+ GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+ tensor->extra = extra;
+}
- const size_t size = ggml_nbytes(tensor);
+void ggml_cuda_set_main_device(int main_device) {
+ if (main_device > g_device_count) {
+ fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+ main_device, g_device_count, g_main_device);
+ return;
+ }
+ g_main_device = main_device;
+ if (g_device_count > 1) {
+ cudaDeviceProp prop;
+ CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+ fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+ }
+}
- void * buf;
- CUDA_CHECK(cudaMalloc(&buf, size));
- void * buf_host = malloc(size);
+void ggml_cuda_set_scratch_size(size_t scratch_size) {
+ g_scratch_size = scratch_size;
+}
-#ifdef _WIN32
- int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
-#else
- int ret = fseek(fp, (long) offset, SEEK_SET);
-#endif
- GGML_ASSERT(ret == 0); // same
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+ ggml_cuda_func_t func;
+ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+ || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
+ || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
- size_t ret2 = fread(buf_host, size, 1, fp);
- if (ret2 != 1) {
- fprintf(stderr, "unexpectedly reached end of file");
- exit(1);
+ switch (tensor->op) {
+ case GGML_OP_ADD:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_add;
+ break;
+ case GGML_OP_MUL:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_mul;
+ break;
+ case GGML_OP_SILU:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_silu;
+ break;
+ case GGML_OP_RMS_NORM:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_rms_norm;
+ break;
+ case GGML_OP_MUL_MAT:
+ if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+ return false;
+ }
+ func = ggml_cuda_mul_mat;
+ break;
+ case GGML_OP_RESHAPE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_nop;
+ break;
+ case GGML_OP_ROPE:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_rope;
+ break;
+ default:
+ return false;
}
- cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
-
- tensor->data = buf;
- free(buf_host);
- fclose(fp);
+ if (params->ith != 0) {
+ return true;
+ }
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+ return true;
+ }
+ func(tensor->src0, tensor->src1, tensor);
+ return true;
}