aboutsummaryrefslogtreecommitdiff
path: root/ggml-cuda.cu
diff options
context:
space:
mode:
authorJohannes Gäßler <johannesg@5d6.de>2023-07-29 23:04:10 +0200
committerGitHub <noreply@github.com>2023-07-29 23:04:10 +0200
commit9baf9ef304f330009d5a93b7390280a0fd27c9a1 (patch)
tree7e7e51024868ad942742cf83675f3e02baa3846c /ggml-cuda.cu
parent8a88e5855c3b93024be0f93290b01a4206b65b38 (diff)
CUDA: faster multi GPU synchronization (#2448)
Diffstat (limited to 'ggml-cuda.cu')
-rw-r--r--ggml-cuda.cu9
1 files changed, 4 insertions, 5 deletions
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index d31fc79..511f48c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -3529,13 +3529,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
if (split) {
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
- // The outputs of cuBLAS matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
// Instead they need to be copied to the correct slice in ne0 = dst row index.
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
- for (int64_t j = 0; j < ne1; ++j) {
- float * dhf_dst_i = (float *) ((char *) dst_off_device + (j*ne0 + i01_low)*sizeof(float) + i02*nb2 + i03*nb3);
- CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i + j*i01_diff, i01_diff*sizeof(float), kind, cudaStream_main));
- }
+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof(float) + i02*nb2 + i03*nb3);
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_ddf_i, i01_diff*sizeof(float),
+ i01_diff*sizeof(float), ne1, kind, cudaStream_main));
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));