猿代码 — 科研/AI模型/高性能计算
0

深度学习源码分析——copy_gpu

摘要: 1)activation_layer.c:54: copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);activation_layer.c:61: copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);batchnorm_layer.c:191: i ...
1)
activation_layer.c:54:    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
activation_layer.c:61:    copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
batchnorm_layer.c:191:    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
batchnorm_layer.c:192:    copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
batchnorm_layer.c:223:        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
batchnorm_layer.c:225:        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
batchnorm_layer.c:266:    copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
batchnorm_layer.c:277:    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
blas.h:53:void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
blas.h:54:void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
blas_kernels.cu:602:extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
blas_kernels.cu:604:    copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
blas_kernels.cu:613:extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
crnn_layer.c:227:            copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
crnn_layer.c:257:        copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
crnn_layer.c:271:        copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
detection_layer.c:259:        copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
detection_layer.c:272:    //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
gru_layer.c:250:        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
gru_layer.c:263:        copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
gru_layer.c:266:        copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
gru_layer.c:272:        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);

2)
extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
{
    copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
}

extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
{
    copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
    check_error(cudaPeekAtLastError());
}

__global__ void copy_kernel(int N,  float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
{
    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX];
}

通过调用一个kernel来实现数组拷贝,也有offset的说法,还是有点意思

说点什么...

已有0条评论

最新评论...

本文作者
2023-10-26 09:44
  • 0
    粉丝
  • 248
    阅读
  • 0
    回复
资讯幻灯片
热门评论
热门专题
排行榜
Copyright   ©2015-2023   猿代码-超算人才智造局 高性能计算|并行计算|人工智能      ( 京ICP备2021026424号-2 )