1) activation_layer.c:54: copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); activation_layer.c:61: copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); batchnorm_layer.c:191: if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); batchnorm_layer.c:192: copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); batchnorm_layer.c:223: copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); batchnorm_layer.c:225: copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); batchnorm_layer.c:266: copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); batchnorm_layer.c:277: if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); blas.h:53:void copy_gpu(int N, float * X, int INCX, float * Y, int INCY); blas.h:54:void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); blas_kernels.cu:602:extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY) blas_kernels.cu:604: copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY); blas_kernels.cu:613:extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) crnn_layer.c:227: copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); crnn_layer.c:257: copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); crnn_layer.c:271: copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); detection_layer.c:259: copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); detection_layer.c:272: //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1); gru_layer.c:250: copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1); gru_layer.c:263: copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1); gru_layer.c:266: copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1); gru_layer.c:272: copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1); 2) extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY) { copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY); } extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } __global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX]; } 通过调用一个kernel来实现数组拷贝,也有offset的说法,还是有点意思 |
说点什么...