我正在尝试使用CUDA对图像进行卷积,但无法获得结果。cuda-gdb在我的系统上不能正常工作,所以我无法判断CUDA内核内部发生了什么。我使用的CUDA内核如下:
__global__
void
convolve_component_EXTEND_kern(const JSAMPLE *data, // image data
ssize_t data_width, // image width
ssize_t data_height, // image height
const float *kern, // convolution kernel data
ssize_t kern_w_f, // convolution kernel has a width of 2 * kern_w_f + 1
ssize_t kern_h_f, // convolution_kernel has a height of 2 * kern_h_f + 1
JSAMPLE *res) // array to store the result
{
ssize_t i = ::blockIdx.x * ::blockDim.x + ::threadIdx.x;
ssize_t j = ::blockIdx.y * ::blockDim.y + ::threadIdx.y;
float value = 0;
for (ssize_t m = 0; m < 2 * kern_w_f + 1; m++) {
for (ssize_t n = 0; n < 2 * kern_h_f + 1; n++) {
ssize_t x = i + m - kern_w_f; // column index for this contribution to convolution sum for (i, j)
ssize_t y = j + n - kern_h_f; // row index for ...
x = x < 0 ? 0 : (x >= data_width ? data_width - 1 : x);
y = y < 0 ? 0 : (y >= data_height ? data_height - 1 : y);
value += ((float) data[data_width * y + x]) * kern[(2 * kern_w_f + 1) * n + m];
}
}
res[data_width * j + i] = (JSAMPLE) value;
}
并且我在这个函数中调用它
void
convolve_component_EXTEND_cuda(const JSAMPLE *data,
ssize_t data_width,
ssize_t data_height,
const float *kern,
ssize_t kern_w_f,
ssize_t kern_h_f,
JSAMPLE *res)
{
JSAMPLE *d_data;
cudaMallocManaged(&d_data,
data_width * data_height * sizeof(JSAMPLE));
cudaMemcpy(d_data,
data,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyHostToDevice);
float *d_kern;
cudaMallocManaged(&d_kern,
(2 * kern_w_f + 1) * (2 * kern_h_f + 1) * sizeof(float));
cudaMemcpy(d_kern,
kern,
(2 * kern_w_f + 1) * (2 * kern_h_f + 1) * sizeof(float),
cudaMemcpyHostToDevice);
JSAMPLE *d_res;
cudaMallocManaged(&d_res,
data_width * data_height * sizeof(JSAMPLE));
dim3 threadsPerBlock(16, 16); // can be adjusted to 32, 32 (1024 threads per block is the maximum)
dim3 numBlocks(data_width / threadsPerBlock.x,
data_height / threadsPerBlock.y);
convolve_component_EXTEND_kern<<<numBlocks, threadsPerBlock>>>(d_data,
data_width,
data_height,
d_kern,
kern_w_f,
kern_h_f,
d_res);
cudaDeviceSynchronize();
cudaMemcpy(d_res,
res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
cudaFree(d_data);
cudaFree(d_kern);
cudaFree(d_res);
}
在这种情况下,图像数据以这样一种方式包含在称为data的数组中,即通过索引到data_width *j+ i处的数组来访问(i,j)处的像素。内核数据位于称为kern的数组中,并且它具有2* kern_w_f + 1的宽度和2* kern_h_f +1的高度。与数据数组一样,通过索引到(2 * w_f + 1) *j+i处的kern数组来访问(i,j)处的元素。数组res用于存储卷积的结果,并在传递给函数之前使用calloc()进行分配。
当我对图像数据调用第二个函数时,图像的所有像素都被转换为0,而不是应用卷积。谁能指出问题所在?
发布于 2021-01-11 12:40:23
在调用内核并执行卷积之后,您尝试将数据复制回res数组。
cudaDeviceSynchronize();
cudaMemcpy(d_res,
res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
这应该是
cudaDeviceSynchronize();
cudaMemcpy(res,
d_res,
data_width * data_height * sizeof(JSAMPLE),
cudaMemcpyDeviceToHost);
因为cudaMemcpy的第一个参数是目标指针。
cudaError_t cudaMemcpy ( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
https://stackoverflow.com/questions/65666257
复制相似问题