在CUDA计算能力2.0中,我遇到了sin和cos的问题。当为CUDA计算能力1.x编译代码时,不会出现这种情况。我做了一个简单的代码。我在GeForce GTX 550 Ti和GeForce GTX 480上进行了测试,结果都是一样的。这是代码:
#include <cufft.h>
#include <stdio.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
void atomicAdd(int*, int);
#endif
__global__ void cuftFrequency(float* in_data, float* out_data, int N, int M, int fromM = 1)
{
cuComplex s;
float t = 0;
for (int I = threadIdx.x + blockIdx.x * blockDim.x + fromM; I <= M; I += blockDim.x * gridDim.x)
{
s.x = 0;
s.y = 0;
for (int J = 0; J < N; J++)
{
t = (6.0 * (J - N / 2)) / I;
s.x += in_data[J] * cos(t);
s.y += in_data[J] * sin(t);
}
/************************* if no problem, array return values 500, else - same refuse
out_data[I - fromM] = 500;//s.x * s.x + s.y * s.y;
}
}
extern "C" __declspec(dllexport) void cuftColorQualifierExec(float* data, float *spm, int N, int M, int fromM)
{
float* in_data_dev;
float *furie_dev;
cudaDeviceProp prop;
int N_Dev;
memset(&prop, 0, sizeof(cudaDeviceProp));
prop.major = 2;
prop.minor = 0;
prop.maxThreadsPerBlock = M - fromM;
cudaChooseDevice(&N_Dev, &prop);
cudaSetDevice(N_Dev);
cudaGetDeviceProperties(&prop, N_Dev);
int N_thread = 576;
int N_block = 2;
int *Count_dev;
cudaError_t err = cudaMalloc((void**)&in_data_dev, sizeof(float) * N);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMemcpy(in_data_dev, data, sizeof(float) * N, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMalloc((void**)&furie_dev, sizeof(float) * (M - fromM + 1));
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
cuftFrequency<<<N_block, N_thread>>>(in_data_dev, furie_dev, N, M, fromM);
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMemcpy(spm, furie_dev, sizeof(float) * (M - fromM + 1), cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaFree(furie_dev);
if (err != cudaSuccess)
fprintf(stderr, "ERROR \"%s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
int main()
{
int M = 1024, fromM = 1, N = 4000;
float* data = new float[4000];
float* spm = new float[M - fromM + 1];
for (int I = 0; I < N; I++)
data[I] = cos(6.0 * I);
for (int I = 0; I < M - fromM + 1; I++)
spm[I] = 0;
cuftColorQualifierExec(data, spm, N, M, fromM);
for (int I = 0; I < M - fromM + 1; I++)
fprintf(stdout, "%d: %f\n", I, spm[I]);
return 0;
}
当线程数超过576时,此代码不工作,并且不返回错误。我专门为数组元素设置了400个值,以使自己相信程序已经达到了这个程度。当程序返回正确的值时,请更改它并再次测试。
为什么这段代码在我用计算功能1.x编译它时正确工作,而当计算功能2.0时它却不能正常工作?
发布于 2013-09-10 08:58:42
对于每个块可以运行多少个线程有硬件限制。GPU架构之间的限制因素不同,包括可用寄存器的数量、可用共享内存和每个MP上每个块的最大线程数。您可以使用CUDA占用率计算器来确定您的GPU和应用程序的限制因素,该计算器包含在CUDA中。
https://stackoverflow.com/questions/18721672
复制