Cuda 寻找相等的子串
Cuda Finding Equal Substrings
函数compute_kmers试图找出子字符串kmer在字符串reference_string中出现了多少次, k是kmer的长度。
这适用于非常小的输入,但在大的输入上它会发现不同的错误结果。不确定我做错了什么。
__device__ void compare_elements(char *ref, char *kmer, int k, int *hits, int current)
{
int equal = 1;
for(int i=0; i<k; i++)
{
if(kmer[i] != ref[i])
{
equal =0;
}
}
if(equal == 1)
{
hits[current]++;
}
}
__global__ void compute(char *d_reference_str, char *d_kmer, int reference_length, int k, int *hits)
{
int current = blockIdx.x * 1024 + threadIdx.x;
if(current+k<reference_length)
{
char *refsubstr = (char *) malloc(k * sizeof(char));
for(int i=0; i<k; i++)
{
refsubstr[i] = d_reference_str[current+i];
}
compare_elements(refsubstr, d_kmer, k, hits, current);
}
}
__host__ void compute_kmers(char* reference_str, char* kmer, int reference_length, int k, int *hits)
{
char *d_reference_str;
char *d_kmer;
int *d_hits;
cudaMalloc((void **) &d_reference_str, reference_length*sizeof(char));
cudaMalloc((void **) &d_kmer, k*sizeof(char));
cudaMalloc((void **) &d_hits, MAX_REF_LENGTH * sizeof(int));
cudaMemcpy(d_reference_str,
reference_str,reference_length*sizeof(char),cudaMemcpyHostToDevice);
cudaMemcpy(d_kmer, kmer,k*sizeof(char),cudaMemcpyHostToDevice);
int numOfBlocks=(reference_length/1024)+1;
dim3 dimGrid(numOfBlocks, 1);
dim3 dimBlock(1024, 1);
compute<<<dimGrid,dimBlock>>>(d_reference_str,d_kmer,reference_length, k, d_hits);
cudaMemcpy(hits, d_hits, reference_length*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(d_reference_str);
cudaFree(d_kmer);
cudaFree(d_hits);
}
无论何时在内核中使用动态内存分配,都应该验证返回的指针不为空。 malloc returns 如果无法满足请求,则为 null。
设备端堆的大小有限。可以通过 cudaDeviceSetLimit
增加
但是,您不需要复制引用子字符串。可以直接使用引用。
__global__ void compute(char *d_reference_str, char *d_kmer, int reference_length, int k, int *hits)
{
int current = blockIdx.x * 1024 + threadIdx.x;
if(current+k<reference_length)
{
compare_elements(d_reference_str + current, d_kmer, k, hits, current);
}
}
函数compute_kmers试图找出子字符串kmer在字符串reference_string中出现了多少次, k是kmer的长度。 这适用于非常小的输入,但在大的输入上它会发现不同的错误结果。不确定我做错了什么。
__device__ void compare_elements(char *ref, char *kmer, int k, int *hits, int current)
{
int equal = 1;
for(int i=0; i<k; i++)
{
if(kmer[i] != ref[i])
{
equal =0;
}
}
if(equal == 1)
{
hits[current]++;
}
}
__global__ void compute(char *d_reference_str, char *d_kmer, int reference_length, int k, int *hits)
{
int current = blockIdx.x * 1024 + threadIdx.x;
if(current+k<reference_length)
{
char *refsubstr = (char *) malloc(k * sizeof(char));
for(int i=0; i<k; i++)
{
refsubstr[i] = d_reference_str[current+i];
}
compare_elements(refsubstr, d_kmer, k, hits, current);
}
}
__host__ void compute_kmers(char* reference_str, char* kmer, int reference_length, int k, int *hits)
{
char *d_reference_str;
char *d_kmer;
int *d_hits;
cudaMalloc((void **) &d_reference_str, reference_length*sizeof(char));
cudaMalloc((void **) &d_kmer, k*sizeof(char));
cudaMalloc((void **) &d_hits, MAX_REF_LENGTH * sizeof(int));
cudaMemcpy(d_reference_str,
reference_str,reference_length*sizeof(char),cudaMemcpyHostToDevice);
cudaMemcpy(d_kmer, kmer,k*sizeof(char),cudaMemcpyHostToDevice);
int numOfBlocks=(reference_length/1024)+1;
dim3 dimGrid(numOfBlocks, 1);
dim3 dimBlock(1024, 1);
compute<<<dimGrid,dimBlock>>>(d_reference_str,d_kmer,reference_length, k, d_hits);
cudaMemcpy(hits, d_hits, reference_length*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(d_reference_str);
cudaFree(d_kmer);
cudaFree(d_hits);
}
无论何时在内核中使用动态内存分配,都应该验证返回的指针不为空。 malloc returns 如果无法满足请求,则为 null。
设备端堆的大小有限。可以通过 cudaDeviceSetLimit
但是,您不需要复制引用子字符串。可以直接使用引用。
__global__ void compute(char *d_reference_str, char *d_kmer, int reference_length, int k, int *hits)
{
int current = blockIdx.x * 1024 + threadIdx.x;
if(current+k<reference_length)
{
compare_elements(d_reference_str + current, d_kmer, k, hits, current);
}
}