|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <cstdio>
|
|
|
#include "cuda.h"
|
|
|
|
|
|
#include "knn_cuda_kernel.h"
|
|
|
|
|
|
|
|
|
#define BLOCK_DIM 16
|
|
|
#define DEBUG 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void cuComputeDistanceGlobal( float* A, int wA,
|
|
|
float* B, int wB, int dim, float* AB){
|
|
|
|
|
|
|
|
|
__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
|
|
|
__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
|
|
|
|
|
|
|
|
|
__shared__ int begin_A;
|
|
|
__shared__ int begin_B;
|
|
|
__shared__ int step_A;
|
|
|
__shared__ int step_B;
|
|
|
__shared__ int end_A;
|
|
|
|
|
|
|
|
|
int tx = threadIdx.x;
|
|
|
int ty = threadIdx.y;
|
|
|
|
|
|
|
|
|
float tmp;
|
|
|
float ssd = 0;
|
|
|
|
|
|
|
|
|
begin_A = BLOCK_DIM * blockIdx.y;
|
|
|
begin_B = BLOCK_DIM * blockIdx.x;
|
|
|
step_A = BLOCK_DIM * wA;
|
|
|
step_B = BLOCK_DIM * wB;
|
|
|
end_A = begin_A + (dim-1) * wA;
|
|
|
|
|
|
|
|
|
int cond0 = (begin_A + tx < wA);
|
|
|
int cond1 = (begin_B + tx < wB);
|
|
|
int cond2 = (begin_A + ty < wA);
|
|
|
|
|
|
|
|
|
for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
|
|
|
|
|
|
if (a/wA + ty < dim){
|
|
|
shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
|
|
|
shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
|
|
|
}
|
|
|
else{
|
|
|
shared_A[ty][tx] = 0;
|
|
|
shared_B[ty][tx] = 0;
|
|
|
}
|
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
|
|
|
|
|
|
if (cond2 && cond1){
|
|
|
for (int k = 0; k < BLOCK_DIM; ++k){
|
|
|
tmp = shared_A[k][ty] - shared_B[k][tx];
|
|
|
ssd += tmp*tmp;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
__syncthreads();
|
|
|
}
|
|
|
|
|
|
|
|
|
if (cond2 && cond1)
|
|
|
AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
|
|
|
|
|
|
|
|
|
int l, i, j;
|
|
|
float *p_dist;
|
|
|
long *p_ind;
|
|
|
float curr_dist, max_dist;
|
|
|
long curr_row, max_row;
|
|
|
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
|
if (xIndex<width){
|
|
|
|
|
|
p_dist = dist + xIndex;
|
|
|
p_ind = ind + xIndex;
|
|
|
max_dist = p_dist[0];
|
|
|
p_ind[0] = 1;
|
|
|
|
|
|
|
|
|
for (l=1; l<k; l++){
|
|
|
curr_row = l * width;
|
|
|
curr_dist = p_dist[curr_row];
|
|
|
if (curr_dist<max_dist){
|
|
|
i=l-1;
|
|
|
for (int a=0; a<l-1; a++){
|
|
|
if (p_dist[a*width]>curr_dist){
|
|
|
i=a;
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
for (j=l; j>i; j--){
|
|
|
p_dist[j*width] = p_dist[(j-1)*width];
|
|
|
p_ind[j*width] = p_ind[(j-1)*width];
|
|
|
}
|
|
|
p_dist[i*width] = curr_dist;
|
|
|
p_ind[i*width] = l+1;
|
|
|
} else {
|
|
|
p_ind[l*width] = l+1;
|
|
|
}
|
|
|
max_dist = p_dist[curr_row];
|
|
|
}
|
|
|
|
|
|
|
|
|
max_row = (k-1)*width;
|
|
|
for (l=k; l<height; l++){
|
|
|
curr_dist = p_dist[l*width];
|
|
|
if (curr_dist<max_dist){
|
|
|
i=k-1;
|
|
|
for (int a=0; a<k-1; a++){
|
|
|
if (p_dist[a*width]>curr_dist){
|
|
|
i=a;
|
|
|
break;
|
|
|
}
|
|
|
}
|
|
|
for (j=k-1; j>i; j--){
|
|
|
p_dist[j*width] = p_dist[(j-1)*width];
|
|
|
p_ind[j*width] = p_ind[(j-1)*width];
|
|
|
}
|
|
|
p_dist[i*width] = curr_dist;
|
|
|
p_ind[i*width] = l+1;
|
|
|
max_dist = p_dist[max_row];
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void cuParallelSqrt(float *dist, int width, int k){
|
|
|
unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
if (xIndex<width && yIndex<k)
|
|
|
dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
|
|
|
int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
|
|
|
|
|
|
|
|
|
dim3 g_16x16(query_nb/16, ref_nb/16, 1);
|
|
|
dim3 t_16x16(16, 16, 1);
|
|
|
if (query_nb%16 != 0) g_16x16.x += 1;
|
|
|
if (ref_nb %16 != 0) g_16x16.y += 1;
|
|
|
|
|
|
dim3 g_256x1(query_nb/256, 1, 1);
|
|
|
dim3 t_256x1(256, 1, 1);
|
|
|
if (query_nb%256 != 0) g_256x1.x += 1;
|
|
|
|
|
|
dim3 g_k_16x16(query_nb/16, k/16, 1);
|
|
|
dim3 t_k_16x16(16, 16, 1);
|
|
|
if (query_nb%16 != 0) g_k_16x16.x += 1;
|
|
|
if (k %16 != 0) g_k_16x16.y += 1;
|
|
|
|
|
|
|
|
|
cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb,
|
|
|
query_dev, query_nb, dim, dist_dev);
|
|
|
|
|
|
|
|
|
cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev,
|
|
|
query_nb, ref_nb, k);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if DEBUG
|
|
|
unsigned int size_of_float = sizeof(float);
|
|
|
unsigned long size_of_long = sizeof(long);
|
|
|
|
|
|
float* dist_host = new float[query_nb * k];
|
|
|
long* idx_host = new long[query_nb * k];
|
|
|
|
|
|
|
|
|
cudaMemcpy(&dist_host[0], dist_dev,
|
|
|
query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
|
|
|
|
|
|
cudaMemcpy(&idx_host[0], ind_dev,
|
|
|
query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
|
|
|
|
|
|
int i = 0;
|
|
|
for(i = 0; i < 100; i++){
|
|
|
printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
|
|
|
}
|
|
|
#endif
|
|
|
}
|
|
|
|