In host part, main.cpp:
int *bgy; int *bgy_CU; BYTE *imY_CU; dev = findCudaDevice(argc, (const char **)argv); checkCudaErrors( cudaGetDeviceProperties(&deviceProp, dev) ); cvNamedWindow("video", 0); cvNamedWindow("BACKGROUND", 0); cvNamedWindow("BACKGROUND_GPU", 0); CvCapture *cap = cvCaptureFromCAM(0); IplImage* pFrame = 0; IplImage *pFrame2 = cvCreateImage(cvSize(320, 240), IPL_DEPTH_8U, 3); imageY = cvCreateImage(cvGetSize(pFrame2), 8, 1); imageBGY = cvCreateImage(cvGetSize(pFrame2), 8, 1); bgy = new int [IMAGE_SIZE]; checkCudaErrors( cudaMalloc((void **)&bgy_CU, IMAGE_SIZE*sizeof(int) ) ); checkCudaErrors( cudaMalloc((void **)&imY_CU, IMAGE_SIZE*sizeof(BYTE) ) ); checkCudaErrors( cudaMalloc((void **)&imBY_CU, IMAGE_SIZE*sizeof(BYTE) ) ); while(pFrame = cvQueryFrame( cap )) { cvResize(pFrame, pFrame2, 1); cvShowImage("video", pFrame2); cvCvtColor(pFrame2, imageY, CV_BGR2GRAY); for (int i = 0; i < IMAGE_SIZE; i++) { imageY->imageData[i] = (unsigned char)pFrame2->imageData[i*3+1]; } imY = (BYTE*)imageY->imageData; imBY = (BYTE*)imageBGY->imageData; checkCudaErrors( cudaMemcpy(imY_CU, imY, IMAGE_SIZE*sizeof(BYTE), cudaMemcpyHostToDevice) ); sdkCreateTimer(&hTimer); for(int iter = 0; iter < 10; iter++) { if(iter == 0){ checkCudaErrors( cudaDeviceSynchronize() ); sdkResetTimer(&hTimer); sdkStartTimer(&hTimer); } EstBackground_CUDA(imBY_CU, imY_CU, bgy_CU, IMAGE_SIZE); checkCudaErrors( cudaMemcpy(imBY, imBY_CU, IMAGE_SIZE * sizeof(BYTE), cudaMemcpyDeviceToHost) ); cvShowImage("BACKGROUND_GPU", imageBGY); } checkCudaErrors( cudaDeviceSynchronize() ); sdkStopTimer(&hTimer); double dAvgSecs = 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)10; shrLog("GPU time (average) : %.5f sec, \n\n", dAvgSecs * 1.0e6); if( cvWaitKey(1) >= 0 ) break; } delete []bgy; cudaFree(bgy_CU); cudaFree(imY_CU); cudaFree(imBY_CU); cvDestroyWindow("video"); cvDestroyWindow("BACKGROUND"); cvDestroyWindow("BACKGROUND_GPU"); cvReleaseImage(&pFrame2); cvReleaseCapture(&cap);
In foo.h:
extern "C" void EstBackground_CUDA(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE);
In foo.cu:
__global__ void EstBackground_CUDAKernel(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE) { int dist, sdist; int i = threadIdx.x + blockIdx.x * blockDim.x; //cuPrintf("%d, %d, %d \n", threadIdx.x , blockIdx.x , blockDim.x); while (i < IMAGE_SIZE) { imBY_CU[i] = ...// run algorithms i += blockDim.x * gridDim.x; } //__syncthreads(); // this is sync thread inside one block, no use here. } extern "C" void EstBackground_CUDA(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE) { //cudaPrintfInit(); EstBackground_CUDAKernel<<<256, 256>>>(imBY_CU, imY_CU, bgy_CU, IMAGE_SIZE); //cudaPrintfDisplay(stdout, true); //cudaPrintfEnd(); }
No comments:
Post a Comment