Wednesday, July 18, 2012

An Image Processing example of Using CUDA

The example shows the steps of  capturing video frames from camera and building a background. Several OpenCV functions are used.

In host part, main.cpp:
int *bgy;
int *bgy_CU;
BYTE *imY_CU;

dev = findCudaDevice(argc, (const char **)argv);
checkCudaErrors( cudaGetDeviceProperties(&deviceProp, dev) );
cvNamedWindow("video", 0);
    cvNamedWindow("BACKGROUND", 0);
    cvNamedWindow("BACKGROUND_GPU", 0);
    CvCapture *cap = cvCaptureFromCAM(0);
    IplImage* pFrame = 0; 
    IplImage *pFrame2 = cvCreateImage(cvSize(320, 240), IPL_DEPTH_8U, 3);
    imageY      = cvCreateImage(cvGetSize(pFrame2), 8, 1);
    imageBGY      = cvCreateImage(cvGetSize(pFrame2), 8, 1);

    bgy = new int [IMAGE_SIZE];    
    checkCudaErrors( cudaMalloc((void **)&bgy_CU,  IMAGE_SIZE*sizeof(int)  ) );
    checkCudaErrors( cudaMalloc((void **)&imY_CU,  IMAGE_SIZE*sizeof(BYTE)  ) );
    checkCudaErrors( cudaMalloc((void **)&imBY_CU, IMAGE_SIZE*sizeof(BYTE)  ) );

    while(pFrame = cvQueryFrame( cap ))
    {
        cvResize(pFrame, pFrame2, 1);
        cvShowImage("video", pFrame2);
        cvCvtColor(pFrame2, imageY, CV_BGR2GRAY);
        for (int i = 0; i < IMAGE_SIZE; i++)                      
        {
            imageY->imageData[i] = (unsigned char)pFrame2->imageData[i*3+1];
        }    
        imY = (BYTE*)imageY->imageData;
        imBY = (BYTE*)imageBGY->imageData;

        checkCudaErrors( cudaMemcpy(imY_CU, imY, IMAGE_SIZE*sizeof(BYTE), cudaMemcpyHostToDevice) );

        sdkCreateTimer(&hTimer);
        for(int iter = 0; iter < 10; iter++)
        {
            if(iter == 0){
                checkCudaErrors( cudaDeviceSynchronize() );
                sdkResetTimer(&hTimer);
                sdkStartTimer(&hTimer);
            }

            EstBackground_CUDA(imBY_CU, imY_CU, bgy_CU, IMAGE_SIZE);
            checkCudaErrors( cudaMemcpy(imBY, imBY_CU, IMAGE_SIZE * sizeof(BYTE), cudaMemcpyDeviceToHost) );
            cvShowImage("BACKGROUND_GPU", imageBGY);
        }
        checkCudaErrors( cudaDeviceSynchronize() );
        sdkStopTimer(&hTimer);
        double dAvgSecs = 1.0e-3 * (double)sdkGetTimerValue(&hTimer) / (double)10;
        shrLog("GPU time (average) : %.5f sec, \n\n", dAvgSecs * 1.0e6);
  
        if( cvWaitKey(1) >= 0 )
              break;
    }

    delete []bgy;
    cudaFree(bgy_CU);
    cudaFree(imY_CU);
    cudaFree(imBY_CU);

    cvDestroyWindow("video");
    cvDestroyWindow("BACKGROUND");
    cvDestroyWindow("BACKGROUND_GPU");
    cvReleaseImage(&pFrame2);
    cvReleaseCapture(&cap);

In foo.h:
extern "C" void EstBackground_CUDA(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE);

In foo.cu:
__global__ void EstBackground_CUDAKernel(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE)
{
    int  dist, sdist;
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    
    //cuPrintf("%d, %d, %d \n", threadIdx.x , blockIdx.x , blockDim.x);
    
    while (i < IMAGE_SIZE)
    {
        imBY_CU[i] = ...// run algorithms
        i += blockDim.x * gridDim.x;
    }    
    //__syncthreads(); // this is sync thread inside one block, no use here.
}
extern "C"  void EstBackground_CUDA(unsigned char *imBY_CU, unsigned char *imY_CU, int *bgy_CU, int IMAGE_SIZE)
{
    //cudaPrintfInit();
    EstBackground_CUDAKernel<<<256, 256>>>(imBY_CU, imY_CU, bgy_CU, IMAGE_SIZE);
    //cudaPrintfDisplay(stdout, true);
    //cudaPrintfEnd();
}



No comments:

Post a Comment