質問

I have a performance drop using the nppiCopyConstBorder_8u_C1R function in two different architectures (GTX480 and GTX TITAN) involving also different CUDA version (v5.0 and v5.5 respectively).

In the first case (GTX480 and CUDA 5.0) the execution time of the function is

T = 0.00005 seconds

In the second case (GTX TITAN and CUDA 5.5) the execution time is

T = 0.969831 seconds

I have reproduced this behaviour with the following code:

// GTX480 nvcc -lnpp -m64 -O3 --ptxas-options=-v -gencode arch=compute_20,code=sm_20 --compiler-options -use_fast_math
// GTXTITAN nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_35,code=sm_35 --compiler-options -use_fast_math
#include <stdlib.h>
#include <stdio.h>
// CUDA
#include <cuda.h>
#include <cuda_runtime_api.h>
// CUDA Nvidia Performance Primitives
#include <npp.h>

#include <assert.h>

#define w 256   // width
#define h 256   // height
#define b 16    // extra border

#define BORDER_TYPE 0

int main(int argc, char *argv[])
{
    // input data
    Npp8u* h_idata[w*h];
    // output data
    Npp8u* h_odata[(w+b)*(h+b)];

    /* MEMORY ALLOCTION AND INITIAL COPY OF DATA FROM CPU TO GPU */

    Npp8u *i_devPtr, *i_devPtr_Border;

    // size of input the data
    int d_Size = w * h * sizeof(Npp8u);
    // allocate input data
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
    // copy initial data to GPU
    CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );

    // size of output the data
    int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);    
    // allocation for input data with extended border
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );

    // create struct with ROI size given the current mask
    NppiSize SizeROI = {w, h};

    NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };

    // create events
    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    // NPP Library Copy Constant Border
    cudaEventRecord( start, 0 );
    NppStatus eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
                    i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
                    b, b, BORDER_TYPE);

    cudaDeviceSynchronize();
    assert( NPP_NO_ERROR == eStatusNPP );
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("T= %1.5f sg\n", milliseconds / 1000.0f);


    // copy output data from GPU
    CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );

    /* free resources */
    cudaFree(i_devPtr);
    cudaFree(i_devPtr_Border);

    CUDA_CHECK_RETURN(cudaDeviceReset());

    return 0;
}

Q: Anyone is aware about this issue ?

This makes me ask the following question:

Q: How is nppiCopyConstBorder_8u_C1R implemented? Does the function involve copy data from device to host, extend the border in the host and copy the result to the device?

PS: The machine with the TITAN has the GPU outside the box in a separated motherboard specially designed for multiple PCIe connections and it's connected via a PCIe wire. I have not seen any drawback in this configuration regarding other kernels I have tested.

役に立ちましたか?

解決

I think you will find that the only difference is when/where API latencies are being accounted for during program execution, and the the underlying npp function itself doesn't have a vast different in performance between the two CUDA versions and GPU architectures.

My evidence for this hypothesis is this version of the code you posted:

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <npp.h>

#include <assert.h>

#define w 256   // width
#define h 256   // height
#define b 16    // extra border

#define BORDER_TYPE 0

#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char *argv[])
{
    Npp8u* h_idata[w*h];
    Npp8u* h_odata[(w+b)*(h+b)];
    Npp8u *i_devPtr, *i_devPtr_Border;

    int d_Size = w * h * sizeof(Npp8u);
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
    CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );

    int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);    
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );

    NppiSize SizeROI = {w, h};
    NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
    NppStatus eStatusNPP;  

#ifdef __WARMUP_CALL__
    // Warm up call to nppi function
    eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
                    i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
                    b, b, BORDER_TYPE);

    assert( NPP_NO_ERROR == eStatusNPP );
    CUDA_CHECK_RETURN( cudaDeviceSynchronize() );
#endif

    // Call for timing
    cudaEvent_t start, stop;
    CUDA_CHECK_RETURN( cudaEventCreate( &start ) );
    CUDA_CHECK_RETURN( cudaEventCreate( &stop ) );

    CUDA_CHECK_RETURN( cudaEventRecord( start, 0 ) );
    eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
                    i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
                    b, b, BORDER_TYPE);

    assert( NPP_NO_ERROR == eStatusNPP );
    CUDA_CHECK_RETURN( cudaEventRecord( stop, 0 ) );
    CUDA_CHECK_RETURN( cudaEventSynchronize( stop ) );

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("T= %1.5f sg\n", milliseconds / 1000.0f);

    CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );

    cudaFree(i_devPtr);
    cudaFree(i_devPtr_Border);

    CUDA_CHECK_RETURN(cudaDeviceReset());

    return 0;
}

Note the warm up call to nppiCopyConstBorder_8u_C1R before the timed call. When I run it (CUDA 5.5 with linux on an sm_30 device), I see this:

~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math pqb.cc 
~$ ./a.out 
T= 0.39670 sg

~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math -D__WARMUP_CALL__ pqb.cc 
~$ ./a.out 
T= 0.00002 sg

ie. adding a warmup call totally changes the timed performance of the function. When I look at the API trace from nvprof, I see that both npp function calls take about 6 microseconds. However, the CUDA launch for the first call takes hundreds of millseconds when the second call takes about 12 microseconds.

So, as I mentioned in an earlier comment, there is some lazy process which is getting included in the timing of the CUDA 5.5 on Titan case that probably isn't on the CUDA 5.0 on Fermi case. That isn't a feature of npp though, as I guess that the performance of the actual function is as fast or faster on Titan than on the Fermi card.

ライセンス: CC-BY-SA帰属
所属していません StackOverflow
scroll top