Performance drop in nppiCopyConstBorder_8u_C1R

Question

I think you will find that the only difference is when/where API latencies are being accounted for during program execution, and the the underlying npp function itself doesn't have a vast different in performance between the two CUDA versions and GPU architectures.

My evidence for this hypothesis is this version of the code you posted:

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <npp.h>

#include <assert.h>

#define w 256   // width
#define h 256   // height
#define b 16    // extra border

#define BORDER_TYPE 0

#define CUDA_CHECK_RETURN(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char *argv[])
{
    Npp8u* h_idata[w*h];
    Npp8u* h_odata[(w+b)*(h+b)];
    Npp8u *i_devPtr, *i_devPtr_Border;

    int d_Size = w * h * sizeof(Npp8u);
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr, d_Size ) );
    CUDA_CHECK_RETURN( cudaMemcpy(i_devPtr, h_idata, d_Size, cudaMemcpyHostToDevice) );

    int d_Size_o = (w+b) * (h+b) * sizeof(Npp8u);    
    CUDA_CHECK_RETURN( cudaMalloc( (void**) &i_devPtr_Border, d_Size_o ) );

    NppiSize SizeROI = {w, h};
    NppiSize SizeROI_Border = { SizeROI.width + b, SizeROI.height + b };
    NppStatus eStatusNPP;  

#ifdef __WARMUP_CALL__
    // Warm up call to nppi function
    eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
                    i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
                    b, b, BORDER_TYPE);

    assert( NPP_NO_ERROR == eStatusNPP );
    CUDA_CHECK_RETURN( cudaDeviceSynchronize() );
#endif

    // Call for timing
    cudaEvent_t start, stop;
    CUDA_CHECK_RETURN( cudaEventCreate( &start ) );
    CUDA_CHECK_RETURN( cudaEventCreate( &stop ) );

    CUDA_CHECK_RETURN( cudaEventRecord( start, 0 ) );
    eStatusNPP = nppiCopyConstBorder_8u_C1R(i_devPtr,SizeROI.width, SizeROI,
                    i_devPtr_Border, SizeROI_Border.width, SizeROI_Border,
                    b, b, BORDER_TYPE);

    assert( NPP_NO_ERROR == eStatusNPP );
    CUDA_CHECK_RETURN( cudaEventRecord( stop, 0 ) );
    CUDA_CHECK_RETURN( cudaEventSynchronize( stop ) );

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("T= %1.5f sg\n", milliseconds / 1000.0f);

    CUDA_CHECK_RETURN( cudaMemcpy(h_odata, i_devPtr_Border, d_Size_o, cudaMemcpyDeviceToHost) );

    cudaFree(i_devPtr);
    cudaFree(i_devPtr_Border);

    CUDA_CHECK_RETURN(cudaDeviceReset());

    return 0;
}

Note the warm up call to nppiCopyConstBorder_8u_C1R before the timed call. When I run it (CUDA 5.5 with linux on an sm_30 device), I see this:

~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math pqb.cc 
~$ ./a.out 
T= 0.39670 sg

~$ nvcc -lnppi -m64 -O3 --ptxas-options=-v -gencode arch=compute_30,code=sm_30 --compiler-options -use_fast_math -D__WARMUP_CALL__ pqb.cc 
~$ ./a.out 
T= 0.00002 sg

ie. adding a warmup call totally changes the timed performance of the function. When I look at the API trace from nvprof, I see that both npp function calls take about 6 microseconds. However, the CUDA launch for the first call takes hundreds of millseconds when the second call takes about 12 microseconds.

So, as I mentioned in an earlier comment, there is some lazy process which is getting included in the timing of the CUDA 5.5 on Titan case that probably isn't on the CUDA 5.0 on Fermi case. That isn't a feature of npp though, as I guess that the performance of the actual function is as fast or faster on Titan than on the Fermi card.