Question

I was working on an algorithm in CUDA and wanted to understand the performance of my kernel so I could optimize it appropriately.

I am required to determine whether my kernel is compute bound or memory bound using source code modifications only? NVIDIA docs suggest I run the kernel without memory accesses to determine compute time and similarly run the kernel without any computations to determine memory time.

I do not know how to appropriately modify my source code so that I can achieve the above? How can you perform computations without memory access (or how can you compute a result without accessing the variables stored in the memory?). Could you suggest an example for the memory and computation case in the following code so I can work on modifying it completely myself...

__device__ inline float cndGPU(float d)
{
const float       A1 = 0.31938153f;
const float       A2 = -0.356563782f;
const float       A3 = 1.781477937f;
const float       A4 = -1.821255978f;
const float       A5 = 1.330274429f;
const float RSQRT2PI = 0.39894228040143267793994605993438f;

float
K = 1.0f / (1.0f + 0.2316419f * fabsf(d));

float
cnd = RSQRT2PI * __expf(- 0.5f * d * d) *
      (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));

if (d > 0)
    cnd = 1.0f - cnd;

return cnd;
}

__device__ inline void BlackScholesBodyGPU(
float &CallResult,
float &PutResult,
float S, //Stock price
float X, //Option strike
float T, //Option years
float R, //Riskless rate
float V  //Volatility rate
)
{
float sqrtT, expRT;
float d1, d2, CNDD1, CNDD2;

sqrtT = sqrtf(T);
d1 = (__logf(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
d2 = d1 - V * sqrtT;

CNDD1 = cndGPU(d1);
CNDD2 = cndGPU(d2);

//Calculate Call and Put simultaneously
expRT = __expf(- R * T);
CallResult = S * CNDD1 - X * expRT * CNDD2;
PutResult  = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1);
}
Was it helpful?

Solution

How I see it. If you have:

float cndGPU(float d) {
    const float a = 1;
    const float b = 2;
    float c;

    c = a + b + arr[d];

    return c;
}
  • Checking compute time without memory access - literally write all your computing expressions into one and without using variables:

return 1 + 2 + 3; //just put some number that can be in arr[d]

  • Checking the memory access - literally the opposite:

`

const float a = 1;
const float b = 2;
float c;

c = arr[d]; //here we have our memory access 

return c;
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top