CUDA's nvcc Does Not Properly Compile Ternary Operators/Conditional Shorthand?

https://stackoverflow.com/questions/9724829

24-05-2021
|

Question

EDIT
Here's a small program you compile to see these kind of errors for yourself...

//for printf
#include <stdio.h>

#include <cuda.h>

__inline __host__ void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      //if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

__global__ void myKernel1(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+
      offset +
      (offset==0)?0:(offset&0x01==0x0)?(offset-1)*(offset>>1):
      (offset)*(offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__global__ void myKernel2(int *dev_idx, int *dev_tID, const int offset)
{
   int myElement = threadIdx.x + blockDim.x * blockIdx.x;
   //
   int temp;
   temp = myElement+offset;
   if (offset != 0 && offset&0x01==0x0) temp+= (offset-1)*(offset>>1);
   if (offset != 0 && offset&0x01!=0x0) temp+= offset*( offset>>1);
   dev_idx[myElement+offset] = temp;
   dev_tID[myElement+offset] = myElement;

}

__host__ void PrintMethod1(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   for (int c=(h_set==0)?0:offset;
    c < (h_set==0)?offset:total;
    c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

__host__ void PrintMethod2(int *h_idx, int * h_tID, const int offset, 
               const int total, const int h_set)
{
   int loopStart = (h_set==0)?0:offset;
   int loopEnd = (h_set==0)?offset:total;
   printf("Loop Start: %d, Loop End: %d\n",
      loopStart, loopEnd);
   for (int c=loopStart; c < loopEnd; c++)
      printf("Element #%d --> idx: %d   tID: %d\n",
         c,h_idx[c],h_tID[c]);
}

//Checks if there is a compatible device
bool IsCompatibleDeviceRunning()
{
   int *dummy;
   return cudaGetDeviceCount(dummy) != cudaSuccess;
}

int main()
{
   //Check for compatible device
   if (!IsCompatibleDeviceRunning())
   {
      printf("ERROR: No compatible CUDA devices found!\n");
      exit(1);
   }
   const int total = 30;
   const int offset = total/2;

   int * h_tID, * dev_tID, * h_idx, * dev_idx, h_set;
   h_tID = (int *) malloc(total*sizeof(int));
   h_idx = (int *) malloc(total*sizeof(int));
   gpuErrchk(cudaMalloc((void **) &dev_tID,total*sizeof(int)));
   gpuErrchk(cudaMalloc((void **) &dev_idx,total*sizeof(int)));
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, 0);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, 0);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   myKernel1<<<1,offset>>>(dev_idx, dev_tID, offset);
   //myKernel2<<<1,offset>>>(dev_idx, dev_tID, offset);
   gpuErrchk(cudaPeekAtLastError());
   gpuErrchk(cudaDeviceSynchronize());
   gpuErrchk(cudaMemcpy(h_tID, dev_tID, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   gpuErrchk(cudaMemcpy(h_idx, dev_idx, total*sizeof(int),
            cudaMemcpyDeviceToHost));
   h_set = 0;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   h_set = 1;
   //PrintMethod1(h_idx, h_tID, offset, total, h_set);
   PrintMethod2(h_idx, h_tID, offset, total, h_set);
   return 0;
}

When MyKernel2 is run, the correct output is written to the array:

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 1   tID: 1
Element #2 --> idx: 2   tID: 2
Element #3 --> idx: 3   tID: 3
Element #4 --> idx: 4   tID: 4
Element #5 --> idx: 5   tID: 5
Element #6 --> idx: 6   tID: 6
Element #7 --> idx: 7   tID: 7
Element #8 --> idx: 8   tID: 8
Element #9 --> idx: 9   tID: 9
Element #10 --> idx: 10   tID: 10
Element #11 --> idx: 11   tID: 11
Element #12 --> idx: 12   tID: 12
Element #13 --> idx: 13   tID: 13
Element #14 --> idx: 14   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 120   tID: 0
Element #16 --> idx: 121   tID: 1
Element #17 --> idx: 122   tID: 2
Element #18 --> idx: 123   tID: 3
Element #19 --> idx: 124   tID: 4
Element #20 --> idx: 125   tID: 5
Element #21 --> idx: 126   tID: 6
Element #22 --> idx: 127   tID: 7
Element #23 --> idx: 128   tID: 8
Element #24 --> idx: 129   tID: 9
Element #25 --> idx: 130   tID: 10
Element #26 --> idx: 131   tID: 11
Element #27 --> idx: 132   tID: 12
Element #28 --> idx: 133   tID: 13
Element #29 --> idx: 134   tID: 14

When MyKernel1 is run, with an identical ternary-based idx assignment, it gets zero for all results:

Loop Start: 0, Loop End: 15
Element #0 --> idx: 0   tID: 0
Element #1 --> idx: 0   tID: 1
Element #2 --> idx: 0   tID: 2
Element #3 --> idx: 0   tID: 3
Element #4 --> idx: 0   tID: 4
Element #5 --> idx: 0   tID: 5
Element #6 --> idx: 0   tID: 6
Element #7 --> idx: 0   tID: 7
Element #8 --> idx: 0   tID: 8
Element #9 --> idx: 0   tID: 9
Element #10 --> idx: 0   tID: 10
Element #11 --> idx: 0   tID: 11
Element #12 --> idx: 0   tID: 12
Element #13 --> idx: 0   tID: 13
Element #14 --> idx: 0   tID: 14
Loop Start: 15, Loop End: 30
Element #15 --> idx: 0   tID: 0
Element #16 --> idx: 0   tID: 1
Element #17 --> idx: 0   tID: 2
Element #18 --> idx: 0   tID: 3
Element #19 --> idx: 0   tID: 4
Element #20 --> idx: 0   tID: 5
Element #21 --> idx: 0   tID: 6
Element #22 --> idx: 0   tID: 7
Element #23 --> idx: 0   tID: 8
Element #24 --> idx: 0   tID: 9
Element #25 --> idx: 0   tID: 10
Element #26 --> idx: 0   tID: 11
Element #27 --> idx: 0   tID: 12
Element #28 --> idx: 0   tID: 13
Element #29 --> idx: 0   tID: 14

When PrintMethod1 (with the ternary bounding) is run, it segfaults, essentially getting stuck in an infinite loop. Note, this is on the host side!!

When PrintMethod2 is run, the output prints normally is as expected above.

Here is my compile command:

nvcc --compiler-options -fno-strict-aliasing -DUNIX -m64 -O2 \
--compiler-bindir /usr/bin/g++ \
-gencode=arch=compute_20,code=\"sm_21,compute_20\" \
-I/usr/local/CUDA_SDK/C/common/inc -I/usr/local/CUDA_SDK/shared/inc \
-o TEST Test.cu

About the only clue I have is that it's complaining about both kernels have an improper parameter, although it looks correct and gets the correct results for MyKernel2.

I think the above example is pretty much what commenters could have tried on their own based on the below description, but it saves you the time and effort of writing code!

Let me know if there's anything else I can post, to help figure this out.

Original Question

Most C compilers, as defined by the lang. standard support ternary operators.

e.g.

int myVar;
myVar=(testFlg==true)?-1:1;

However, surprisingly CUDA's nvcc appears to strip away some ternary operators and replace them with zeroes when they are used within a kernel...

I discovered this by applying cuPrintf to check a problem block of code. For example, let's say I have two kernels sharing a global array for their output. The first kernel deals with the first chunk of elements. The second kernel gets an offset to indicate how far to jump in the array so as not to overwrite the first kernel's elements. The offset is different for even and odd.

So I could write:

if (krnl!=0 && offset&0x01==0x0)
   idx+=(offset-1)*(offset>>1);
if (krnl!=0 && offset&0x01!=0x0)
   idx+=offset*(offset>>1);

But it would be more compact and readable (in my opinion) to write the near-equivalent shorthand syntax.

idx += (krnl==0)?0:(offset&0x01==0)?
   (offset-1)*(offset>>1):
   offset*(offset>>1);

The latter code, though will always produce a zero, as CUDA's compiler snips out the shorthand conditionals.

I realize this feature code be abused and cause thread divergence, but in simple cases it does not seem like it would be any different from standard conditionals, if the compiler handled it properly.

Is this a bug in the compiler or is it intentionally not supported?

Does anyone know if this feature is coming to CUDA?

I was quite surprised to find out that was the source of my addressing failures and segfaults...

EDIT
This is a standard C feature, I misread and erroneously said it was non-standard.

EDIT 2
I had said "chokes and dies" for the compiler. "Dies" was definitely inappropriate terminology to use. Rather, nvcc completes the compilation, but apparently has stripped away the ternary operator-based assignment and replaced it with zero. This would later come back and bite me as stuff was not getting written to the proper spots, and those spots were in turn used as indices in a double-indexed scheme. The indices were used during the wrapup on the CPU side, hence the segfault occurred on the CPU side, but was driven by compiler snipping.

I'm using compiler v4.1 and have -O2 turned on. It appears that the optimizer may be optimizing out the variable that is used inside the ternary operation, which may be the source of this bug.

The error-prone ternary operation is near-identical to the example I gave above, but is involved in a large addition operation.

I plan on following the advice of the below commenter and filing a bug report with NVIDIA, but am leaving this post as a warning to others.

Edit 3

Here a slightly sanitized full statement that's always yielding zero:

__global__ void MyFunc
( const int offset
  const CustomType * dev_P,
  ...
  const int box)
{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int idx=0;
   ...
   idx = tidx +
      dev_P->B +
      (box == 0)?0:(offset&0x01!=0x0):
      (offset-1)*(offset>>1):offset*(offset>>1);
   //NOTES:
   //I put the cuPrintf here.... from it I could see that tidx was diff. ints (as you 
   //would expect), but that when added together the sum was always "magically"
   //becoming zero.  The culprit was the nested ternary operator.
   //Once I replaced it with the equivalent conditional, the assignment worked as
   //expected.
   //"offset" is constant on the level of this kernel, but it is not always 0.
   //Outside the kernel "offset" varies greatly over the course of the simulation,
   //meaning that each time the kernel is called, it likely has a different value.
   //"tidx" obviously varies.
   //but somehow the above sum gave 0, likely due to an unreported compiler bug.
   //box is either 0 or 1.  For a certain type of op in my simulation I call this 
   //kernel twice, once for box value 0 and a second time for box value 1
   ...
}

La solution

I found the answer out... this is a general C issue, not CUDA-specific.

The ternary operator has a very low precedence, both on the LHS and RHS (strangely different precedence for each, though).

However, the precedence could be overriden via encapsulating the entire ternary in parentheses, e.g. ((...)?...:...).

I forked a general question about the common sense of adopting this approach for a language standard here:
Unexpected Result, Ternary Operator in Gnu C

Licencié sous: CC-BY-SA avec attribution

Non affilié à StackOverflow