Segmentation fault using the following code on more than one Nvidia GPU

https://stackoverflow.com/questions/17344270

01-06-2022
|

Question

The following code reliably leads to segmentation fault after a while when configured to run on more than one card (i have a 4 GPU system (NVIDIA Titan's)). It is a minimal example of the issue I tried to solve before (see here: Segmentation fault in __pthread_getspecific called from libcuda.so.1).

Description of what the code does:

First it allocates a huge amount of memory on each card (on first N cards, where N is 1-4), the idea is to stress the card as much as possible. Then it proceeds to launch 16 threads, each thread runs a couple of kernels on one random card. It repeats this a couple of times, each time picking a card randomly, then it exits. The master thread waits on all 16 threads to finish, then repeats the process indefinitely or until one of the threads fails for some reason.

After a couple (usually around 20 but might be more or less) iterations of the master thread it leads to a segmentation fault. Stack usually looks like this:

#0  0x00007f164a71f43c in ?? () from /usr/lib/libcuda.so
#1  0x00007f164a6bd1b5 in ?? () from /usr/lib/libcuda.so
#2  0x00007f164a5dff1a in ?? () from /usr/lib/libcuda.so
#3  0x00007f164a6c0b34 in ?? () from /usr/lib/libcuda.so
#4  0x00007f164a6c0c92 in ?? () from /usr/lib/libcuda.so
#5  0x00007f164a5e009f in ?? () from /usr/lib/libcuda.so
#6  0x00007f164a5d03c0 in ?? () from /usr/lib/libcuda.so
#7  0x00007f164a5c43bf in ?? () from /usr/lib/libcuda.so
#8  0x00007f164c131c39 in ?? () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#9  0x00007f164c152879 in cudaDeviceSynchronize () from /usr/local/cuda-5.5/lib64/libcudart.so.5.5
#10 0x0000000000401911 in TestCUDA(int, unsigned int, unsigned int, unsigned int const*, unsigned short*) ()
#11 0x00000000004012b0 in main (argc=0, argv=0x100000200) at main.cpp:208

The full source code:

main.cpp:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <cuda_runtime.h>

class CriticalSection
{
    pthread_mutex_t cs;

public:

    CriticalSection();
    ~CriticalSection();

    void Lock( void );
    void Unlock( void );
};

CriticalSection::CriticalSection()
{
    assert( pthread_mutex_init( &cs, NULL ) == 0 );
}

CriticalSection::~CriticalSection()
{
    assert( pthread_mutex_destroy( &cs ) == 0 );
}

void CriticalSection::Lock( void )
{
    assert( pthread_mutex_lock( &cs ) == 0 );
}

void CriticalSection::Unlock( void )
{
    assert( pthread_mutex_unlock( &cs ) == 0 );
}

class DeviceWrapper
{
protected:

    CriticalSection m_cs;

public:

    int32_t     m_i32DeviceId;

    uint32_t*   m_pdu32Data;
    uint16_t*   m_pdu16Res;
    uint32_t    m_u32Count;

    DeviceWrapper();
    ~DeviceWrapper();

    void Lock( void );
    void Unlock( void );

    bool Init( const int32_t i32DevId, const uint32_t u32Count );
    bool Free();    
};

DeviceWrapper::DeviceWrapper()
{
    m_i32DeviceId = 0;
    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
}

DeviceWrapper::~DeviceWrapper()
{

}

void DeviceWrapper::Lock( void )
{
    m_cs.Lock();
}

void DeviceWrapper::Unlock( void )
{
    m_cs.Unlock();
}

bool DeviceWrapper::Init( const int32_t i32DevId, const uint32_t u32Count )
{
    if ( cudaSetDevice( i32DevId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to set device %d\n", i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu32Data, sizeof( uint32_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned int's on device %d\n", u32Count, i32DevId );
        return false;
    }

    if ( cudaMalloc( &m_pdu16Res, sizeof( uint16_t ) * u32Count ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Init: Failed to allocate %u unsigned short's on device %d\n", u32Count, i32DevId );
        return false;
    }

    m_u32Count = u32Count;
    m_i32DeviceId = i32DevId;   
    return true;
}

bool DeviceWrapper::Free()
{
    if ( cudaSetDevice( m_i32DeviceId ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to set device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu32Data ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu32Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    if ( cudaFree( m_pdu16Res ) != cudaSuccess )
    {
        printf( "DeviceWrapper::Free: Failed to free pdu16Mem on device %d\n", m_i32DeviceId );
        return false;
    }

    m_pdu32Data = NULL;
    m_pdu16Res = NULL;
    m_u32Count = 0;
    m_i32DeviceId = 0;
    return true;
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res );

void* DoWork( void* pArg );

static bool bRun = true;

static DeviceWrapper devices[4];

int main( int argc, char* argv[] )
{
    if ( argc != 2 )
    {
        printf( "Usage: %s <number of cards to use>\n", argv[0] );
        return 1;
    }

    uint32_t u32CardsToUse = strtoul( argv[1], NULL, 0 );

    if ( !u32CardsToUse || u32CardsToUse > 4 )
    {
        printf( "Invalid argument, must be in range 1-4\n" );
        return 2;
    }

    for ( int32_t i = 0; i < u32CardsToUse; i++ )
    {
        if ( !devices[i].Init( i, 0x20000000 ) )
        {
            for ( uint32_t j = 0; j < i; j++ )
            {
                devices[j].Free();
            }

            printf( "Failed to init device %d\n", i );
            return 3;
        }
    }

    uint32_t u32IterationsCompleted = 0;

    while ( bRun )
    {
        pthread_t pWorkers[ 16 ];
        memset( pWorkers, 0, 16 * sizeof( pthread_t ) );

        for ( uint32_t i = 0; i < 16; i++ )
        {       
            int iReturnValue = pthread_create( &pWorkers[i], NULL, &DoWork, (void*)u32CardsToUse );

            if ( iReturnValue != 0 )
            {
                printf( "Error calling pthread_create: %d\n", iReturnValue );
                return 4;
            }
        }

        for ( uint32_t i = 0; i < 16; i++ )
        {
            pthread_join( pWorkers[i], NULL );
        }

        printf( "Iterations completed: %u\n", ++u32IterationsCompleted );
    }

    printf( "Finished\n" );
    fflush( stdout );
    return 0;
}

void* DoWork( void* pArg )
{
    uint32_t u32CardsToUse = uint32_t( pArg );

    uint32_t u32TestCount = (rand() % 4) + 4;

    for ( uint32_t i = 0; i < u32TestCount; i++ )
    {
        int32_t i32DeviceId = int32_t( rand() % u32CardsToUse );

        devices[ i32DeviceId ].Lock();

        if ( !TestCUDA( i32DeviceId, 1, devices[i32DeviceId].m_u32Count, devices[i32DeviceId].m_pdu32Data, devices[i32DeviceId].m_pdu16Res ) )
        {
            printf( "DoWork: Failure in executing TestCUDA for device %d (test number %u)\n", i32DeviceId, i );
            bRun = false;
            devices[ i32DeviceId ].Unlock();
            return NULL;
        }

        devices[ i32DeviceId ].Unlock();
    }

    return NULL;
}

cuda_test.cu:

#include <stdint.h>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>

__global__ void HammingU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx );
        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ]++;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualByteU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 4;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualBitU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void OrderU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32File = pu32Data[gidx];                  // 32-bit value to find the log2 of 
        uint32_t u32FileLog = 0;                            // result of log2 will go here
        uint32_t u32Shift = 0;

        u32FileLog = (u32File > 0xFFFF) << 4;
        u32File >>= u32FileLog;
        u32Shift = (u32File > 0xFF) << 3;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0xF) << 2;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32Shift = (u32File > 0x3) << 1;
        u32File >>= u32Shift;
        u32FileLog |= u32Shift;
        u32FileLog |= (u32File >> 1);

        uint32_t u32Other = gidx;           // 32-bit value to find the log2 of 
        uint32_t u32OtherLog = 0;                           // result of log2 will go here
        u32Shift = 0;

        u32OtherLog = (u32Other > 0xFFFF) << 4;
        u32Other >>= u32OtherLog;
        u32Shift = (u32Other > 0xFF) << 3;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0xF) << 2;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32Shift = (u32Other > 0x3) << 1;
        u32Other >>= u32Shift;
        u32OtherLog |= u32Shift;
        u32OtherLog |= (u32Other >> 1);

        if ( u32FileLog >= u32OtherLog )
        {
            pu16Results[ gidx ] += uint16_t( u32FileLog - u32OtherLog );
        }
        else
        {
            pu16Results[ gidx ] += uint16_t( u32OtherLog - u32FileLog );
        }

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void LogU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        uint32_t u32Value = 0;

        if ( pu32Data[gidx] >= gidx )
        {
            u32Value = pu32Data[gidx] - gidx;
        }
        else
        {
            u32Value = gidx - pu32Data[gidx];
        }

        uint32_t u32Log = 0;        // result of log2 will go here
        uint32_t u32Shift = 0;

        u32Log = (u32Value > 0xFFFF) << 4;
        u32Value >>= u32Log;
        u32Shift = (u32Value > 0xFF) << 3;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0xF) << 2;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Shift = (u32Value > 0x3) << 1;
        u32Value >>= u32Shift;
        u32Log |= u32Shift;
        u32Log |= (u32Value >> 1);

        pu16Results[ gidx ] += (uint16_t)u32Log;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void EqualRetU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        if ( pu32Data[gidx] != gidx ) pu16Results[ gidx ] += 32;

        gidx += blockDim.x * gridDim.x;
    }
}

__global__ void HammingMulU32( const uint32_t* pu32Data, const uint32_t u32Count, uint16_t* pu16Results )
{
    uint32_t gidx = blockDim.x * blockIdx.x + threadIdx.x;

    while ( gidx < u32Count )
    {
        pu16Results[ gidx ] += __popc( pu32Data[gidx] ^ gidx ) << 5;
        gidx += blockDim.x * gridDim.x;
    }
}

bool TestCUDA( const int32_t i32DeviceId, const uint32_t u32Iterations, const uint32_t u32Count, const uint32_t* pdu32Data, uint16_t* pdu16Res )
{   
    for ( uint32_t i = 0; i < u32Iterations; i++ )
    {
        if ( cudaSetDevice( i32DeviceId ) != cudaSuccess )
        {
            return false;
        }

        if ( cudaMemset( pdu16Res, 0, u32Count * sizeof( uint16_t ) ) != cudaSuccess )
        {
            return false;
        }

        for ( uint32_t j = 0; j < 3; j++ )
        {
            HammingU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualByteU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualBitU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            OrderU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            LogU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            EqualRetU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
            HammingMulU32<<< 512, 512 >>>( pdu32Data, u32Count, pdu16Res );
        }

        cudaDeviceSynchronize();
    }

    return true;
}

Makefile:

IDIR_CUDA = -I. -I/usr/local/cuda-5.5/include
CC_CUDA = g++
CFLAGS_CUDA = -g $(IDIR_CUDA)
LIBS_CUDA = -lz -lpthread -lrt -ldl -L/usr/local/cuda-5.5/lib64 -lcudart -lcuda

all:
    nvcc -I/usr/include -arch=compute_35 -code=sm_35 --machine 64 --compile cuda_test.cu
    $(CC_CUDA) -fpermissive *.cpp *.o -o test_cuda.out $(CFLAGS_CUDA) $(LIBS_CUDA)
    @echo DONE TEST_CUDA BUILD

clean:
    rm -f *.o test_cuda.out

I am using CUDA 5.5 with latest drivers (319.32), running Ubuntu Linux (64bit).

My questions are: Is there some bug in the code which could cause this behaviour? Why am i not seeing a crash when using just one card? Is it just that the crash is far less probable? (it seems that using more cards makes the crash appear sooner than with fewer cards) Bonus: Is anyone else seeing a crash using this code?

Solution

I was able to find a workaround which solves the crashes:

The idea is that there is exactly one thread per each GPU used and other worker threads submit work to these threads to run on the GPUs. I was able to run the test app on the machine for a day without any problems (over 6 thousand iterations done).

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow