cuda error: unexpected launch failure

https://stackoverflow.com/questions/23437208

c++
cuda

14-07-2023
|

Pergunta

I'm novice in cuda and i am trying to implement numerical integration with Simpson method on CUDA.

My code falls with error "unexpected launch failure". It is looks like segfault somewhere in gpu memory. But it is very strange, because it depends on variable step, which controls the number of iteration, not any memory operations. For example, when i run integrate_with_cuda with step = 0.00001 it works fine, and result is correct, but if i change step on 0.000001, my program falls.

This is my code:

#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
#include "cuda.h"
#include "cuda_safe_call.h"
#include <cmath>
#include <iostream>

typedef double(*cuda_func)(double, double);

struct cuda_expr {
    cuda_func func;
    int dest;
    int op1;
    int op2;
};

enum cuda_method {
    cm_Add,
    cm_Mult
};

__device__ double add_func(double x, double y) {
    return x + y;
}
__device__ cuda_func p_add_func = add_func;

__device__ double mult_func(double x, double y) {
    return x*y;
}
__device__ cuda_func p_mult_func = mult_func;

__host__ cuda_func get_cuda_func(cuda_method method) {
    cuda_func result = NULL;

    switch (method) {
    case cm_Add:
        cudaMemcpyFromSymbol(&result, p_add_func, sizeof(cuda_func));
        break;
    case cm_Mult:
        cudaMemcpyFromSymbol(&result, p_mult_func, sizeof(cuda_func));
        break;
    }
    return result;
}

__device__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull =
        (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
            __double_as_longlong(val +
            __longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}

__device__ void computate_expr(cuda_expr* expr, int expr_length, double* vars, int vars_count) {
    for (cuda_expr* step = expr, *end = expr + expr_length; step != end; ++step) {
        vars[step->dest] = (*step->func)(vars[step->op1], vars[step->op2]);
    }
}

__device__ double simpson_step(cuda_expr* expr, int expr_length, double* vars, int vars_count, double a, double b, double c) {
    double f_a;
    double f_b;
    double f_c;
    vars[0] = a;
    computate_expr(expr, expr_length, vars, vars_count);
    f_a = vars[vars_count - 1];
    vars[0] = b;
    computate_expr(expr, expr_length, vars, vars_count);
    f_b = vars[vars_count - 1]; 
    vars[0] = c;
    computate_expr(expr, expr_length, vars, vars_count);
    f_c = vars[vars_count - 1];
    return (c - a) / 6 * (f_a + 4 * f_b + f_c);
}

__global__ void integrate_kernel(cuda_expr* expr, int expr_length, double* vars, int vars_count, double from, double to, double step, double* res) {
    int index = blockIdx.x*blockDim.x + threadIdx.x;
    int threads_count = gridDim.x*blockDim.x;
    double* my_vars = vars + index * vars_count;

    double my_from = from + index*(to - from) / threads_count;
    double my_to = from + (index + 1)*(to - from) / threads_count;

    double my_res = 0;

    double a = my_from;
    double b = my_from + step / 2;
    double c = my_from + step;

    while (c < (my_to + step / 10)) {
        my_res += simpson_step(expr, expr_length, my_vars, vars_count, a, b, c);
        a += step;
        b += step;
        c += step;
    }
    atomicAdd(res, my_res);
}

__host__ double integrate_with_cuda(const cuda_expr* expr, int expr_length, double* vars, int vars_count, double from, double to, double step) {
    const int blockSize = 32;
    const int gridSize = 2;
    const int threadsCount = blockSize*gridSize;

    cuda_expr* d_expr;
    CudaSafeCall(cudaMalloc((void**)&d_expr, expr_length*sizeof(cuda_expr)));
    CudaSafeCall(cudaMemcpy(d_expr, expr, expr_length*sizeof(cuda_expr), cudaMemcpyHostToDevice));

    double* d_vars;    //allocate own vars array for every thread
    CudaSafeCall(cudaMalloc((void**)&d_vars, threadsCount*vars_count*sizeof(double)));
    for (int i = 0; i < threadsCount; ++i) {
        CudaSafeCall(cudaMemcpy(d_vars + i*vars_count, vars, vars_count*sizeof(double), cudaMemcpyHostToDevice));
    }

    double* d_res;
    double result = 0;
    CudaSafeCall(cudaMalloc((void**)&d_res, sizeof(double)));
    CudaSafeCall(cudaMemcpy(d_res, &result, sizeof(double), cudaMemcpyHostToDevice));

    integrate_kernel<<<gridSize, blockSize>>>(d_expr, expr_length, d_vars, vars_count, from, to, step, d_res);

    CudaSafeCall(cudaMemcpy(&result, d_res, sizeof(double), cudaMemcpyDeviceToHost));

    CudaSafeCall(cudaFree(d_expr));
    CudaSafeCall(cudaFree(d_vars));
    CudaSafeCall(cudaFree(d_res));
    return result;
}

int main() {
    cuda_expr expr[3] = {
        { get_cuda_func(cuda_method::cm_Add), 4, 1, 0 },
        { get_cuda_func(cuda_method::cm_Add), 3, 0, 2 },
        { get_cuda_func(cuda_method::cm_Mult), 5, 3, 4 }
    };
    double vars[6] = {0, 10, 1, 0, 0, 0};

    double res = integrate_with_cuda(expr, 3, vars, 6, 0, 10, 0.00001);

    std::cout << res << std::endl;
    system("PAUSE");
}

I think, i need to give some explanations about how it works. The function integrate_with_cuda takes an input array of cuda_expr and array of double as vars. Array of cuda_expr represents syntax tree of math expression, which expanded in array. cuda_expr::func is pointer to device function, which will called with args vars[cuda_expr::op1] and vars[cuda_expr::op2] and result will put in vars[cuda_expr::dest]. First cell in vars array reserved for x variable.
Test example in main function represents expression (1+x)*(x+10). Computation of the first cuda_expr in array gets second and first (it is x) cells from vars, adds them and puts to vars[4], second cuda_expr gets first and third cells from vars, adds them and puts to vars[5], and the last cuda_expr gets 4-th and 5-th cells (where first and second cuda_expr puts them results), multiplies it and puts to last cell of vars. The last cell of vars is result of expression after computation.

I use MS Visual Studio 2013 (with v110 platform toolset), with regular flags (sm_30 arch and without cuda debug):

nvcc.exe -gencode=arch=compute_30,code=\"sm_30,compute_30\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_amd64"  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include"     --keep-dir x64\Tests -maxrregcount=0  --machine 64 --compile -cudart static     -DWIN32 -D_DEBUG -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Zi  /MDd  " -o x64\Tests\integration_on_cuda.cu.obj integration_on_cuda.cu

Thanks. And sorry for my english :)

Solução

When I run your code on linux, I get 983.333 whether I specify step as 0.00001 or 0.000001. However with the smaller step, the code takes a lot longer to run.

Since you're running on windows, this may be nothing more than you hitting the windows TDR mechanism. Kernels that run longer than about 2 seconds on windows may trigger the TDR mechanism. Usually when this happens, you will see the screen flash to black and then repaint itself, as the GPU goes through the windows-triggered reset. You may also see a systray message. The exact behavior may also be different if you are running the code from within VS versus running from the command line.

Refer to the above link or search on the CUDA tag for how to modify the TDR mechanism.

Licenciado em: CC-BY-SA com atribuição

Não afiliado a StackOverflow