I used cudaEvent to measure the running time of my code. But I found something puzzles

https://stackoverflow.com/questions/17836945

04-06-2022
|

Question

I used the code following to measuring the running time of my code;

#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/find.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/pair.h>
#include <thrust/remove.h>

#include <math.h>
#include <fstream>
#include <string>
#include <cstdlib>
#include <iostream>
#include <stdlib.h>
using namespace std;

const int MINCOUNTS = 20;
const int h = 10;
const int dim = 2;
//const int h2 = pow(double(h),double(dim));



struct DataType
{
    float d[dim];
};





void loadData(thrust::host_vector<DataType>& D_,string dir_, DataType& gt)
{



    fstream in(dir_.c_str(),ios::in);
    string tline;
    string::size_type position;
    getline(in,tline);

    int flag = atoi(tline.c_str());
    if(flag != 1)
    {
        cout<<"there is problem in file : "<<dir_<<endl;
        exit(-1);
    }

    getline(in,tline);
    int tot = atoi(tline.c_str());

    getline(in,tline);




    for(int i = 0; i < dim - 1; i++)
    {
        position = tline.find(" ");
        gt.d[i] = atof(tline.substr(0,position).c_str());
        tline = tline.substr(position+1, tline.size() - position);
    }
    gt.d[dim-1] = atof(tline.c_str());

    DataType dt;
    for(int i = 0; i < tot-1; i++)
    {
        getline(in,tline);
        for(int i = 0; i < dim - 1; i++)
        {
            position = tline.find(" ");
            dt.d[i] = atof(tline.substr(0,position).c_str());
            tline = tline.substr(position+1, tline.size() - position);
        }
        dt.d[dim-1] = atof(tline.c_str());
        D_.push_back(dt);
    }
}



__global__ void initialSM(int *gpu_Mchanged1, int *gpu_Schanged1,int N)
{
    int index = blockIdx.x;
    if(index < N)
    {
        gpu_Mchanged1[index] = index;
        gpu_Schanged1[index] = index;
    }

}


//parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N, h);
__global__ void parallelCal(int* gpu_Schanged1, DataType *input, DataType *msPoint, int tot) // h is the band-width of the kernel function;
{

    int index = blockIdx.x;
    int dis = 0;

    int ii = 0;
    int i0 = 0;

    int inlierNum = 0;
    //  double h2 = 10000;

    if(index < tot)
    {
        dis = 0;
        i0 = gpu_Schanged1[index];

        for(unsigned int i = 0; i < dim; i++)
            msPoint[index].d[i] = 0;

        for(int i = 0 ;i < tot ;i++)
        {
            ii = gpu_Schanged1[i];


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
            {
                dis += (input[i0].d[j] - input[ii].d[j])*(input[i0].d[j] - input[ii].d[j]);
                if(dis > pow(double(h),2.0))
                    break;
            }

            if (dis < pow(double(h),2.0))
            {
                inlierNum++;
                for(unsigned int j = 0; j < dim; j++)
                    msPoint[index].d[j] += (input[ii].d[j] - input[i0].d[j]);
            }
        }


        //      msPoint[index].d[0] = inlierNum;
        for(unsigned int j = 0; j < dim; j++)
        {
            msPoint[index].d[j] /= inlierNum;
            msPoint[index].d[j] += input[i0].d[j];
        }

    }
}


//nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);
__global__ void nearestSearch(int *gpu_Schanged1,int *gpu_Mchanged1, DataType *msPoint, DataType *input, int tot, int *Sunchanged, int *Munchanged)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = 1000000;
    int flag = -1;
    int i1;
    if(index < tot)
    {

        for(int i = 0; i < tot; i++)
        {
            i1 = gpu_Schanged1[i];

            dis = 0;
            for(int j = 0; j < dim; j++)
                dis += (msPoint[index].d[j] - input[i1].d[j])*(msPoint[index].d[j] - input[i1].d[j]);

            if(dis <= disMin)
            {
                disMin = dis;
                flag = i1;
            }
        }
        Sunchanged[gpu_Schanged1[index]] = index;
        Munchanged[gpu_Schanged1[index]] = flag;
        gpu_Mchanged1[index] = flag;
    }
}
////routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);
__global__ void routineTransmission(bool loop1st, int *gpu_Schanged1,int *gpu_Mchanged1, int *gpu_Sunchanged,int *gpu_Munchanged, const int tot)
{
    int index = blockIdx.x;
    bool find2 = false;

    if(index < tot)
    {
        int lastOne = -1;
        int thisOne = -1;
        int indexIter = index;
        while(1)
        {



            if(loop1st)
            {
                lastOne = gpu_Mchanged1[indexIter];
                thisOne = gpu_Mchanged1[lastOne];

                if(lastOne == thisOne)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = thisOne;
                    gpu_Mchanged1[index] = thisOne;
                    break;
                }
                indexIter = thisOne;
            }

            else
            {
                //              gpu_Mchanged1[index] = gpu_Schanged1[index];

                while(1)
                {
                    lastOne = gpu_Mchanged1[indexIter];
                    for(int i = 0; i < tot; i++)
                    {
                        if(i == indexIter)
                            continue;

                        if(lastOne == gpu_Schanged1[i])
                        {
                            thisOne = i;
                            find2 = true;
                            break;
                        }
                    }
                    if(find2 == false)
                        break;
                    indexIter = thisOne;
                    find2 = false;

                }
                if(thisOne != index && thisOne != -1)
                {
                    gpu_Munchanged[gpu_Schanged1[index]] = gpu_Schanged1[thisOne];
                    gpu_Mchanged1[index] = gpu_Schanged1[thisOne];
                }
                break;
            }
        }
    }

}
//

__global__ void deleteCircle(int *gpu_Mchanged1, int *gpu_Munchanged, const int N, bool loop1st)
{
    int index = blockIdx.x;
    int router0, router1;
    if(index < N)
    {
        if(loop1st)
        {
            router0 = gpu_Mchanged1[index];
            router1 = gpu_Mchanged1[router0];
            while(1)
            {

                if(index == router0 || index == router1)
                {
                    gpu_Munchanged[index] = index;
                    break;
                }
                if(router0 == router1)
                    break;
                router0 = gpu_Mchanged1[router1];
                router1 = gpu_Mchanged1[router0];
            }
        }

    }


}
__global__ void checkTheClusterSize(int *gpu_Mchanged1, int *gpu_Schanged1, int *gpu_Munchanged, int *gpu_clusterSize, int smallTot, int tot)
{
    int index = blockIdx.x;
    if(index < smallTot)
    {
        int count = 0;
        for(unsigned int i = 0; i < tot; i++)
        {
            if(gpu_Munchanged[i] == gpu_Mchanged1[index])
                count++;
        }
        gpu_clusterSize[index] = count;
        if(count <= MINCOUNTS)
            gpu_Schanged1[index] = -1;
    }

}
__global__ void checkTheCenterNum(int *gpu_Munchanged,int *gpu_Sunchanged, int *gpu_Kcounts ,int tot)
{
    int index = blockIdx.x;
    if(index < tot)
    {
        if (gpu_Kcounts[gpu_Munchanged[index]] < MINCOUNTS)
        {
            gpu_Sunchanged[index] = -1;
        }
    }


}

struct increaseOne: public thrust::unary_function<int, int>
{
    int operator()(int a_){return a_++;}

};
//
__global__ void mergeCentreSimple(int* gpu_Munchanged, int *gpu_clusterSize, DataType* gpu_input,int *gpu_Schanged1, int *gpu_Mchanged1, int tot)
{
    int index = blockIdx.x;
    float dis = 0;
    float disMin = pow(double(h/2),2.0);
    int disMinIndex = -1;
    bool flag = false;
    if(index < tot)
    {
        for(unsigned int i = 0; i < tot; i++)
        {
            if(index == i)
                continue;


            dis = 0;
            for(unsigned int j = 0; j < dim; j++)
                dis += (gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j])*(gpu_input[gpu_Mchanged1[index]].d[j] - gpu_input[gpu_Mchanged1[i]].d[j]);
            //          dis = (gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)*(gpu_input[gpu_Mchanged1[index]].d1 - gpu_input[gpu_Mchanged1[i]].d1)+(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2)*(gpu_input[gpu_Mchanged1[index]].d2 - gpu_input[gpu_Mchanged1[i]].d2);

            if(dis < disMin)
            {
                flag = true;
                disMin = dis;
                disMinIndex = i;
            }
        }

        if(flag)
            if(gpu_clusterSize[index] < gpu_clusterSize[disMinIndex])
            {
                gpu_Munchanged[gpu_Schanged1[index]] = gpu_Mchanged1[disMinIndex];
                gpu_Mchanged1[index] = gpu_Mchanged1[disMinIndex];

            }
    }
}



struct is_minus_one
{
    __host__ __device__
    bool operator()(const int x)
    {
        return(x == -1);
    }
};

typedef thrust::device_vector<int>::iterator dintiter;

int main(int argc, char** argv)
{
    //  int h = 100;
    using namespace std;
    thrust::host_vector<DataType> host_input;
    //  string dir = "/home/gaoy/cuda-workspace/DATA/input/dataMS/data_1.txt";
    string dir = "/home/gaoy/workspace/DATA/dataInput/gaussianDistribution_2500.txt";
    DataType gt;
    loadData(host_input,dir, gt);
    cudaEvent_t start,stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int loopTime = 100;
    float timeSum = 0;

    thrust::device_vector<DataType> device_input = host_input;  // Host端vector
    int N = device_input.size();
    int rN = N;
    int lastSize, thisSize;

    DataType *gpu_input;
    gpu_input = thrust::raw_pointer_cast(&device_input[0]);

    thrust::device_vector<DataType> device_msPoint;
    device_msPoint.resize(N);
    DataType *gpu_msPoint;

    thrust::device_vector<int> device_Sunchanged;
    device_Sunchanged.resize(N);
    int *gpu_Sunchanged;
    gpu_Sunchanged = thrust::raw_pointer_cast(&device_Sunchanged[0]);

    thrust::device_vector<int> device_Munchanged;
    device_Munchanged.resize(N);
    int *gpu_Munchanged;
    gpu_Munchanged = thrust::raw_pointer_cast(&device_Munchanged[0]);

    thrust::device_vector<int> device_Schanged1;
    device_Schanged1.resize(N);
    int *gpu_Schanged1;
    gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);

    thrust::device_vector<int> device_Mchanged1;
    device_Mchanged1.resize(N);
    int *gpu_Mchanged1;
    gpu_Mchanged1 = thrust::raw_pointer_cast(&device_Mchanged1[0]);

    thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> new_end;

    thrust::device_vector<int> device_clusterSize;

    initialSM<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,N);

    bool loop1st = true;
    dintiter Mend, Send, Cend;
    int *gpu_clusterSize;
    gpu_msPoint = thrust::raw_pointer_cast(&device_msPoint[0]);




    for(int i = 0; i < loopTime; i++)
    {

        cudaFree(0);
        cudaEventRecord(start,0);


        while(1)
        {
            lastSize = device_Schanged1.size();
            N = lastSize;
            device_msPoint.resize(N);

            parallelCal<<<N,1>>>(gpu_Schanged1,gpu_input, gpu_msPoint, N); //the size of the gpu_msPoint is as the same as the gpu_Mchanged1; but the gpu_input is the original data size
            device_Mchanged1.resize(N);
            nearestSearch<<<N,1>>>(gpu_Schanged1,gpu_Mchanged1,gpu_msPoint,gpu_input, N, gpu_Sunchanged, gpu_Munchanged);

            routineTransmission<<<N,1>>>(loop1st,gpu_Schanged1,gpu_Mchanged1,gpu_Sunchanged, gpu_Munchanged,N);


            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            //
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N = new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);

            device_clusterSize.clear();
            device_clusterSize.resize(N);

            gpu_clusterSize = thrust::raw_pointer_cast(&device_clusterSize[0]);
            checkTheClusterSize<<<N,1>>>(gpu_Mchanged1, gpu_Schanged1,gpu_Munchanged, gpu_clusterSize,N,rN);

            Mend = thrust::remove_if(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin(),is_minus_one());
            Cend = thrust::remove_if(device_clusterSize.begin(), device_clusterSize.end(), device_Schanged1.begin(), is_minus_one());
            Send = thrust::remove(device_Schanged1.begin(), device_Schanged1.end(), -1);

            N =  Send - device_Schanged1.begin();
            device_Schanged1.resize(N);
            device_Mchanged1.resize(N);
            device_clusterSize.resize(N);
            mergeCentreSimple<<<N,1>>>(gpu_Munchanged,gpu_clusterSize, gpu_input, gpu_Schanged1, gpu_Mchanged1, N);
            thrust::sort_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            new_end = thrust::unique_by_key(device_Mchanged1.begin(), device_Mchanged1.end(), device_Schanged1.begin());
            N =  new_end.first - device_Mchanged1.begin();
            device_Mchanged1.resize(N);
            device_Schanged1.resize(N);


            thisSize = N;
            if(lastSize == thisSize)
                break;
            loop1st = false;

            thrust::copy(device_Mchanged1.begin(),device_Mchanged1.end(),device_Schanged1.begin());
            device_Mchanged1.clear();
            gpu_Schanged1 = thrust::raw_pointer_cast(&device_Schanged1[0]);
        }
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);

        cudaEventElapsedTime(&time, start, stop);
        //      for(unsigned int ii = 0; ii < device_Mchanged1.size(); ii++)
        //          cout<<ii<<" "<<host_input[device_Schanged1[ii]].d[0]<<" "<<host_input[device_Schanged1[ii]].d[1]<<endl;

        timeSum += time;
        cout<<i<<" "<<time<<endl;
    }
    cout<<"elapsed: "<<timeSum/loopTime<<" ms"<<endl;









    return 0;


}

The output of the variable, time, in every loop is not the same and this is the result I got:

But everytime the code I test just do the same thing. Which result I should believe? I am really puzzled about this. Thanks.

Solution

I don't see any obvious problems with your timing methodology, now that you have posted your actual code. The statements about cudaFree(0); and start-up time are irrelevant, because your code will already have created a cuda context well before your first timing sequence.

It's not possible for me to run your code, as it depends on data files which I do not have. However, the most likely explanation for the variation in timing is that there is an actual variation in timing, or work being done. Even though you appear to be running the same code, it may take a different amount of time from run to run.

Some examples of how this might be (I'm not saying this is true of your code; I don't know):

thrust::sort_by_key will take a different amount of time, if the sequence is already sorted, than for an unsorted sequence. Since you are doing an in-place sort, the question arises as to whether you are sorting data that is already sorted. The first pass through your timing loop might sort the data, whereas the subsequent passes might be sorting data that is already sorted, which will take less time.
Another example would be all your .resize(N) operations. It seems to me the first time through the loop, these might be doing some actual resizing, whereas on subsequent passes, if N does not change, then there is no actual resizing, so the operation takes less time.

Again, I don't know if any of these hypotheses are true of your code, I'm simply pointing out how it might be possible, in some cases, to run the same code sequence repeatedly, and observe variation in timing.

Obviously since the code is identical, the question then becomes one of analyzing the data to see if it is the same, or not, from run to run. An interesting test might be to keep track of the number of passes through your while(1) loop, before the break; instruction is encountered. This might also be instructive as to what is going on.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow