Question

I did a program that does matrices multiplications (without optimization).

for(i=0; i<a_r; i++)
 for(j=0;j<b_c; j++)
  for(k=0;k<a_c; k++)
   c[i][j]=c[i][j]+a[i][k]*b[k][j];

Depending on how I allocate the memory, the time of computation is different.

It is important to notice three points:

  • I record only the time of computation and not the time of allocation (plus the time of allocation is negligible compared with the time of computation).
  • I initialize the matrices with random numbers before computing. I use huge matrices (2500 int*2500 int). I choose this size to use the RAM memory but not the swap.
  • This phenomenon disappears if it doesn't use RAM.

I test this program with three different allocations:

Test 1: Global allocation

Int a[2500][2500], b[2500][2500], c[2500][2500];
Main {matrix multiplication a*b=c}

The computing time is quite constant (around 41s).

Test 2: dynamic allocation (arrays)

Main{
int **a,**b,**c;
a=(int **) malloc(sizeof(int)*2500);
for( i=0;i<a_c; i++)
 a[i]=(int *) malloc(sizeof(int)*2500);
…
matrix multiplication a*b=c
}

When I execute the program several times in a raw I obtain these running times: 260 seconds, 180, 110, 110… If I wait around 5 seconds and launch again the program I obtain the same results.

Test 3: dynamic allocation (lines)

Main{
int *a, *b, *c;
a=(int *) malloc(sizeof(int)*2500*2500);
… (same for b and c)
matrix multiplication a*b=c
}

The computing time is quite constant (around 44s).

I think test 2 is less efficient because of the way the data are stored in memory. Like explain in this article(the website is out today) or in this question. Some way to go through the memory are more efficient and as a result some way to allocate memory give you a more efficient program.

But (in test 2), I have no clue to figure why the program is faster over time. Does anyone have an idea to explain this phenomenon? Thanks in advance.

P.S. I did these tests on an Intel Xeon E5-2625 with CentOS 6.3 and kernel Linux 3.11.1. EDIT : Frequency scaling is disabled on my computer. The frequency is constant.

Here is the code:

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <float.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>

/****************************************************
 * Random generate a random int between _Min and _Max
 */
int Random(int _iMin,int _iMax)
{
    return (_iMin + (rand()%(_iMax-_iMin+1)));
}

/****************************************************
 * Return the struct of getrusage.
 */
struct rusage give_ru()
{
    struct rusage ru;
    getrusage(RUSAGE_SELF, &ru);
    //getrusage(RUSAGE_CHILDREN, &ru);

    return ru;
}

/****************************************************
 * Do a matrix multiplication a*b=c
 * This program isn't optimized.
 */
int main()
{
    /*
     * Initialize variables and array sizes.
     */
    int **a,**b,**c;
    printf("\nenter Taille de la matrice : 2500");
    int a_r=2500,a_c=2500,b_r=2500,b_c=2500;
    clock_t start,end;
    struct rusage start1,end1;
    /* variables to store time difference between
    start of matrix multiplication and end of multiplication. */
    double dif; /*variable to calculate the time difference between the parallelization */
    int i,j,k;

    if(a_c!=b_r )
    {
        printf("\ncan not multiply");
        goto again;
    }


    /*
     * Memory allocation.
     */
    start =clock();
    a=(int **) malloc(sizeof(int *)*a_r);
    if (a == NULL) 
    {
       printf("Allocation of matrix a failed \n");
       exit(0); 
    }
    for( i=0;i<a_c; i++)
    {
        a[i]=(int *) malloc(sizeof(int)*a_c);
            if (a[i] == NULL) 
            {
            printf("Allocation of matrix a[%d] failed \n",i);
            exit(0); 
            }
    }
    b=(int **) malloc(sizeof(int *)*b_r);
    if (b == NULL) 
    {
       printf("Allocation of matrix b failed \n");
       exit(0); 
    }
    for( i=0;i<b_c; i++)
    {
        b[i]=(int *) malloc(sizeof(int)*b_c);
            if (b[i] == NULL) 
            {
            printf("Allocation of matrix b[%d] failed \n",i);
            exit(0); 
            }
    }
    c=(int **) malloc(sizeof(int *)*a_r);
    if (c == NULL) 
    {
       printf("Allocation of matrix c failed \n");
       exit(0); 
    }
    for( i=0;i< b_c; i++)
    {
        c[i]=(int *) malloc(sizeof(int)*b_c);
            if (c[i] == NULL) 
            {
            printf("Allocation of matrix c[%d] failed \n",i);
            exit(0); 
            }
    }

    /* Memory is lock*/
    printf("Starting mlockall.\n");
    if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) {
        perror("mlockall");
        return 1;
    }


    /*
     * Matrix initialization (with random integer).
     */
    printf("Initializing matrices...\n");

    //initializing first matrix
    srand(time(NULL));
    for(i=0;i<a_r; i++)
    {
        for(j=0;j<a_c; j++)
        {
            a[i][j] = Random(0,100);//i+j;
            //printf("%d \n", a[i][j]);
        }
    }
    // initializing second matrix
    srand(time(NULL));
    for(i=0;i<b_r; i++)
    {
        for(j=0;j<b_c; j++)
        {
            b[i][j] = Random(0,100);//i*j;
        }
    }
    /*initialize product matrix */
    srand(time(NULL));
    for(i=0;i<a_r; i++)
    {
        for(j=0;j< b_c; j++)
        {
            c[i][j]= Random(0,100);//0;
        }
    }
    end= clock(); //end the timer
    dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
    printf("Malloc and initialization took %f sec. time.\n", dif);


    /*
     * Matrix Multiplication.
     */
    start =clock(); //start the timer
    start1 = give_ru(); //start the timer
    /* multiply matrix one and matrix two */
    for(i=0;i<a_r; i++)
        for(j=0;j<a_c; j++)
            for(k=0;k<b_c; k++)
            c[i][j]=c[i][j]+a[i][k]*b[k][j];

    end1 = give_ru(); //end the timer
    end = clock(); //end the timer

    dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference

    struct timeval timeUserStart = start1.ru_utime;
    double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0;
    struct timeval timeSystemStart = start1.ru_stime;
    double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0;

    struct timeval timeUserEnd = end1.ru_utime;
    double utimeEnd  = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0;
    struct timeval timeSystemEnd  = end1.ru_stime;
    double stimeEnd  = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0;

    double difuTime = utimeEnd - utimeStart;
    double difsTime = stimeEnd - stimeStart;

    long pageReclaims = end1.ru_minflt;//start1.ru_minflt-
    long pageFaults = end1.ru_majflt;//start1.ru_majflt-

    printf("Parallelization took %f sec. time.\n", dif);
    printf("Time User : %f .\n", difuTime );
    printf("Time System : %f .\n", difsTime );
    printf("Page reclaims : %ld .\n", end1.ru_minflt);
    printf("Page faults : %ld .\n", end1.ru_majflt);

    /*
     * free memory.
     */
    printf("Finished, cleaning up \n");
    munlockall();

    /*free memory*/
    for(i=0;i<a_r; i++)
    {
        free(a[i]);
        a[i] = NULL;
    }
    free(a);
    a = NULL;
    for(i=0;i<a_c; i++)
    {
        free(b[i]);
        b[i] = NULL;
    }
    free(b);
    b = NULL;
    for(i=0;i<b_c; i++)
    {
        free(c[i]);
        c[i] = NULL;
    }
    free(c);
    c = NULL;

    return 0;
}
Was it helpful?

Solution 2

I figure out a part of the answer. "Recent Linux kernels have support for transparent hugepages, programs will make use of it without knowing they do." If I disable hugepages all the three tests take 260s to run.

For test 1 and 3, they always use hugepages. Here you can see the quantity of AnonHugePages in test 1 and 3: AnonHugePages in test 1 and 3

For test 2, it uses hugepages but not in all cases and not since the beginning of the programm: AnonHugePages in test 2

I still don't understand why test 2 is still this irregular. However test 3 uses system call mmap() to allocate memory and test 2 uses brk(). Maybe the answer is here, I will continue to investigate.

(I forgot to say it but the program is run three times.)

OTHER TIPS

An array of pointer require an extra level of indirection, and thus will be slower. Moreover, it might cause more cache misses.

So I tried with 1000*1000 matrix :

Test 2 (int**)

$ perf stat ./test2
Performance counter stats for './test1':

   8561,688348 task-clock (msec)         #    1,000 CPUs utilized          
            13 context-switches          #    0,002 K/sec                  
             9 cpu-migrations            #    0,001 K/sec                  
         3 058 page-faults               #    0,357 K/sec                  
24 844 304 630 cycles                    #    2,902 GHz                     [83,32%]
21 302 837 742 stalled-cycles-frontend   #   85,75% frontend cycles idle    [83,32%]
 2 110 745 845 stalled-cycles-backend    #    8,50% backend  cycles idle    [66,68%]
 7 030 427 722 instructions              #    0,28  insns per cycle        
                                         #    3,03  stalled cycles per insn [83,36%]
 1 004 889 984 branches                  #  117,371 M/sec                   [83,37%]
     1 077 360 branch-misses             #    0,11% of all branches         [83,32%]

   8,561758966 seconds time elapsed

Test 3 (int*)

$ perf stat ./test3
 Performance counter stats for './test2':

   1367,856713 task-clock (msec)         #    0,995 CPUs utilized          
           195 context-switches          #    0,143 K/sec                  
             3 cpu-migrations            #    0,002 K/sec                  
         3 001 page-faults               #    0,002 M/sec                  
 3 977 759 335 cycles                    #    2,908 GHz                     [83,41%]
   975 477 913 stalled-cycles-frontend   #   24,52% frontend cycles idle    [83,41%]
   179 003 530 stalled-cycles-backend    #    4,50% backend  cycles idle    [66,81%]
 7 017 803 848 instructions              #    1,76  insns per cycle        
                                         #    0,14  stalled cycles per insn [83,41%]
 1 002 321 021 branches                  #  732,768 M/sec                   [83,42%]
     1 046 066 branch-misses             #    0,10% of all branches         [82,97%]

   1,374613079 seconds time elapsed

As we can see, a lot more cycles int test2.

I also mesured the cache missed :

Test 2 (int**)

$ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses'  ./test2
Performance counter stats for './test1':

 3 009 028 415 L1-dcache-loads                                             
 1 259 622 058 L1-dcache-load-misses     #   41,86% of all L1-dcache hits  
     6 427 561 L1-dcache-stores                                            
     1 141 461 L1-dcache-store-misses                                      

   8,650905202 seconds time elapsed

Test 3 (int*)

$ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses'  ./test3
Performance counter stats for './test2':

 2 004 807 223 L1-dcache-loads                                             
   596 388 192 L1-dcache-load-misses     #   29,75% of all L1-dcache hits  
     2 111 264 L1-dcache-stores                                            
       384 198 L1-dcache-store-misses                                      

   1,384352257 seconds time elapsed
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top