Vector-matrix & matrix-matrix multiplication using SSE for any size of input matrix and vector

https://stackoverflow.com/questions/23553309

18-07-2023
|

Question

I am trying to do Vector-matrix multiplication as well as matrix-matrix multiplication using SSE Intrinsic but I get an error saying "Segmentation Fault", if I try to do for anything except multiples of 4. not able to figure out why, it don't work for anything else.? Please suggest changes so that it works for anysize of input.?

Following is my implementation:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>  

/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
 ****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
    int i;
    for (i = 0; i < size*size; i++){
        vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
        printf("%f \n ", vector[i]);
    }
    for (i = 0; i < size*size; i++){
        matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
        printf("%f \n ", matrix[i]);
    }
}

/****************************************************
the following function calculate the below equation
   vector_out = vector_in x matrix_in
 ***************************************************/
void matrix_mult_sq(int size, float *vector_in,
               float *matrix_in, float *vector_out){
    int i, j, k;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            vector_out[size*i + j] = 0.0;
            for (k = 0; k < size; k++)
                vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
        }
    }
}

void matrix_mult_sse(int size, float *vector_in,
    float *matrix_in, float *vector_out){
    __m128 a_line, b_line, r_line;
    int i, j, k, l;
    for (k = 0; k < size; k++)
    {

        for (i = 0; i < size; i += 4){
            j = 0;
            b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
            a_line = _mm_set1_ps(vector_in[j + k*size]);      // a_line = vec4(vector_in[0])
            r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
            for (j = 1; j < size; j++) {
                b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
                a_line = _mm_set1_ps(vector_in[j + k*size]);  // b_line = vec4(b[i][j])
                // r_line += a_line * b_line
                r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
            }
            _mm_store_ps(&vector_out[i + k*size], r_line);     // r[i] = r_line
        }
    }
    for (l=0; l < size*size; l++)
    {
        printf("%f \n", vector_out[l]);
    }
}

int main(int argc, char *argv[]){
  if(argc < 2){
    printf("Usage: %s matrix/vector_size\n", argv[0]);
    return 0;
  }

  int size = atoi(argv[1]);
  if(size%4 != 0){
    printf("This version implements for ""size = 4*n"" only\n");
    return 0;
  }

  float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
  if(vector==NULL){
    printf("can't allocate the required memory for vector\n");
    return 0;
  }

  float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
  if(matrix==NULL){
    printf("can't allocate the required memory for matrix\n");
    free(vector);
    return 0;
  }

  float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_sq==NULL){
    printf("can't allocate the required memory for result_sq\n");
    free(vector);
    free(matrix);
    return 0;
  }

  float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_pl==NULL){
    printf("can't allocate the required memory for result_pl\n");
    free(vector);
    free(matrix);
    free(result_sq);
    return 0;
  }

  matrix_vector_gen(size, matrix, vector);

  double time_sq;
  double time_sse;

  time_sq = omp_get_wtime();
  matrix_mult_sq(size, vector, matrix, result_sq);
  time_sq = omp_get_wtime() - time_sq;

  time_sse = omp_get_wtime();
  matrix_mult_sse(size, vector, matrix, result_pl);
  time_sse = omp_get_wtime() - time_sse;

  printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
  printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);

  //check
  /*int i;
  for(i=0; i<size; i++)
    if((int)result_sq[i] != (int)result_pl[i]){
      printf("wrong at position %d\n", i);
      free(vector);
      free(matrix);
      free(result_sq);
      free(result_pl);
      return 0;
    }*/

  free(vector);
  free(matrix);
  free(result_sq);
  free(result_pl);
  return 1;
}

Solution

It seems you are loading and storing exclusively with mm_load_ps and mm_store_ps, which load and store 4 floats in a single instruction.

Since your containers (matrixes and vectors) do not have necessarily a size which is a multiple of 4 floats (16 bytes) this is incorrect.

memalign ensures that the pointer is aligned (here on 16 bytes) but does not reserve padding at the end so that the allocated block size is a multiple of 16 bytes.

For instance, when storing a 5-dimension vector, the vector has only 20 bytes allocated in memory, but you write 32 bytes (two mm_store_ps operations)

Additionally, it seems this is incorrect:

_mm_store_ps(&vector_out[i + k*size], r_line);

You want to store a single float here, if I'm right. Not four packed floats.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow