I am trying to do Vector-matrix multiplication as well as matrix-matrix multiplication using SSE Intrinsic but I get an error saying "Segmentation Fault", if I try to do for anything except multiples of 4. not able to figure out why, it don't work for anything else.? Please suggest changes so that it works for anysize of input.?
Following is my implementation:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>
/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
int i;
for (i = 0; i < size*size; i++){
vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
printf("%f \n ", vector[i]);
}
for (i = 0; i < size*size; i++){
matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
printf("%f \n ", matrix[i]);
}
}
/****************************************************
the following function calculate the below equation
vector_out = vector_in x matrix_in
***************************************************/
void matrix_mult_sq(int size, float *vector_in,
float *matrix_in, float *vector_out){
int i, j, k;
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
vector_out[size*i + j] = 0.0;
for (k = 0; k < size; k++)
vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
}
}
}
void matrix_mult_sse(int size, float *vector_in,
float *matrix_in, float *vector_out){
__m128 a_line, b_line, r_line;
int i, j, k, l;
for (k = 0; k < size; k++)
{
for (i = 0; i < size; i += 4){
j = 0;
b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
a_line = _mm_set1_ps(vector_in[j + k*size]); // a_line = vec4(vector_in[0])
r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
for (j = 1; j < size; j++) {
b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
a_line = _mm_set1_ps(vector_in[j + k*size]); // b_line = vec4(b[i][j])
// r_line += a_line * b_line
r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
}
_mm_store_ps(&vector_out[i + k*size], r_line); // r[i] = r_line
}
}
for (l=0; l < size*size; l++)
{
printf("%f \n", vector_out[l]);
}
}
int main(int argc, char *argv[]){
if(argc < 2){
printf("Usage: %s matrix/vector_size\n", argv[0]);
return 0;
}
int size = atoi(argv[1]);
if(size%4 != 0){
printf("This version implements for ""size = 4*n"" only\n");
return 0;
}
float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
if(vector==NULL){
printf("can't allocate the required memory for vector\n");
return 0;
}
float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
if(matrix==NULL){
printf("can't allocate the required memory for matrix\n");
free(vector);
return 0;
}
float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_sq==NULL){
printf("can't allocate the required memory for result_sq\n");
free(vector);
free(matrix);
return 0;
}
float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_pl==NULL){
printf("can't allocate the required memory for result_pl\n");
free(vector);
free(matrix);
free(result_sq);
return 0;
}
matrix_vector_gen(size, matrix, vector);
double time_sq;
double time_sse;
time_sq = omp_get_wtime();
matrix_mult_sq(size, vector, matrix, result_sq);
time_sq = omp_get_wtime() - time_sq;
time_sse = omp_get_wtime();
matrix_mult_sse(size, vector, matrix, result_pl);
time_sse = omp_get_wtime() - time_sse;
printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);
//check
/*int i;
for(i=0; i<size; i++)
if((int)result_sq[i] != (int)result_pl[i]){
printf("wrong at position %d\n", i);
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 0;
}*/
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 1;
}