Allright, I tink I found a sultion, its not very elegant but it works at least! The should be a better way, anyone any suggestions?
extern "C"{
int foobar(float * ndarray1,float * ndarray2,int path_cnt)
{
float * test = (float*)_mm_malloc(path_cnt*sizeof(float),32);
float * test2 = (float*)_mm_malloc(path_cnt*sizeof(float),32);
//copy to aligned memory(this part is kinda stupid)
for(int i=0;i<path_cnt;i++)
{
test[i] = stock[i];
test2[i] = max_vola[i];
}
for(int i=0;i<path_cnt;i=i+8)
{
__m256 arr1 = _mm256_load_ps(&test1[i]);
__m256 arr2 = _mm256_load_ps(&test2[i]);
__m256 add = _mm256_add_ps(arr1,arr2);
_mm256_store_ps(&test1[i],add);
}
//and copy everything back!
for(int i=0;i<path_cnt;i++)
{
stock[i] = test[i];
}
return 0;
}
}