SGEMM result inconsistency

Question

I've gotten interested in this and wrote a bit of code to test the hypothesis myself, and it does seem that SIMD gives different results compared to "standard mode".

The following snippet was compiled with ICC 13.0.2 on Mac OS X 10.8.3 using icpc -std=c++11 -O3 -ip -xAVX -fp-model source -fp-model precise -mkl=parallel -openmp.

#include <cmath>
#include <cstring>
#include <iostream>
#include <random>
#include <utility>

#include <immintrin.h>
#include <mkl.h>

template <typename type, size_t rows, size_t cols>
class matrix
{
private:
    void *_data;

public:
    matrix() :
        _data (_mm_malloc(sizeof(type) * rows * cols, 64))
    {
        if (_data == nullptr) throw std::bad_alloc();
        else memset(_data, 0, sizeof(type) * rows * cols);
    }

    matrix(matrix<type, rows, cols> const& other) :
        _data (_mm_malloc(sizeof(type) * rows * cols, 64))
    {
        if (_data == nullptr) throw std::bad_alloc();
        else memcpy(_data, other._data, sizeof(type) * rows * cols);
    }

    ~matrix()
    {
        if (_data != nullptr) _mm_free(_data);
    }

    typedef type array_type[cols];
    array_type& operator[](size_t i)
    {
        return static_cast<array_type*>(_data)[i];
    }

    typedef type const_array_type[cols];
    const_array_type& operator[](size_t i) const
    {
        return static_cast<const_array_type*>(_data)[i];
    }
};

template <typename type, size_t m, size_t n>
type max_diff(matrix<type, m, n> const& a, matrix<type, m, n> const& b)
{
    type value = static_cast<type>(0);
    for (size_t i = 0; i < m; ++i)
    {
        #pragma novector
        for (size_t j = 0; j < n; ++j)
        {
            const type diff = a[i][j] - b[i][j];
            if (std::abs(diff) > value) value = std::abs(diff);
        }
    }
    return value;
}

template <typename type, size_t m, size_t n, size_t k>
matrix<type, m, n> matmul_loop(matrix<type, m, k> const& a, matrix<type, n, k> const& b)
{
    matrix<type, m, n> out;

    #pragma omp parallel for
    for (size_t i = 0; i < m; ++i)
    {
        for (size_t j = 0; j < n; ++j)
        {
            for (size_t l = 0; l < k; ++l)
            {
                out[i][j] += a[i][l] * b[j][l];
            }
        }
    }

    return out;
}

template <typename type, size_t m, size_t n, size_t k>
matrix<type, m, n> matmul_simd(matrix<type, m, k> const& a, matrix<type, n, k> const& b)
{
    matrix<type, m, n> out;
    type *temp = static_cast<type*>(_mm_malloc(sizeof(type) * k, 64));

    #pragma omp parallel for
    for (size_t i = 0; i < m; ++i)
    {
        for (size_t j = 0; j < n; ++j)
        {
            type temp = 0.;

            #pragma vector aligned
            #pragma ivdep
            #pragma simd vectorlengthfor(type)
            for (size_t l = 0; l < k; ++l)
            {
                temp += a[i][l] * b[j][l];
            }

            out[i][j] = temp;
        }
    }

    return out;
}

template <size_t m, size_t n, size_t k>
matrix<float, m, n> matmul_sgemm(matrix<float, m, k> const& a, matrix<float, n, k> const& b)
{
    matrix<float, m, n> out;
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1., &a[0][0], m, &b[0][0], n, 0., &out[0][0], m);
    return out;
}

int main()
{
    std::mt19937_64 generator;
    std::uniform_real_distribution<float> rand_dist(-1000.0,1000.0);

    const size_t size = 4096;

    matrix<float, size, size> mat;
    for (size_t i = 0; i < size; ++i)
    {
        for (size_t j = 0; j < size; ++j)
        {
            mat[i][j] = rand_dist(generator);
        }
    }

    matrix<float, size, size> result_loop = matmul_loop(mat, mat);
    matrix<float, size, size> result_simd = matmul_simd(mat, mat);
    matrix<float, size, size> result_sgemm = matmul_sgemm(mat, mat);

    std::cout << "SIMD differs from LOOP by a maximum of " << max_diff(result_loop, result_simd) << std::endl;
    std::cout << "SGEMM differs from LOOP by a maximum of " << max_diff(result_loop, result_sgemm) << std::endl;
    std::cout << "SGEMM differs from SIMD by a maximum of " << max_diff(result_simd, result_sgemm) << std::endl;

    return 0;
}

Note that the "random" matrix was generated using a standard seed so the result should be completely reproducible. Basically, given a 4096x4096 matrix A, the code computes AA^T using three different methods and then compare the results, printing out the component which differs by the largest amount. On my machine, the output is as follows:

$ ./matmul
SIMD differs from LOOP by a maximum of 6016
SGEMM differs from LOOP by a maximum of 6016
SGEMM differs from SIMD by a maximum of 512

The compiler flag -fp-model source -fp-model precise prevents matmul_loop from being vectorised, but the loop in matmul_simd was obviously forced to be vectorised #pragma simd. The matrix transpose is there just to help simplify the SIMD code a little.