This is the code after implementing all the suggestions of @Paul R and @sh1, summarized as follows:
1) use only integer arithmetic (with precision to taste)
2) add the values of the pixels at the same distance from the mask center before applying the multiplications, to reduce the number of multiplications.
3) apply only horizontal filters to take advantage of the storage by rows of the matrices
4) separate cycles around the edges from those inside the image not to make unnecessary calls to reflection functions. I totally removed the functions of reflection, including them inside the loops along the edges.
5) In addition, as a personal observation, to improve rounding without calling a (slow) function "round" or "cvRound", I've added to both temporary and final pixel results 0.5f (= 32768 in integers precision) to reduce the error / difference compared to OpenCV.
Now the performance is much better from about 15 to about 6 times slower than OpenCV.
However, the resulting matrix is not perfectly identical to that obtained with the Gaussian Blur of OpenCV. This is not due to arithmetic length (sufficient) as well as removing the error remains. Note that this is a minimum difference, between 0 and 2 (in absolute value) of pixel intensity, between the matrices resulting from the two versions. Coefficient are the same used by OpenCV, obtained with getGaussianKernel with same size and sigma.
Mat myGaussianBlur(Mat src){
Mat dst(src.rows, src.cols, CV_8UC1);
Mat temp(src.rows, src.cols, CV_8UC1);
int sum;
int x1;
double coeffs[] = {0.070159, 0.131075, 0.190713, 0.216106, 0.190713, 0.131075, 0.070159};
int coeffs_i[7] = { 0 };
for (int i = 0; i < 7; i++){
coeffs_i[i] = (int)(coeffs[i] * 65536); //65536
}
// filter horizontally - inside the image
for(int y = 0; y < src.rows; y++){
uchar *ptr = src.ptr<uchar>(y);
for(int x = 3; x < (src.cols - 3); x++){
sum = ptr[x] * coeffs_i[3];
for(int i = -3; i < 0; i++){
int tmp = ptr[x+i] + ptr[x-i];
sum += coeffs_i[i + 3]*tmp;
}
temp.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
// filter horizontally - edges - needs reflect
for(int y = 0; y < src.rows; y++){
uchar *ptr = src.ptr<uchar>(y);
for(int x = 0; x <= 2; x++){
sum = 0;
for(int i = -3; i <= 3; i++){
x1 = x + i;
if(x1 < 0){
x1 = -x1;
}
sum += coeffs_i[i + 3]*ptr[x1];
}
temp.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
for(int y = 0; y < src.rows; y++){
uchar *ptr = src.ptr<uchar>(y);
for(int x = (src.cols - 3); x < src.cols; x++){
sum = 0;
for(int i = -3; i <= 3; i++){
x1 = x + i;
if(x1 >= src.cols){
x1 = 2*src.cols - x1 - 2;
}
sum += coeffs_i[i + 3]*ptr[x1];
}
temp.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
// transpose to apply again horizontal filter - better cache data locality
transpose(temp, temp);
// filter horizontally - inside the image
for(int y = 0; y < src.rows; y++){
uchar *ptr = temp.ptr<uchar>(y);
for(int x = 3; x < (src.cols - 3); x++){
sum = ptr[x] * coeffs_i[3];
for(int i = -3; i < 0; i++){
int tmp = ptr[x+i] + ptr[x-i];
sum += coeffs_i[i + 3]*tmp;
}
dst.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
// filter horizontally - edges - needs reflect
for(int y = 0; y < src.rows; y++){
uchar *ptr = temp.ptr<uchar>(y);
for(int x = 0; x <= 2; x++){
sum = 0;
for(int i = -3; i <= 3; i++){
x1 = x + i;
if(x1 < 0){
x1 = -x1;
}
sum += coeffs_i[i + 3]*ptr[x1];
}
dst.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
for(int y = 0; y < src.rows; y++){
uchar *ptr = temp.ptr<uchar>(y);
for(int x = (src.cols - 3); x < src.cols; x++){
sum = 0;
for(int i = -3; i <= 3; i++){
x1 = x + i;
if(x1 >= src.cols){
x1 = 2*src.cols - x1 - 2;
}
sum += coeffs_i[i + 3]*ptr[x1];
}
dst.at<uchar>(y,x) = (sum + 32768) / 65536;
}
}
transpose(dst, dst);
return dst;
}