If you can arrange your vectors in AoSoA form (xxyyzzxxyyzzxxyyzz...) you can do this very efficiently with SSE or AVX (xxxxyyyyzzzz...). In the code below I assumed SSE2 which has vec_size=2 but it's easy to change this to AVX. But your code is likely memory bound and not compute bound so this is only going to be useful for small loops which fit in the L1 cache. It will also be faster to use single float since it does twice the number of flops and sqrt is one of the few functions which is actually slower for double than float.
resultv = _mm_setzero_pd(0);
for(int j = 0; j < BIGNUMBER; j+=vec_size) {
bx = _mm_load_pd(&B[3*j+0*vec_size]);
by = _mm_load_pd(&B[3*j+1*vec_size]);
bz = _mm_load_pd(&B[3*j+2*vec_size]);
for(int i = 0; i < SMALLNUMBER; i+=vec_size) {
ax = _mm_load_pd(&A[3*i+0*vec_size]);
ay = _mm_load_pd(&A[3*i+1*vec_size]);
az = _mm_load_pd(&A[3*i+2*vec_size]);
dx = _mm_sub_pd(ax,bx);
dy = _mm_sub_pd(ay,by);
dz = _mm_sub_pd(az,bz);
mag2 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(dx,dx),_mm_mul_pd(dy,dy)), _mm_mul_pd(dz,dz));
varv = _mm_load_pd(&var[i]);
resultv = _mm_add_pd(_mm_div_pd(varv, _mm_sqrt_pd(mag2)), resultv);
//resultv = _mm_add_pd(_mm_mul_pd(varv, _mm_rsqrt_pd(mag2)), resultv);
}
}
result = _mm_cvtsd_f64(_mm_hadd_pd(resultv,resultv));