The answer posted by @njuffa is almost certainly the smartest way forward for this desire. I don't know how fast it would be, but since it taps into the highest throughput engine on the device (the single precision floating point engine) it would seem to be best of any competing alternative. Nevertheless you did ask for a fixed point representation so I thought I would go ahead and post this anyway, just using this question as a repository of ideas.
This code has many caveats:
- not extensively tested
- probably slower than most alternatives
- doesn't do signed arithmetic (you mentioned uint), so subtract is questionable
- didn't implement divide
- no overflow/underflow checking
- ignored request to absolutely not use any 64 bit representation or operation. I have not used any 64-bit (DP) floating point ops of course, but I do use 64 bit integer ops. My rationale is that (in some cases,) me breaking up the inherently 64 bit representation proposed into two 32 bit quantities, doing a bunch of 32 bit ops, and then re-assembling is senseless when the machine (compiler) will do exactly this anyway if I request 64 bit (integer) ops.
- not optimized. You could probably do a better job with PTX, but it might be in the category of "polishing a turd".
So I offer this up as a framework, for amusement, and perhaps so you can see how much slower doing it in a "true" fixed point format would be as compared to using e.g. the SP engines.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define N 10000
#define nTPB 512
#define SCALE 1
#define MUL_ERR_LIM (0.0000001 * SCALE * SCALE)
#define ADD_ERR_LIM (0.0000001 * SCALE)
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// fixed point number representation: 32 bit whole portion and 32 bit decimal
typedef union {
struct {
unsigned d; // decimal portion
unsigned w; // whole number portion
} part;
unsigned long val;
} fxp;
__device__ __host__ inline fxp fxp_add(fxp a, fxp b){
fxp temp;
temp.val = a.val + b.val;
return temp;
}
__device__ __host__ inline fxp fxp_sub(fxp a, fxp b){
fxp temp;
temp.val = a.val - b.val;
return temp;
}
__device__ __host__ inline unsigned fxp_whole(fxp a){
return a.part.w;
}
__device__ __host__ inline unsigned fxp_decimal(fxp a){
return a.part.d;
}
__device__ __host__ inline void fxp_set_whole(fxp *a, unsigned val){
a->part.w = val;
}
__device__ __host__ inline void fxp_set_decimal(fxp *a, unsigned val){
a->part.d = val;
}
__device__ __host__ inline fxp float_to_fxp(float val){
fxp temp;
temp.part.w = (unsigned) truncf(val);
temp.part.d = (unsigned) rintf((val - truncf(val)) * 0x0000000100000000ul);
return temp;
}
__device__ __host__ inline float fxp_to_float(fxp val){
return val.part.w + (val.part.d/(float)0x0000000100000000ul);
}
__device__ __host__ inline fxp double_to_fxp(double val){
fxp temp;
temp.part.w = (unsigned) trunc(val);
temp.part.d = (unsigned) rint((val - trunc(val)) * 0x0000000100000000ul);
return temp;
}
__device__ __host__ inline double fxp_to_double(fxp val){
return val.part.w + (val.part.d/(double)0x0000000100000000ul);
}
__device__ __host__ fxp fxp_mul(fxp a, fxp b){
fxp temp;
unsigned long ltemp = ((unsigned long)a.part.w * (unsigned long)b.part.d) + ((unsigned long)a.part.d * (unsigned long)b.part.w);
unsigned utemp = (unsigned) (ltemp & 0x0FFFFFFFFul);
temp.part.w = (unsigned) (ltemp >> 32);
temp.part.d = (unsigned) (((unsigned long)a.part.d * (unsigned long)b.part.d) >> 32) + utemp;
temp.part.w += (a.part.w * b.part.w) + ((temp.part.d < utemp) ? 1:0);
return temp;
}
__global__ void fxp_test(float *a, float *b, float *s, float *p, double *da, double *db, double *ds, double *dp, unsigned n){
unsigned idx = threadIdx.x + (blockDim.x * blockIdx.x);
if (idx < n){
float la = a[idx];
float lb = b[idx];
double lda = da[idx];
double ldb = db[idx];
s[idx] = fxp_to_float(fxp_add(float_to_fxp(la), float_to_fxp(lb)));
p[idx] = fxp_to_float(fxp_mul(float_to_fxp(la), float_to_fxp(lb)));
ds[idx] = fxp_to_double(fxp_add(double_to_fxp(lda), double_to_fxp(ldb)));
dp[idx] = fxp_to_double(fxp_mul(double_to_fxp(lda), double_to_fxp(ldb)));
}
}
int main(){
fxp a,b,c;
float x,y,z, xp, yp, zp;
double dx, dy, dz, dxp, dyp, dzp;
if (sizeof(unsigned) != 4) {printf("unsigned type size error: %d\n", sizeof(unsigned)); return 1;}
if (sizeof(unsigned long) != 8) {printf("unsigned long type error: %d\n", sizeof(unsigned long)); return 1;}
// test host side
x = 76.705116;
y = 1.891480;
a = float_to_fxp(x);
b = float_to_fxp(y);
// test conversions
xp = fxp_to_float(a);
yp = fxp_to_float(b);
printf("con: a = %f, a should be %f, b = %f, b should be %f\n", xp, x, yp, y);
// test multiply
c = fxp_mul(a, b);
z = x*y;
zp = fxp_to_float(c);
printf("mul: a = %f, b = %f, c = %f, c should be %f\n", x, y, zp, z);
//test add
c = fxp_add(a, b);
z = x+y;
zp = fxp_to_float(c);
printf("add: a = %f, b = %f, c = %f, c should be %f\n", x, y, zp, z);
//test subtract
c = fxp_sub(a, b);
z = x-y;
zp = fxp_to_float(c);
printf("sub: a = %f, b = %f, c = %f, c should be %f\n", x, y, zp, z);
// now test doubles
dx = 6.7877;
dy = 5.2444;
a = double_to_fxp(dx);
b = double_to_fxp(dy);
// test conversions
dxp = fxp_to_double(a);
dyp = fxp_to_double(b);
printf("dbl con: a = %f, a should be %f, b = %f, b should be %f\n", dxp, dx, dyp, dy);
// test multiply
c = fxp_mul(a, b);
dz = dx*dy;
dzp = fxp_to_double(c);
printf("double mul: a = %f, b = %f, c = %f, c should be %f\n", dx, dy, dzp, dz);
//test add
c = fxp_add(a, b);
dz = dx+dy;
dzp = fxp_to_double(c);
printf("double add: a = %f, b = %f, c = %f, c should be %f\n", dx, dy, dzp, dz);
//test subtract
c = fxp_sub(a, b);
dz = dx-dy;
dzp = fxp_to_double(c);
printf("double sub: a = %f, b = %f, c = %f, c should be %f\n", dx, dy, dzp, dz);
// test device side
float *h_a, *d_a, *h_b, *d_b, *h_s, *d_s, *h_p, *d_p;
double *h_da, *d_da, *h_db, *d_db, *h_ds, *d_ds, *h_dp, *d_dp;
if ((h_a=(float *)malloc(N*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_b=(float *)malloc(N*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_s=(float *)malloc(N*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_p=(float *)malloc(N*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_da=(double *)malloc(N*sizeof(double))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_db=(double *)malloc(N*sizeof(double))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_ds=(double *)malloc(N*sizeof(double))) == 0) {printf("malloc fail\n"); return 1;}
if ((h_dp=(double *)malloc(N*sizeof(double))) == 0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_a, N*sizeof(float));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_b, N*sizeof(float));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_s, N*sizeof(float));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_p, N*sizeof(float));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_da, N*sizeof(double));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_db, N*sizeof(double));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_ds, N*sizeof(double));
cudaCheckErrors("cudamalloc fail");
cudaMalloc((void **)&d_dp, N*sizeof(double));
cudaCheckErrors("cudamalloc fail");
for (unsigned i = 0; i < N; i++){
h_a[i] = (float)drand48() * SCALE;
h_b[i] = (float)drand48() * SCALE;
h_da[i] = drand48()*SCALE;
h_db[i] = drand48()*SCALE;
}
cudaMemcpy(d_a, h_a, N*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(d_b, h_b, N*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(d_da, h_da, N*sizeof(double), cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(d_db, h_db, N*sizeof(double), cudaMemcpyHostToDevice);
cudaCheckErrors("cudamemcpy fail");
fxp_test<<<(N+nTPB-1)/nTPB, nTPB>>>(d_a, d_b, d_s, d_p, d_da, d_db, d_ds, d_dp, N);
cudaCheckErrors("kernel fail");
cudaMemcpy(h_s, d_s, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(h_p, d_p, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(h_ds, d_ds, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy fail");
cudaMemcpy(h_dp, d_dp, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudamemcpy fail");
for (unsigned i=0; i<N; i++){
if (fabsf(h_s[i] - (h_a[i] + h_b[i])) > ADD_ERR_LIM) {printf("float add mismatch at %d, a: %f b: %f gpu: %f cpu: %f\n", i, h_a[i], h_b[i], h_s[i], (h_a[i] + h_b[i])); return 1;}
if (fabsf(h_p[i] - (h_a[i] * h_b[i])) > MUL_ERR_LIM) {printf("float mul mismatch at %d, a: %f b: %f gpu: %f cpu: %f\n", i, h_a[i], h_b[i], h_p[i], (h_a[i] * h_b[i])); return 1;}
if (fabs(h_ds[i] - (h_da[i] + h_db[i])) > ADD_ERR_LIM) {printf("double add mismatch at %d, a: %f b: %f gpu: %f cpu: %f\n", i, h_da[i], h_db[i], h_ds[i], (h_da[i] + h_db[i])); return 1;}
if (fabs(h_dp[i] - (h_da[i] * h_db[i])) > MUL_ERR_LIM) {printf("double mul mismatch at %d, a: %f b: %f gpu: %f cpu: %f\n", i, h_da[i], h_db[i], h_dp[i], (h_da[i] * h_db[i])); return 1;}
}
printf("GPU results match!\n");
return 0;
}
As an aside, the molecular dynamics research code AMBER runs fast on GPUs and implements several different arithmetic models to achieve it's speed, including a mixture of SP and DP calculations as well as an SPFP format, which uses a fixed-point representation for some work and SP for other work. I don't know the details of it however. But apparently it can be done to good effect.