32-bit to 16-bit Floating Point Conversion

https://stackoverflow.com/questions/1659440

11-09-2019
|

Question

I need a cross-platform library/algorithm that will convert between 32-bit and 16-bit floating point numbers. I don't need to perform math with the 16-bit numbers; I just need to decrease the size of the 32-bit floats so they can be sent over the network. I am working in C++.

I understand how much precision I would be losing, but that's OK for my application.

The IEEE 16-bit format would be great.

Solution

std::frexp extracts the significand and exponent from normal floats or doubles -- then you need to decide what to do with exponents that are too large to fit in a half-precision float (saturate...?), adjust accordingly, and put the half-precision number together. This article has C source code to show you how to perform the conversion.

OTHER TIPS

Complete conversion from single precision to half precision. This is a direct copy from my SSE version, so it's branch-less. It makes use of the fact that in GCC (-true == ~0), may be true for VisualStudio too but, I don't have a copy.

    class Float16Compressor
    {
        union Bits
        {
            float f;
            int32_t si;
            uint32_t ui;
        };

        static int const shift = 13;
        static int const shiftSign = 16;

        static int32_t const infN = 0x7F800000; // flt32 infinity
        static int32_t const maxN = 0x477FE000; // max flt16 normal as a flt32
        static int32_t const minN = 0x38800000; // min flt16 normal as a flt32
        static int32_t const signN = 0x80000000; // flt32 sign bit

        static int32_t const infC = infN >> shift;
        static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
        static int32_t const maxC = maxN >> shift;
        static int32_t const minC = minN >> shift;
        static int32_t const signC = signN >> shiftSign; // flt16 sign bit

        static int32_t const mulN = 0x52000000; // (1 << 23) / minN
        static int32_t const mulC = 0x33800000; // minN / (1 << (23 - shift))

        static int32_t const subC = 0x003FF; // max flt32 subnormal down shifted
        static int32_t const norC = 0x00400; // min flt32 normal down shifted

        static int32_t const maxD = infC - maxC - 1;
        static int32_t const minD = minC - subC - 1;

    public:

        static uint16_t compress(float value)
        {
            Bits v, s;
            v.f = value;
            uint32_t sign = v.si & signN;
            v.si ^= sign;
            sign >>= shiftSign; // logical shift
            s.si = mulN;
            s.si = s.f * v.f; // correct subnormals
            v.si ^= (s.si ^ v.si) & -(minN > v.si);
            v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
            v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
            v.ui >>= shift; // logical shift
            v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
            v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
            return v.ui | sign;
        }

        static float decompress(uint16_t value)
        {
            Bits v;
            v.ui = value;
            int32_t sign = v.si & signC;
            v.si ^= sign;
            sign <<= shiftSign;
            v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
            v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
            Bits s;
            s.si = mulC;
            s.f *= v.si;
            int32_t mask = -(norC > v.si);
            v.si <<= shift;
            v.si ^= (s.si ^ v.si) & mask;
            v.si |= sign;
            return v.f;
        }
    };

So that's a lot to take in but, it handles all subnormal values, both infinities, quiet NaNs, signaling NaNs, and negative zero. Of course full IEEE support isn't always needed. So compressing generic floats:

    class FloatCompressor
    {
        union Bits
        {
            float f;
            int32_t si;
            uint32_t ui;
        };

        bool hasNegatives;
        bool noLoss;
        int32_t _maxF;
        int32_t _minF;
        int32_t _epsF;
        int32_t _maxC;
        int32_t _zeroC;
        int32_t _pDelta;
        int32_t _nDelta;
        int _shift;

        static int32_t const signF = 0x80000000;
        static int32_t const absF = ~signF;

    public:

        FloatCompressor(float min, float epsilon, float max, int precision)
        {
            // legal values
            // min <= 0 < epsilon < max
            // 0 <= precision <= 23
            _shift = 23 - precision;
            Bits v;
            v.f = min;
            _minF = v.si;
            v.f = epsilon;
            _epsF = v.si;
            v.f = max;
            _maxF = v.si;
            hasNegatives = _minF < 0;
            noLoss = _shift == 0;
            int32_t pepsU, nepsU;
            if(noLoss) {
                nepsU = _epsF;
                pepsU = _epsF ^ signF;
                _maxC = _maxF ^ signF;
                _zeroC = signF;
            } else {
                nepsU = uint32_t(_epsF ^ signF) >> _shift;
                pepsU = uint32_t(_epsF) >> _shift;
                _maxC = uint32_t(_maxF) >> _shift;
                _zeroC = 0;
            }
            _pDelta = pepsU - _zeroC - 1;
            _nDelta = nepsU - _maxC - 1;
        }

        float clamp(float value)
        {
            Bits v;
            v.f = value;
            int32_t max = _maxF;
            if(hasNegatives)
                max ^= (_minF ^ _maxF) & -(0 > v.si);
            v.si ^= (max ^ v.si) & -(v.si > max);
            v.si &= -(_epsF <= (v.si & absF));
            return v.f;
        }

        uint32_t compress(float value)
        {
            Bits v;
            v.f = clamp(value);
            if(noLoss)
                v.si ^= signF;
            else
                v.ui >>= _shift;
            if(hasNegatives)
                v.si ^= ((v.si - _nDelta) ^ v.si) & -(v.si > _maxC);
            v.si ^= ((v.si - _pDelta) ^ v.si) & -(v.si > _zeroC);
            if(noLoss)
                v.si ^= signF;
            return v.ui;
        }

        float decompress(uint32_t value)
        {
            Bits v;
            v.ui = value;
            if(noLoss)
                v.si ^= signF;
            v.si ^= ((v.si + _pDelta) ^ v.si) & -(v.si > _zeroC);
            if(hasNegatives)
                v.si ^= ((v.si + _nDelta) ^ v.si) & -(v.si > _maxC);
            if(noLoss)
                v.si ^= signF;
            else
                v.si <<= _shift;
            return v.f;
        }

    };

This forces all values into the accepted range, no support for NaNs, infinities or negative zero. Epsilon is the smallest allowable value in the range. Precision is how many bits of precision to retain from the float. While there are a lot of branches above, they are all static and will be cached by the branch predictor in the CPU.

Of course if your values don't require logarithmic resolution approaching zero. Then linearizing them to a fixed point format is much faster, as was already mentioned.

I use the FloatCompressor (SSE version) in graphics library for reducing the size of linear float color values in memory. Compressed floats have the advantage of creating small look-up tables for time consuming functions, like gamma correction or transcendentals. Compressing linear sRGB values reduces to a max of 12 bits or a max value of 3011, which is great for a look-up table size for to/from sRGB.

Given your needs (-1000, 1000), perhaps it would be better to use a fixed-point representation.

//change to 20000 to SHORT_MAX if you don't mind whole numbers
//being turned into fractional ones
const int compact_range = 20000;

short compactFloat(double input) {
    return round(input * compact_range / 1000);
}
double expandToFloat(short input) {
    return ((double)input) * 1000 / compact_range;
}

This will give you accuracy to the nearest 0.05. If you change 20000 to SHORT_MAX you'll get a bit more accuracy but some whole numbers will end up as decimals on the other end.

Half to float:
float f = ((h&0x8000)<<16) | (((h&0x7c00)+0x1C000)<<13) | ((h&0x03FF)<<13);

Float to half:
uint32_t x = *((uint32_t*)&f);
uint16_t h = ((x>>16)&0x8000)|((((x&0x7f800000)-0x38000000)>>13)&0x7c00)|((x>>13)&0x03ff);

If you're sending a stream of information across, you could probably do better than this, especially if everything is in a consistent range, as your application seems to have.

Send a small header, that just consists of a float32 minimum and maximum, then you can send across your information as a 16 bit interpolation value between the two. As you also say that precision isn't much of an issue, you could even send 8bits at a time.

Your value would be something like, at reconstruction time:

float t = _t / numeric_limits<unsigned short>::max();  // With casting, naturally ;)
float val = h.min + t * (h.max - h.min);

Hope that helps.

-Tom

This question is already a bit old, but for the sake of completeness, you might also take a look at this paper for half-to-float and float-to-half conversion.

They use a branchless table-driven approach with relatively small look-up tables. It is completely IEEE-conformant and even beats Phernost's IEEE-conformant branchless conversion routines in performance (at least on my machine). But of course his code is much better suited to SSE and is not that prone to memory latency effects.

Most of the approaches described in the other answers here either do not round correctly on conversion from float to half, throw away subnormals which is a problem since 2**-14 becomes your smallest non-zero number, or do unfortunate things with Inf / NaN. Inf is also a problem because the largest finite number in half is a bit less than 2^16. OpenEXR was unnecessarily slow and complicated, last I looked at it. A fast correct approach will use the FPU to do the conversion, either as a direct instruction, or using the FPU rounding hardware to make the right thing happen. Any half to float conversion should be no slower than a 2^16 element lookup table.

The following are hard to beat:

On OS X / iOS, you can use vImageConvert_PlanarFtoPlanar16F and vImageConvert_Planar16FtoPlanarF. See Accelerate.framework.

Intel ivybridge added SSE instructions for this. See f16cintrin.h. Similar instructions were added to the ARM ISA for Neon. See vcvt_f32_f16 and vcvt_f16_f32 in arm_neon.h. On iOS you will need to use the arm64 or armv7s arch to get access to them.

This code converts a 32-bit floating point number to 16-bits and back.

#include <x86intrin.h>
#include <iostream>

int main()
{
    float f32;
    unsigned short f16;
    f32 = 3.14159265358979323846;
    f16 = _cvtss_sh(f32, 0);
    std::cout << f32 << std::endl;
    f32 = _cvtsh_ss(f16);
    std::cout << f32 << std::endl;
    return 0;
}

I tested with the Intel icpc 16.0.2:

$ icpc a.cpp

g++ 7.3.0:

$ g++ -march=native a.cpp

and clang++ 6.0.0:

$ clang++ -march=native a.cpp

It prints:

$ ./a.out
3.14159
3.14062

Documentation about these intrinsics is available at:

https://software.intel.com/en-us/node/524287

https://clang.llvm.org/doxygen/f16cintrin_8h.html

This conversion for 16-to-32-bit floating point is quite fast for cases where you do not have to account for infinities or NaNs, and can accept denormals-as-zero (DAZ). I.e. it is suitable for performance-sensitive calculations, but you should beware of division by zero if you expect to encounter denormals.

Note that this is most suitable for x86 or other platforms that have conditional moves or "set if" equivalents.

Strip the sign bit off the input
Align the most significant bit of the mantissa to the 22nd bit
Adjust the exponent bias
Set bits to all-zero if the input exponent is zero
Re-insert sign bit

The reverse applies for single-to-half-precision, with some additions.

void float32(float* __restrict out, const uint16_t in) {
    uint32_t t1;
    uint32_t t2;
    uint32_t t3;

    t1 = in & 0x7fff;                       // Non-sign bits
    t2 = in & 0x8000;                       // Sign bit
    t3 = in & 0x7c00;                       // Exponent

    t1 <<= 13;                              // Align mantissa on MSB
    t2 <<= 16;                              // Shift sign bit into position

    t1 += 0x38000000;                       // Adjust bias

    t1 = (t3 == 0 ? 0 : t1);                // Denormals-as-zero

    t1 |= t2;                               // Re-insert sign bit

    *((uint32_t*)out) = t1;
};

void float16(uint16_t* __restrict out, const float in) {
    uint32_t inu = *((uint32_t*)&in);
    uint32_t t1;
    uint32_t t2;
    uint32_t t3;

    t1 = inu & 0x7fffffff;                 // Non-sign bits
    t2 = inu & 0x80000000;                 // Sign bit
    t3 = inu & 0x7f800000;                 // Exponent

    t1 >>= 13;                             // Align mantissa on MSB
    t2 >>= 16;                             // Shift sign bit into position

    t1 -= 0x1c000;                         // Adjust bias

    t1 = (t3 > 0x38800000) ? 0 : t1;       // Flush-to-zero
    t1 = (t3 < 0x8e000000) ? 0x7bff : t1;  // Clamp-to-max
    t1 = (t3 == 0 ? 0 : t1);               // Denormals-as-zero

    t1 |= t2;                              // Re-insert sign bit

    *((uint16_t*)out) = t1;
};

Note that you can change the constant 0x7bff to 0x7c00 for it to overflow to infinity.

See GitHub for source code.

I have found an implementation of conversion from half-float to single-float format and back with using of AVX2. There are much more faster than software implementation of these algorithms. I hope it will be useful.

32-bit float to 16-bit float conversion:

#include <immintrin.h"

inline void Float32ToFloat16(const float * src, uint16_t * dst)
{
    _mm_storeu_si128((__m128i*)dst, _mm256_cvtps_ph(_mm256_loadu_ps(src), 0));
}

void Float32ToFloat16(const float * src, size_t size, uint16_t * dst)
{
    assert(size >= 8);

    size_t fullAlignedSize = size&~(32-1);
    size_t partialAlignedSize = size&~(8-1);

    size_t i = 0;
    for (; i < fullAlignedSize; i += 32)
    {
        Float32ToFloat16(src + i + 0, dst + i + 0);
        Float32ToFloat16(src + i + 8, dst + i + 8);
        Float32ToFloat16(src + i + 16, dst + i + 16);
        Float32ToFloat16(src + i + 24, dst + i + 24);
    }
    for (; i < partialAlignedSize; i += 8)
        Float32ToFloat16(src + i, dst + i);
    if(partialAlignedSize != size)
        Float32ToFloat16(src + size - 8, dst + size - 8);
}

16-bit float to 32-bit float conversion:

#include <immintrin.h"

inline void Float16ToFloat32(const uint16_t * src, float * dst)
{
    _mm256_storeu_ps(dst, _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)src)));
}

void Float16ToFloat32(const uint16_t * src, size_t size, float * dst)
{
    assert(size >= 8);

    size_t fullAlignedSize = size&~(32-1);
    size_t partialAlignedSize = size&~(8-1);

    size_t i = 0;
    for (; i < fullAlignedSize; i += 32)
    {
        Float16ToFloat32<align>(src + i + 0, dst + i + 0);
        Float16ToFloat32<align>(src + i + 8, dst + i + 8);
        Float16ToFloat32<align>(src + i + 16, dst + i + 16);
        Float16ToFloat32<align>(src + i + 24, dst + i + 24);
    }
    for (; i < partialAlignedSize; i += 8)
        Float16ToFloat32<align>(src + i, dst + i);
    if (partialAlignedSize != size)
        Float16ToFloat32<false>(src + size - 8, dst + size - 8);
}

The question is old and has already been answered, but I figured it would be worth mentioning an open source C++ library that can create 16bit IEEE compliant half precision floats and has a class that acts pretty much identically to the built in float type, but with 16 bits instead of 32. It is the "half" class of the OpenEXR library. The code is under a permissive BSD style license. I don't believe it has any dependencies outside of the standard library.

I had this same exact problem, and found this link very helpful. Just import the file "ieeehalfprecision.c" into your project and use it like this :

float myFloat = 1.24;
uint16_t resultInHalf;
singles2halfp(&resultInHalf, &myFloat, 1); // it accepts a series of floats, so use 1 to input 1 float

// an example to revert the half float back
float resultInSingle;
halfp2singles(&resultInSingle, &resultInHalf, 1);

I also change some code (See the comment by the author (James Tursa) in the link) :

#define INT16_TYPE int16_t 
#define UINT16_TYPE uint16_t 
#define INT32_TYPE int32_t 
#define UINT32_TYPE uint32_t

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow