Efficient C BitStream Implementation

Question 1

For the record, here is the BitStream implementation I ended up with. It is mostly macro-based, and uses a 32-bit accumulator. Bits are stored in the accumulator from the most significant bit to the least significant bit. For instance, to test if the next bit is set, one would do (accumulator & 0x80000000). This makes it very practical to do multiple tests without having to manipulate the BitStream much. For writing, I am accumulating bits in the accumulator, and automatically flush them to the output buffer when it is full. Flushing can also be manually triggered when all bits have been written. Your mileage may vary, but I am quite satisfied with it. I have used it to implement Microsoft Point to Point Compression (MPPC), which uses huffman coding, and performance is great.

struct _wBitStream
{
    BYTE* buffer;
    BYTE* pointer;
    DWORD position;
    DWORD length;
    DWORD capacity;
    UINT32 mask;
    UINT32 offset;
    UINT32 prefetch;
    UINT32 accumulator;
};
typedef struct _wBitStream wBitStream;

#define BitStream_Prefetch(_bs) do { \
    (_bs->prefetch) = 0; \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 4)) \
        (_bs->prefetch) |= (*(_bs->pointer + 4) << 24); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 5)) \
        (_bs->prefetch) |= (*(_bs->pointer + 5) << 16); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 6)) \
        (_bs->prefetch) |= (*(_bs->pointer + 6) << 8); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 7)) \
        (_bs->prefetch) |= (*(_bs->pointer + 7) << 0); \
} while(0)

#define BitStream_Fetch(_bs) do { \
    (_bs->accumulator) = 0; \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 0)) \
        (_bs->accumulator) |= (*(_bs->pointer + 0) << 24); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 1)) \
        (_bs->accumulator) |= (*(_bs->pointer + 1) << 16); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 2)) \
        (_bs->accumulator) |= (*(_bs->pointer + 2) << 8); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) <(_bs->capacity + 3)) \
        (_bs->accumulator) |= (*(_bs->pointer + 3) << 0); \
    BitStream_Prefetch(_bs); \
} while(0)

#define BitStream_Flush(_bs) do { \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 0)) \
        *(_bs->pointer + 0) = (_bs->accumulator >> 24); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 1)) \
        *(_bs->pointer + 1) = (_bs->accumulator >> 16); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 2)) \
        *(_bs->pointer + 2) = (_bs->accumulator >> 8); \
    if (((UINT32) (_bs->pointer - _bs->buffer)) < (_bs->capacity + 3)) \
        *(_bs->pointer + 3) = (_bs->accumulator >> 0); \
} while(0)

#define BitStream_Shift(_bs, _nbits) do { \
    _bs->accumulator <<= _nbits; \
    _bs->position += _nbits; \
    _bs->offset += _nbits; \
    if (_bs->offset < 32) { \
        _bs->mask = ((1 << _nbits) - 1); \
        _bs->accumulator |= ((_bs->prefetch >> (32 - _nbits)) & _bs->mask); \
        _bs->prefetch <<= _nbits; \
    } else { \
        _bs->mask = ((1 << _nbits) - 1); \
        _bs->accumulator |= ((_bs->prefetch >> (32 - _nbits)) & _bs->mask); \
        _bs->prefetch <<= _nbits; \
        _bs->offset -= 32; \
        _bs->pointer += 4; \
        BitStream_Prefetch(_bs); \
        if (_bs->offset) { \
            _bs->mask = ((1 << _bs->offset) - 1); \
            _bs->accumulator |= ((_bs->prefetch >> (32 - _bs->offset)) & _bs->mask); \
            _bs->prefetch <<= _bs->offset; \
        } \
    } \
} while(0)

#define BitStream_Write_Bits(_bs, _bits, _nbits) do { \
    _bs->position += _nbits; \
    _bs->offset += _nbits; \
    if (_bs->offset < 32) { \
        _bs->accumulator |= (_bits << (32 - _bs->offset)); \
    } else { \
        _bs->offset -= 32; \
        _bs->mask = ((1 << (_nbits - _bs->offset)) - 1); \
        _bs->accumulator |= ((_bits >> _bs->offset) & _bs->mask); \
        BitStream_Flush(bs); \
        _bs->accumulator = 0; \
        _bs->pointer += 4; \
        if (_bs->offset) { \
            _bs->mask = ((1 << _bs->offset) - 1); \
            _bs->accumulator |= ((_bits & _bs->mask) << (32 - _bs->offset)); \
        } \
    } \
} while(0)

void BitStream_Attach(wBitStream* bs, BYTE* buffer, UINT32 capacity)
{
    bs->position = 0;
    bs->buffer = buffer;
    bs->offset = 0;
    bs->accumulator = 0;
    bs->pointer = bs->buffer;
    bs->capacity = capacity;
    bs->length = bs->capacity * 8;
}

wBitStream* BitStream_New()
{
    wBitStream* bs = NULL;

    bs = (wBitStream*) calloc(1, sizeof(wBitStream));

    return bs;
}

void BitStream_Free(wBitStream* bs)
{
    free(bs);
}

Question 2

Hmm. Unfortunately, what I'm going to point you to is not necessarily an efficient implementation, and I hope that somebody posts a better answer (I haven't examined ffmpeg as Multimedia Mike suggested). But I can see enough holes in your understanding for me to think it's worthwhile to post this as an answer.

First off, macros aren't necessary for good performance! This is a highly antiquated approach unless you are coding to very old or immature compilers. An inline function is just as efficient as a macro (with caveats, sometimes making it more efficient). With the judicious use of static functions, the compiler can decide what should be inlined and what shouldn't. While gcc may not be an excellent compiler, it excels as determining when a value is constant, even pointers! This also eliminates the need for constant being put in macros. That is, this:

#define UINT_BIT_SIZE (sizeof(uint) * 8)

is exactly as efficient as

static const size_t UINT_BIT_SIZE = sizeof(uint) * 8;

except that the later has a type. For recent versions of gcc (last 4 years or so), you don't even need const for it to treat it as a compile-time constant, as long as it is marked static (or a local) and the value isn't modified by any code in the compilation unit, it treats it as a compile-time constant.

The advent of CPU cache has radically altered what makes a piece of code fast or (comparatively) slow. If the hot portion of your code does not fit into the upper cache (L1 / L2) then your inlining and/or macros end up slowing you down, especially if you have to hit main memory. Likewise, touching data in multiple places at once results in a lot of cache misses.

Having said that, I wrote "Bit Creek", a minor implementation for a non-performance-critical part of a Linux device driver ("creek" as in "not quite a stream"). The struct bit_creek represents a segment of memory that you treat as a stream of bits and the functions creek_get_bits and creek_put_bits reads or writes to it as a stream returning -EOVERFLOW if you overrun your buffer.

I could probably squeeze out more performance if I stored the skip_bits in the struct and even, as you suggested, don't even write the byte to main memory until I have a full byte, but performance was not at all critical for this.

Good luck!