Implementation of Bi-Cubic resize

Question

Aside from mixing floating point and integer arithmetic, I suspect you're getting numeric overflow / underflow with some of your intermediate values.

One easy fix is to just be consistent and use floating point throughout. Right now you have:

unsigned char C[5] = { 0 };

for (unsigned i = 0; i < dest_height; ++i)
{
    for (unsigned j = 0; j < dest_width; ++j)
    {
        const int x = int(tx * j);
        const int y = int(ty * i);
        const double dx = tx * j - x;
        const double dy = ty * i - y;

        for (int k = 0; k < 3; ++k)
        {
            for (int jj = 0; jj < 4; ++jj)
            {
                const int idx = y - 1 + jj;
                unsigned char a0 = get_subpixel(bmap, idx, x, k);
                unsigned char d0 = get_subpixel(bmap, idx, x - 1, k) - a0;
                unsigned char d2 = get_subpixel(bmap, idx, x + 1, k) - a0;
                unsigned char d3 = get_subpixel(bmap, idx, x + 2, k) - a0;
                unsigned char a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
                unsigned char a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
                unsigned char a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
                C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;

                d0 = C[0] - C[1];
                d2 = C[2] - C[1];
                d3 = C[3] - C[1];
                a0 = C[1];
                a1 = -1.0 / 3 * d0 + d2 -1.0 / 6 * d3;
                a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
                a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
                out[i * row_stride + j * channels + k] = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
            }
        }
    }
}

You have a mixture of unsigned char, int and double. Each one of those 1.0 / 3 converts your 8-bit data to double-precision float, and then the assignment truncates it back.

Instead, why not just use float throughout?

float C[5] = { 0 };

for (unsigned i = 0; i < dest_height; ++i)
{
    for (unsigned j = 0; j < dest_width; ++j)
    {
        const float x = float(tx * j);
        const float y = float(ty * i);
        const float dx = tx * j - x;
        const float dy = ty * i - y;

        for (int k = 0; k < 3; ++k)
        {
            for (int jj = 0; jj < 4; ++jj)
            {
                const int idx = y - 1 + jj;
                float a0 = get_subpixel(bmap, idx, x, k);
                float d0 = get_subpixel(bmap, idx, x - 1, k) - a0;
                float d2 = get_subpixel(bmap, idx, x + 1, k) - a0;
                float d3 = get_subpixel(bmap, idx, x + 2, k) - a0;
                float a1 = -(1.0f / 3.0f) * d0 + d2 - (1.0f / 6.0f) * d3;
                float a2 =          0.5f  * d0 +              0.5f *  d2;
                float a3 = -(1.0f / 6.0f) * d0 - 0.5f * d2 + (1.0f / 6.0f) * d3;
                C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;

                d0 = C[0] - C[1];
                d2 = C[2] - C[1];
                d3 = C[3] - C[1];
                a0 = C[1];
                a1 = -(1.0f / 3.0f) * d0 + d2 -(1.0f / 6.0f) * d3;
                a2 =          0.5f  * d0 +             0.5f  * d2;
                a3 = -(1.0f / 6.0f) * d0 - 0.5f * d2 + (1.0f / 6.0f) * d3;
                out[i * row_stride + j * channels + k] = saturate( a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy );
            }
        }
    }
}

Then define a function saturate that does this:

inline unsigned char saturate( float x )
{
    return x > 255.0f ? 255
         : x < 0.0f   ? 0
         :              unsigned char(x);
}

That will fix your overflow issues, and give you better precision and likely better performance too.

If you need to further improve the performance, then you should look into fixed point arithmetic. But for now, I think the above implementation is better.

Also, one other thought: You should be able to get some further efficiencies by precomputing dx * dx, dx * dx * dx, and so on:

float C[5] = { 0 };

for (unsigned i = 0; i < dest_height; ++i)
{
    for (unsigned j = 0; j < dest_width; ++j)
    {
        const float x = float(tx * j);
        const float y = float(ty * i);
        const float dx = tx * j - x, dx2 = dx * dx, dx3 = dx2 * dx;
        const float dy = ty * i - y, dy2 = dy * dy, dy3 = dy2 * dy;

        for (int k = 0; k < 3; ++k)
        {
            for (int jj = 0; jj < 4; ++jj)
            {
                const int idx = y - 1 + jj;
                float a0 = get_subpixel(bmap, idx, x, k);
                float d0 = get_subpixel(bmap, idx, x - 1, k) - a0;
                float d2 = get_subpixel(bmap, idx, x + 1, k) - a0;
                float d3 = get_subpixel(bmap, idx, x + 2, k) - a0;
                float a1 = -(1.0f / 3.0f) * d0 + d2 - (1.0f / 6.0f) * d3;
                float a2 =          0.5f  * d0 +              0.5f *  d2;
                float a3 = -(1.0f / 6.0f) * d0 - 0.5f * d2 + (1.0f / 6.0f) * d3;
                C[jj] = a0 + a1 * dx + a2 * dx2 + a3 * dx3;

                d0 = C[0] - C[1];
                d2 = C[2] - C[1];
                d3 = C[3] - C[1];
                a0 = C[1];
                a1 = -(1.0f / 3.0f) * d0 + d2 -(1.0f / 6.0f) * d3;
                a2 =          0.5f  * d0 +             0.5f  * d2;
                a3 = -(1.0f / 6.0f) * d0 - 0.5f * d2 + (1.0f / 6.0f) * d3;
                out[i * row_stride + j * channels + k] = saturate( a0 + a1 * dy + a2 * dy2 + a3 * dy3 );
            }
        }
    }
}