Fast input/output in competitive programming

https://stackoverflow.com/questions/9749630

24-05-2021
|

Question

I have come across this particular snippet of code many times in solutions of competitive programming contests. I understand the basic use of this code to beat time limits but i want to understand it more deeply. I know that unistd.h gives access to system call wrapper functions such as fork, pipe and I/O primitives (read, write, ..).

It will also be great if anyone can explain or guide me to resources that can help me understand it further.

#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
class FastInput {
public:
    FastInput() {
        m_dataOffset = 0;
        m_dataSize = 0;
        m_v = 0x80000000;
    }
    uint32_t ReadNext() {
        if (m_dataOffset == m_dataSize) {
            int r = read(0, m_buffer, sizeof(m_buffer));
            if (r <= 0) return m_v;
            m_dataOffset = 0;
            m_dataSize = 0;
            int i = 0;
            if (m_buffer[0] < '0') {
                if (m_v != 0x80000000) {
                    m_data[m_dataSize++] = m_v;
                    m_v = 0x80000000;
                }
                for (; (i < r) && (m_buffer[i] < '0'); ++i);
            }
            for (; i < r;) {
                if (m_buffer[i] >= '0') {
                    m_v = m_v * 10 + m_buffer[i] - 48;
                    ++i;
                } else {
                    m_data[m_dataSize++] = m_v;
                    m_v = 0x80000000;
                    for (i = i + 1; (i < r) && (m_buffer[i] < '0'); ++i);
                }
            }
        }
        return m_data[m_dataOffset++];
    }
public:
    uint8_t m_buffer[32768];
    uint32_t m_data[16384];
    size_t m_dataOffset, m_dataSize;
    uint32_t m_v;
};
class FastOutput {
public:
    FastOutput() {
        m_dataOffset = 0;
    }
    ~FastOutput() {
    }
    void Flush() {
        if (m_dataOffset) {
            if (write(1, m_data, m_dataOffset));
            m_dataOffset = 0;
        }
    }
    void PrintUint(uint32_t v, char d) {
        if (m_dataOffset + 11 > sizeof(m_data)) Flush();
        if (v < 100000) {
            if (v < 1000) {
                if (v < 10) {
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 1;
                } else if (v < 100) {
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 2;
                } else {
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 3;
                }
            } else {
                if (v < 10000) {
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 4;
                } else {
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 5;
                }
            }
        } else {
            if (v < 100000000) {
                if (v < 1000000) {
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 6;
                } else if (v < 10000000) {
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 7;
                } else {
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 8;
                }
            } else {
                if (v < 1000000000) {
                    m_data[m_dataOffset + 8] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 9;
                } else {
                    m_data[m_dataOffset + 9] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 8] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 7] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 6] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 5] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 4] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 3] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 2] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 1] = v - v / 10 * 10 + 48;
                    v /= 10;
                    m_data[m_dataOffset + 0] = v + 48;
                    m_dataOffset += 10;
                }
            }
        }
        m_data[m_dataOffset++] = d;
    }
    void PrintChar(char d) {
        if (m_dataOffset + 1 > sizeof(m_data)) Flush();
        m_data[m_dataOffset++] = d;
    }
    void ReplaceChar(int offset, char d) {
        m_data[m_dataOffset + offset] = d;
    }
public:
    uint8_t m_data[32768];
    size_t m_dataOffset;
};

One more thing: Is it good practice to employ similar techniques in production level code?

Solution

Is it good practice to employ similar techniques in production level code?

No. Reimplementing the wheel leads to bugs. Bugs require extra development time and cost money.

can help me understand it further.

If you don't understand the code, the code is poorly written. Code is written by humans, and for humans. If another programmer doesn't understand code quickly, there may be a big problem. The rationale behind this thinking ("written for humans") is simple: development time costs a lot, and unreadable code increases development time.

The code fragment in question utilizes several bad coding practices:

Hungarian notation (there's no need for that in case-sensitive notation and especially in C++),
Short variable members (can you tell what m_v means without reading the rest of the program, for example?)
Hard-coded values (+ 48, + 11)
(subjective) Mixes signed/unsigned ints/chars (mingw/gcc will annoy the hell out of you while compiling).
Code copy-pasting (v /= 10 and similar - C++ has macros/templates, damn it, so if you want to unroll loop by hand, use them!).
Needlessly multi-level if/else.

Unless you want to become worse at programming, I'd advise to avoid trying to "understand" this code fragment. It is bad.

I seriously doubt that this particular design was a result of profiling. Most likely scenario some "genius" assumed that his code fragment will outperform built-in functions.

When you want performance, you follow this pattern:

Write initial version.
Repeat until performance gain is no longer worth it or until there's no solution:
1. Do not make many assumptions about what will improve performance. You're human, human's job is to make mistakes. By Murphy's law, your assumptions will be incorrect.
2. Consider algorithmic optimization first.
3. Run the code through profiler.
4. Locate bottlenecks.
5. Investigate total performance gain if total time spent in this particular routine will be reduced to zero.
6. If the gain is reasonable (time/cost), optimize the routine. Otherwise ignore.

OTHER TIPS

try this for faster I/O

ios_base::sync_with_stdio(false); cin.tie(NULL);

It sets whether the standard C++ streams are synchronized to the standard C streams after each input/output operation. By default, iostream objects and cstdio streams are synchronized.

In the PrintUint function, he's basically just unrolling a loop by hand. Unrolling loops are sometimes a good thing to do -- however the compiler already does it, and will do it better than you, most of the time.

To plug my favorite language feature, it would be better implemented using templates: a simple implementation (more clever probably exist) would look like:

// I'm sure the compiler can figure out the inline part, but I'll add it anyways
template<unsigned int N> 
inline void print_uint_inner(uint32_t v) {
    m_data[m_dataOffset + N] = v - v / 10 * 10 + 48;
    print_uint_inner<N-1>(v / 10);
}

// For situations just like this, there's a trick to avoid having to define the base case separately.
inline void print_uint_inner<0>(uint32_t v) {
    m_data[m_dataOffset] = v - v / 10 * 10 + 48;
}

template<unsigned int N>
inline void print_uint_helper(uint32_t v) {
    print_uint_inner<N-1>(v);
    m_dataOffset += N;
}

// We could generate the compile-time binary search with templates too, rather than by hand.
void PrintUint(uint32_t v, char d) {
    if (m_dataOffset + 11 > sizeof(m_data)) Flush();
    if (v < 100000) {
        if (v < 1000) {
            if (v < 10) {
                print_uint_helper<1>(v);
            } else if (v < 100) {
                print_uint_helper<2>(v);
            } else {
                print_uint_helper<3>(v);
            }
        } else {
            if (v < 10000) {
                print_uint_helper<4>(v);
            } else {
                print_uint_helper<5>(v);
            }
        }
    } else {
        if (v < 100000000) {
            if (v < 1000000) {
                print_uint_helper<6>(v);
            } else if (v < 10000000) {
                print_uint_helper<7>(v);
            } else {
                print_uint_helper<8>(v);
            }
        } else {
            if (v < 1000000000) {
                print_uint_helper<9>(v);
            } else {
                print_uint_helper<10>(v);
            }
        }
    }
    m_data[m_dataOffset++] = d;
}

Is doing things like this good coding practice in general? Yes, but only if all of the following criteria are satisfied:

You've already written the obvious, easy to understand, simple version.
You've profiled your program, so that you know this stretch of code is costing enough time to be worth the effort
You're willing to go through the extra work to ensure the more complex version is actually correct
You've profiled the revised program, so that you know the rewrite actually improved your run-time.

Also, you should probably retain the ability to switch back to the simple version, either using compile-time constants or pre-processor directives. This will be important for two reasons:

When you're debugging, the ability to switch back to the simple version will help to narrow down places where there could be problems
When you try running on a different computer (or even the same computer under different conditions), you may find the complicated version is no longer faster than the simple version.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow