Understanding cache-friendly, data-oriented objects and handles

Question 1

There's a great powerpoint done by insomniac, their solution was something like this

template<typename T, size_t SIZE>
class ResourceManager
{
    T data[SIZE];
    int indices[SIZE];
    size_t back;

    ResourceManager() : back(0)
    {
        for(size_t i=0; i<SIZE; i++)
            indices[i] = static_cast<int>(i);
    }

    int Reserve()
    { return indices[back++]; }

    void Release(int handle)
    {
        for(size_t i=0; i<back; i++)
        {
            if(indices[i] == handle)
            {
                back--;
                std::swap(indices[i], indices[back]);
                return;
            }
        }
    }

    T GetData(size_t handle)
    { return data[handle]; }
};

I hope this example demonstrates the idea plainly.

Question 2

If you need stable indices or pointers, then your data structure requirements start to resemble that of a memory allocator. Memory allocators are also a particular type of data structure but face that requirement that they can't shuffle memory around or reallocate, since that would invalidate the pointers stored by the client. So I recommend looking at memory allocator implementations, starting with the classic free list.

Free List

Here's a simple C implementation I wrote to illustrate the idea to colleagues (doesn't bother with thread syncs):

typedef struct FreeList FreeList;

struct FreeList
{
    /// Stores a pointer to the first block in the free list.
    struct FlBlock* first_block;

    /// Stores a pointer to the first free chunk.
    struct FlNode* first_node;

    /// Stores the size of a chunk.
    int type_size;

    /// Stores the number of elements in a block.
    int block_num;
};

/// @return A free list allocator using the specified type and block size, 
/// both specified in bytes.
FreeList fl_create(int type_size, int block_size);

/// Destroys the free list allocator.
void fl_destroy(FreeList* fl);

/// @return A pointer to a newly allocated chunk.
void* fl_malloc(FreeList* fl);

/// Frees the specified chunk.
void fl_free(FreeList* fl, void* mem);

// Implementation:   
typedef struct FlNode FlNode;
typedef struct FlBlock FlBlock;
typedef long long FlAlignType;

struct FlNode
{
    // Stores a pointer to the next free chunk.
    FlNode* next;
};

struct FlBlock
{
    // Stores a pointer to the next block in the list.
    FlBlock* next;

    // Stores the memory for each chunk (variable-length struct).
    FlAlignType mem[1];
};

static void* mem_offset(void* ptr, int n)
{
    // Returns the memory address of the pointer offset by 'n' bytes.
    char* mem = ptr;
    return mem + n;
}

FreeList fl_create(int type_size, int block_size)
{
    // Initialize the free list.
    FreeList fl;
    fl.type_size = type_size >= sizeof(FlNode) ? type_size: sizeof(FlNode);
    fl.block_num = block_size / type_size;
    fl.first_node = 0;
    fl.first_block = 0;
    if (fl.block_num == 0)
        fl.block_num = 1;
    return fl;
}

void fl_destroy(FreeList* fl)
{
    // Free each block in the list, popping a block until the stack is empty.
    while (fl->first_block)
    {
        FlBlock* block = fl->first_block;
        fl->first_block = block->next;
        free(block);
    }
    fl->first_node = 0;
}

void* fl_malloc(FreeList* fl)
{
    // Common case: just pop free element and return.
    FlNode* node = fl->first_node;
    if (node)
    {
        void* mem = node;
        fl->first_node = node->next;
        return mem;
    }
    else
    {
        // Rare case when we're out of free elements.
        // Try to allocate a new block.
        const int block_header_size = sizeof(FlBlock) - sizeof(FlAlignType);
        const int block_size = block_header_size + fl->type_size*fl->block_num;
        FlBlock* new_block = malloc(block_size);

        if (new_block)
        {
            // If the allocation succeeded, initialize the block.
            int j = 0;
            new_block->next = fl->first_block;
            fl->first_block = new_block;

            // Push all but the first chunk in the block to the free list.
            for (j=1; j < fl->block_num; ++j)
            {
                FlNode* node = mem_offset(new_block->mem, j * fl->type_size);
                node->next = fl->first_node;
                fl->first_node = node;
            }

            // Return a pointer to the first chunk in the block.
            return new_block->mem;
        }

        // If we failed to allocate the new block, return null to indicate failure.
        return 0;
    }
}

void fl_free(FreeList* fl, void* mem)
{
    // Just push a free element to the stack.
    FlNode* node = mem;
    node->next = fl->first_node;
    fl->first_node = node;
}

Random-Access Sequence, Nested Free Lists

With the free list idea understood, one possible solution is this:

This type of data structure will give you stable pointers that don't invalidate and not just indices. However, it does up the cost of random-access as well as sequential access if you want to use an iterator for it. It can do sequential access on par with vector using something like a for_each method.

The idea is to use the concept of the free list above except each block stores a free list of its own, and the outer data structure aggregating the blocks store a free list of blocks. A block is only popped off the free stack when it becomes completely full.

Parallel Occupancy Bits

Another is to use a parallel array of bits to indicate which parts of an array are occupied/vacant. The benefit here is that you can, during sequential iteration, check to see if many indices are occupied at once (64-bits at once, at which point you can access all 64 contiguous elements in a loop without individually checking to see if they are occupied). When not all 64 indices are occupied, you can use FFS instructions to quickly determine which bits are set.

You can combine this with the free list to then use the bits to quickly determine what indices are occupied during iteration while having rapid constant-time insertion and removal.

You can actually get faster sequential access than std::vector with a list of indices/pointers on the side since, again, we can do things like checking 64-bits at once to see what elements to traverse inside the data structure, and because the access pattern will always be sequential (similar to using a sorted list of indices into the array).

All of these concepts revolve around leaving vacant spaces in an array to reclaim upon subsequent insertions which becomes a practical requirement if you don't want indices or pointers to be invalidated to elements that haven't been removed from the container.

Singly-Linked Index List

Another solution is to use a singly-linked list which most people might think of as involving a separate heap allocation per node and cache misses galore on traversal, but that doesn't have to be the case. We can just store the nodes contiguously in an array and link them together. A world of optimization opportunities actually opens up if you don't think of a linked list as a container so much as a way to just link existing elements together stored in another container, like an array, to allow different traversal and search patterns. Example with everything just stored in a contiguous array with indices to link them together:

With data stored like this:

struct Bucket
{
    struct Node
    {
         // Stores the element data.
         T some_data;

         // Points to either the next node in the bucket
         // or the next free node available if this node
         // has been removed.
         int next;
    };
    vector<Node> data;

    // Points to first node in the bucket.
    int head;

    // Points to first free node in the bucket.
    int free_head;
};

This does not allow random access and its spatial locality does degrade if you remove from the middle and insert often. But it's easy enough to restore it with a post-processing copy. It can be suitable if you only need sequential access and want constant-time removal and insertion. If you need stable pointers and not just indices, then you might use the above structure with the nested free list.

The indexed SLL tends to do quite well when you have a lot of small lists that are very dynamic (constant removals and insertions). Another example with particles stored contiguously but the 32-bit index links just being used to partition them into a grid for rapid collision detection while allowing the particles to move every single frame and only having to change a couple of integers to transfer a particle from one grid cell to another:

In this case you can store a 1000x1000 grid in under 4 megabytes -- definitely beats storing a million instances of std::list or std::vector and having to constantly remove and insert from/to them as particles move around.

Occupancy Indices

Another simple solution if you only need stable indices is just use, say, std::vector with an std::stack<int> of free indices to reclaim/overwrite on insertions. That follows the free list principle of constant-time removal but is tiny bit less efficient since it requires memory to store the stack of free indices. The free list makes the stack come free of charge.

However, unless you hand-roll it and avoid just using std::vector<T>, you can't very effectively make it trigger the destructor of the element type you're storing on removal (I haven't been keeping up with C++, more of a C programmer these days, but there might be a way to do this nicely that still respects your element destructors without hand-rolling your own equivalent of std::vector -- maybe a C++ expert could pitch in). That can be fine though if your types are trivial POD types.

template <class T>
class ArrayWithHoles
{
private:
    std::vector<T> elements;
    std::stack<size_t> free_stack;

public:
    ...

    size_t insert(const T& element)
    {
        if (free_stack.empty())
        {
            elements.push_back(element);
            return elements.size() - 1;
        }
        else
        {
            const size_t index = free_stack.top();
            free_stack.pop();
            elements[index] = element;
            return index;
        }
    }

    void erase(size_t n)
    {
        free_stack.push(n);
    }
};

Something to this effect. That leaves us with a dilemma though in that we can't tell what elements have been removed from the container to skip over during iteration. Here again you can use parallel bit arrays or you can also just store a list of valid indices on the side.

If you do that, the list of valid indices can degrade in terms of memory access patterns into the array as they become unsorted over time. A quick way to repair that is to radix sort the indices from time-to-time, at which point you've restored the sequential access pattern.

Question 3

If you really have measured that the cache locality provides benefits for you, then I'd suggest using a memory pooling approach: At the most basic level if you know a maximum number of elements up front you can simply create three vectors, one with the objects, one with active object pointers and one with free object pointers. Initially the free list has pointers to all the objects in the elements container and then items are moved to the active list as they become active, then back to the free list as they become deleted.

The objects never change location even as pointers are added/removed from the respective containers, so your references never become invalidated.

Question 4

To change out the referenced vector entities on the fly, modify your design to store indices in the UserObject instead of direct pointers. This way you can change out the referenced vector, copy the old values over and then everything will still work. Cache-wise, indices off of a single pointer is negligible and instruction-wise it's the same.

To deal with deletes, either ignore them (if you know there is a fixed amount of them) or maintain a free list of indices. Use this freelist when adding items, and then only increase the vector when the freelist is empty.

Question 5

I will focus on the case that you require variable size for your vector, e.g. the data is often inserted and sometimes cleaned up. In this case, using dummy data or holes in your vector is nearly as "bad" as using heap data like your first solution.

If you often iterate directly over all data, and use only few random "UsersObject" accesses, then the below might be a solution. It uses, like proposed by others and yourself, a level of indirection which needs to be updated on each delete/update step. This takes linear time and is definitely not cache optimal. Additionally and imo even worse, such a solution can not be done thread safe without locks.

#include <vector>
#include <map>
#include <algorithm>
#include <iostream>
#include <mutex>

using namespace std;

typedef __int64 EntityId;

template<class Entity>
struct Manager {        
    vector<Entity>          m_entities; // Cache-friendly
    map<EntityId, size_t>   m_id_to_idx;
    mutex                   g_pages_mutex;
public:
    Manager() :
        m_entities(),
        m_id_to_idx(),
        m_remove_counter(0),
        g_pages_mutex()
    {}
    void update()
    {
        g_pages_mutex.lock();
        m_remove_counter = 0;
        // erase-remove_if idiom: remove all !alive entities

        for (vector<Entity>::iterator i = m_entities.begin(); i <  m_entities.end(); )
        {
            Entity &e = (*i);
            if (!e.m_alive)
            { 
                m_id_to_idx.erase(m_id_to_idx.find(e.m_id)); 
                i = m_entities.erase(i);
                m_remove_counter++;
                return true;
            } 
            else
            {
                m_id_to_idx[e.m_id] -= m_remove_counter;
                i++;
            }                    
        }
        g_pages_mutex.unlock();
    }
    Entity& getEntity(EntityId h)
    { 
        g_pages_mutex.lock();
        map<EntityId, size_t>::const_iterator it = m_id_to_idx.find(h);


        if (it != m_id_to_idx.end())
        {
            Entity& et =  m_entities[(*it).second];
            g_pages_mutex.unlock();
            return et;
        }
        else
        {
            g_pages_mutex.unlock();
            throw std::exception();
        }
    }
    EntityId inserEntity(const Entity& entity) 
    {
        g_pages_mutex.lock();
        size_t idx = m_entities.size();
        m_id_to_idx[entity.m_id]  = idx;
        m_entities.push_back(entity);
        g_pages_mutex.unlock();
        return entity.m_id;
    }
};

class Entity { 
    static EntityId  s_uniqeu_entity_id;
public:
    Entity (bool alive) :  m_id (s_uniqeu_entity_id++), m_alive(alive) {}
    Entity () :  m_id (s_uniqeu_entity_id++), m_alive(true) {}
    Entity (const Entity &in) : m_id(in.m_id), m_alive(in.m_alive) {}
    EntityId  m_id;
    bool m_alive; 
};

EntityId  Entity::s_uniqeu_entity_id = 0;

struct UserObject
{ 
    UserObject(bool alive, Manager<Entity>& manager) : 
        entity(manager.inserEntity(alive)) 
    {}
    EntityId entity; 
};

int main(int argc, char* argv[])
{
    Manager<Entity> manager;
    UserObject obj1(true, manager);
    UserObject obj2(false, manager);
    UserObject obj3(true, manager);
    cout << obj1.entity << "," << obj2.entity << "," << obj3.entity;
    manager.update();
    manager.getEntity(obj1.entity);
    manager.getEntity(obj3.entity);
    try
    {
        manager.getEntity(obj2.entity);
        return -1;
    }
    catch (std::exception ex)
    {
        // obj 2 should be invalid
    }
    return 0;
}

I am not sure, if you specified enough side conditions why you want to solve your problem having this two contradicting assumptions: have a fast-to-iterate list and have a stable reference to elements of this list. This sounds to me like two use cases which should be separated also on data level (e.g. copy on read, commit changes back).

Question 6

Let's review your phrase

cache-friendly linear memory.

What is the requirement for 'linear'? If you really have such requirement then please refer to answers of @seano and @Mark B. If you don't care about linear memory then here we go.

std::map, std::set, std::list provides iterators that are stable (tolerant) to container modification - it's mean that instead of keeping reference you can keep iterator:

struct UserObject {
    // This reference may unexpectedly become invalid
    my_container_t::iterator entity;
};

Special notes on std::list - on some lecture on http://isocpp.org/ Bjarne Stroustrup did not recommend use linked list, but for your case you can be sure that Entity inside the Manager will be safe from modifications - so reference is applicable there.

P.S. Googling fast I have not found if unordered_map provide stable iterators, so my list above may be not complete.

P.P.S After the posting I recollect interesting data-structure - chunked list. The linked list of linear arrays - so your keep linear fixed-size chunks in the linked order.

Question 7

I have two ways in my mind. First way is to update your handles when erase entity from container http://www.codeproject.com/Articles/328365/Understanding-and-Implementing-Observer-Pattern-in , second is to use key/value container like map/hash table and your handle must contain key instead of index

edit:

first solution example

class Manager:

class Entity { bool alive{true}; };
class EntityHandle 
{
public:
    EntityHandle(Manager *manager)
    {
        manager->subscribe(this);
        // need more code for index
    }

    ~EntityHandle(Manager *manager)
    {
        manager->unsubscribe(this);
    }

    void update(int removedIndex)
    {
        if(removedIndex < index)
        {
            --index;
        }
    }

    int index; 
};

class Manager {
    vector<Entity> entities; // Cache-friendly
    list<EntityHandle*> handles;

    bool needToRemove(const unique_ptr<Entity>& e)
    {
        bool result = !e->alive;

        if(result )
            for(auto handle: handles)
            {
                handle->update(e->index);
            }

        return result;
    }

    void update() 
    {
        entities.erase(remove_if(begin(entities), end(entities),
        needToRemove);
    }
    Entity& getEntity(EntityHandle h) { return entities[h.index]; }

    subscribe(EntityHandle *handle)
    {
        handles.push_back(handle);
    }

    unsubscribe(EntityHandle *handle)
    {
        // find and remove
    }

};

I hope this is enough to get the idea