Actually you can store and retrieve this kind of data into a hdf5
file with just a little bit custom logic:
import tables
import numpy as np
def store(filename, name, data):
with tables.openFile(filename, 'w') as store:
store.createGroup('/', name)
for i, item in enumerate(data):
store.createArray('/%s' % name, 'item_%s' % i, item)
def read(filename, name):
with tables.openFile(filename, 'r') as store:
nodes = store.listNodes('/%s' % name)
data = [0] * len(nodes)
for node in nodes:
pos = int(node.name.split('_')[-1])
data[pos] = node.read()
return data
Usage:
>>> a = [0, np.array([4,5,6])]
>>> store('my_data.h5', 'a', a)
>>> print read('my_data.h5', 'a')
[0, array([4, 5, 6])]
This is just the first thing that fall on my mind, I'm sure there is a more efficient pattern of storing list into hdf5
files. But let's time it and see if even this naive implementation is faster than cPickle
:
In [7]: a = []
for i in range(1, 500):
if i % 10 == 0:
a.append(i)
else:
a.append(np.random.randn(i, i))
In [8]: %%timeit
store('my_data.h5', 'a', a)
read_data = read('my_data.h5', 'a')
1 loops, best of 3: 1.32 s per loop
In [9]: %%timeit
with open('test.pickle', 'wb') as f:
cPickle.dump(a, f)
with open('test.pickle', 'rb') as f:
read_data = cPickle.load(f)
1 loops, best of 3: 1min 58s per loop
Depending on the data the difference is even bigger or a little bit smaller. But even this stupid implementation is at least 10x faster than cPickle
for any data that contains numpy
arrays.