I'm trying to do knn search on big data with limited memory.
I'm using HDF5 and python.
I tried bruteforce linear search(using pytables) and kd-tree search (using sklearn)
It's suprising but kd-tree method takes more time(maybe kd-tree will work better if we increase batch size? but I don't know optimal size also it limited by memory)
Now I'm looking for how to speed up calculations, I think HDF5 file can be tuned for individual PC, also norm calculation can be speeded maybe using nymexpr or some python tricks.
import numpy as np
import time
import tables
import cProfile
from sklearn.neighbors import NearestNeighbors
rows = 10000
cols = 1000
batches = 100
k= 10
#USING HDF5
vec= np.random.rand(1,cols)
data = np.random.rand(rows,cols)
fileName = 'C:\carray1.h5'
shape = (rows*batches, cols) # predefined size
atom = tables.UInt8Atom() #?
filters = tables.Filters(complevel=5, complib='zlib') #?
#create
# h5f = tables.open_file(fileName, 'w')
# ca = h5f.create_carray(h5f.root, 'carray', atom, shape, filters=filters)
# for i in range(batches):
# ca[i*rows:(i+1)*rows]= data[:]+i # +i to modify data
# h5f.close()
#can be parallel?
def test_bruteforce_knn():
h5f = tables.open_file(fileName)
t0= time.time()
d = np.empty((rows*batches,))
for i in range(batches):
d[i*rows:(i+1)*rows] = ((h5f.root.carray[i*rows:(i+1)*rows]-vec)**2).sum(axis=1)
print (time.time()-t0)
ndx = d.argsort()
print ndx[:k]
h5f.close()
def test_tree_knn():
h5f = tables.open_file(fileName)
# it will not work
# t0= time.time()
# nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(h5f.root.carray)
# distances, indices = nbrs.kneighbors(vec)
# print (time.time()-t0)
#need to concatenate distances, indices somehow
t0= time.time()
d = np.empty((rows*batches,))
for i in range(batches):
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(h5f.root.carray[i*rows:(i+1)*rows])
distances, indices = nbrs.kneighbors(vec) # put in dict?
#d[i*rows:(i+1)*rows] =
print (time.time()-t0)
#ndx = d.argsort()
#print ndx[:k]
h5f.close()
cProfile.run('test_bruteforce_knn()')
cProfile.run('test_tree_knn()')