Regarding nogil:
Using with nogil
simply allows other threads to run that block of code without the global lock in place -- it is still up to you to run code that is multithreaded in that block, and to ensure that you don't touch any Python objects while doing so. Typed memoryviews are not Python Objects, so you can use / manipulate them in a nogil block with multiple threads. Cython has the prange()
function that automatically generates OpenMP directives inside a with nogil
block. You can get good speedups easily with prange
if your loop iterations are independent of each other. There are lots of details here -- please see the linked documentation.
Regarding your code:
Focus on optimizing the code in the inner loop.
Using cython -a
on your code reveals a few lines are likely dragging down your performance.
You can directly index into
n_k_w[new_k,t]
rather than what you have.You will get an improvement by converting the
k_m_n
list into a 2D numpy array, and using a typed memoryview for that internally.Ditto for
numbered_docs
.You also need to use the
arr[::1]
typed memoryview declarations whenever you know you have contiguous data, otherwise Cython treats the memview as strided, which will slowdown access.
See the cython code below for some suggestions -- you might need to touch it up to get it to work for your stuff.
lda.pyx
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t
cdef class LDA:
cdef:
int iteration, M
int[::1] docSizes
double[:, ::1] n_k_w ,n_m_k
double[::1] n_k
list k_m_n, numbered_docs
def __init__(self, iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n):
self.iteration = iteration
self.M = M
self.n_k_w = n_k_w
self.n_m_k = n_m_k
self.n_k = n_k
self.k_m_n = k_m_n
self.numbered_docs = numbered_docs
self.docSizes = docSizes
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _sample(self) :
cdef:
int[::1] docSizes = self.docSizes
double[:, ::1] n_k_w = self.n_k_w, n_m_k = self.n_m_k
double[::1] n_k = self.n_k
int[::1] k_n, doc
int m, n, t, k, new_k
for m in range(self.M):
k_n = self.k_m_n[m]
doc = self.numbered_docs[m]
for n in range(docSizes[m]):
t = doc[n]
k = k_n[n]
n_m_k[m,k] -= 1
n_k_w[k,t] -= 1
n_k[k] -= 1
new_k = 1
# set z the new topic and increment counters
k_n[n] = new_k
n_m_k[m, new_k] += 1
n_k_w[new_k, t] += 1
n_k[new_k] += 1
return 1
@cython.boundscheck(False)
@cython.wraparound(False)
cdef int _iterate(self) :
while self.iteration >0 :
self._sample()
self.iteration -= 1
return 1
def iterate(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
# pass array / list arguments through np.ascontiguousarray(), will make
# copy only if not contiguous buffer already.
ascontig = np.ascontiguousarray
n_k_w = ascontig(n_k_w, dtype=np.double)
n_m_k = ascontig(n_m_k, dtype=np.double)
n_k = ascontig(n_k, dtype=np.double)
docSizes = ascontig(docSizes, dtype=np.int32)
k_m_n = [ascontig(k_n, dtype=np.int32) for k_n in k_m_n]
numbered_docs = [ascontig(n_d, dtype=np.int32) for n_d in numbered_docs]
cdef LDA lda
lda= LDA(iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
lda._iterate()
# since the lda object just grabs views of the n_k_w, n_m_k etc. arrays,
# these will be modified, so return them directly.
return n_k_w, n_m_k, n_k, k_m_n
setup.py
import numpy as np
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
exts = [Extension("lda", ["lda.pyx"],
include_dirs=[np.get_include()])
]
setup(
cmdclass = {'build_ext': build_ext},
ext_modules = exts,
)
test.py:
import numpy as np
from speedup import iterate
iteration = 10
M = 10
n_k_w = np.random.rand(10, 10)
n_m_k = np.random.rand(10, 10)
n_k = np.random.rand(10)
docSizes = np.zeros((10,), dtype=np.int32) + 10
numbered_docs = np.zeros((10, 10), dtype=np.int32) + 3
k_m_n = np.zeros((10, 10), dtype=np.int32) + 7
k_m_n_orig = k_m_n.copy()
iterate(iteration, M, n_k_w, n_m_k, n_k, docSizes, numbered_docs, k_m_n)
print k_m_n_orig[1]
print k_m_n[1]