Bakeoff Part 1 Python vs Cython vs Cython Typed memory views: LDA by Gibbs Sampling

Question

Regarding nogil:

Using with nogil simply allows other threads to run that block of code without the global lock in place -- it is still up to you to run code that is multithreaded in that block, and to ensure that you don't touch any Python objects while doing so. Typed memoryviews are not Python Objects, so you can use / manipulate them in a nogil block with multiple threads. Cython has the prange() function that automatically generates OpenMP directives inside a with nogil block. You can get good speedups easily with prange if your loop iterations are independent of each other. There are lots of details here -- please see the linked documentation.

Regarding your code:

Focus on optimizing the code in the inner loop.

Using cython -a on your code reveals a few lines are likely dragging down your performance.

You can directly index into n_k_w[new_k,t] rather than what you have.
You will get an improvement by converting the k_m_n list into a 2D numpy array, and using a typed memoryview for that internally.
Ditto for numbered_docs.
You also need to use the arr[::1] typed memoryview declarations whenever you know you have contiguous data, otherwise Cython treats the memview as strided, which will slowdown access.

See the cython code below for some suggestions -- you might need to touch it up to get it to work for your stuff.

lda.pyx

import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t


cdef class LDA:

    cdef:
        int iteration, M
        int[::1] docSizes
        double[:, ::1] n_k_w ,n_m_k
        double[::1] n_k
        list k_m_n, numbered_docs

    def __init__(self, iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n):
        self.iteration = iteration
        self.M = M
        self.n_k_w = n_k_w
        self.n_m_k = n_m_k
        self.n_k = n_k
        self.k_m_n = k_m_n
        self.numbered_docs = numbered_docs
        self.docSizes = docSizes

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _sample(self) :

        cdef:
            int[::1] docSizes = self.docSizes
            double[:, ::1] n_k_w = self.n_k_w, n_m_k = self.n_m_k
            double[::1] n_k = self.n_k
            int[::1] k_n, doc
            int m, n, t, k, new_k

        for m in range(self.M):
            k_n = self.k_m_n[m]
            doc = self.numbered_docs[m]
            for n in range(docSizes[m]):

                t = doc[n]
                k = k_n[n]

                n_m_k[m,k] -= 1
                n_k_w[k,t] -= 1
                n_k[k] -= 1
                new_k = 1

                # set z the new topic and increment counters
                k_n[n] = new_k
                n_m_k[m, new_k] += 1
                n_k_w[new_k, t] += 1
                n_k[new_k] += 1

        return 1

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _iterate(self) :

        while self.iteration >0 :
            self._sample()
            self.iteration -= 1
        return 1

def iterate(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):

    # pass array / list arguments through np.ascontiguousarray(), will make
    # copy only if not contiguous buffer already.
    ascontig = np.ascontiguousarray
    n_k_w = ascontig(n_k_w, dtype=np.double)
    n_m_k = ascontig(n_m_k, dtype=np.double)
    n_k = ascontig(n_k, dtype=np.double)
    docSizes = ascontig(docSizes, dtype=np.int32)
    k_m_n = [ascontig(k_n, dtype=np.int32) for k_n in k_m_n]
    numbered_docs = [ascontig(n_d, dtype=np.int32) for n_d in numbered_docs]

    cdef LDA lda
    lda= LDA(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
    lda._iterate()
    # since the lda object just grabs views of the n_k_w, n_m_k etc. arrays,
    # these will be modified, so return them directly.
    return n_k_w, n_m_k, n_k, k_m_n

setup.py

import numpy as np
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

exts = [Extension("lda", ["lda.pyx"],
                  include_dirs=[np.get_include()])
        ]

setup(
    cmdclass = {'build_ext': build_ext},
    ext_modules = exts,
)

test.py:

import numpy as np
from speedup import iterate

iteration = 10
M = 10
n_k_w = np.random.rand(10, 10)
n_m_k = np.random.rand(10, 10)
n_k = np.random.rand(10)
docSizes = np.zeros((10,), dtype=np.int32) + 10
numbered_docs = np.zeros((10, 10), dtype=np.int32) + 3
k_m_n = np.zeros((10, 10), dtype=np.int32) + 7
k_m_n_orig = k_m_n.copy()

iterate(iteration, M,  n_k_w, n_m_k, n_k, docSizes, numbered_docs, k_m_n)

print k_m_n_orig[1]
print k_m_n[1]