Question

Update: Memory Views wins. Cython using typed memoryviews : 0.0253449

Special thanks to lothario who pointed out several critical changes.

Ridiculous. Of course now the issue is , cant seem to do much arithmetic on them (sums and multiplications). Original Post Inspired by Implementing Topic Model with Python (numpy) , which is incredibly slow. i thought it would be a good idea to cythonize it. However i could only figure out how to halve the time with cython. There are clearly array operations here which are not being optimized - some thoughts and suggestions would be most welcome. Ive always wanted to play with cython and this appears to be a good opportunity!

for 15 documents with about 300 words each, python: 39.6903322834 cython: 19.2733114806 Cython using typed memoryviews : 0.547822975

I specifically want to use nogil, so this can get speeded up further: 1) With memory views, does it help to add nogil to the loop? 2) I have a list of documents, each document is represented by an array of numbers. WHat is the best C object for me to use? nogil does not work on python objects. Currently I have this as a list of arrays.

I am not a C fiend, but would welcome any further optimization suggestions.

Java implementation from a friend , for 1000 documents of 300 words each, 3 seconds.

lda_pyx Cython code

import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t




cdef class LDA:
    cdef int iteration, M
    cdef int[:] docSizes
    cdef double[:, ::1] n_k_w ,n_m_k
    #cdef
    cdef double[:] n_k
    cdef list k_m_n
    cdef list numbered_docs


    #def __init__(self,int iteration,int M,  np.ndarray[np.double_t, ndim=2] n_k_w ,np.ndarray[np.double_t, ndim=2]  n_m_k, np.ndarray[np.double_t, ndim=1] n_k,np.ndarray[np.int_t, ndim=1] docSizes, list numbered_docs, list k_m_n):
    def __init__(self,int iteration,int M,  double[:, ::1] n_k_w ,double[:, ::1] n_m_k, double[:] n_k, int[:] docSizes, list numbered_docs, list k_m_n):
        self.iteration = iteration
        self.M = M
        self.n_k_w = n_k_w
        self.n_m_k = n_m_k
        self.n_k = n_k
        self.k_m_n = k_m_n
        self.numbered_docs = numbered_docs
        self.docSizes = docSizes


    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _sample(self) :
        #cdef np.ndarray[np.double_t, ndim=2, mode="c"] n_k_w = self.n_k_w
        #cdef np.ndarray[np.double_t, ndim=2, mode="c"]  n_m_k = self.n_m_k
        #cdef np.ndarray[np.double_t, ndim=1, mode="c"] n_k = self.n_k
        cdef double[:, ::1] n_k_w = self.n_k_w
        cdef double[:] n_k = self.n_k
        cdef double[:, ::1]  n_m_k = self.n_m_k
        #cdef np.ndarray[np.int_t, ndim=1, mode="c"] docSizes = self.docSizes
        cdef int[:] docSizes = self.docSizes
        cdef  int m , n, t , k ,new_k
        #cdef np.ndarray[np.int_t, ndim=1, mode="c"] doc
        cdef int[:] doc

        for m in xrange(self.M):
            doc = self.numbered_docs[m]
            for n in xrange(docSizes[m]):

                t = doc[n]

                # discount for n-th word t with topic z
                k = self.k_m_n[m][n]
                #print k

                n_m_k[m,k] -= 1
                n_k_w[k,t] -= 1
                n_k[k] -= 1
                #print "ok"
                # sampling topic new_z for t
                #p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
                new_k = 1
                #np.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                self.k_m_n[m][n] = new_k
                #print n_m_k[m, new_k] ,"after"
                n_m_k[m, new_k] += 1
                #print n_m_k[m, new_k] ,"after"
                n_k_w[new_k][t] += 1
                n_k[new_k] += 1
        #print self.n_k_w ,"before"
        self.n_k_w = n_k_w
        #print self.n_k_w ,"after"
        self.n_m_k = n_m_k
        self.n_k = n_k
        #self.k_m_n = k_m_n

        return 1

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _iterate(self) :

        while self.iteration >0 :
            self._sample()
            self.iteration -= 1
        return 1

def iterate(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):
    cdef LDA lda
    lda= LDA(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
    lda._iterate()
    return lda.n_k_w , lda.n_m_k, lda.n_k , lda.k_m_n

pure python version

def gibbs_sample():
    for i in xrange(iteration):
    #print i
        for m in xrange(M):
            doc = numbered_docs[m]
            for n in xrange(docSizes[m]):
                #print t
                t = doc[n]

                # discount for n-th word t with topic z
                k = k_m_n[m][n]
                n_m_k[m][k] -= 1
                n_k_w[k][t] -= 1
                n_k[k] -= 1

                # sampling topic new_z for t
                #p_k = n_k_w[:, t] * n_m_k[m][k] / n_k
                new_k = 1
                #np.random.multinomial(1, p_z / p_z.sum()).argmax()

                # set z the new topic and increment counters
                k_m_n[m][n] = new_k
                n_m_k[m][new_k] += 1
                n_k_w[new_k][t] += 1
                n_k[new_k] += 1

cProfile

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.419    0.419 <string>:1(<module>)
        1    0.419    0.419    0.419    0.419 {lda_pyx.iterate}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
Was it helpful?

Solution

Regarding nogil:

Using with nogil simply allows other threads to run that block of code without the global lock in place -- it is still up to you to run code that is multithreaded in that block, and to ensure that you don't touch any Python objects while doing so. Typed memoryviews are not Python Objects, so you can use / manipulate them in a nogil block with multiple threads. Cython has the prange() function that automatically generates OpenMP directives inside a with nogil block. You can get good speedups easily with prange if your loop iterations are independent of each other. There are lots of details here -- please see the linked documentation.

Regarding your code:

Focus on optimizing the code in the inner loop.

Using cython -a on your code reveals a few lines are likely dragging down your performance.

  • You can directly index into n_k_w[new_k,t] rather than what you have.

  • You will get an improvement by converting the k_m_n list into a 2D numpy array, and using a typed memoryview for that internally.

  • Ditto for numbered_docs.

  • You also need to use the arr[::1] typed memoryview declarations whenever you know you have contiguous data, otherwise Cython treats the memview as strided, which will slowdown access.

See the cython code below for some suggestions -- you might need to touch it up to get it to work for your stuff.

lda.pyx

import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.int
ctypedef np.int_t DTYPE_t


cdef class LDA:

    cdef:
        int iteration, M
        int[::1] docSizes
        double[:, ::1] n_k_w ,n_m_k
        double[::1] n_k
        list k_m_n, numbered_docs

    def __init__(self, iteration, M, n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n):
        self.iteration = iteration
        self.M = M
        self.n_k_w = n_k_w
        self.n_m_k = n_m_k
        self.n_k = n_k
        self.k_m_n = k_m_n
        self.numbered_docs = numbered_docs
        self.docSizes = docSizes

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _sample(self) :

        cdef:
            int[::1] docSizes = self.docSizes
            double[:, ::1] n_k_w = self.n_k_w, n_m_k = self.n_m_k
            double[::1] n_k = self.n_k
            int[::1] k_n, doc
            int m, n, t, k, new_k

        for m in range(self.M):
            k_n = self.k_m_n[m]
            doc = self.numbered_docs[m]
            for n in range(docSizes[m]):

                t = doc[n]
                k = k_n[n]

                n_m_k[m,k] -= 1
                n_k_w[k,t] -= 1
                n_k[k] -= 1
                new_k = 1

                # set z the new topic and increment counters
                k_n[n] = new_k
                n_m_k[m, new_k] += 1
                n_k_w[new_k, t] += 1
                n_k[new_k] += 1

        return 1

    @cython.boundscheck(False)
    @cython.wraparound(False)
    cdef int _iterate(self) :

        while self.iteration >0 :
            self._sample()
            self.iteration -= 1
        return 1

def iterate(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n ):

    # pass array / list arguments through np.ascontiguousarray(), will make
    # copy only if not contiguous buffer already.
    ascontig = np.ascontiguousarray
    n_k_w = ascontig(n_k_w, dtype=np.double)
    n_m_k = ascontig(n_m_k, dtype=np.double)
    n_k = ascontig(n_k, dtype=np.double)
    docSizes = ascontig(docSizes, dtype=np.int32)
    k_m_n = [ascontig(k_n, dtype=np.int32) for k_n in k_m_n]
    numbered_docs = [ascontig(n_d, dtype=np.int32) for n_d in numbered_docs]

    cdef LDA lda
    lda= LDA(iteration, M,  n_k_w , n_m_k, n_k, docSizes, numbered_docs, k_m_n)
    lda._iterate()
    # since the lda object just grabs views of the n_k_w, n_m_k etc. arrays,
    # these will be modified, so return them directly.
    return n_k_w, n_m_k, n_k, k_m_n

setup.py

import numpy as np
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

exts = [Extension("lda", ["lda.pyx"],
                  include_dirs=[np.get_include()])
        ]

setup(
    cmdclass = {'build_ext': build_ext},
    ext_modules = exts,
)

test.py:

import numpy as np
from speedup import iterate

iteration = 10
M = 10
n_k_w = np.random.rand(10, 10)
n_m_k = np.random.rand(10, 10)
n_k = np.random.rand(10)
docSizes = np.zeros((10,), dtype=np.int32) + 10
numbered_docs = np.zeros((10, 10), dtype=np.int32) + 3
k_m_n = np.zeros((10, 10), dtype=np.int32) + 7
k_m_n_orig = k_m_n.copy()

iterate(iteration, M,  n_k_w, n_m_k, n_k, docSizes, numbered_docs, k_m_n)

print k_m_n_orig[1]
print k_m_n[1]
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top