python pandas optimization: filtering on text index values

https://datascience.stackexchange.com/questions/11935

16-10-2019
|

سؤال

I have to filter a pandas data frame by matching a complex regular expression on a text index.

The data frame is multi level indexed, and contains more than 2 million records.

The way I'm doing is:

identifiers = self._data.index.get_level_values('identifier')
filt = ... # an_array_of_np.bool_with_the_same_length_as_my_data

pattern = ... # a complex regular expression, as a string
filt = filt & np.array(identifiers.str.contains(pat=pattern, case=False, regex=True), dtype=np.bool)

... # other filterings

Unfortunately, the line beginning by filt = filt & is very slow.

I'm wondering if you have some ideas to make it faster. I guess that's because of the identifiers.str.contains

Thanks a lot!

EDIT:

Thanks @Emre

I'm not allowed to share those data, but the code below demonstrates the problem:

Step 0: 0:00:00.013527
Step 1: 0:00:00.010127
Step 2: 0:00:04.468114
Step 3: 0:00:02.109594
Step 4: 0:00:00.027437

In fact, my feeling is that we apply the regular expression on all the values of the identifiers, while I would expect that the filter applies on the possible values of the index (lost of values are reused many times).

import pandas as pd
import numpy as np
import datetime

N = 2000000
N_DVC = 10000

def getData():
    identifiers = np.random.choice(np.array([
        "need", "need: foo", "need: bar", "need: foo: bar", "foo: need", "bar: need",
        "not: need", "not: need: foo", "not: need: bar", "not: need: foo: bar", "foo: need: not", "bar: need: not",
        "need ign", "need: foo ign", "need: bar ign", "need: foo: bar ign", "foo: need ign", "bar: need ign",
        "ign need", "need: ign foo", "need: ign bar", "need: foo: ign bar",
    ]), N)
    devices = np.random.choice(np.arange(N_DVC))
    timestamps = np.random.choice(pd.date_range('1/1/2016 00:00:00', periods=60*60*24, freq='s'), N)
    x = np.random.rand(N)
    y = np.random.rand(N)
    data = pd.DataFrame({'identifier': identifiers, 'device': devices, 'timestamp': timestamps, 'x': x, 'y': y})
    data.set_index(['device', 'identifier', 'timestamp'], drop=True, inplace=True)
    return data

def filterData(data):
    # I know those regular expressions are not perfect for the example,
    # but it mimics the real expressions I have
    rexpPlus = '^(?:[^\s]+:\s)*need(?:(?::\s[^\s]+)*:\s[^\s]+)?$'
    rexpMinus = '(?::\s)(?:(?:not)|(?:ign))(?::\s)'

    tic = datetime.datetime.now()
    identifiers = data.index.get_level_values('identifier')
    print("- Step 0: %s" % str(datetime.datetime.now() - tic))

    tic = datetime.datetime.now()
    filt = np.repeat(np.False_, data.shape[0])
    print("- Step 1: %s" % str(datetime.datetime.now() - tic))

    tic = datetime.datetime.now()
    filt = filt | np.array(identifiers.str.contains(pat=rexpPlus, case=False, regex=True), dtype=np.bool)
    print("- Step 2: %s" % str(datetime.datetime.now() - tic))

    tic = datetime.datetime.now()
    filt = filt & (~np.array(identifiers.str.contains(pat=rexpMinus, case=False, regex=True), dtype=np.bool))
    print("- Step 3: %s" % str(datetime.datetime.now() - tic))

    tic = datetime.datetime.now()
    data = data.loc[filt, :]
    print("- Step 4: %s" % str(datetime.datetime.now() - tic))

    return data

if __name__ == "__main__":
    filterData(getData())

المحلول

I've found a solution: instead of filtering through all the values, I compute a hash table (in fact a DataFrame) of the possible values and filter on those values:

Step 0: 0:00:00.014251
Step 1: 0:00:00.010097
Init fast filter: 0:00:00.353645
Step 2: 0:00:00.027462
Step 3: 0:00:00.002550
Step 4: 0:00:00.027452
Globally: 0:00:00.435839

It runs faster because there are lots of duplicated values.

Here's a class which is doing that stuff. It works for indexes and normal columns:

import pandas as pd
import numpy as np


class DataFrameFastFilterText(object):
    """This class allows fast filtering of text column or index of a data frame
    Instead of filtering directy the values, it computes a hash of the values
    and applies the filtering to it.
    Then, it filters the values based on the hash mapping.
    This is faster than direct filtering if there are lots of duplicates values.
    Please note that missing values will be replaced by "" and every value will
    be converted to string.
    Input data won't be changed (working on a copy).
    """

    def __init__(self, data, column, isIndex):
        """Constructor
        - data: pandas Data Frame
        - column: name of the column - the column can be an index
        - isIndex: True if the column is an index
        """
        self._data = data.copy()  # working with a copy because we will make some changes
        self._column = column
        self._isIndex = isIndex
        self._hashedValues = None
        self._initHashAndData()

    def _initHashAndData(self):
        """Initialize the hash and transforms the data
        1. data: a. adds an order column
                 b. creates the column from the index if needed,
                 c. rename the column as 'value' and
                 d. drops all the other columns
                 e. fill na with ""
                 f. convert to strings
        2. hash: data frame of 'hashId', 'hashValue'
        3. data: a. sort by initial order
                 b. delete order
        """
        self._data['order'] = np.arange(self._data.shape[0])
        if self._isIndex:
            self._data[self._column] = self._data.index.get_level_values(self._column)
        self._data = self._data.loc[:, [self._column, 'order']]
        self._data.rename(columns={self._column: 'value'}, inplace=True)
        self._data['value'] = self._data['value'].fillna("", inplace=False)
        self._data['value'] = self._data['value'].astype(str)
        self._hash = pd.DataFrame({'value': pd.Series(self._data['value'].unique())})
        self._hash['hashId'] = self._hash.index
        self._data = pd.merge(
            left=self._data,
            right=self._hash,
            on='value',
            how='inner',
        )
        self._data.sort_values(by='order', inplace=True)
        del self._data['order']

    def getFilter(self, *args, **kwargs):
        """Returns the filter as a boolean array. It doesn't applies it.
        - args: positional arguments to pass to pd.Series.str.contains
        - kwargs: named arguments to pass to pd.Series.str.contains
        """
        assert "na" not kwargs.keys()  # useless in this context because all the NaN values are already replaced by empty strings
        h = self._hash.copy()
        h = h.loc[h['value'].str.contains(*args, **kwargs), :].copy()
        d = self._data.copy()
        d['keep'] = d['hashId'].isin(h['hashId'])
        return np.array(
            d['keep'],
            dtype=np.bool
        ).copy()

And the example code is adapted as follows:

...
tic = datetime.datetime.now()
fastFilter = DataFrameFastFilterText(data, 'identifier')
print("- Init fast filter: %s" % str(datetime.datetime.now() - tic))

tic = datetime.datetime.now()
filt = filt | fastFilter.getFilter(rexpPlus)
print("- Step 2: %s" % str(datetime.datetime.now() - tic))

tic = datetime.datetime.now()
filt = filt & (~fastFilter.getFilter(rexpMinus))
print("- Step 3: %s" % str(datetime.datetime.now() - tic))
...

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى datascience.stackexchange