python pandas optimization: filtering on text index values
-
16-10-2019 - |
سؤال
I have to filter a pandas data frame by matching a complex regular expression on a text index.
The data frame is multi level indexed, and contains more than 2 million records.
The way I'm doing is:
identifiers = self._data.index.get_level_values('identifier')
filt = ... # an_array_of_np.bool_with_the_same_length_as_my_data
pattern = ... # a complex regular expression, as a string
filt = filt & np.array(identifiers.str.contains(pat=pattern, case=False, regex=True), dtype=np.bool)
... # other filterings
Unfortunately, the line beginning by filt = filt &
is very slow.
I'm wondering if you have some ideas to make it faster. I guess that's because of the identifiers.str.contains
Thanks a lot!
EDIT:
Thanks @Emre
I'm not allowed to share those data, but the code below demonstrates the problem:
- Step 0: 0:00:00.013527
- Step 1: 0:00:00.010127
- Step 2: 0:00:04.468114
- Step 3: 0:00:02.109594
- Step 4: 0:00:00.027437
In fact, my feeling is that we apply the regular expression on all the values of the identifiers, while I would expect that the filter applies on the possible values of the index (lost of values are reused many times).
import pandas as pd
import numpy as np
import datetime
N = 2000000
N_DVC = 10000
def getData():
identifiers = np.random.choice(np.array([
"need", "need: foo", "need: bar", "need: foo: bar", "foo: need", "bar: need",
"not: need", "not: need: foo", "not: need: bar", "not: need: foo: bar", "foo: need: not", "bar: need: not",
"need ign", "need: foo ign", "need: bar ign", "need: foo: bar ign", "foo: need ign", "bar: need ign",
"ign need", "need: ign foo", "need: ign bar", "need: foo: ign bar",
]), N)
devices = np.random.choice(np.arange(N_DVC))
timestamps = np.random.choice(pd.date_range('1/1/2016 00:00:00', periods=60*60*24, freq='s'), N)
x = np.random.rand(N)
y = np.random.rand(N)
data = pd.DataFrame({'identifier': identifiers, 'device': devices, 'timestamp': timestamps, 'x': x, 'y': y})
data.set_index(['device', 'identifier', 'timestamp'], drop=True, inplace=True)
return data
def filterData(data):
# I know those regular expressions are not perfect for the example,
# but it mimics the real expressions I have
rexpPlus = '^(?:[^\s]+:\s)*need(?:(?::\s[^\s]+)*:\s[^\s]+)?$'
rexpMinus = '(?::\s)(?:(?:not)|(?:ign))(?::\s)'
tic = datetime.datetime.now()
identifiers = data.index.get_level_values('identifier')
print("- Step 0: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
filt = np.repeat(np.False_, data.shape[0])
print("- Step 1: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
filt = filt | np.array(identifiers.str.contains(pat=rexpPlus, case=False, regex=True), dtype=np.bool)
print("- Step 2: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
filt = filt & (~np.array(identifiers.str.contains(pat=rexpMinus, case=False, regex=True), dtype=np.bool))
print("- Step 3: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
data = data.loc[filt, :]
print("- Step 4: %s" % str(datetime.datetime.now() - tic))
return data
if __name__ == "__main__":
filterData(getData())
المحلول
I've found a solution: instead of filtering through all the values, I compute a hash table (in fact a DataFrame) of the possible values and filter on those values:
- Step 0: 0:00:00.014251
- Step 1: 0:00:00.010097
- Init fast filter: 0:00:00.353645
- Step 2: 0:00:00.027462
- Step 3: 0:00:00.002550
- Step 4: 0:00:00.027452
- Globally: 0:00:00.435839
It runs faster because there are lots of duplicated values.
Here's a class which is doing that stuff. It works for indexes and normal columns:
import pandas as pd
import numpy as np
class DataFrameFastFilterText(object):
"""This class allows fast filtering of text column or index of a data frame
Instead of filtering directy the values, it computes a hash of the values
and applies the filtering to it.
Then, it filters the values based on the hash mapping.
This is faster than direct filtering if there are lots of duplicates values.
Please note that missing values will be replaced by "" and every value will
be converted to string.
Input data won't be changed (working on a copy).
"""
def __init__(self, data, column, isIndex):
"""Constructor
- data: pandas Data Frame
- column: name of the column - the column can be an index
- isIndex: True if the column is an index
"""
self._data = data.copy() # working with a copy because we will make some changes
self._column = column
self._isIndex = isIndex
self._hashedValues = None
self._initHashAndData()
def _initHashAndData(self):
"""Initialize the hash and transforms the data
1. data: a. adds an order column
b. creates the column from the index if needed,
c. rename the column as 'value' and
d. drops all the other columns
e. fill na with ""
f. convert to strings
2. hash: data frame of 'hashId', 'hashValue'
3. data: a. sort by initial order
b. delete order
"""
self._data['order'] = np.arange(self._data.shape[0])
if self._isIndex:
self._data[self._column] = self._data.index.get_level_values(self._column)
self._data = self._data.loc[:, [self._column, 'order']]
self._data.rename(columns={self._column: 'value'}, inplace=True)
self._data['value'] = self._data['value'].fillna("", inplace=False)
self._data['value'] = self._data['value'].astype(str)
self._hash = pd.DataFrame({'value': pd.Series(self._data['value'].unique())})
self._hash['hashId'] = self._hash.index
self._data = pd.merge(
left=self._data,
right=self._hash,
on='value',
how='inner',
)
self._data.sort_values(by='order', inplace=True)
del self._data['order']
def getFilter(self, *args, **kwargs):
"""Returns the filter as a boolean array. It doesn't applies it.
- args: positional arguments to pass to pd.Series.str.contains
- kwargs: named arguments to pass to pd.Series.str.contains
"""
assert "na" not kwargs.keys() # useless in this context because all the NaN values are already replaced by empty strings
h = self._hash.copy()
h = h.loc[h['value'].str.contains(*args, **kwargs), :].copy()
d = self._data.copy()
d['keep'] = d['hashId'].isin(h['hashId'])
return np.array(
d['keep'],
dtype=np.bool
).copy()
And the example code is adapted as follows:
...
tic = datetime.datetime.now()
fastFilter = DataFrameFastFilterText(data, 'identifier')
print("- Init fast filter: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
filt = filt | fastFilter.getFilter(rexpPlus)
print("- Step 2: %s" % str(datetime.datetime.now() - tic))
tic = datetime.datetime.now()
filt = filt & (~fastFilter.getFilter(rexpMinus))
print("- Step 3: %s" % str(datetime.datetime.now() - tic))
...