I made a test file and tested a few variations. The fastest way of searching for a static string (as you appear to be doing) by iterating over the file is by using string in line
.
However, if you'll be using the loaded data to search more than once (actually more than 30 times according to the testnumbers below), it's worth your (computational) time to produce lookup tables for the PLUs and EANs in the form of dicts
and use these for future searches.
loaded 120000 lines
question regex 0.114868402481
simpler regex 0.417045307159
other regex 0.386662817001
startswith 0.236350297928
string in 0.020356798172 <-- iteration winner
dict construction 0.611148500443
dict lookup 0.000002503395 <-- best if you are doing many lookups
Test code follows:
import re
import timeit
def timefunc(function, times, *args):
def wrap():
function(*args)
t = timeit.Timer(wrap)
return t.timeit(times) / times
def question(lines):
eanic = "D41RP9"
matcher = re.compile('^(?:'+eanic.strip()+'(?:;|$)|[^;]*;'+eanic.strip()+'(?:;|$))').match
line=[next(l.split(';') for l in lines if matcher(l))]
return line
def splitstart(lines):
eanic = "D41RP9"
ret = []
for l in lines:
s = l.split(';')
if s[0].startswith(eanic) or s[1].startswith(eanic):
ret.append(l)
return ret
def simpler(lines):
eanic = "D41RP9"
matcher = re.compile('(^|;)' + eanic)
return [l for l in lines if matcher.search(l)]
def better(lines):
eanic = "D41RP9"
matcher = re.compile('^(?:' + eanic + '|[^;]*;' + eanic + ')')
return [l for l in lines if matcher.match(l)]
def strin(lines):
eanic = "D41RP9"
return [l for l in lines if eanic in l]
def mkdicts(lines):
ean = {}
plu = {}
for l in lines:
s = l.split(';')
ean[s[0]] = s
plu[s[1]] = s
return (ean, plu)
def searchdicts(ean, plu):
eanic = "D41RP9"
return (ean.get(eanic, None), plu.get(eanic, None))
with open('test.txt', 'r') as f:
lines = f.readlines()
print "loaded", len(lines), "lines"
print "question regex\t", timefunc(question, 10, lines)
print "simpler regex\t", timefunc(simpler, 10, lines)
print "other regex\t", timefunc(simpler, 10, lines)
print "startswith\t", timefunc(splitstart, 10, lines)
print "string in\t", timefunc(strin, 10, lines)
print "dict construction\t", timefunc(mkdicts, 10, lines)
ean, plu = mkdicts(lines)
print "dict lookup\t", timefunc(searchdicts, 10, ean, plu)