The nested loop approach means the algorithm is of O(N^2), even if the inner starting index is made more efficient. Here is an example of an on average O(N) approach which does not use a nested loop.
It also tries to handle some cases of unmatched transactions, assuming that a log-on of a user must be followed by another log-off by that user before he / she ever logs in again.
log_lines =[('2014-01-28 16:54:58', 'LOGON', 'jane', 'machinename'),
('2014-01-28 17:50:18', 'LOGOFF', 'jane', 'machinename'),
('2014-01-28 19:53:02', 'LOGON', 'skip', 'machinename'),
('2014-01-28 19:54:12', 'LOGOFF', 'skip', 'machinename'),
('2014-01-29 09:41:52', 'LOGON', 'jim', 'machinename'),
('2014-01-29 09:42:45', 'LOGOFF', 'jim', 'machinename'),
('2014-01-29 11:59:20', 'LOGON', 'skip', 'machinename'),
('2014-01-29 12:00:52', 'LOGOFF', 'skip', 'machinename'),
# Following are made up, weird logs
('2014-01-29 12:00:52', 'LOGOFF', 'dooz', 'machinename'),
('2014-01-29 12:00:52', 'LOGOFF', 'booz', 'machinename'),
('2014-01-29 12:00:52', 'LOGON', 'fooz', 'machinename'),]
from pprint import pprint
logged_in = {}
transactions_matched = []
transactions_weird = []
for line in log_lines:
action = line[1]
user = line[2]
if action == 'LOGON':
if user not in logged_in:
logged_in[user] = line
else: # Abnormal case 1: LOGON again when the user is already LOGON
transactions_weird.append(logged_in.pop(user))
logged_in[user] = line
elif action == 'LOGOFF':
if user in logged_in:
transactions_matched.append((logged_in.pop(user), line))
else: # Abnormal case 2: LOGOFF when the user is never LOGIN yet
transactions_weird.append(line)
# Dangling log-in actions, considered as abnormal
transactions_weird.extend(logged_in.values())
print 'Matched:'
pprint(transactions_matched)
print 'Weird:'
pprint(transactions_weird)
Output:
Matched:
[(('2014-01-28 16:54:58', 'LOGON', 'jane', 'machinename'),
('2014-01-28 17:50:18', 'LOGOFF', 'jane', 'machinename')),
(('2014-01-28 19:53:02', 'LOGON', 'skip', 'machinename'),
('2014-01-28 19:54:12', 'LOGOFF', 'skip', 'machinename')),
(('2014-01-29 09:41:52', 'LOGON', 'jim', 'machinename'),
('2014-01-29 09:42:45', 'LOGOFF', 'jim', 'machinename')),
(('2014-01-29 11:59:20', 'LOGON', 'skip', 'machinename'),
('2014-01-29 12:00:52', 'LOGOFF', 'skip', 'machinename'))]
Weird:
[('2014-01-29 12:00:52', 'LOGOFF', 'dooz', 'machinename'),
('2014-01-29 12:00:52', 'LOGOFF', 'booz', 'machinename'),
('2014-01-29 12:00:52', 'LOGON', 'fooz', 'machinename')]