This isn't very elegant, but it should get the job done. I have assumed based on your expected output that you meant for the key to be rg1 + rg3
, not rg1 + rg2
.
import re, collections
mylist = ['Probes', 'Gene.symbol', 'Gene.Title', 'GO1', 'GO2', 'GO3', 'ADX_KD_06.ip', 'ADX_KD_24.ip', 'ADX_LG_06.ip', 'ADX_LG_24.ip', 'ADX_LV_06.ip', 'ADX_LV_24.ip', 'ADX_SP_06.ip', 'ADX_SP_24.ip', 'ADX_LN_06.id', 'ALM_LN_06.id', 'ALM_LV_06.ip', 'ALM_SP_06.ip', 'K3SPG_LV_06.ip', 'K3SPG_SP_06.ip', 'KKK_LN_06.id', 'KKK_LV_06.ip', 'KKK_SP_06.ip', 'ENDCN_LV_06.in', 'ENDCN_SP_06.in', 'bCD_LV_06.ip', 'bCD_SP_06.ip', 'ADX_LV_06.id', 'ADX_SP_06.id', 'ALM_LV_06.id', 'ALM_SP_06.id', 'D35_LN_06.id', 'K3SPG_LN_06.id', 'K3_LV_06.id', 'K3_SP_06.id', 'bCD_LN_06.id', 'D35_LV_06.id', 'D35_SP_06.id', 'K3SPG_LV_06.id', 'K3SPG_SP_06.id', 'bCD_LV_06.id', 'bCD_SP_06.id', 'ENDCN_KD_06.in', 'ENDCN_LG_06.in', 'Probes', 'Gene.symbol', 'ADX_KD_06.ip', 'ADX_KD_24.ip', 'ADX_LG_06.ip', 'ADX_LG_24.ip', 'ADX_LV_06.ip', 'ADX_LV_24.ip', 'ADX_SP_06.ip', 'ADX_SP_24.ip', 'ADX_LN_06.id', 'ALM_LN_06.id', 'ALM_LV_06.ip', 'ALM_SP_06.ip', 'K3SPG_LV_06.ip', 'K3SPG_SP_06.ip', 'KKK_LN_06.id', 'KKK_LV_06.ip', 'KKK_SP_06.ip', 'ENDCN_LV_06.in', 'ENDCN_SP_06.in', 'bCD_LV_06.ip', 'bCD_SP_06.ip', 'ADX_LV_06.id', 'ADX_SP_06.id', 'ALM_LV_06.id', 'ALM_SP_06.id', 'D35_LN_06.id', 'K3SPG_LN_06.id', 'K3_LV_06.id', 'K3_SP_06.id', 'bCD_LN_06.id', 'D35_LV_06.id', 'D35_SP_06.id', 'K3SPG_LV_06.id', 'K3SPG_SP_06.id', 'bCD_LV_06.id', 'bCD_SP_06.id', 'ENDCN_KD_06.in', 'ENDCN_LG_06.in']
regex = re.compile(r'([\w\d]+)_(\w\w)_(\d\d)\.(\w\w)')
first_part_dict = collections.defaultdict(list)
second_part_dict = collections.defaultdict(list)
# second instance of 'Probes', to separate the first and second parts
cutoff_index = mylist.index('Probes', 1)
for i, string in enumerate(mylist):
matched = regex.match(string)
if not matched:
continue
rg1, rg2, rg3, rg4 = matched.groups()
key = rg1 + rg3
if i < cutoff_index:
first_part_dict[key].append(i)
else:
second_part_dict[key].append(i)
Result:
>>> first_part_dict
defaultdict(<class 'list'>, {'ALM06': [15, 16, 17, 29, 30], 'K3SPG06': [18, 19, 32, 38, 39], 'bCD06': [25, 26, 35, 40, 41], 'ADX24': [7, 9, 11, 13], 'ENDCN06': [23, 24, 42, 43], 'KKK06': [20, 21, 22], 'K306': [33, 34], 'ADX06': [6, 8, 10, 12, 14, 27, 28], 'D3506': [31, 36, 37]})
>>> second_part_dict
defaultdict(<class 'list'>, {'ALM06': [55, 56, 57, 69, 70], 'K3SPG06': [58, 59, 72, 78, 79], 'bCD06': [65, 66, 75, 80, 81], 'ADX24': [47, 49, 51, 53], 'ENDCN06': [63, 64, 82, 83], 'KKK06': [60, 61, 62], 'K306': [73, 74], 'ADX06': [46, 48, 50, 52, 54, 67, 68], 'D3506': [71, 76, 77]})