Opening text file in python as an array or list of list

Question 1

You can supply the delimiter sizes as argument. Example:

import numpy as np
import sys

with open('ebtrk_atlc.txt', 'rU') as f:
    data = np.genfromtxt(f,
                         dtype=None,
                         delimiter=[7, 10, 7, 4, 5, 6, 4, 5, 4, 4, 5, 4, 4, 3, 3, 3])
    print data

will give as output (omitting the first few lines)

('AL0188 ', 'ALBERTO   ', 80712, 1988, 41.5, 69.0, 35, 1002, -99, -99, 1012, 60, 100, 100, 50, 50)
('AL0188 ', 'ALBERTO   ', 80718, 1988, 43.0, 67.5, 35, 1002, -99, -99, 1008, 50, 100, 100, 50, 50)
('AL0188 ', 'ALBERTO   ', 80800, 1988, 45.0, 65.5, 35, 1004, -99, -99, 1008, 50, -99, -99, -99, -99)

As you see the 100100 field got separated. Of course you have to supply the correct field types and dimensions, this example just demonstrates that it is possible. For example, changing the code to

import numpy as np
import re
import sys

with open('ebtrk_atlc.txt', 'rU') as f:
    dt = "a7,a10,a7,i4,f5,f6,i4,i5,i4,i4,i5,i4,i4,i3,i3,i3"
    data = np.genfromtxt(f,
                         dtype=dt,
                         delimiter=map(int, re.split(",?[a-z]", dt[1:])),
                         autostrip=True)

will change the result to

('AL0188', 'ALBERTO', '080712', 1988, 41.5, 69.0, 35, 1002, -99, -99, 1012, 60, 100, 100, 50, 50)
('AL0188', 'ALBERTO', '080718', 1988, 43.0, 67.5, 35, 1002, -99, -99, 1008, 50, 100, 100, 50, 50)
('AL0188', 'ALBERTO', '080800', 1988, 45.0, 65.5, 35, 1004, -99, -99, 1008, 50, -99, -99, -99, -99)

Stripping away the whitespace around the strings and explicitly setting some types to float. Further documentation can be found here, check the example at the bottom.

Question 2

Old-fashion parsing is possible since the structure is nicely ordered, a bit length but it seems to do the trick.

Before:

$ awk '{print NF}' ebtrk_atlc.txt  | sort | uniq -c
     79 17
     16 18
     92 19
    494 20
    308 21
    405 22
   1769 23
    897 24
   1329 25
   5444 26
     27 27

after:

$ awk '{print NF}' log  | sort | uniq -c
   8778 27
   2082 28

code:

#!/usr/bin/env python

def chunks(l, n):
    return [l[i:i+n] for i in range(0, len(l), n)]

with open("ebtrk_atlc.txt") as fd:
    for line in fd:
        cols=line.strip().split()
        # 26 columns seems to be the target
        # after column 13, split on -
        if len(cols) < 26:
            tmp = []
            for i in cols[-13:]:
                if '-' in i:
                    for n in i.split('-'):
                        if n:
                            tmp.append('-' + n)

                elif len(i) == 6 or len(i) == 9 or len(i) == 12:
                    for n in chunks(i, 3):
                        tmp.append(n)
                elif len(i) == 8:
                    # 50100100 split in 2-3-3-fashion
                    tmp.append(i[0:1])
                    tmp.append(i[2:4])
                    tmp.append(i[5:7])
                elif len(i) == 5:
                    # 50100 split in 2-3-fashion
                    tmp.append(i[0:1])
                    tmp.append(i[2:4])
                elif len(i) == 7:
                    # 0285195 split in 3-3-fashion
                    tmp.append(i[0])
                    tmp.append(i[1:3])
                    tmp.append(i[4:6])
                elif len(i) == 11:
                    # 30120160200 split in 2-3-3-3-fashion
                    tmp.append(i[0:1])
                    tmp.append(i[2:4])
                    tmp.append(i[5:7])
                    tmp.append(i[8:10])
                elif len(i) == 10:
                    # 0180180210 split in 3-3-3-fashion
                    tmp.append(i[0])
                    tmp.append(i[1:3])
                    tmp.append(i[4:6])
                    tmp.append(i[7:9])
                else:
                    tmp.append(i)

                # one final loop to fix strings beginning with a 0
                tmp2 = []
                for i in tmp:
                    if i.startswith('0') and len(i) > 2:
                        tmp2.append(i[0])
                        tmp2.append(i[1:])
                    else:
                        tmp2.append(i)

            # rebuild list
            data = cols[0:-13] + tmp2

            print len(data), data
        else:
            print len(cols), cols