I have a text file of lines (of several GB and ~ 12 millions of lines), where each line is a point x, y, z, + accessory info. I wish to read chunk-by-chunk the file, processing the point and split (following an spatial index based on the position of the points respect a square grid of 0.25 m) the result in several text file in a temporary folder.
# File: readline-example-3.py
file = open("sample.txt")
while 1:
lines = file.readlines(100000)
if not lines:
break
for line in lines:
pass # do something
my code is the following:
from __future__ import division
import os
import glob
import tempfile
import sys
def print_flulsh(n, maxvalue = None):
sys.stdout.write("\r")
if maxvalue is None:
sys.stdout.write("Laser points processed: %d" % n)
else:
sys.stdout.write("%d of %d laser points processed" % (n, maxvalue))
sys.stdout.flush()
def point_grid_id(x, y, minx, maxy, size):
"""give id (row,col)"""
col = int((x - minx) / size)
row = int((maxy - y) / size)
return row, col
def tempfile_tile_name(line, temp_dir, minx, maxy, size, parse):
x, y = line.split(parse)[:2]
row, col = point_grid_id(float(x), float(y), minx, maxy, size)
return os.path.normpath(os.path.join(temp_dir + os.sep,"tempfile_%s_%s.tmp" % (row, col)))
# split the text file in small text files following the ID value given by tempfile_tile_name
# where:
# filename : name+path of text file
# temp_dir: temporary folder
# minx, maxy: origin of the grid (left-up corner)
# size: size of the grid
# parse: delimeter of the text file
# num: number of lines (~ 12 millions)
def tempfile_split(filename, temp_dir, minx, maxy, size, parse, num):
index = 1
with open(filename) as file:
while True:
lines = file.readlines(100000)
if not lines:
break
for line in lines:
print_flulsh(index, num)
index += 1
name = tempfile_tile_name(line, temp_dir, minx, maxy, size, parse)
with open(name, 'a') as outfile:
outfile.write(line)
The main problem of my code is a decreasing of speed when ~ 2 millions of split text files are saved in the temporary folder. I wish to know with to the respect the solution of effbot.org if there is an optimized method to create a buffer?
Solución
The bottleneck in your code is not in reading, but in opening and closing an output file for every line read. In comments you mention your final objective: After the split i need to open again each file and select only one line random.
theodox mentions a possible approach, taking the first entry for each ID and then randomly overwriting it in memory. Note that the overwriting must take place at probability 1/n, where n is the number of lines so far seen with the same ID, to avoid a bias towards later samples.
EDIT. You can save memory by doing two passes over the file. The first pass builds a set of line numbers excluded by the random selection, the second pass processes the lines that are not excluded.
from random import random
def random_selection(filename, temp_dir, minx, maxy, size, parse, num):
selection = {}
excluded = set()
with open(filename) as file:
for i, line in enumerate(file):
x, y, _ = line.split(parse, 2)
row_col = point_grid_id(float(x), float(y), minx, maxy, size)
try:
n, selected_i = selection[row_col]
except KeyError:
selection[row_col] = 1, i
else:
n += 1
if random() < 1.0 / n:
excluded.add(selected_i)
selected_i = i
selection[row_col] = n, selected_i
with open(filename) as file:
for i, line in enumerate(file):
if i not in excluded:
#process the line