Obter últimas n linhas de um arquivo com Python, semelhante à cauda

https://stackoverflow.com/questions/136168

02-07-2019
|

Pergunta

Eu estou escrevendo um visualizador de arquivos de log de um aplicativo web e para que eu quiser paginar através das linhas do arquivo de log. Os itens no arquivo são linha de base com o mais novo item na parte inferior.

Então eu preciso de um método tail() que pode ler linhas n do fundo e suporta um deslocamento. O que eu vim com esta aparência:

def tail(f, n, offset=0):
    """Reads a n lines from f with an offset of offset lines."""
    avg_line_length = 74
    to_read = n + offset
    while 1:
        try:
            f.seek(-(avg_line_length * to_read), 2)
        except IOError:
            # woops.  apparently file is smaller than what we want
            # to step back, go to the beginning instead
            f.seek(0)
        pos = f.tell()
        lines = f.read().splitlines()
        if len(lines) >= to_read or pos == 0:
            return lines[-to_read:offset and -offset or None]
        avg_line_length *= 1.3

Esta é uma abordagem razoável? O que é a maneira recomendada para arquivos de log cauda com offsets?

Solução 6

O código acabei usando. Acho que este é o melhor até agora:

def tail(f, n, offset=None):
    """Reads a n lines from f with an offset of offset lines.  The return
    value is a tuple in the form ``(lines, has_more)`` where `has_more` is
    an indicator that is `True` if there are more lines in the file.
    """
    avg_line_length = 74
    to_read = n + (offset or 0)

    while 1:
        try:
            f.seek(-(avg_line_length * to_read), 2)
        except IOError:
            # woops.  apparently file is smaller than what we want
            # to step back, go to the beginning instead
            f.seek(0)
        pos = f.tell()
        lines = f.read().splitlines()
        if len(lines) >= to_read or pos == 0:
            return lines[-to_read:offset and -offset or None], \
                   len(lines) > to_read or pos > 0
        avg_line_length *= 1.3

Outras dicas

Isto pode ser mais rápido que o seu. Não faz suposições sobre o comprimento da linha. Costas através do arquivo de um bloco de cada vez até que seja encontrado o número correto de caracteres '\ n'.

def tail( f, lines=20 ):
    total_lines_wanted = lines

    BLOCK_SIZE = 1024
    f.seek(0, 2)
    block_end_byte = f.tell()
    lines_to_go = total_lines_wanted
    block_number = -1
    blocks = [] # blocks of size BLOCK_SIZE, in reverse order starting
                # from the end of the file
    while lines_to_go > 0 and block_end_byte > 0:
        if (block_end_byte - BLOCK_SIZE > 0):
            # read the last block we haven't yet read
            f.seek(block_number*BLOCK_SIZE, 2)
            blocks.append(f.read(BLOCK_SIZE))
        else:
            # file too small, start from begining
            f.seek(0,0)
            # only read what was not read
            blocks.append(f.read(block_end_byte))
        lines_found = blocks[-1].count('\n')
        lines_to_go -= lines_found
        block_end_byte -= BLOCK_SIZE
        block_number -= 1
    all_read_text = ''.join(reversed(blocks))
    return '\n'.join(all_read_text.splitlines()[-total_lines_wanted:])

Eu não gosto suposições difíceis sobre o comprimento da linha quando - como uma questão prática -. Você nunca pode saber essas coisas

De um modo geral, este irá localizar os últimos 20 linhas na primeira ou segunda passagem através do laço. Se a sua coisa caráter 74 é realmente preciso, você faz o tamanho do bloco 2048 e você vai cauda 20 linhas quase que imediatamente.

Além disso, eu não queimar muitas calorias cérebro tentando alinhamento finesse com blocos OS físicas. Usando esses pacotes de alto nível I / O, eu duvido que você vai ver consequência de qualquer tentativa de alinhar em limites de bloco OS performance. Se você usar de nível mais baixo I / O, então você pode ver um aumento de velocidade.

Assume um sistema unix-like em Python 2 você pode fazer:

import os
def tail(f, n, offset=0):
  stdin,stdout = os.popen2("tail -n "+n+offset+" "+f)
  stdin.close()
  lines = stdout.readlines(); stdout.close()
  return lines[:,-offset]

Para python 3 você pode fazer:

import subprocess
def tail(f, n, offset=0):
    proc = subprocess.Popen(['tail', '-n', n + offset, f], stdout=subprocess.PIPE)
    lines = proc.stdout.readlines()
    return lines[:, -offset]

Se ler o arquivo inteiro é aceitável, em seguida, usar um deque.

from collections import deque
deque(f, maxlen=n)

Antes de 2.6, deques não tem uma opção maxlen, mas é bastante fácil de implementar.

import itertools
def maxque(items, size):
    items = iter(items)
    q = deque(itertools.islice(items, size))
    for item in items:
        del q[0]
        q.append(item)
    return q

Se é um requisito para ler o arquivo a partir do final, em seguida, usar um galope (a.k.a exponencial) busca.

def tail(f, n):
    assert n >= 0
    pos, lines = n+1, []
    while len(lines) <= n:
        try:
            f.seek(-pos, 2)
        except IOError:
            f.seek(0)
            break
        finally:
            lines = list(f)
        pos *= 2
    return lines[-n:]

Aqui está a minha resposta. python puro. Usando timeit parece bastante rápido. Tailing 100 linhas de um arquivo de log que tem 100.000 linhas:

>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=10)
0.0014600753784179688
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=100)
0.00899195671081543
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=1000)
0.05842900276184082
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=10000)
0.5394978523254395
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=100000)
5.377126932144165

Aqui está o código:

import os


def tail(f, lines=1, _buffer=4098):
    """Tail a file and get X lines from the end"""
    # place holder for the lines found
    lines_found = []

    # block counter will be multiplied by buffer
    # to get the block size from the end
    block_counter = -1

    # loop until we find X lines
    while len(lines_found) < lines:
        try:
            f.seek(block_counter * _buffer, os.SEEK_END)
        except IOError:  # either file is too small, or too many lines requested
            f.seek(0)
            lines_found = f.readlines()
            break

        lines_found = f.readlines()

        # we found enough lines, get out
        # Removed this line because it was redundant the while will catch
        # it, I left it for history
        # if len(lines_found) > lines:
        #    break

        # decrement the block counter to get the
        # next X bytes
        block_counter -= 1

    return lines_found[-lines:]

A resposta de S. Lott acima quase funciona para mim, mas acaba dando me linhas parciais. Acontece que ele corrompe dados sobre limites dos blocos, pois os dados detém os blocos de leitura em ordem inversa. Quando '' .join (dados) é chamado, os blocos estão na ordem errada. Isso corrige isso.

def tail(f, window=20):
    """
    Returns the last `window` lines of file `f` as a list.
    f - a byte file-like object
    """
    if window == 0:
        return []
    BUFSIZ = 1024
    f.seek(0, 2)
    bytes = f.tell()
    size = window + 1
    block = -1
    data = []
    while size > 0 and bytes > 0:
        if bytes - BUFSIZ > 0:
            # Seek back one whole BUFSIZ
            f.seek(block * BUFSIZ, 2)
            # read BUFFER
            data.insert(0, f.read(BUFSIZ))
        else:
            # file too small, start from begining
            f.seek(0,0)
            # only read what was not read
            data.insert(0, f.read(bytes))
        linesFound = data[0].count('\n')
        size -= linesFound
        bytes -= BUFSIZ
        block -= 1
    return ''.join(data).splitlines()[-window:]

Uma solução simples e rápida com mmap:

import mmap
import os

def tail(filename, n):
    """Returns last n lines from the filename. No exception handling"""
    size = os.path.getsize(filename)
    with open(filename, "rb") as f:
        # for Windows the mmap parameters are different
        fm = mmap.mmap(f.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ)
        try:
            for i in xrange(size - 1, -1, -1):
                if fm[i] == '\n':
                    n -= 1
                    if n == -1:
                        break
            return fm[i + 1 if i else 0:].splitlines()
        finally:
            fm.close()

Uma ainda mais limpa python3 versão compatível que não inserir mas Anexa e reveses:

def tail(f, window=1):
    """
    Returns the last `window` lines of file `f` as a list of bytes.
    """
    if window == 0:
        return b''
    BUFSIZE = 1024
    f.seek(0, 2)
    end = f.tell()
    nlines = window + 1
    data = []
    while nlines > 0 and end > 0:
        i = max(0, end - BUFSIZE)
        nread = min(end, BUFSIZE)

        f.seek(i)
        chunk = f.read(nread)
        data.append(chunk)
        nlines -= chunk.count(b'\n')
        end -= nread
    return b'\n'.join(b''.join(reversed(data)).splitlines()[-window:])

usá-lo como este:

with open(path, 'rb') as f:
    last_lines = tail(f, 3).decode('utf-8')

Eu encontrei o Popen acima para ser a melhor solução. É rápido e sujo e ele funciona Para python 2.6 na máquina Unix eu usei o seguinte

    def GetLastNLines(self, n, fileName):
    """
    Name:           Get LastNLines
    Description:        Gets last n lines using Unix tail
    Output:         returns last n lines of a file
    Keyword argument:
    n -- number of last lines to return
    filename -- Name of the file you need to tail into
    """
    p=subprocess.Popen(['tail','-n',str(n),self.__fileName], stdout=subprocess.PIPE)
    soutput,sinput=p.communicate()
    return soutput

soutput terá conterá últimos n linhas do código. para percorrer soutput linha por linha fazer:

for line in GetLastNLines(50,'myfile.log').split('\n'):
    print line

solução Atualização @papercrane para python3. Abra o arquivo com open(filename, 'rb') e:

def tail(f, window=20):
    """Returns the last `window` lines of file `f` as a list.
    """
    if window == 0:
        return []

    BUFSIZ = 1024
    f.seek(0, 2)
    remaining_bytes = f.tell()
    size = window + 1
    block = -1
    data = []

    while size > 0 and remaining_bytes > 0:
        if remaining_bytes - BUFSIZ > 0:
            # Seek back one whole BUFSIZ
            f.seek(block * BUFSIZ, 2)
            # read BUFFER
            bunch = f.read(BUFSIZ)
        else:
            # file too small, start from beginning
            f.seek(0, 0)
            # only read what was not read
            bunch = f.read(remaining_bytes)

        bunch = bunch.decode('utf-8')
        data.insert(0, bunch)
        size -= bunch.count('\n')
        remaining_bytes -= BUFSIZ
        block -= 1

    return ''.join(data).splitlines()[-window:]

Publicação de uma resposta a mando de comentadores em minha resposta a uma pergunta semelhante onde a mesma técnica foi utilizada para transformar a última linha de um arquivo, não apenas obtê-lo.

Para um arquivo de tamanho significativo, mmap é a melhor maneira de faça isso. Para melhorar a resposta mmap existente, esta versão é portátil entre Windows e Linux, e deve correr mais rápido (embora ele não vai trabalhar sem algumas modificações no 32 bit Python com arquivos na faixa GB, consulte o outra resposta para dicas sobre manuseio deste, e para modificar a trabalhar em Python 2 ).

import io  # Gets consistent version of open for both Py2.7 and Py3.x
import itertools
import mmap

def skip_back_lines(mm, numlines, startidx):
    '''Factored out to simplify handling of n and offset'''
    for _ in itertools.repeat(None, numlines):
        startidx = mm.rfind(b'\n', 0, startidx)
        if startidx < 0:
            break
    return startidx

def tail(f, n, offset=0):
    # Reopen file in binary mode
    with io.open(f.name, 'rb') as binf, mmap.mmap(binf.fileno(), 0, access=mmap.ACCESS_READ) as mm:
        # len(mm) - 1 handles files ending w/newline by getting the prior line
        startofline = skip_back_lines(mm, offset, len(mm) - 1)
        if startofline < 0:
            return []  # Offset lines consumed whole file, nothing to return
            # If using a generator function (yield-ing, see below),
            # this should be a plain return, no empty list

        endoflines = startofline + 1  # Slice end to omit offset lines

        # Find start of lines to capture (add 1 to move from newline to beginning of following line)
        startofline = skip_back_lines(mm, n, startofline) + 1

        # Passing True to splitlines makes it return the list of lines without
        # removing the trailing newline (if any), so list mimics f.readlines()
        return mm[startofline:endoflines].splitlines(True)
        # If Windows style \r\n newlines need to be normalized to \n, and input
        # is ASCII compatible, can normalize newlines with:
        # return mm[startofline:endoflines].replace(os.linesep.encode('ascii'), b'\n').splitlines(True)

Isso pressupõe o número de linhas atado é pequeno o suficiente, você pode lê-los todos com segurança para a memória de uma só vez; Você também pode fazer isso uma função de gerador e ler manualmente uma linha de cada vez, substituindo a linha final com:

        mm.seek(startofline)
        # Call mm.readline n times, or until EOF, whichever comes first
        # Python 3.2 and earlier:
        for line in itertools.islice(iter(mm.readline, b''), n):
            yield line

        # 3.3+:
        yield from itertools.islice(iter(mm.readline, b''), n)

Por fim, esta leitura em modo binário (necessário uso mmap) para que ele dá linhas str (PY2) e linhas bytes (PY3); se você quiser unicode (Py2) ou str (PY3), a abordagem iterativa pode ser ajustado para decodificar para você e / ou corrigir novas linhas:

        lines = itertools.islice(iter(mm.readline, b''), n)
        if f.encoding:  # Decode if the passed file was opened with a specific encoding
            lines = (line.decode(f.encoding) for line in lines)
        if 'b' not in f.mode:  # Fix line breaks if passed file opened in text mode
            lines = (line.replace(os.linesep, '\n') for line in lines)
        # Python 3.2 and earlier:
        for line in lines:
            yield line
        # 3.3+:
        yield from lines

Nota: eu digitei isso tudo em uma máquina onde eu não têm acesso à Python para teste. Por favor, deixe-me saber se eu typoed nada; este foi o suficiente semelhante ao minha outra resposta que eu pensar ele deve funcionar, mas os ajustes (por exemplo, lidar com um offset) poderia levar a erros sutis. Por favor, deixe-me saber nos comentários se existem erros.

com base no topo do S.Lott votou resposta (25 de setembro '08 em 21:43), mas fixo para arquivos pequenos.

def tail(the_file, lines_2find=20):  
    the_file.seek(0, 2)                         #go to end of file
    bytes_in_file = the_file.tell()             
    lines_found, total_bytes_scanned = 0, 0
    while lines_2find+1 > lines_found and bytes_in_file > total_bytes_scanned: 
        byte_block = min(1024, bytes_in_file-total_bytes_scanned)
        the_file.seek(-(byte_block+total_bytes_scanned), 2)
        total_bytes_scanned += byte_block
        lines_found += the_file.read(1024).count('\n')
    the_file.seek(-total_bytes_scanned, 2)
    line_list = list(the_file.readlines())
    return line_list[-lines_2find:]

    #we read at least 21 line breaks from the bottom, block by block for speed
    #21 to ensure we don't get a half line

Hope isso é útil.

Existem algumas implementações existentes de cauda no pypi que você pode instalar usando pip:

mtFileUtil
multitail
log4tailer
...

Dependendo da situação, pode haver vantagens em usar uma dessas ferramentas existentes.

Aqui é uma implementação muito simples:

with open('/etc/passwd', 'r') as f:
  try:
    f.seek(0,2)
    s = ''
    while s.count('\n') < 11:
      cur = f.tell()
      f.seek((cur - 10))
      s = f.read(10) + s
      f.seek((cur - 10))
    print s
  except Exception as e:
    f.readlines()

Simples:

with open("test.txt") as f:
data = f.readlines()
tail = data[-2:]
print(''.join(tail)

Para a eficiência com arquivos muito grandes (comuns em situações de arquivo de log onde você pode querer cauda uso), você geralmente querem evitar lendo o arquivo inteiro (mesmo se você fazê-lo sem ler o arquivo inteiro na memória de uma vez) Entretanto , você precisa de alguma forma trabalhar para fora o deslocamento em linhas em vez de caracteres. Uma possibilidade é a leitura para trás com seek () caractere por caractere, mas isso é muito lento. Em vez disso, o seu melhor para processo em blocos maiores.

Eu tenho uma função de utilidade que eu escrevi há um tempo atrás para ler arquivos trás que podem ser usados ??aqui.

import os, itertools

def rblocks(f, blocksize=4096):
    """Read file as series of blocks from end of file to start.

    The data itself is in normal order, only the order of the blocks is reversed.
    ie. "hello world" -> ["ld","wor", "lo ", "hel"]
    Note that the file must be opened in binary mode.
    """
    if 'b' not in f.mode.lower():
        raise Exception("File must be opened using binary mode.")
    size = os.stat(f.name).st_size
    fullblocks, lastblock = divmod(size, blocksize)

    # The first(end of file) block will be short, since this leaves 
    # the rest aligned on a blocksize boundary.  This may be more 
    # efficient than having the last (first in file) block be short
    f.seek(-lastblock,2)
    yield f.read(lastblock)

    for i in range(fullblocks-1,-1, -1):
        f.seek(i * blocksize)
        yield f.read(blocksize)

def tail(f, nlines):
    buf = ''
    result = []
    for block in rblocks(f):
        buf = block + buf
        lines = buf.splitlines()

        # Return all lines except the first (since may be partial)
        if lines:
            result.extend(lines[1:]) # First line may not be complete
            if(len(result) >= nlines):
                return result[-nlines:]

            buf = lines[0]

    return ([buf]+result)[-nlines:]


f=open('file_to_tail.txt','rb')
for line in tail(f, 20):
    print line

[Edit] versão mais específica Acrescentado (evita precisar reverter duas vezes)

Você pode ir para o final do seu arquivo com f.seek (0, 2) e, em seguida, ler fora das linhas de um por um com a seguinte substituição para readline ():

def readline_backwards(self, f):
    backline = ''
    last = ''
    while not last == '\n':
        backline = last + backline
        if f.tell() <= 0:
            return backline
        f.seek(-1, 1)
        last = f.read(1)
        f.seek(-1, 1)
    backline = last
    last = ''
    while not last == '\n':
        backline = last + backline
        if f.tell() <= 0:
            return backline
        f.seek(-1, 1)
        last = f.read(1)
        f.seek(-1, 1)
    f.seek(1, 1)
    return backline

Com base na resposta Eyecue (Jun 10 '10 em 21:28.): Esta cabeça classe add () eo método de cauda () para objeto de arquivo

class File(file):
    def head(self, lines_2find=1):
        self.seek(0)                            #Rewind file
        return [self.next() for x in xrange(lines_2find)]

    def tail(self, lines_2find=1):  
        self.seek(0, 2)                         #go to end of file
        bytes_in_file = self.tell()             
        lines_found, total_bytes_scanned = 0, 0
        while (lines_2find+1 > lines_found and
               bytes_in_file > total_bytes_scanned): 
            byte_block = min(1024, bytes_in_file-total_bytes_scanned)
            self.seek(-(byte_block+total_bytes_scanned), 2)
            total_bytes_scanned += byte_block
            lines_found += self.read(1024).count('\n')
        self.seek(-total_bytes_scanned, 2)
        line_list = list(self.readlines())
        return line_list[-lines_2find:]

Uso:

f = File('path/to/file', 'r')
f.head(3)
f.tail(3)

Várias dessas soluções têm problemas se o arquivo não termina em \ n ou para garantir a primeira linha completa é lido.

def tail(file, n=1, bs=1024):
    f = open(file)
    f.seek(-1,2)
    l = 1-f.read(1).count('\n') # If file doesn't end in \n, count it anyway.
    B = f.tell()
    while n >= l and B > 0:
            block = min(bs, B)
            B -= block
            f.seek(B, 0)
            l += f.read(block).count('\n')
    f.seek(B, 0)
    l = min(l,n) # discard first (incomplete) line if l > n
    lines = f.readlines()[-l:]
    f.close()
    return lines

Eu tive que ler um valor específico a partir da última linha de um arquivo, e deparei com esta discussão. Em vez de reinventar a roda em Python, eu acabei com um shell script minúsculo, salvo como / Usr / local / bin / get_last_netp:

#! /bin/bash
tail -n1 /home/leif/projects/transfer/export.log | awk {'print $14'}

E no programa Python:

from subprocess import check_output

last_netp = int(check_output("/usr/local/bin/get_last_netp"))

Não é o primeiro exemplo usando um deque, mas uma mais simples. Este é geral:. Ele funciona em qualquer objeto iterável, e não apenas um arquivo

#!/usr/bin/env python
import sys
import collections
def tail(iterable, N):
    deq = collections.deque()
    for thing in iterable:
        if len(deq) >= N:
            deq.popleft()
        deq.append(thing)
    for thing in deq:
        yield thing
if __name__ == '__main__':
    for line in tail(sys.stdin,10):
        sys.stdout.write(line)

This is my version of tailf

import sys, time, os

filename = 'path to file'

try:
    with open(filename) as f:
        size = os.path.getsize(filename)
        if size < 1024:
            s = size
        else:
            s = 999
        f.seek(-s, 2)
        l = f.read()
        print l
        while True:
            line = f.readline()
            if not line:
                time.sleep(1)
                continue
            print line
except IOError:
    pass

import time

attemps = 600
wait_sec = 5
fname = "YOUR_PATH"

with open(fname, "r") as f:
    where = f.tell()
    for i in range(attemps):
        line = f.readline()
        if not line:
            time.sleep(wait_sec)
            f.seek(where)
        else:
            print line, # already has newline

import itertools
fname = 'log.txt'
offset = 5
n = 10
with open(fname) as f:
    n_last_lines = list(reversed([x for x in itertools.islice(f, None)][-(offset+1):-(offset+n+1):-1]))

abc = "2018-06-16 04:45:18.68"
filename = "abc.txt"
with open(filename) as myFile:
    for num, line in enumerate(myFile, 1):
        if abc in line:
            lastline = num
print "last occurance of work at file is in "+str(lastline)

Não é muito útil módulo que pode fazer isso:

from file_read_backwards import FileReadBackwards

with FileReadBackwards("/tmp/file", encoding="utf-8") as frb:

# getting lines by lines starting from the last line up
for l in frb:
    print(l)

Pensando bem, esta é provavelmente tão rápido quanto qualquer coisa aqui.

def tail( f, window=20 ):
    lines= ['']*window
    count= 0
    for l in f:
        lines[count%window]= l
        count += 1
    print lines[count%window:], lines[:count%window]

É muito mais simples. E parece rasgar junto a um bom ritmo.

Eu encontrei um a maneira provavelmente mais fácil para encontrar as primeiras ou últimas N linhas de um arquivo

Última N linhas de um arquivo (ex: N = 10)

file=open("xyz.txt",'r")
liner=file.readlines()
for ran in range((len(liner)-N),len(liner)):
    print liner[ran]

Primeiro N linhas de um ficheiro (por ex: N = 10)

file=open("xyz.txt",'r")
liner=file.readlines()
for ran in range(0,N+1):
    print liner[ran]

é tão simples:

def tail(fname,nl):
with open(fname) as f:
    data=f.readlines() #readlines return a list
    print(''.join(data[-nl:]))

Embora este não é realmente no lado eficiente com arquivos grandes, este código é bem simples:

Ele lê o objeto de arquivo, f.
Ele divide a string retornada usando novas linhas, \n.
Ela recebe as listas de matriz últimos índices, usando o sinal negativo a repousar durante os últimos índices, ea : para obter um subarray.
```
def tail(f,n):
    return "\n".join(f.read().split("\n")[-n:])
```

Licenciado em: CC-BY-SA com atribuição

Não afiliado a StackOverflow