Estrarre informazioni da grandi file di testo strutturati

https://stackoverflow.com/questions/481862

20-08-2019
|

Domanda

Devo leggere alcuni file di grandi dimensioni (da 50k a 100k righe), strutturati in gruppi separati da righe vuote. Ogni gruppo inizia con lo stesso schema & Quot; No.999999999 gg / mm / aaaa ZZZ & Quot ;. Qui & # 180; s alcuni dati di esempio.

N. 813829461 del 16/09/1987 270
  Tit.SUZANO PAPEL E CELULOSE S.A. (BR / BA)
  C.N.P.J./C.I.C./N INPI: 16404287000155
  Procuratore: MARCELLO DO NASCIMENTO

N. 815326777 del 28/12/1989 351
  Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR / RJ)
  C.N.P.J./C.I.C./NºINPI: 34162651000108
  Apres .: Nominativa; Nat .: De Produto
  Marca: TRIO TROPICAL
  Clas.Prod/Serv: 09.40
  * DEFERIDO CONFORME RESOLU & # 199; & # 195; O 123 DE 06/01/2006, PUBLICADA NA RPI 1829, DE 24/01/2006.
  Procuratore: WALDEMAR RODRIGUES PEDRA

No.900148764 11/01/2007 LD3
  Tit.TIARA BOLSAS E CAL & # 199; ADOS LTDA
  Procuratore: Marcia Ferreira Gomes
  * Escrit & # 243; rio: Marcas Marcantes e Patentes Ltda
  * Exig & # 234; ncia Formal n & # 227; o response Satisfatoriamente, Pedido de Registro de Marca considerado inexistente, de acordo com Art. 157 da LPI
  * Protocolo da Peti & # 231; & # 227; o cumprimento de Exig & # 234; ncia Formal: 810080140197

Ho scritto del codice che & # 180; lo analizza di conseguenza. C'è qualcosa che posso migliorare, per migliorare la leggibilità o le prestazioni? Qui & # 180; s quello che vengo finora:

import re, pprint

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    regexp = {
        re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'): lambda self: self._processo,
        re.compile(r'Tit.(.*)'): lambda self: self._titular,
        re.compile(r'Procurador: (.*)'): lambda self: self._procurador,
        re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,
        re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,
        re.compile(r'Marca: (.*)'): lambda self: self._marca,
        re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,
        re.compile(r'\*(.*)'): lambda self: self._complemento,
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []

    def _processo(self, matches):
        self.processo, self.data, self.despacho = matches.groups()

    def _titular(self, matches):
        self.titular = matches.group(1)

    def _procurador(self, matches):
        self.procurador = matches.group(1)

    def _documento(self, matches):
        self.documento = matches.group(1)

    def _apresentacao(self, matches):
        self.apresentacao, self.natureza = matches.groups()

    def _marca(self, matches):
        self.marca = matches.group(1)

    def _classe(self, matches):
        self.classe = matches.group(1)

    def _complemento(self, matches):
        self.complemento.append(matches.group(1))

    def read(self, line):
        for pattern in Despacho.regexp:
            m = pattern.match(line)
            if m:
                Despacho.regexp[pattern](self)(m)


def process(rpi):
    """
    read data and process each group
    """
    rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)


arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
    pprint.pprint(desp.__dict__)
    print('--------------')

Soluzione

Questo è abbastanza buono. Di seguito alcuni suggerimenti, fammi sapere se ti piace:

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = {
        ('processo', 
         'data', 
         'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),
        ('titular',): re.compile(r'Tit.(.*)'),
        ('procurador',): re.compile(r'Procurador: (.*)'),
        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
        ('apresentacao',
         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
        ('marca',): re.compile(r'Marca: (.*)'),
        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
        ('complemento',): re.compile(r'\*(.*)'),
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []


    def read(self, line):
        for attrs, pattern in Despacho.regexp.iteritems():
            m = pattern.match(line)
            if m:
                for groupn, attr in enumerate(attrs):
                    # special case complemento:
                    if attr == 'complemento':
                        self.complemento.append(m.group(groupn + 1))
                    else:
                        # set the attribute on the object
                        setattr(self, attr, m.group(groupn + 1))

    def __repr__(self):
        # defines object printed representation
        d = {}
        for attrs in self.regexp:
            for attr in attrs:
                d[attr] = getattr(self, attr, None)
        return pprint.pformat(d)

def process(rpi):
    """
    read data and process each group
    """
    #Useless line, since you're doing a for anyway
    #rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)

def main():
    arquivo = open('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print('-' * 20)
    return 0

if __name__ == '__main__':
    main()

Altri suggerimenti

Sarebbe più semplice aiutarti se avessi un problema specifico. Le prestazioni dipenderanno notevolmente dall'efficienza del particolare motore regex che stai utilizzando. 100K righe in un singolo file non sembrano così grandi, ma di nuovo tutto dipende dal tuo ambiente.

Uso Expresso nel mio sviluppo .NET per testare espressioni di precisione e prestazioni. Una ricerca su Google ha rivelato Kodos , uno strumento di authoring regex di Python GUI.

Nel complesso sembra buono, ma perché hai la linea:

rpi = (line for line in rpi)

Puoi già iterare sull'oggetto file senza questo passaggio intermedio.

Non userei regex qui. Se sai che le tue linee inizieranno con stringhe fisse, perché non controllare quelle stringhe e scrivere una logica attorno ad essa?

for line in open(file):
    if line[0:3]=='No.':
        currIndex='No'
        map['No']=line[4:]
   ....
   ...
   else if line.strip()=='':
       //store the record in the map and clear the map
   else:
      //append line to the last index in map.. this is when the record overflows to the next line.
      Map[currIndex]=Map[currIndex]+"\n"+line

Considera il codice sopra come solo lo pseudocodice.

Un'altra versione con una sola espressione regolare combinata:

#!/usr/bin/python

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = re.compile(
        r'No.(?P<processo>[\d]{9})  (?P<data>[\d]{2}/[\d]{2}/[\d]{4})  (?P<despacho>.*)'
        r'|Tit.(?P<titular>.*)'
        r'|Procurador: (?P<procurador>.*)'
        r'|C.N.P.J./C.I.C./N INPI :(?P<documento>.*)'
        r'|Apres.: (?P<apresentacao>.*) ; Nat.: (?P<natureza>.*)'
        r'|Marca: (?P<marca>.*)'
        r'|Clas.Prod/Serv: (?P<classe>.*)'
        r'|\*(?P<complemento>.*)')

    simplefields = ('processo', 'data', 'despacho', 'titular', 'procurador',
                    'documento', 'apresentacao', 'natureza', 'marca', 'classe')

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single
        registry
        """
        self.__dict__ = dict.fromkeys(self.simplefields)
        self.complemento = []

    def parse(self, line):
        m = self.regexp.match(line)
        if m:
            gd = dict((k, v) for k, v in m.groupdict().items() if v)
            if 'complemento' in gd:
                self.complemento.append(gd['complemento'])
            else:
                self.__dict__.update(gd)

    def __repr__(self):
        # defines object printed representation
        return pprint.pformat(self.__dict__)

def process(rpi):
    """
    read data and process each group
    """
    d = None

    for line in rpi:
        if line.startswith('No.'):
            if d:
                yield d
            d = Despacho()
        d.parse(line)
    yield d

def main():
    arquivo = file('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print '-' * 20

if __name__ == '__main__':
    main()

Autorizzato sotto: CC-BY-SA insieme a attribuzione

Non affiliato a StackOverflow