قائمة الكائنات تحويله إلى قائمة من الأعداد الصحيحة وجدول بحث

https://stackoverflow.com/questions/1401721

05-07-2019
|

سؤال

لتوضيح ما أعنيه بهذا، وهنا مثال

messages = [
  ('Ricky',  'Steve',  'SMS'),
  ('Steve',  'Karl',   'SMS'),
  ('Karl',   'Nora',   'Email')
]

وأريد تحويل هذه القائمة وتعريف مجموعة إلى قائمة من الأعداد الصحيحة وقاموس البحث بحيث يكون كل عنصر في المجموعة يحصل على معرف فريد. يجب أن الخريطة التي id لعنصر في جدول بحث مثل هذا

messages_int, lookup_table = create_lookup_list(
              messages, ('person', 'person', 'medium'))

print messages_int
[ (0, 1, 0),
  (1, 2, 0),
  (2, 3, 1) ]

print lookup_table
{ 'person': ['Ricky', 'Steve', 'Karl', 'Nora'],
  'medium': ['SMS', 'Email']
}

وأنا أتساءل عما إذا كان هناك حل أنيق وpythonic لهذه المشكلة.

وأنا أيضا منفتح على المصطلحات أفضل من create_lookup_list الخ

المحلول

وdefaultdict جنبا إلى جنب مع أسلوب itertools.count().next هو وسيلة جيدة لتعيين معرفات لعناصر فريدة. وإليك مثال لكيفية تطبيق ذلك في قضيتك:

from itertools import count
from collections import defaultdict

def create_lookup_list(data, domains):
    domain_keys = defaultdict(lambda:defaultdict(count().next))
    out = []
    for row in data:
        out.append(tuple(domain_keys[dom][val] for val, dom in zip(row, domains)))
    lookup_table = dict((k, sorted(d, key=d.get)) for k, d in domain_keys.items())
    return out, lookup_table

وتحرير: لاحظ أن count().next يصبح count().__next__ أو lambda: next(count()) في بيثون 3

نصائح أخرى

والألغام تقريبا نفس طول وتعقيد:

import collections

def create_lookup_list(messages, labels):

    # Collect all the values
    lookup = collections.defaultdict(set)
    for msg in messages:
        for l, v in zip(labels, msg):
            lookup[l].add(v)

    # Make the value sets lists
    for k, v in lookup.items():
        lookup[k] = list(v)

    # Make the lookup_list
    lookup_list = []
    for msg in messages:
        lookup_list.append([lookup[l].index(v) for l, v in zip(labels, msg)])

    return lookup_list, lookup

في الإجابة أوتو (أو أي شخص آخر هو مع STRING-> dicts الهوية)، ويهمني ان يحل محل (إن التوجس أكثر من السرعة هو الشيء الخاص بك):

# create the lookup table
lookup_dict = {}
for group in indices:
    lookup_dict[group] = sorted(indices[group].keys(),
            lambda e1, e2: indices[group][e1]-indices[group][e2])

وقبل

# k2i must map keys to consecutive ints [0,len(k2i)-1)
def inverse_indices(k2i):
    inv=[0]*len(k2i)
    for k,i in k2i.iteritems():
        inv[i]=k
    return inv

lookup_table = dict((g,inverse_indices(gi)) for g,gi in indices.iteritems())

وهذا هو أفضل لمهمة مباشرة إلى كل عنصر في مجموعة عكسية مباشرة أسرع من الفرز.

وهنا هو الحل بلدي - أنا أشك في ذلك هو أفضل

def create_lookup_list(input_list, groups):
    # use a dictionary for the indices so that the index lookup 
    # is fast (not necessarily a requirement)
    indices = dict((group, {}) for group in groups) 
    output = []

    # assign indices by iterating through the list
    for row in input_list:
        newrow = []
        for group, element in zip(groups, row):
            if element in indices[group]:
                index = indices[group][element]
            else:
                index = indices[group][element] = len(indices[group])
            newrow.append(index)
        output.append(newrow)

    # create the lookup table
    lookup_dict = {}
    for group in indices:
        lookup_dict[group] = sorted(indices[group].keys(),
                lambda e1, e2: indices[group][e1]-indices[group][e2])

    return output, lookup_dict

وهذا هو أبسط قليلا، وأكثر مباشرة.

from collections import defaultdict

def create_lookup_list( messages, schema ):
    def mapped_rows( messages ):
        for row in messages:
            newRow= []
            for col, value in zip(schema,row):
                if value not in lookups[col]:
                    lookups[col].append(value)
                code= lookups[col].index(value)
                newRow.append(code)
            yield newRow
    lookups = defaultdict(list)
    return list( mapped_rows(messages) ), dict(lookups)

إذا كانت عمليات البحث القواميس المناسبة، وليس القوائم، وهذا يمكن زيادة مبسطة.
يصبح لديك "جدول البحث" لديها بنية التالي

{ 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
  'medium': {'SMS':0, 'Email':1}
}

ويمكن أن يشهد مزيدا من الانخفاض في التعقيد.

ويمكنك تحويل هذه النسخة العمل في عمليات البحث في انها معكوس كما يلي:

>>> lookups = { 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
      'medium': {'SMS':0, 'Email':1}
    }
>>> dict( ( d, dict( (v,k) for k,v in lookups[d].items() ) ) for d in lookups )
{'person': {0: 'Ricky', 1: 'Steve', 2: 'Karl', 3: 'Nora'}, 'medium': {0: 'SMS', 1: 'Email'}}

وهنا هو حل بي، انها ليست أفضل - انها مختلفة تماما:)

def create_lookup_list(data, keys):
  encoded = []
  table = dict([(key, []) for key in keys])

  for record in data:
      msg_int = []
      for key, value in zip(keys, record):
          if value not in table[key]:
              table[key].append(value)
          msg_int.append(table[key].index(value))  
      encoded.append(tuple(msg_int))

  return encoded, table

وهنا هو لي، وظيفة داخلية تسمح لي كتابة مؤشر الصفوف (tuple) كمولد.

def create_lookup_list( data, format):
    table = {}
    indices = []
    def get_index( item, form ):
        row = table.setdefault( form, [] )
        try:
            return row.index( item )
        except ValueError:
            n = len( row )
            row.append( item )
            return n
    for row in data:
        indices.append( tuple( get_index( item, form ) for item, form in zip( row, format ) ))

    return table, indices

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow