Try running this code and see what it reports. This requires Python 2.7 or newer for collections.Counter
but you could easily write your own counter code, or copy my example code from another answer: Python : List of dict, if exists increment a dict value, if not append a new dict
from collections import Counter
# read in original records
with open("account_names.csv", "rt") as f:
rows = sorted(line.strip() for line in f)
# count how many times each row appears
counts = Counter(rows)
# get a list of tuples of (count, row) that only includes count > 1
dups = [(count, row) for row, count in counts.items() if count > 1]
dup_count = sum(count-1 for count in counts.values() if count > 1)
# sort the list from largest number of dups to least
dups.sort(reverse=True)
# print a report showing how many dups
for count, row in dups:
print("{}\t{}".format(count, row))
# get de-duped list
unique_rows = sorted(counts)
# read in de-duped list
with open("account_de_ex.csv", "rt") as f:
de_duped = sorted(line.strip() for line in f)
print("List lengths: rows {}, uniques {}/de_duped {}, result {}".format(
len(rows), len(unique_rows), len(de_duped), len(de_duped) + dup_count))
# lists should match since we sorted both lists
if unique_rows == de_duped:
print("perfect match!")
else:
# if lists don't match, find out what is going on
uniques_set = set(unique_rows)
deduped_set = set(de_duped)
# find intersection of the two sets
x = uniques_set.intersection(deduped_set)
# print differences
if x != uniques_set:
print("Rows in original that are not in deduped:\n{}".format(sorted(uniques_set - x)))
if x != deduped_set:
print("Rows in deduped that are not in original:\n{}".format(sorted(deduped_set - x)))