"""
SAMPLE OUTPUT
-------------
In-Cell Duplicate: Row #167, Col: CV, Val: D04C
In-Cell Duplicate: Row #988, Col: PR, Val: P190
Duplicate in Other Row #1384: Row #176, Col: CV, Val: D20C
Duplicate in Other Row #571: Row #177, Col: CV, Val: T910
1. Cells contain lists (separated with whitespace)
2. Find duplicates within each cell for a series of columns.
3. Find duplicates between the list in the cell and all other rows.
(concatonated lists from a series of columns)
"""
import csv
from collections import Counter
from toolz.curried import *
FILENAME = 'cv_fam.csv'
START_ROW = 2
STOP_ROW = None
START_COL = 2
STOP_COL = 6
COLUMN_NAMES = ('CV', 'PR', 'FL', 'CR')
DUPLICATE_MESG = ': Row #{}, Col: {}, Val: {}'
DUPLICATE_IN_CELL_MESG = 'In-Cell Duplicate' + DUPLICATE_MESG
DUPLICATE_IN_OTHER_ROW_MESG = 'Duplicate in Other Row #{}' + DUPLICATE_MESG
def duplicates(seq1, seq2=None):
if seq2 is None:
return (i for i, j in Counter(seq1).items() if j > 1)
return set(seq1) & set(seq2)
def duplicates_in_cell(cols):
for col_num, col in enumerate(cols):
col_name = COLUMN_NAMES[col_num]
for row_num, row in enumerate(col):
for value in duplicates(col[row_num]):
yield DUPLICATE_IN_CELL_MESG.format(
row_num + START_ROW + 1, col_name, value)
def duplicates_in_other_rows(rows, cols):
for col_num, col in enumerate(cols):
col_name = COLUMN_NAMES[col_num]
for row_num, row in enumerate(col):
cell = col[row_num]
for other_row_num, other_row in enumerate(rows):
if other_row_num != row_num:
for value in duplicates(cell, other_row):
yield DUPLICATE_IN_OTHER_ROW_MESG.format(
other_row_num + START_ROW + 1,
row_num + START_ROW + 1,
col_name, value)
def main():
rows = list(csv.reader(open(FILENAME)))[START_ROW:STOP_ROW]
test_cols = [[row[col].split() for row in rows]
for col in range(START_COL, STOP_COL)]
test_rows = [list(concat(col)) for col in zip(*test_cols)]
for duplicate in duplicates_in_cell(test_cols):
print(duplicate)
for duplicate in duplicates_in_other_rows(test_rows, test_cols):
print(duplicate)
if __name__ == '__main__':
main()