Skip to content

Instantly share code, notes, and snippets.

@indera
Created April 23, 2018 15:28
Show Gist options
  • Save indera/b5a2aaca703d2a9b76bc87829f3f9703 to your computer and use it in GitHub Desktop.
Save indera/b5a2aaca703d2a9b76bc87829f3f9703 to your computer and use it in GitHub Desktop.
"""
Detect groups having common values
Example: unpack.csv
PATID, HASH_1, HASH_2
pat_1, a, b
pat_2, x
pat_3, , y
pat_4, a, c
pat_5, d, b
pat_6, e, b
pat_7, e, f
"""
import pandas as pd
import numpy as np
filename = 'unpack.csv'
df = pd.read_csv(filename, sep=",", skipinitialspace=True, dtype=object)
df.fillna('', inplace=True)
df['COUNT_1'] = 0
df['COUNT_2'] = 0
print(df)
grp = df.groupby('HASH_1', as_index=False)
gr1 = df.groupby('HASH_1', as_index=False).agg(np.size)
print(gr1)
found_1 = gr1.loc[gr1['COUNT_1'] > 1]
print("Found: \n{}".format(found_1))
# df_a = df.groupby('HASH_1').agg(lambda x: ','.join(x))
#
#
# for i in df_a.index:
# print("\n -> Name: {}".format(i))
# print(df_a.loc[i]['PATID'])
#
for name, group in grp:
print(name)
print("Group size: {}".format(len(group)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment