Skip to content

Commit 76814be

Browse files
committed
Add utility functionality for checking for duplicates
1 parent a9e867d commit 76814be

1 file changed

Lines changed: 46 additions & 0 deletions

File tree

scripts/utils.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#import cPickle as pickle
77
import pickle
8+
from openeye.oechem import *
89

910
def read_database():
1011
"""Read the database from a pickle file and return it"""
@@ -23,3 +24,48 @@ def convert_to_json( database_pickle, database_json):
2324

2425
with open(database_json,"w", encoding='utf-8') as fs:
2526
json.dump(freeSolv,fs)
27+
28+
def check_for_duplicates( database_contents ):
29+
"""Take contents of database and re-generate all SMILES, checking for duplicates.
30+
31+
Parameters:
32+
----------
33+
database_contents : dict
34+
dictionary of FreeSolv database, keyed by compound ID
35+
36+
Returns:
37+
----------
38+
num_dupes : int
39+
Number of duplicated compound pairs found
40+
keypairs : list
41+
List containing tuples of pairs corresponding to the compound IDs of the duplicates
42+
"""
43+
44+
# Pull compound IDs
45+
cids = [ item for item in database_contents ]
46+
47+
# Generate new OEMols from SMILES
48+
oemols = []
49+
for cid in cids:
50+
mol = OEMol()
51+
OEParseSmiles(mol, database_contents[cid]['smiles'])
52+
oemols.append(mol)
53+
54+
# Generate new SMILES from OEMols, thereby standardizing
55+
smiles = []
56+
for mol in oemols:
57+
smiles.append(OEMolToSmiles(mol))
58+
59+
# Build duplicate info
60+
clean_smiles = []
61+
keypairs = []
62+
for idx,cid in enumerate(cids):
63+
smi = smiles[idx]
64+
if smi not in clean_smiles:
65+
clean_smiles.append(smi)
66+
else:
67+
dupe_idx = smiles.index(smi)
68+
keypairs.append( (cids[dupe_idx], cid) )
69+
70+
return len(keypairs), keypairs
71+

0 commit comments

Comments
 (0)