55
66#import cPickle as pickle
77import pickle
8+ from openeye .oechem import *
89
910def read_database ():
1011 """Read the database from a pickle file and return it"""
@@ -23,3 +24,48 @@ def convert_to_json( database_pickle, database_json):
2324
2425 with open (database_json ,"w" , encoding = 'utf-8' ) as fs :
2526 json .dump (freeSolv ,fs )
27+
28+ def check_for_duplicates ( database_contents ):
29+ """Take contents of database and re-generate all SMILES, checking for duplicates.
30+
31+ Parameters:
32+ ----------
33+ database_contents : dict
34+ dictionary of FreeSolv database, keyed by compound ID
35+
36+ Returns:
37+ ----------
38+ num_dupes : int
39+ Number of duplicated compound pairs found
40+ keypairs : list
41+ List containing tuples of pairs corresponding to the compound IDs of the duplicates
42+ """
43+
44+ # Pull compound IDs
45+ cids = [ item for item in database_contents ]
46+
47+ # Generate new OEMols from SMILES
48+ oemols = []
49+ for cid in cids :
50+ mol = OEMol ()
51+ OEParseSmiles (mol , database_contents [cid ]['smiles' ])
52+ oemols .append (mol )
53+
54+ # Generate new SMILES from OEMols, thereby standardizing
55+ smiles = []
56+ for mol in oemols :
57+ smiles .append (OEMolToSmiles (mol ))
58+
59+ # Build duplicate info
60+ clean_smiles = []
61+ keypairs = []
62+ for idx ,cid in enumerate (cids ):
63+ smi = smiles [idx ]
64+ if smi not in clean_smiles :
65+ clean_smiles .append (smi )
66+ else :
67+ dupe_idx = smiles .index (smi )
68+ keypairs .append ( (cids [dupe_idx ], cid ) )
69+
70+ return len (keypairs ), keypairs
71+
0 commit comments