Changes: Find the 10 nearest neighbors in a data set

Latest revision as of 10:42, 10 October 2013

This task uses the toolkit appropriate way of computing molecular similarities to find the 10 most similar structures in a data set.

A common task in cheminformatics is to find target structures in a data set which are similar to a query structure. The word "similar" is ill-defined. What we have instead are well-defined measures which are hopefully well correlated to what a chemist would call similar. One common measure is to generate a hash fingerprint then use the Tanimoto similarity as a proxy for molecular similarity. Other solutions work directly on the lexical token of the SMILES string, or on molecular conformations, or on the electrostatic surface.

Implementation[]

The data will come from the gzip'ed SD file of the benzodiazepine data set. Use the first structure as the query structure, and use the rest of the file as the targets to find the 10 most similar structures. The output is sorted by similarity, from most similar to least. Each target match is on its own line, and the line contains the similarity score in the first column in the range 0.00 to 1.00 (preferably to 2 decimal places), then a space, then the target ID, which is the title line from the SD file.

Indigo/C++[]

'Title' line is not stored in Indigo's Molecule class, that is because the molecule numbers in this example are taken from SDF field 'pubchem_compound_cid'.

#include "base_cpp/scanner.h"
#include "molecule/molecule.h"
#include "molecule/sdf_loader.h"
#include "molecule/molfile_loader.h"
#include "molecule/molecule_arom.h"
#include "molecule/molecule_fingerprint.h"
#include "base_c/bitarray.h"

typedef struct
{
   int idx;
   float sim;
} IdxSim;

static int _compareSim (const IdxSim &a, const IdxSim &b, const void *context)
{
   return a.sim < b.sim ? 1 : -1;
}

int main (void)
{
   StringPool ids;
   Array<IdxSim> arr;

   MoleculeFingerprintParameters parameters;

   memset(&parameters, 0, sizeof(parameters));
   parameters.sim_qwords = 8; // 64 bytes 

   int fpsize = parameters.fingerprintSizeSim();
   Array<byte> queryfp;

   try
   {
      FileScanner scanner("benzodiazepine.sdf");
      const char *idfield  = "pubchem_compound_cid";
      SdfLoader loader(scanner);

      loader.initProperties(idfield);
      int cnt = 0;

      while (!loader.isEOF())
      {
         loader.readNext();

         BufferScanner molscan(loader.data);
         MolfileLoader molload(molscan);
         Molecule mol;
         
         molload.loadMolecule(mol, false);
         mol.calcImplicitHydrogens(true);
         MoleculeAromatizer::aromatizeBonds(mol);

         MoleculeFingerprintBuilder builder(mol, parameters);
          
         builder.process();

         if (cnt++ == 0)
         {
            queryfp.copy(builder.getSim(), fpsize);
            continue;
         }
 
         int ones1 = bitGetOnesCount(builder.getSim(), fpsize);
         int ones2 = bitGetOnesCount(queryfp.ptr(), fpsize);
         int common_ones = bitCommonOnes(builder.getSim(), queryfp.ptr(), fpsize);
         
         float similarity = 0.f;
         
         if (common_ones > 0)
            similarity = (float)common_ones / (ones1 + ones2 - common_ones);

         IdxSim &elem = arr.push();

         elem.idx = ids.add(loader.properties.at(idfield).ptr());
         elem.sim = similarity;
      }

      arr.qsort(_compareSim, 0);

      for (int i = 0; i < __min(arr.size(), 10); i++)
         printf("%f %s\n", arr[i].sim, ids.at(arr[i].idx));
   }
   catch (Exception &e)
   {
      fprintf(stderr, "error: %s\n", e.message());
      return -1;
   }

   return 0;
}

1.000000 19758676
1.000000 450830
1.000000 11033472
1.000000 44353759
1.000000 3016
1.000000 20560526
1.000000 19002495
1.000000 19892573
1.000000 450820
1.000000 6452650

Instructions:

Unpack 'graph' and 'molecule' projects into some folder
Create 'utils' folder nearby
Paste the above code into utils/nearest_neighbors.cpp file
Compile the file using the following commands:
$ cd graph; make CONF=Release32; cd ..
$ cd molecule; make CONF=Release32; cd ..
$ cd utils
$ gcc nearest_neighbors.cpp -o nearest_neighbors -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++
Run the program like that:
$ ./nearest_neighbors

OpenBabel/Rubabel[]

require 'rubabel'
prev = nil
comp = []
Rubabel.foreach("benzodiazepine.sdf.gz").map do |mol|
  prev = mol if prev.nil?
  comp << [prev.tanimoto(mol), mol.title]
end
puts comp.sort.reverse[1,11].map {|v| "#{v[0].round(3)} #{v[1]}" }

OpenBabel/Pybel[]

# This solution uses OpenBabel's default path-based fingerprint (FP2)

import pybel

inputfile = pybel.readfile("sdf", "benzodiazepine.sdf")
query = inputfile.next()
queryfp = query.calcfp()

scores = sorted((-(mol.calcfp() | queryfp), mol.title) for mol in inputfile)
print "\n".join("%.2f %s" % (-sim, title) for sim, title in scores[:10])

1.00 450820
1.00 623918
0.99 20351792
0.99 9862446
0.98 398657
0.98 398658
0.98 11033472
0.98 19002495
0.98 19758676
0.98 20512505

OpenEye/Python[]

# This solution uses OpenEye's default path-based fingerprints

from openeye.oechem import *
from openeye.oegraphsim import *

ifs = oemolistream()
ifs.open("benzodiazepine.sdf.gz")
mol_iter = ifs.GetOEGraphMols()

# Get the first structure and compute its fingerprint
mol = next(mol_iter)
query_fp = OEFingerPrint()
OEMakePathFP(query_fp, mol)

# Will put each (similarity score, title) into this list.
scores = []

# Create this once and reuse
target_fp = OEFingerPrint()

for mol in mol_iter:
    OEMakePathFP(target_fp, mol)
    similarity = OETanimoto(query_fp, target_fp)
    scores.append( (similarity, mol.GetTitle()) )

# Use a special sort key, to order so most similar
# comes first. Use the title to break ties.
scores.sort(key=lambda (similarity, title): (-similarity, title))

for similarity, title in scores[:10]:
    # Report the similaity to two decimal places
    print "%.2f" % similarity, title

Here is the output

1.00 10913700
1.00 14221920
1.00 14221922
1.00 450820
1.00 9904514
0.92 20528394
0.92 3085818
0.91 19002495
0.91 19758676
0.91 3016

Different version[]

from openeye.oechem import *
from openeye.oegraphsim import *

def main():
    OEThrow.SetLevel(OEErrorLevel_Error)
    
    ifs = oemolistream()
    ifs.open('benzodiazepine.sdf.gz')

    # CREATE FINGERPRINT DATABASE
    molecules = []
    fpdb = OEFPDatabase(OEFPType_Path)
    
    for mol in ifs.GetOEGraphMols():
        fpdb.AddFP(mol)
        molecules.append(OEGraphMol(mol))
    
    for score in fpdb.GetSortedScores(molecules[0], 10):
        print 'Tanimoto score %.2f for %s' % (
                 score.GetScore(), molecules[score.GetIdx()].GetTitle())

main()

Tanimoto score 1.00 for 9904514
Tanimoto score 1.00 for 14221922
Tanimoto score 1.00 for 10913700
Tanimoto score 1.00 for 1688
Tanimoto score 1.00 for 14221920
Tanimoto score 1.00 for 450820
Tanimoto score 0.92 for 20528394
Tanimoto score 0.92 for 3085818
Tanimoto score 0.91 for 19758676
Tanimoto score 0.91 for 450830

RDKit/Python[]

from rdkit import Chem
from rdkit import DataStructs

# the RDKit has a builtin datastructure for "Top N" type tasks:
from rdkit.DataStructs.TopNContainer import TopNContainer
top10 = TopNContainer(10)

refFp = None
for mol in Chem.SDMolSupplier('benzodiazepine.sdf'):
    if not mol: continue
    fp = Chem.RDKFingerprint(mol)    
    if refFp is None:
        refFp = fp
    else:
        tani = DataStructs.TanimotoSimilarity(refFp,fp)
        top10.Insert((tani,mol.GetProp('_Name')))
pts = top10.GetPts()
pts.reverse()
for sim,nm in pts:
    print '% .2f %s'%(sim,nm)

1.00 450820
0.99 6452650
0.99 450830
0.99 44353759
0.99 3016
0.99 20560526
0.99 19892573
0.99 19758676
0.99 19002495
0.98 19002643

Cactvs/Tcl[]

prop setparam E_SCREEN extended 2
set fh [molfile open benzodiazepine.sdf.gz r hydrogens add]
set ehcmp [molfile read $fh]
set th [molfile scan $fh "structure ~>= $ehcmp 0" {table score E_NAME}]
table sort $th {scores descending} {E_NAME ascending}
table loop $th row 10 {
        puts [format "%.2f %s" [expr [lindex $row 0]/100.0] [lindex $row 1]]
}

This script only stores the score and name in the table object, so this works with almost arbitrarily large files. It can be further optimized by using a value larger than 0 as the Tanimoto similarity threshold in the file scan.

The result is

1.00 450820
1.00 9862446
0.99 9928121
0.98 23332591
0.98 44353759
0.98 6452650
0.97 11033472
0.97 3016
0.97 450830
0.97 623112

Cactvs/Python[]

In Python, this is even terser:

Prop.Setparam('E_SCREEN',{'extended':2})
f=Molfile('benzodiazepine.sdf.gz','r',{'hydrogens':'add'})
ecmp=f.read()
t=f.scan('structure ~>= {} 0'.format(ecmp),('table','score','E_NAME'))
t.sort(('score','descending'),('E_NAME','ascending'))
t.loop(lambda row:print('{:.2f} {}'.format(row[0]/100.0,row[1])),maxrows=10)

The output is the same as for the Tcl version. If ultracompact lambda functions make you nervous, you can also use a more verbose loop construct similar to the Tcl statement.