Chemistry Toolkit Rosetta Wiki
(problem description)
 
(23 intermediate revisions by 12 users not shown)
Line 1: Line 1:
  +
This task uses the toolkit appropriate way of computing molecular similarities to find the 10 most similar structures in a data set.
Given a query compounds (SMILES ==, and corresponding SD structure ==), search the benzodiazepine data set and find the 10 compounds which are the most similar, using the toolkits preferred definition of similar. The output is a sorted list of structures, one per line, with the structure ID followed by a space followed by the similarity value as a float between 0.0 and 1.0.
 
   
  +
A common task in cheminformatics is to find target structures in a data set which are similar to a query structure. The word "similar" is ill-defined. What we have instead are well-defined measures which are hopefully well correlated to what a chemist would call similar. One common measure is to generate a hash fingerprint then use the Tanimoto similarity as a proxy for molecular similarity. Other solutions work directly on the lexical token of the SMILES string, or on molecular conformations, or on the electrostatic surface.
The query structure is compound ### from the benzodiazepine data set, which means it must be found with a similarity of 1.0.
 
  +
  +
==Implementation==
  +
  +
The data will come from the [http://dalkescientific.com/writings/benzodiazepine.sdf.gz gzip'ed SD file] of the [[benzodiazepine]] data set. Use the first structure as the query structure, and use the rest of the file as the targets to find the 10 most similar structures. The output is sorted by similarity, from most similar to least. Each target match is on its own line, and the line contains the similarity score in the first column in the range 0.00 to 1.00 (preferably to 2 decimal places), then a space, then the target ID, which is the title line from the SD file.
  +
  +
==Indigo/C++==
  +
'Title' line is not stored in Indigo's Molecule class, that is because the molecule numbers in this example are taken from SDF field 'pubchem_compound_cid'.
  +
{|border="0" width="100%"
  +
|
  +
<source lang="cpp">
  +
#include "base_cpp/scanner.h"
  +
#include "molecule/molecule.h"
  +
#include "molecule/sdf_loader.h"
  +
#include "molecule/molfile_loader.h"
  +
#include "molecule/molecule_arom.h"
  +
#include "molecule/molecule_fingerprint.h"
  +
#include "base_c/bitarray.h"
  +
  +
typedef struct
  +
{
  +
int idx;
  +
float sim;
  +
} IdxSim;
  +
  +
static int _compareSim (const IdxSim &a, const IdxSim &b, const void *context)
  +
{
  +
return a.sim < b.sim ? 1 : -1;
  +
}
  +
  +
int main (void)
  +
{
  +
StringPool ids;
  +
Array<IdxSim> arr;
  +
  +
MoleculeFingerprintParameters parameters;
  +
  +
memset(&parameters, 0, sizeof(parameters));
  +
parameters.sim_qwords = 8; // 64 bytes
  +
  +
int fpsize = parameters.fingerprintSizeSim();
  +
Array<byte> queryfp;
  +
  +
try
  +
{
  +
FileScanner scanner("benzodiazepine.sdf");
  +
const char *idfield = "pubchem_compound_cid";
  +
SdfLoader loader(scanner);
  +
  +
loader.initProperties(idfield);
  +
int cnt = 0;
  +
  +
while (!loader.isEOF())
  +
{
  +
loader.readNext();
  +
  +
BufferScanner molscan(loader.data);
  +
MolfileLoader molload(molscan);
  +
Molecule mol;
  +
  +
molload.loadMolecule(mol, false);
  +
mol.calcImplicitHydrogens(true);
  +
MoleculeAromatizer::aromatizeBonds(mol);
  +
  +
MoleculeFingerprintBuilder builder(mol, parameters);
  +
  +
builder.process();
  +
  +
if (cnt++ == 0)
  +
{
  +
queryfp.copy(builder.getSim(), fpsize);
  +
continue;
  +
}
  +
  +
int ones1 = bitGetOnesCount(builder.getSim(), fpsize);
  +
int ones2 = bitGetOnesCount(queryfp.ptr(), fpsize);
  +
int common_ones = bitCommonOnes(builder.getSim(), queryfp.ptr(), fpsize);
  +
  +
float similarity = 0.f;
  +
  +
if (common_ones > 0)
  +
similarity = (float)common_ones / (ones1 + ones2 - common_ones);
  +
  +
IdxSim &elem = arr.push();
  +
  +
elem.idx = ids.add(loader.properties.at(idfield).ptr());
  +
elem.sim = similarity;
  +
}
  +
  +
arr.qsort(_compareSim, 0);
  +
  +
for (int i = 0; i < __min(arr.size(), 10); i++)
  +
printf("%f %s\n", arr[i].sim, ids.at(arr[i].idx));
  +
}
  +
catch (Exception &e)
  +
{
  +
fprintf(stderr, "error: %s\n", e.message());
  +
return -1;
  +
}
  +
  +
return 0;
  +
}</source>
  +
|
  +
<pre>
  +
1.000000 19758676
  +
1.000000 450830
  +
1.000000 11033472
  +
1.000000 44353759
  +
1.000000 3016
  +
1.000000 20560526
  +
1.000000 19002495
  +
1.000000 19892573
  +
1.000000 450820
  +
1.000000 6452650
  +
</pre>
  +
|}
  +
  +
Instructions:
  +
#Unpack 'graph' and 'molecule' projects into some folder
  +
#Create 'utils' folder nearby
  +
#Paste the above code into utils/nearest_neighbors.cpp file
  +
#Compile the file using the following commands:<br />$ cd graph; make CONF=Release32; cd ..<br />$ cd molecule; make CONF=Release32; cd ..<br />$ cd utils<br />$ gcc nearest_neighbors.cpp -o nearest_neighbors -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++
  +
#Run the program like that:<br />$ ./nearest_neighbors
  +
  +
==OpenBabel/Rubabel==
  +
<source lang="ruby">
  +
require 'rubabel'
  +
prev = nil
  +
comp = []
  +
Rubabel.foreach("benzodiazepine.sdf.gz").map do |mol|
  +
prev = mol if prev.nil?
  +
comp << [prev.tanimoto(mol), mol.title]
  +
end
  +
puts comp.sort.reverse[1,11].map {|v| "#{v[0].round(3)} #{v[1]}" }
  +
</source>
  +
  +
==OpenBabel/Pybel==
  +
  +
{|border="0" width="100%"
  +
|
  +
<source lang="python">
  +
# This solution uses OpenBabel's default path-based fingerprint (FP2)
  +
  +
import pybel
  +
  +
inputfile = pybel.readfile("sdf", "benzodiazepine.sdf")
  +
query = inputfile.next()
  +
queryfp = query.calcfp()
  +
  +
scores = sorted((-(mol.calcfp() | queryfp), mol.title) for mol in inputfile)
  +
print "\n".join("%.2f %s" % (-sim, title) for sim, title in scores[:10])
  +
</source>
  +
|
  +
<pre>
  +
1.00 450820
  +
1.00 623918
  +
0.99 20351792
  +
0.99 9862446
  +
0.98 398657
  +
0.98 398658
  +
0.98 11033472
  +
0.98 19002495
  +
0.98 19758676
  +
0.98 20512505
  +
</pre>
  +
|}
  +
  +
==OpenEye/Python==
  +
{|border="0" width="100%"
  +
|
  +
<source lang="python">
  +
# This solution uses OpenEye's default path-based fingerprints
  +
  +
from openeye.oechem import *
  +
from openeye.oegraphsim import *
  +
  +
ifs = oemolistream()
  +
ifs.open("benzodiazepine.sdf.gz")
  +
mol_iter = ifs.GetOEGraphMols()
  +
  +
# Get the first structure and compute its fingerprint
  +
mol = next(mol_iter)
  +
query_fp = OEFingerPrint()
  +
OEMakePathFP(query_fp, mol)
  +
  +
# Will put each (similarity score, title) into this list.
  +
scores = []
  +
  +
# Create this once and reuse
  +
target_fp = OEFingerPrint()
  +
  +
for mol in mol_iter:
  +
OEMakePathFP(target_fp, mol)
  +
similarity = OETanimoto(query_fp, target_fp)
  +
scores.append( (similarity, mol.GetTitle()) )
  +
  +
# Use a special sort key, to order so most similar
  +
# comes first. Use the title to break ties.
  +
scores.sort(key=lambda (similarity, title): (-similarity, title))
  +
  +
for similarity, title in scores[:10]:
  +
# Report the similaity to two decimal places
  +
print "%.2f" % similarity, title
  +
  +
</source>
  +
|
  +
Here is the output
  +
  +
<pre>
  +
1.00 10913700
  +
1.00 14221920
  +
1.00 14221922
  +
1.00 450820
  +
1.00 9904514
  +
0.92 20528394
  +
0.92 3085818
  +
0.91 19002495
  +
0.91 19758676
  +
0.91 3016
  +
</pre>
  +
|}
  +
===Different version===
  +
{|border="0" width="100%"
  +
|
  +
<source lang='python'>
  +
from openeye.oechem import *
  +
from openeye.oegraphsim import *
  +
  +
def main():
  +
OEThrow.SetLevel(OEErrorLevel_Error)
  +
  +
ifs = oemolistream()
  +
ifs.open('benzodiazepine.sdf.gz')
  +
  +
# CREATE FINGERPRINT DATABASE
  +
molecules = []
  +
fpdb = OEFPDatabase(OEFPType_Path)
  +
  +
for mol in ifs.GetOEGraphMols():
  +
fpdb.AddFP(mol)
  +
molecules.append(OEGraphMol(mol))
  +
  +
for score in fpdb.GetSortedScores(molecules[0], 10):
  +
print 'Tanimoto score %.2f for %s' % (
  +
score.GetScore(), molecules[score.GetIdx()].GetTitle())
  +
  +
main()
  +
</source>
  +
|
  +
<pre>
  +
Tanimoto score 1.00 for 9904514
  +
Tanimoto score 1.00 for 14221922
  +
Tanimoto score 1.00 for 10913700
  +
Tanimoto score 1.00 for 1688
  +
Tanimoto score 1.00 for 14221920
  +
Tanimoto score 1.00 for 450820
  +
Tanimoto score 0.92 for 20528394
  +
Tanimoto score 0.92 for 3085818
  +
Tanimoto score 0.91 for 19758676
  +
Tanimoto score 0.91 for 450830
  +
</pre>
  +
|}
  +
  +
==RDKit/Python==
  +
  +
{|border="0" width="100%"
  +
|
  +
<source lang="python">
  +
from rdkit import Chem
  +
from rdkit import DataStructs
  +
  +
# the RDKit has a builtin datastructure for "Top N" type tasks:
  +
from rdkit.DataStructs.TopNContainer import TopNContainer
  +
top10 = TopNContainer(10)
  +
  +
refFp = None
  +
for mol in Chem.SDMolSupplier('benzodiazepine.sdf'):
  +
if not mol: continue
  +
fp = Chem.RDKFingerprint(mol)
  +
if refFp is None:
  +
refFp = fp
  +
else:
  +
tani = DataStructs.TanimotoSimilarity(refFp,fp)
  +
top10.Insert((tani,mol.GetProp('_Name')))
  +
pts = top10.GetPts()
  +
pts.reverse()
  +
for sim,nm in pts:
  +
print '% .2f %s'%(sim,nm)
  +
</source>
  +
|
  +
<pre>
  +
1.00 450820
  +
0.99 6452650
  +
0.99 450830
  +
0.99 44353759
  +
0.99 3016
  +
0.99 20560526
  +
0.99 19892573
  +
0.99 19758676
  +
0.99 19002495
  +
0.98 19002643
  +
</pre>
  +
|}
  +
  +
==Cactvs/Tcl==
  +
  +
<pre lang="tcl">
  +
prop setparam E_SCREEN extended 2
  +
set fh [molfile open benzodiazepine.sdf.gz r hydrogens add]
  +
set ehcmp [molfile read $fh]
  +
set th [molfile scan $fh "structure ~>= $ehcmp 0" {table score E_NAME}]
  +
table sort $th {scores descending} {E_NAME ascending}
  +
table loop $th row 10 {
  +
puts [format "%.2f %s" [expr [lindex $row 0]/100.0] [lindex $row 1]]
  +
}
  +
</pre>
  +
  +
This script only stores the score and name in the table object, so this works with almost arbitrarily large files. It can be further optimized by using a value larger than 0 as the Tanimoto similarity threshold in the file scan.
  +
  +
The result is
  +
  +
<pre>
  +
1.00 450820
  +
1.00 9862446
  +
0.99 9928121
  +
0.98 23332591
  +
0.98 44353759
  +
0.98 6452650
  +
0.97 11033472
  +
0.97 3016
  +
0.97 450830
  +
0.97 623112
  +
</pre>
  +
  +
==Cactvs/Python==
  +
  +
In Python, this is even terser:
  +
  +
<pre lang="python">
  +
Prop.Setparam('E_SCREEN',{'extended':2})
  +
f=Molfile('benzodiazepine.sdf.gz','r',{'hydrogens':'add'})
  +
ecmp=f.read()
  +
t=f.scan('structure ~>= {} 0'.format(ecmp),('table','score','E_NAME'))
  +
t.sort(('score','descending'),('E_NAME','ascending'))
  +
t.loop(lambda row:print('{:.2f} {}'.format(row[0]/100.0,row[1])),maxrows=10)
  +
</pre>
  +
  +
The output is the same as for the Tcl version. If ultracompact lambda functions make you nervous, you can also use a more verbose loop construct similar to the Tcl statement.
  +
[[Category:similarity]]
  +
[[Category:nearest neighbors]]
  +
[[Category:OpenEye/Python]]
  +
[[Category:OpenBabel/Pybel]]
  +
[[Category:RDKit/Python]]
  +
[[Category:Indigo/C++]]
  +
[[Category:Cactvs/Tcl]]
  +
[[Category:Cactvs/Python]]

Latest revision as of 10:42, 10 October 2013

This task uses the toolkit appropriate way of computing molecular similarities to find the 10 most similar structures in a data set.

A common task in cheminformatics is to find target structures in a data set which are similar to a query structure. The word "similar" is ill-defined. What we have instead are well-defined measures which are hopefully well correlated to what a chemist would call similar. One common measure is to generate a hash fingerprint then use the Tanimoto similarity as a proxy for molecular similarity. Other solutions work directly on the lexical token of the SMILES string, or on molecular conformations, or on the electrostatic surface.

Implementation[]

The data will come from the gzip'ed SD file of the benzodiazepine data set. Use the first structure as the query structure, and use the rest of the file as the targets to find the 10 most similar structures. The output is sorted by similarity, from most similar to least. Each target match is on its own line, and the line contains the similarity score in the first column in the range 0.00 to 1.00 (preferably to 2 decimal places), then a space, then the target ID, which is the title line from the SD file.

Indigo/C++[]

'Title' line is not stored in Indigo's Molecule class, that is because the molecule numbers in this example are taken from SDF field 'pubchem_compound_cid'.

#include "base_cpp/scanner.h"
#include "molecule/molecule.h"
#include "molecule/sdf_loader.h"
#include "molecule/molfile_loader.h"
#include "molecule/molecule_arom.h"
#include "molecule/molecule_fingerprint.h"
#include "base_c/bitarray.h"

typedef struct
{
   int idx;
   float sim;
} IdxSim;

static int _compareSim (const IdxSim &a, const IdxSim &b, const void *context)
{
   return a.sim < b.sim ? 1 : -1;
}

int main (void)
{
   StringPool ids;
   Array<IdxSim> arr;

   MoleculeFingerprintParameters parameters;

   memset(&parameters, 0, sizeof(parameters));
   parameters.sim_qwords = 8; // 64 bytes 

   int fpsize = parameters.fingerprintSizeSim();
   Array<byte> queryfp;

   try
   {
      FileScanner scanner("benzodiazepine.sdf");
      const char *idfield  = "pubchem_compound_cid";
      SdfLoader loader(scanner);

      loader.initProperties(idfield);
      int cnt = 0;

      while (!loader.isEOF())
      {
         loader.readNext();

         BufferScanner molscan(loader.data);
         MolfileLoader molload(molscan);
         Molecule mol;
         
         molload.loadMolecule(mol, false);
         mol.calcImplicitHydrogens(true);
         MoleculeAromatizer::aromatizeBonds(mol);

         MoleculeFingerprintBuilder builder(mol, parameters);
          
         builder.process();

         if (cnt++ == 0)
         {
            queryfp.copy(builder.getSim(), fpsize);
            continue;
         }
 
         int ones1 = bitGetOnesCount(builder.getSim(), fpsize);
         int ones2 = bitGetOnesCount(queryfp.ptr(), fpsize);
         int common_ones = bitCommonOnes(builder.getSim(), queryfp.ptr(), fpsize);
         
         float similarity = 0.f;
         
         if (common_ones > 0)
            similarity = (float)common_ones / (ones1 + ones2 - common_ones);

         IdxSim &elem = arr.push();

         elem.idx = ids.add(loader.properties.at(idfield).ptr());
         elem.sim = similarity;
      }

      arr.qsort(_compareSim, 0);

      for (int i = 0; i < __min(arr.size(), 10); i++)
         printf("%f %s\n", arr[i].sim, ids.at(arr[i].idx));
   }
   catch (Exception &e)
   {
      fprintf(stderr, "error: %s\n", e.message());
      return -1;
   }

   return 0;
}
1.000000 19758676
1.000000 450830
1.000000 11033472
1.000000 44353759
1.000000 3016
1.000000 20560526
1.000000 19002495
1.000000 19892573
1.000000 450820
1.000000 6452650

Instructions:

  1. Unpack 'graph' and 'molecule' projects into some folder
  2. Create 'utils' folder nearby
  3. Paste the above code into utils/nearest_neighbors.cpp file
  4. Compile the file using the following commands:
    $ cd graph; make CONF=Release32; cd ..
    $ cd molecule; make CONF=Release32; cd ..
    $ cd utils
    $ gcc nearest_neighbors.cpp -o nearest_neighbors -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++
  5. Run the program like that:
    $ ./nearest_neighbors

OpenBabel/Rubabel[]

require 'rubabel'
prev = nil
comp = []
Rubabel.foreach("benzodiazepine.sdf.gz").map do |mol|
  prev = mol if prev.nil?
  comp << [prev.tanimoto(mol), mol.title]
end
puts comp.sort.reverse[1,11].map {|v| "#{v[0].round(3)} #{v[1]}" }

OpenBabel/Pybel[]

# This solution uses OpenBabel's default path-based fingerprint (FP2)

import pybel

inputfile = pybel.readfile("sdf", "benzodiazepine.sdf")
query = inputfile.next()
queryfp = query.calcfp()

scores = sorted((-(mol.calcfp() | queryfp), mol.title) for mol in inputfile)
print "\n".join("%.2f %s" % (-sim, title) for sim, title in scores[:10])
1.00 450820
1.00 623918
0.99 20351792
0.99 9862446
0.98 398657
0.98 398658
0.98 11033472
0.98 19002495
0.98 19758676
0.98 20512505

OpenEye/Python[]

# This solution uses OpenEye's default path-based fingerprints

from openeye.oechem import *
from openeye.oegraphsim import *

ifs = oemolistream()
ifs.open("benzodiazepine.sdf.gz")
mol_iter = ifs.GetOEGraphMols()

# Get the first structure and compute its fingerprint
mol = next(mol_iter)
query_fp = OEFingerPrint()
OEMakePathFP(query_fp, mol)

# Will put each (similarity score, title) into this list.
scores = []

# Create this once and reuse
target_fp = OEFingerPrint()

for mol in mol_iter:
    OEMakePathFP(target_fp, mol)
    similarity = OETanimoto(query_fp, target_fp)
    scores.append( (similarity, mol.GetTitle()) )

# Use a special sort key, to order so most similar
# comes first. Use the title to break ties.
scores.sort(key=lambda (similarity, title): (-similarity, title))

for similarity, title in scores[:10]:
    # Report the similaity to two decimal places
    print "%.2f" % similarity, title

Here is the output

1.00 10913700
1.00 14221920
1.00 14221922
1.00 450820
1.00 9904514
0.92 20528394
0.92 3085818
0.91 19002495
0.91 19758676
0.91 3016

Different version[]

from openeye.oechem import *
from openeye.oegraphsim import *

def main():
    OEThrow.SetLevel(OEErrorLevel_Error)
    
    ifs = oemolistream()
    ifs.open('benzodiazepine.sdf.gz')

    # CREATE FINGERPRINT DATABASE
    molecules = []
    fpdb = OEFPDatabase(OEFPType_Path)
    
    for mol in ifs.GetOEGraphMols():
        fpdb.AddFP(mol)
        molecules.append(OEGraphMol(mol))
    
    for score in fpdb.GetSortedScores(molecules[0], 10):
        print 'Tanimoto score %.2f for %s' % (
                 score.GetScore(), molecules[score.GetIdx()].GetTitle())

main()
Tanimoto score 1.00 for 9904514
Tanimoto score 1.00 for 14221922
Tanimoto score 1.00 for 10913700
Tanimoto score 1.00 for 1688
Tanimoto score 1.00 for 14221920
Tanimoto score 1.00 for 450820
Tanimoto score 0.92 for 20528394
Tanimoto score 0.92 for 3085818
Tanimoto score 0.91 for 19758676
Tanimoto score 0.91 for 450830

RDKit/Python[]

from rdkit import Chem
from rdkit import DataStructs

# the RDKit has a builtin datastructure for "Top N" type tasks:
from rdkit.DataStructs.TopNContainer import TopNContainer
top10 = TopNContainer(10)

refFp = None
for mol in Chem.SDMolSupplier('benzodiazepine.sdf'):
    if not mol: continue
    fp = Chem.RDKFingerprint(mol)    
    if refFp is None:
        refFp = fp
    else:
        tani = DataStructs.TanimotoSimilarity(refFp,fp)
        top10.Insert((tani,mol.GetProp('_Name')))
pts = top10.GetPts()
pts.reverse()
for sim,nm in pts:
    print '% .2f %s'%(sim,nm)
1.00 450820
0.99 6452650
0.99 450830
0.99 44353759
0.99 3016
0.99 20560526
0.99 19892573
0.99 19758676
0.99 19002495
0.98 19002643

Cactvs/Tcl[]

prop setparam E_SCREEN extended 2
set fh [molfile open benzodiazepine.sdf.gz r hydrogens add]
set ehcmp [molfile read $fh]
set th [molfile scan $fh "structure ~>= $ehcmp 0" {table score E_NAME}]
table sort $th {scores descending} {E_NAME ascending}
table loop $th row 10 {
        puts [format "%.2f %s" [expr [lindex $row 0]/100.0] [lindex $row 1]]
}

This script only stores the score and name in the table object, so this works with almost arbitrarily large files. It can be further optimized by using a value larger than 0 as the Tanimoto similarity threshold in the file scan.

The result is

1.00 450820
1.00 9862446
0.99 9928121
0.98 23332591
0.98 44353759
0.98 6452650
0.97 11033472
0.97 3016
0.97 450830
0.97 623112

Cactvs/Python[]

In Python, this is even terser:

Prop.Setparam('E_SCREEN',{'extended':2})
f=Molfile('benzodiazepine.sdf.gz','r',{'hydrogens':'add'})
ecmp=f.read()
t=f.scan('structure ~>= {} 0'.format(ecmp),('table','score','E_NAME'))
t.sort(('score','descending'),('E_NAME','ascending'))
t.loop(lambda row:print('{:.2f} {}'.format(row[0]/100.0,row[1])),maxrows=10)

The output is the same as for the Tcl version. If ultracompact lambda functions make you nervous, you can also use a more verbose loop construct similar to the Tcl statement.