Run a SMILES file containing multiple structures. Perform a substructure match of a query SMILES structure against each input structure. In every found substructure, invert the stereo configuration of selected stereocenters of the query, then save the resulting SMILES.
This task originated from a topic on BlueObelisk.
Implementation[]
Take the following query substructure: O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O
Run the query substructure against pubchem_sugars.smi.gz (file was constructed by selecting the all molecules from PUBCHEM database that contains this substructure query), and in every matched structure, invert stereo configurations of atoms that correspond to atoms #3, 4, 5 of the query substructure. Write the resulting SMILES to standard output.
The first 5 lines of the target file are:
O1[C@@H](C(=O)O)[C@H]([C@@H]([C@H]([C@@H]1OC1C=CC2C(C)=CC(=O)OC=2C=1)O)O)O 128746 O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1CO)O)O)O)OC1C(C2=CC=CC=C2OC=1C1C=CC(=C(C=1)O)O)=O 10320573 S(=O)(=O)(O)O[C@@H]1[C@H](C(=O)OS(=O)(=O)O)O[C@H]([C@@H]([C@H]1O)O)O[C@H]1[C@@H](CO)O[C@@H]([C@@H]([C@H]1O)NC(C)=O)O 23654298 O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1C(=O)OC)O)OCC1C=CC=CC=1)OC(C)=O)OC 10338204 O([C@H]1[C@@H]([C@H]([C@@H]([C@H](C(=O)O)O1)O)O)O)[C@@H]1CCC2[C@]1(C)CCC1[C@@]3(C)CCC(CC3CCC12)=O 162498
And the right output of the program would be:
O1[C@@H](OC2=CC3=C(C(=CC(O3)=O)C)C=C2)[C@H](O)[C@H](O)[C@@H](O)[C@H]1C(O)=O 128746 O1[C@H](CO)[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1OC1=C(C2=CC(O)=C(O)C=C2)OC2C(=CC=CC=2)C1=O 10320573 S(O[C@@H]1[C@@H](O)[C@@H](O)[C@H](O[C@@H]2[C@H](O)[C@@H](NC(=O)C)[C@@H](O)O[C@@H]2CO)O[C@@H]1C(OS(O)(=O)=O)=O)(O)(=O)=O 23654298 O1[C@H](C(OC)=O)[C@H](O)[C@@H](OCC2=CC=CC=C2)[C@@H](OC(=O)C)[C@@H]1OC 10338204 O([C@H]1[C@]2(CCC3C(C2CC1)CCC1[C@@]3(CCC(=O)C1)C)C)[C@@H]1O[C@H](C(O)=O)[C@H](O)[C@@H](O)[C@H]1O 162498
There are total 822 structures in the target file, and the example query matches every of them, and so the program should output 822 modified SMILES total.
Indigo/Python[]
from indigo import *
indigo = Indigo()
q = indigo.loadQueryMolecule('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O')
saver = indigo.createFileSaver('pubchem_sugars_transformed.smi', 'smi')
indigo.setOption("smiles-saving-write-name", "1")
for target in indigo.iterateSmilesFile ('pubchem_sugars.smi'):
print(target.smiles())
matcher = indigo.substructureMatcher(target)
match = matcher.match(q)
match.mapAtom(q.getAtom(3)).invertStereo()
match.mapAtom(q.getAtom(4)).invertStereo()
match.mapAtom(q.getAtom(5)).invertStereo()
saver.append(target)
Indigo/C++[]
#include "base_cpp/scanner.h"
#include "base_cpp/output.h"
#include "molecule/smiles_saver.h"
#include "gzip/gzip_scanner.h"
#include "molecule/molecule.h"
#include "molecule/smiles_loader.h"
#include "molecule/molecule_substructure_matcher.h"
int main (void)
{
const char *query = "O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O";
int atoms_to_invert[] = {3, 4, 5};
Molecule qmol;
BufferScanner qs(query);
SmilesLoader ql(qs);
ql.loadMolecule(qmol, true);
try
{
FileScanner scanner("sugars.smi.gz");
GZipScanner gzscanner(scanner);
while (!gzscanner.isEOF())
{
Molecule mol;
Array<char> str;
gzscanner.readString(str, false);
BufferScanner strscanner(str);
SmilesLoader loader(strscanner);
loader.loadMolecule(mol, false);
mol.calcImplicitHydrogens(true);
MoleculeSubstructureMatcher matcher(mol);
matcher.setQuery(qmol);
if (!matcher.find())
{
printf("%.*s\n", str.size(), str.ptr());
continue;
}
const int *mapping = matcher.getQueryMapping();
for (int i = 0; i < NELEM(atoms_to_invert); i++)
{
int *pyramid = mol.getStereocenters().getPyramid(mapping[atoms_to_invert[i]]);
int tmp;
__swap(pyramid[0], pyramid[1], tmp);
}
StandardOutput output;
SmilesSaver saver(output);
saver.saveMolecule(mol);
output.writeCR();
}
}
catch (Exception &e)
{
fprintf(stderr, "error: %s\n", e.message());
return -1;
}
return 0;
}
Instructions:
- Unpack 'graph' and 'molecule' projects into some folder
- Create 'utils' folder nearby
- Paste the above code into utils/stereo-invert.cpp file
- Compile the file using the following commands:
$ cd graph; make CONF=Release32; cd ..
$ cd molecule; make CONF=Release32; cd ..
$ cd utils
$ gcc stereo-invert.cpp -o stereo-invert -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++ - Run the program like that:
$ ./stereo-invert
Cactvs/Tcl[]
set ss [ens create {O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O} smarts]
molfile loop sugars.smi.gz eh {
if {[match ss -stereo 1 $ss $eh amap]} {
atom invert $eh [lindex [lindex $amap 3] 1]
atom invert $eh [lindex [lindex $amap 5] 1]
atom invert $eh [lindex [lindex $amap 7] 1]
puts [ens new $eh E_SMILES]
} else {
error "not matched"
}
}
In Cactvs, the stereo hydrogen atoms after the @ or @@ are explicitly stored. This is the reason why (zero-based) atom map indices 3, 5, and 7 are used and not simply 3, 4 and 5.
Cactvs/Python[]
ss=Ens('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O','smarts')
Molfile.Loop('sugars.smi.gz',variable='e',function='''
if (match('ss',ss,e,stereo=1,atommapvariable='amap')):
amap[3][1].invert()
amap[5][1].invert()
amap[7][1].invert()
print(e.new('E_SMILES'))
else:
raise RuntimeError('not matched')
''')