(→Implementation: added numbers) |
|||
(6 intermediate revisions by 3 users not shown) | |||
Line 6: | Line 6: | ||
Take the following query substructure: O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O |
Take the following query substructure: O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O |
||
− | Run the query substructure against [http:// |
+ | Run the query substructure against [http://ggasoftware.com/downloads/ctr.wikia.com/pubchem_sugars.smi.gz pubchem_sugars.smi.gz] (file was constructed by selecting the all molecules from PUBCHEM database that contains this substructure query), and in every matched structure, invert stereo configurations of atoms that correspond to atoms #3, 4, 5 of the query substructure. Write the resulting SMILES to standard output. |
The first 5 lines of the target file are: |
The first 5 lines of the target file are: |
||
− | + | O1[C@@H](C(=O)O)[C@H]([C@@H]([C@H]([C@@H]1OC1C=CC2C(C)=CC(=O)OC=2C=1)O)O)O 128746 |
|
− | + | O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1CO)O)O)O)OC1C(C2=CC=CC=C2OC=1C1C=CC(=C(C=1)O)O)=O 10320573 |
|
− | + | S(=O)(=O)(O)O[C@@H]1[C@H](C(=O)OS(=O)(=O)O)O[C@H]([C@@H]([C@H]1O)O)O[C@H]1[C@@H](CO)O[C@@H]([C@@H]([C@H]1O)NC(C)=O)O 23654298 |
|
+ | O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1C(=O)OC)O)OCC1C=CC=CC=1)OC(C)=O)OC 10338204 |
||
− | S(N[C@H]1[C@@H](O[C@@H]2[C@H](C(=O)[O-])O[C@@H](O[C@@H]3[C@@H](COS(=O)(=O)[O-])O[C@H](OC)[C@H](NS(=O)(=O)[O-])[C@H]3O)[C@H](OS(=O)(=O)[O-])[C@H]2O)O[C@H](COS(=O)(=O)[O-])C(O[C@H]2[C@H](O)[C@@H](O)[C@H](O[C@@H]3[C@H](NS(=O)(=O)[O-])[C@@H](O)[C@H](O)[C@@H](COS(=O)(=O)[O-])O3)[C@@H](C(=O)[O-])O2)[C@@H]1OS(=O)(=O)[O-])(=O)(=O)[O-].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+] |
||
− | + | O([C@H]1[C@@H]([C@H]([C@@H]([C@H](C(=O)O)O1)O)O)O)[C@@H]1CCC2[C@]1(C)CCC1[C@@]3(C)CCC(CC3CCC12)=O 162498 |
|
And the right output of the program would be: |
And the right output of the program would be: |
||
− | + | O1[C@@H](OC2=CC3=C(C(=CC(O3)=O)C)C=C2)[C@H](O)[C@H](O)[C@@H](O)[C@H]1C(O)=O 128746 |
|
− | + | O1[C@H](CO)[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1OC1=C(C2=CC(O)=C(O)C=C2)OC2C(=CC=CC=2)C1=O 10320573 |
|
− | + | S(O[C@@H]1[C@@H](O)[C@@H](O)[C@H](O[C@@H]2[C@H](O)[C@@H](NC(=O)C)[C@@H](O)O[C@@H]2CO)O[C@@H]1C(OS(O)(=O)=O)=O)(O)(=O)=O 23654298 |
|
+ | O1[C@H](C(OC)=O)[C@H](O)[C@@H](OCC2=CC=CC=C2)[C@@H](OC(=O)C)[C@@H]1OC 10338204 |
||
− | S([O-])(=O)(=O)N[C@@H]1[C@@H](OS([O-])(=O)=O)C(O[C@@H]2O[C@H](C([O-])=O)[C@@H](O[C@H]3O[C@H](COS([O-])(=O)=O)[C@@H](O)[C@H](O)[C@H]3NS([O-])(=O)=O)[C@H](O)[C@H]2O)[C@@H](COS([O-])(=O)=O)O[C@@H]1O[C@@H]1[C@@H](O)[C@@H](OS([O-])(=O)=O)[C@H](O[C@H]2[C@H](O)[C@@H](NS([O-])(=O)=O)[C@@H](OC)O[C@@H]2COS([O-])(=O)=O)O[C@@H]1C([O-])=O.[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+].[Na+] |
||
− | + | O([C@H]1[C@]2(CCC3C(C2CC1)CCC1[C@@]3(CCC(=O)C1)C)C)[C@@H]1O[C@H](C(O)=O)[C@H](O)[C@@H](O)[C@H]1O 162498 |
|
− | There are total |
+ | There are total 822 structures in the target file, and the example query matches every of them, and so the program should output 822 modified SMILES total. |
+ | |||
+ | ==Indigo/Python== |
||
+ | <source lang="python"> |
||
+ | from indigo import * |
||
+ | |||
+ | indigo = Indigo() |
||
+ | q = indigo.loadQueryMolecule('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O') |
||
+ | |||
+ | saver = indigo.createFileSaver('pubchem_sugars_transformed.smi', 'smi') |
||
+ | indigo.setOption("smiles-saving-write-name", "1") |
||
+ | |||
+ | for target in indigo.iterateSmilesFile ('pubchem_sugars.smi'): |
||
+ | print(target.smiles()) |
||
+ | matcher = indigo.substructureMatcher(target) |
||
+ | match = matcher.match(q) |
||
+ | match.mapAtom(q.getAtom(3)).invertStereo() |
||
+ | match.mapAtom(q.getAtom(4)).invertStereo() |
||
+ | match.mapAtom(q.getAtom(5)).invertStereo() |
||
+ | saver.append(target) |
||
+ | </source> |
||
==Indigo/C++== |
==Indigo/C++== |
||
Line 93: | Line 113: | ||
#Compile the file using the following commands:<br />$ cd graph; make CONF=Release32; cd ..<br />$ cd molecule; make CONF=Release32; cd ..<br />$ cd utils<br />$ gcc stereo-invert.cpp -o stereo-invert -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++ |
#Compile the file using the following commands:<br />$ cd graph; make CONF=Release32; cd ..<br />$ cd molecule; make CONF=Release32; cd ..<br />$ cd utils<br />$ gcc stereo-invert.cpp -o stereo-invert -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++ |
||
#Run the program like that:<br />$ ./stereo-invert<br /> |
#Run the program like that:<br />$ ./stereo-invert<br /> |
||
+ | |||
+ | ==Cactvs/Tcl== |
||
+ | <pre lang="tcl"> |
||
+ | set ss [ens create {O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O} smarts] |
||
+ | molfile loop sugars.smi.gz eh { |
||
+ | if {[match ss -stereo 1 $ss $eh amap]} { |
||
+ | atom invert $eh [lindex [lindex $amap 3] 1] |
||
+ | atom invert $eh [lindex [lindex $amap 5] 1] |
||
+ | atom invert $eh [lindex [lindex $amap 7] 1] |
||
+ | puts [ens new $eh E_SMILES] |
||
+ | } else { |
||
+ | error "not matched" |
||
+ | } |
||
+ | } |
||
+ | </pre> |
||
+ | |||
+ | In Cactvs, the stereo hydrogen atoms after the @ or @@ are explicitly stored. This is the reason why (zero-based) atom map indices 3, 5, and 7 are used and not simply 3, 4 and 5. |
||
+ | |||
+ | ==Cactvs/Python== |
||
+ | |||
+ | <pre lang="python"> |
||
+ | ss=Ens('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O','smarts') |
||
+ | Molfile.Loop('sugars.smi.gz',variable='e',function=''' |
||
+ | if (match('ss',ss,e,stereo=1,atommapvariable='amap')): |
||
+ | amap[3][1].invert() |
||
+ | amap[5][1].invert() |
||
+ | amap[7][1].invert() |
||
+ | print(e.new('E_SMILES')) |
||
+ | else: |
||
+ | raise RuntimeError('not matched') |
||
+ | ''') |
||
+ | </pre> |
||
[[Category:Indigo/C++]] |
[[Category:Indigo/C++]] |
||
+ | [[Category:Cactvs/Tcl]] |
||
+ | [[Category:Cactvs/Python]] |
Latest revision as of 18:43, 11 October 2013
Run a SMILES file containing multiple structures. Perform a substructure match of a query SMILES structure against each input structure. In every found substructure, invert the stereo configuration of selected stereocenters of the query, then save the resulting SMILES.
This task originated from a topic on BlueObelisk.
Implementation[]
Take the following query substructure: O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O
Run the query substructure against pubchem_sugars.smi.gz (file was constructed by selecting the all molecules from PUBCHEM database that contains this substructure query), and in every matched structure, invert stereo configurations of atoms that correspond to atoms #3, 4, 5 of the query substructure. Write the resulting SMILES to standard output.
The first 5 lines of the target file are:
O1[C@@H](C(=O)O)[C@H]([C@@H]([C@H]([C@@H]1OC1C=CC2C(C)=CC(=O)OC=2C=1)O)O)O 128746 O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1CO)O)O)O)OC1C(C2=CC=CC=C2OC=1C1C=CC(=C(C=1)O)O)=O 10320573 S(=O)(=O)(O)O[C@@H]1[C@H](C(=O)OS(=O)(=O)O)O[C@H]([C@@H]([C@H]1O)O)O[C@H]1[C@@H](CO)O[C@@H]([C@@H]([C@H]1O)NC(C)=O)O 23654298 O1[C@H]([C@@H]([C@H]([C@@H]([C@@H]1C(=O)OC)O)OCC1C=CC=CC=1)OC(C)=O)OC 10338204 O([C@H]1[C@@H]([C@H]([C@@H]([C@H](C(=O)O)O1)O)O)O)[C@@H]1CCC2[C@]1(C)CCC1[C@@]3(C)CCC(CC3CCC12)=O 162498
And the right output of the program would be:
O1[C@@H](OC2=CC3=C(C(=CC(O3)=O)C)C=C2)[C@H](O)[C@H](O)[C@@H](O)[C@H]1C(O)=O 128746 O1[C@H](CO)[C@H](O)[C@@H](O)[C@@H](O)[C@@H]1OC1=C(C2=CC(O)=C(O)C=C2)OC2C(=CC=CC=2)C1=O 10320573 S(O[C@@H]1[C@@H](O)[C@@H](O)[C@H](O[C@@H]2[C@H](O)[C@@H](NC(=O)C)[C@@H](O)O[C@@H]2CO)O[C@@H]1C(OS(O)(=O)=O)=O)(O)(=O)=O 23654298 O1[C@H](C(OC)=O)[C@H](O)[C@@H](OCC2=CC=CC=C2)[C@@H](OC(=O)C)[C@@H]1OC 10338204 O([C@H]1[C@]2(CCC3C(C2CC1)CCC1[C@@]3(CCC(=O)C1)C)C)[C@@H]1O[C@H](C(O)=O)[C@H](O)[C@@H](O)[C@H]1O 162498
There are total 822 structures in the target file, and the example query matches every of them, and so the program should output 822 modified SMILES total.
Indigo/Python[]
from indigo import *
indigo = Indigo()
q = indigo.loadQueryMolecule('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O')
saver = indigo.createFileSaver('pubchem_sugars_transformed.smi', 'smi')
indigo.setOption("smiles-saving-write-name", "1")
for target in indigo.iterateSmilesFile ('pubchem_sugars.smi'):
print(target.smiles())
matcher = indigo.substructureMatcher(target)
match = matcher.match(q)
match.mapAtom(q.getAtom(3)).invertStereo()
match.mapAtom(q.getAtom(4)).invertStereo()
match.mapAtom(q.getAtom(5)).invertStereo()
saver.append(target)
Indigo/C++[]
#include "base_cpp/scanner.h"
#include "base_cpp/output.h"
#include "molecule/smiles_saver.h"
#include "gzip/gzip_scanner.h"
#include "molecule/molecule.h"
#include "molecule/smiles_loader.h"
#include "molecule/molecule_substructure_matcher.h"
int main (void)
{
const char *query = "O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O";
int atoms_to_invert[] = {3, 4, 5};
Molecule qmol;
BufferScanner qs(query);
SmilesLoader ql(qs);
ql.loadMolecule(qmol, true);
try
{
FileScanner scanner("sugars.smi.gz");
GZipScanner gzscanner(scanner);
while (!gzscanner.isEOF())
{
Molecule mol;
Array<char> str;
gzscanner.readString(str, false);
BufferScanner strscanner(str);
SmilesLoader loader(strscanner);
loader.loadMolecule(mol, false);
mol.calcImplicitHydrogens(true);
MoleculeSubstructureMatcher matcher(mol);
matcher.setQuery(qmol);
if (!matcher.find())
{
printf("%.*s\n", str.size(), str.ptr());
continue;
}
const int *mapping = matcher.getQueryMapping();
for (int i = 0; i < NELEM(atoms_to_invert); i++)
{
int *pyramid = mol.getStereocenters().getPyramid(mapping[atoms_to_invert[i]]);
int tmp;
__swap(pyramid[0], pyramid[1], tmp);
}
StandardOutput output;
SmilesSaver saver(output);
saver.saveMolecule(mol);
output.writeCR();
}
}
catch (Exception &e)
{
fprintf(stderr, "error: %s\n", e.message());
return -1;
}
return 0;
}
Instructions:
- Unpack 'graph' and 'molecule' projects into some folder
- Create 'utils' folder nearby
- Paste the above code into utils/stereo-invert.cpp file
- Compile the file using the following commands:
$ cd graph; make CONF=Release32; cd ..
$ cd molecule; make CONF=Release32; cd ..
$ cd utils
$ gcc stereo-invert.cpp -o stereo-invert -O3 -m32 -I.. -I../common ../molecule/dist/Release32/GNU-Linux-x86/libmolecule.a ../graph/dist/Release32/GNU-Linux-x86/libgraph.a -lpthread -lstdc++ - Run the program like that:
$ ./stereo-invert
Cactvs/Tcl[]
set ss [ens create {O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O} smarts]
molfile loop sugars.smi.gz eh {
if {[match ss -stereo 1 $ss $eh amap]} {
atom invert $eh [lindex [lindex $amap 3] 1]
atom invert $eh [lindex [lindex $amap 5] 1]
atom invert $eh [lindex [lindex $amap 7] 1]
puts [ens new $eh E_SMILES]
} else {
error "not matched"
}
}
In Cactvs, the stereo hydrogen atoms after the @ or @@ are explicitly stored. This is the reason why (zero-based) atom map indices 3, 5, and 7 are used and not simply 3, 4 and 5.
Cactvs/Python[]
ss=Ens('O[C@H]1[C@@H]([C@H]([C@@H]([C@@H](O1)CO)O)O)O','smarts')
Molfile.Loop('sugars.smi.gz',variable='e',function='''
if (match('ss',ss,e,stereo=1,atommapvariable='amap')):
amap[3][1].invert()
amap[5][1].invert()
amap[7][1].invert()
print(e.new('E_SMILES'))
else:
raise RuntimeError('not matched')
''')