# HG changeset patch # User bgruening # Date 1376551825 14400 # Node ID 34ae5f2ae4508f80f14ffca1ebeae1f8fbbd06bb Uploaded diff -r 000000000000 -r 34ae5f2ae450 osra.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/osra.py Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,27 @@ +#!usr/bin/env python + +import os, sys +import subprocess + +""" + OSRA_DATA_FILES is set during the toolshed Installation + If it is not set, use the standard configuration of OSRA. + That means we need to delete argument 4-7. + That script is a hack, because we do not know the content of OSRA_DATA_FILES at xml evaluation time. + + osra -f $oformat $infile + -l \$OSRA_DATA_FILES/spelling.txt -a \$OSRA_DATA_FILES/superatom.txt + > $outfile +""" + +if not os.path.exists(sys.argv[7]): + # OSRA_DATA_FILES path is not set or the spelling file is not existent + sys.argv.pop(7) # superatom.txt path + sys.argv.pop(6) # -a + sys.argv.pop(5) # speling.txt path + sys.argv.pop(4) # -l + +sys.argv[0] = 'osra' +subprocess.call(sys.argv, stdout=sys.stdout) + + diff -r 000000000000 -r 34ae5f2ae450 osra.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/osra.xml Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,74 @@ + + in Images or PDF documents (OSRA) + + osra + openbabel + graphicsmagick + + + ## OSRA_DATA_FILES is set during the toolshed Installation + ## if it is not set, use the standard configuration and hope the best + osra.py -f $oformat $infile + -l \$OSRA_DATA_FILES/spelling.txt -a \$OSRA_DATA_FILES/superatom.txt + + ## further additions of OSRA parameter should go after -l and -a + ## because -l and -a can be removed by the python wrapper + + $confidence + $adaptive + $thinning + + > $outfile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What this tool does** + +OSRA_ (Optical Structure Recognition Application) is a utility designed to convert graphical representations of chemical structures into SMILES or SDF. It generates the SMILES or SDF representation of any molecular structure image within a document which is parseable by GraphicMagick. + +.. _OSRA: http://cactus.nci.nih.gov/osra/ + +----- + +.. class:: infomark + +**Cite** + +Igor V Filippov and Marc C Nicklaus - `Optical Structure Recognition Software To Recover Chemical Information: OSRA, An Open Source Solution`_ + +.. _`Optical Structure Recognition Software To Recover Chemical Information: OSRA, An Open Source Solution`: http://pubs.acs.org/doi/abs/10.1021/ci800067r + + diff -r 000000000000 -r 34ae5f2ae450 readme --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,20 @@ +OSRA: Optical Structure Recognition Application + +OSRA is a utility designed to convert graphical representations of chemical +structures, as they appear in journal articles, patent documents, textbooks, +trade magazines etc., into SMILES (Simplified Molecular Input Line Entry +Specification - see http://en.wikipedia.org/wiki/SMILES) or +SD files - a computer recognizable molecular structure format. +OSRA can read a document in any of the over 90 graphical formats parseable by +ImageMagick - including GIF, JPEG, PNG, TIFF, PDF, PS etc., and generate +the SMILES or SDF representation of the molecular structure images encountered +within that document. + +Note that any software designed for optical recognition is unlikely to be +perfect, and the output produced might, and probably will, contain errors, +so curation by a human knowledgeable in chemical structures is highly recommended. + +http://cactus.nci.nih.gov/osra/ + +The wrapper comes with an automatic installation of all dependencies through the +galaxy toolshed. diff -r 000000000000 -r 34ae5f2ae450 repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,4 @@ + + + + diff -r 000000000000 -r 34ae5f2ae450 test_data/2008001635_153_chem.png Binary file test_data/2008001635_153_chem.png has changed diff -r 000000000000 -r 34ae5f2ae450 test_data/2008001635_153_chem.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_data/2008001635_153_chem.smi Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,1 @@ +CCC(c1ccc(cc1)Br)OCCCO diff -r 000000000000 -r 34ae5f2ae450 test_data/CID_2244.png Binary file test_data/CID_2244.png has changed diff -r 000000000000 -r 34ae5f2ae450 test_data/CID_2244.sdf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_data/CID_2244.sdf Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,155 @@ +2244 + -OEChem-05151212332D + + 21 21 0 0 0 0 0 0 0999 V2000 + 3.7320 -0.0600 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 6.3301 1.4400 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.5981 1.4400 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2.8660 -1.5600 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.5981 -0.5600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4641 -0.0600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.5981 -1.5600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.3301 -0.5600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4641 -2.0600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.3301 -1.5600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4641 0.9400 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.8660 -0.5600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.0000 -0.0600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 4.0611 -1.8700 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.8671 -0.2500 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 5.4641 -2.6800 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.8671 -1.8700 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 2.3100 0.4769 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.4631 0.2500 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1.6900 -0.5969 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 6.3301 2.0600 0.0000 H 0 0 0 0 0 0 0 0 0 0 0 0 + 1 5 1 0 0 0 0 + 1 12 1 0 0 0 0 + 2 11 1 0 0 0 0 + 2 21 1 0 0 0 0 + 3 11 2 0 0 0 0 + 4 12 2 0 0 0 0 + 5 6 1 0 0 0 0 + 5 7 2 0 0 0 0 + 6 8 2 0 0 0 0 + 6 11 1 0 0 0 0 + 7 9 1 0 0 0 0 + 7 14 1 0 0 0 0 + 8 10 1 0 0 0 0 + 8 15 1 0 0 0 0 + 9 10 2 0 0 0 0 + 9 16 1 0 0 0 0 + 10 17 1 0 0 0 0 + 12 13 1 0 0 0 0 + 13 18 1 0 0 0 0 + 13 19 1 0 0 0 0 + 13 20 1 0 0 0 0 +M END +> +2244 + +> +1 + +> +212 + +> +4 + +> +1 + +> +3 + +> +AAADccBwOAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAAAAAAAABAAAAGgAACAAADASAmAAyDoAABgCIAiDSCAACCAAkIAAIiAEGCMgMJzaENRqCe2Cl4BEIuYeIyCCOAAAAAAAIAAAAAAAAABAAAAAAAAAAAA== + +> +2-acetoxybenzoic acid + +> +2-acetyloxybenzoic acid + +> +2-acetyloxybenzoic acid + +> +2-acetyloxybenzoic acid + +> +2-acetoxybenzoic acid + +> +InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12) + +> +BSYNRYMUTXBXSQ-UHFFFAOYSA-N + +> +1.2 + +> +180.042259 + +> +C9H8O4 + +> +180.15742 + +> +CC(=O)OC1=CC=CC=C1C(=O)O + +> +CC(=O)OC1=CC=CC=C1C(=O)O + +> +63.6 + +> +180.042259 + +> +0 + +> +13 + +> +0 + +> +0 + +> +0 + +> +0 + +> +0 + +> +1 + +> +1 + +> +1 +5 +255 + +> +5 6 8 +5 7 8 +6 8 8 +7 9 8 +8 10 8 +9 10 8 + +$$$$ + diff -r 000000000000 -r 34ae5f2ae450 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Aug 15 03:30:25 2013 -0400 @@ -0,0 +1,74 @@ + + + + + + + + + + + + + http://downloads.sourceforge.net/project/osra/osra/2.0.0/osra-2.0.0.tgz + + + + + + + + + + + + + wget http://potrace.sourceforge.net/download/potrace-1.11.tar.gz + tar xfvz potrace-1.11.tar.gz && cd potrace-1.11 && ./configure --with-libpotrace --prefix=$INSTALL_DIR/potrace/build && make && make install + + + + wget http://downloads.sourceforge.net/project/osra/gocr-patched/gocr-0.50pre-patched.tgz + tar xfvz gocr-0.50pre-patched.tgz && cd gocr-0.50pre-patched && ./configure --prefix=$INSTALL_DIR/gocr/build && make libs && make all install + + + + wget http://downloads.sourceforge.net/project/tclap/tclap-1.2.1.tar.gz + tar xfvz tclap-1.2.1.tar.gz && cd tclap-1.2.1 && ./configure --prefix=$INSTALL_DIR/tclap/build && make && make install + + + wget http://mirror.checkdomain.de/gnu/ocrad/ocrad-0.21.tar.gz + tar xfvz ocrad-0.21.tar.gz && cd ocrad-0.21 && ./configure --prefix=$INSTALL_DIR/ocrad/build && make && make install + + + wget https://launchpad.net/cuneiform-linux/1.1/1.1/+download/cuneiform-linux-1.1.0.tar.bz2 + tar xfvj cuneiform-linux-1.1.0.tar.bz2 && cd cuneiform-linux-1.1.0 && mkdir build && cd build && cmake .. -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR/cuneiform/build/ && make && make install + + + + export PATH=$PATH:$GRAPHICSMAGICK_ROOT_DIR/bin/ && + ./configure --with-tclap-include=$INSTALL_DIR/tclap/build/include/ --with-potrace-include=$INSTALL_DIR/potrace/build/include/ --with-potrace-lib=$INSTALL_DIR/potrace/build/lib/ --with-gocr-include=$INSTALL_DIR/gocr/build/include/gocr/ --with-gocr-lib=$INSTALL_DIR/gocr/build/lib/ --with-ocrad-include=$INSTALL_DIR/ocrad/build/include/ --with-ocrad-lib=$INSTALL_DIR/ocrad/build/lib/ --with-cuneiform-include=$INSTALL_DIR/cuneiform/build/install/include/ --with-cuneiform --with-cuneiform-lib=$INSTALL_DIR/cuneiform/build/install/lib/ --with-openbabel-include=$OPENBABEL_INCLUDE_DIR/openbabel-2.0/ --with-openbabel-lib=$OPENBABEL_LIB_DIR --with-graphicsmagick-lib=$GRAPHICSMAGICK_ROOT_DIR/lib/ --with-graphicsmagick-include=$GRAPHICSMAGICK_ROOT_DIR/include/GraphicsMagick/ --prefix=$INSTALL_DIR + make + make install + + rm $INSTALL_DIR/tclap/ -r + + rm $INSTALL_DIR/gocr/ -r + rm $INSTALL_DIR/ocrad/ -r + rm $INSTALL_DIR/cuneiform/ -r + + + $ENV[GRAPHICSMAGICK_ROOT_DIR]/lib/ + $INSTALL_DIR/potrace/build/lib/ + $INSTALL_DIR/bin + + $INSTALL_DIR/share + + + + We still have a handfull of requirements + +