Mercurial > repos > bgruening > get_pubchem

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_as_smiles.py	Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+
+
+def main(output, processors = 4):
+    output_handle = open(output,'w+')
+
+    td = tempfile.mkdtemp()
+    ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+    ftp.login()
+    ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
+    filelist = ftp.nlst()
+
+    pool = Pool(processes = processors)
+    filenames = zip(filelist, [td]*len(filelist))
+    result = pool.map_async(fetch_convert, filenames)
+    result.get()
+
+    for filename in os.listdir(td):
+        path = os.path.join(td, filename)
+        shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+    output_handle.close()
+    shutil.rmtree(td)
+
+def fetch_convert(args):
+    (filename, td) = args
+    tmp_name = os.path.join( td, filename)
+    subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
+    output = os.path.join(td, filename) + '.smi'
+    subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
+    os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+    parser.add_argument("-o", "--output", dest="output",
+                    required=True,
+                    help="Path to the output file.")
+    parser.add_argument("-p", "--processors", dest="processors",
+                    type=int, default=10,
+                    help="How many processors you want to use.")
+
+    options = parser.parse_args()
+    main( options.output, options.processors )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_as_smiles.xml	Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,58 @@
+<tool id="ctb_pubchem_download_as_smiles" name="PubChem Download" version="0.2" >
+    <description>as canonical SMILES</description>
+    <requirements>
+        <requirement type="package" version="2.4.1">openbabel</requirement>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <stdio>
+        <regex match="obError"
+               source="both"
+               level="fatal"
+               description="Critical Open Babel error" />
+        <regex match="obWarning"
+               source="both"
+               level="warning"
+               description="Non-critical Open Babel warning" />
+        <regex match="obInfo"
+               source="both"
+               level="log"
+               description="Open Babel Information" />
+    </stdio>
+    <command>
+<![CDATA[
+        python '$__tool_directory__/get_pubchem_as_smiles.py'
+            -o $pubchem_smi
+            -p \${GALAXY_SLOTS:-4}
+            ## temporary hack until my Galaxy patch is committed
+            > /dev/null 2>&1
+]]>
+    </command>
+    <inputs>
+    </inputs>
+    <outputs>
+        <data format="smi" name="pubchem_smi" />
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+<![CDATA[
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ file after another and convert them to canonical SMILES.
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+
+]]>
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_assays.py	Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2014'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+import urllib
+import zipfile
+import gzip
+
+
+PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
+
+def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
+    """
+        Starting multiple processes to download and extract PubChem Assay data.
+    """
+    td = tempfile.mkdtemp()
+    ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+    ftp.login()
+    ftp.cwd( PUBCHEM_URL )
+    filelist = ftp.nlst()
+
+    pool = Pool(processes = processors)
+    triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
+
+    result = pool.map_async(fetch_convert, triplestore)
+    result.get()
+
+    with open(output,'w+') as output_handle:
+        for filename in os.listdir( td ):
+            path = os.path.join( td, filename )
+            shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+    shutil.rmtree( td )
+
+def fetch_convert(args):
+    (filename, td, white_list) = args
+    tmp_name = os.path.join( td, filename)
+    urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
+
+    temp_dir = tempfile.mkdtemp()
+    with zipfile.ZipFile(tmp_name, "r") as z:
+        z.extractall(temp_dir)
+
+    output = os.path.join(td, filename) + '.tsv'
+    with open(output, 'w+') as out_handle:
+        for root, dirs, files in os.walk( temp_dir ):
+            for filename in files:
+                # filename encodes the assay_id, it looks like 1.csv.gz
+                # extract the assay id and insert it as column one
+                assay_id = filename.split('.', 1)
+                gzfile_path = os.path.join( root, filename )
+                with gzip.open(gzfile_path, 'rb') as gzfile:
+                    gzfile.readline() # skip first line
+                    for line in gzfile:
+                        cols = line.split(',')
+                        PUBCHEM_ACTIVITY_OUTCOME = cols[2]
+                        cols = line.pop(4) # removing the URL column
+                        cols.insert(0, assay_id) # insert assay_id as first column
+                        if PUBCHEM_ACTIVITY_OUTCOME in white_list:
+                            out_handle.write( '%s' % line.replace(',', '\t') )
+    os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+    parser.add_argument("-o", "--output", dest="output",
+                    required=True,
+                    help="Path to the output file.")
+    parser.add_argument("-p", "--processors", dest="processors",
+                    type=int, default=10,
+                    help="How many processors you want to use.")
+    parser.add_argument("-w", "--white-list", dest="white_list",
+                    default="Active,Inconclusive,Inactive",
+                    help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
+
+    options = parser.parse_args()
+    main( options.output, options.processors, options.white_list.split(',') )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_assays.xml	Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,67 @@
+<tool id="ctb_pubchem_download_assays" name="PubChem Assay Downloader" version="0.2" >
+    <description>as table</description>
+    <requirements>
+        <requirement type="package" version="2.7">python</requirement>
+    </requirements>
+    <command detect_errors="aggressive">
+<![CDATA[
+        python '$__tool_directory__/get_pubchem_assay.py'
+            -o '$pubchem_assay_tsv'
+            -p '\${GALAXY_SLOTS:-4}'
+            --white-list $white_list
+]]>
+    </command>
+    <inputs>
+        <param name="white_list" type="select" multiple="true" label="Scoring matrix">
+            <option value="Active" selected="true">Active</option>
+            <option value="Inconclusive" selected="true">Inconclusive</option>
+            <option value="Inactive">Inactive</option>
+            <option value="Unspecified">Unspecified</option>
+            <option value="Probe">Probe</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="pubchem_assay_tsv" />
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+<![CDATA[
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ Assay file after another and concatenating them.
+It is possible to optionally filter by PUBCHEM_ACTIVITY_OUTCOME.
+
+Columns in the result file:
+
+ - column 1: PubChem AID (assay id)
+ - column 1: PubChem SID (substance id)
+ - column 2: PubChem CID (compound id)
+ - column 3: PubChem Activity Outcome
+            1-Inactive
+            2-Active
+            3-Inconclusive
+            4-Unspecified
+            5-Probe
+ - column 4: PubChem activity score, the higher value, the more active
+ - column 5: Test result specific comment
+ - column 6 and beyond: All remaining columns starting from the 7th column are the TID "names" defined in the associated assay description given by the XML file under the corresponding Description/ directory. These "names" can also be found in the "Result Definitions" section of the assay summary page: e.g. http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=2244#aDefinitions
+
+
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+
+]]>
+    </help>
+</tool>