5
|
1 #!/usr/bin/env python
|
|
2
|
|
3 __author__ = 'Bjoern Gruening'
|
|
4 __version__ = '0.1'
|
|
5 __date__ = '2014'
|
|
6 __license__ = 'GLP3+'
|
|
7
|
|
8 import ftplib
|
|
9 import os, sys
|
|
10 import argparse
|
|
11 import subprocess
|
|
12 from multiprocessing import Pool
|
|
13 import tempfile
|
|
14 import shutil
|
|
15 import urllib
|
|
16 import zipfile
|
|
17 import gzip
|
|
18
|
|
19
|
|
20 PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
|
|
21
|
|
22 def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
|
|
23 """
|
|
24 Starting multiple processes to download and extract PubChem Assay data.
|
|
25 """
|
|
26 td = tempfile.mkdtemp()
|
|
27 ftp = ftplib.FTP('ftp.ncbi.nih.gov')
|
|
28 ftp.login()
|
|
29 ftp.cwd( PUBCHEM_URL )
|
|
30 filelist = ftp.nlst()
|
|
31
|
|
32 pool = Pool(processes = processors)
|
|
33 triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
|
|
34
|
|
35 result = pool.map_async(fetch_convert, triplestore)
|
|
36 result.get()
|
|
37
|
|
38 with open(output,'w+') as output_handle:
|
|
39 for filename in os.listdir( td ):
|
|
40 path = os.path.join( td, filename )
|
|
41 shutil.copyfileobj(open(path, 'rb'), output_handle)
|
|
42
|
|
43 shutil.rmtree( td )
|
|
44
|
|
45 def fetch_convert(args):
|
|
46 (filename, td, white_list) = args
|
|
47 tmp_name = os.path.join( td, filename)
|
|
48 urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
|
|
49
|
|
50 temp_dir = tempfile.mkdtemp()
|
|
51 with zipfile.ZipFile(tmp_name, "r") as z:
|
|
52 z.extractall(temp_dir)
|
|
53
|
|
54 output = os.path.join(td, filename) + '.tsv'
|
|
55 with open(output, 'w+') as out_handle:
|
|
56 for root, dirs, files in os.walk( temp_dir ):
|
|
57 for filename in files:
|
|
58 # filename encodes the assay_id, it looks like 1.csv.gz
|
|
59 # extract the assay id and insert it as column one
|
|
60 assay_id = filename.split('.', 1)
|
|
61 gzfile_path = os.path.join( root, filename )
|
|
62 with gzip.open(gzfile_path, 'rb') as gzfile:
|
|
63 gzfile.readline() # skip first line
|
|
64 for line in gzfile:
|
|
65 cols = line.split(',')
|
|
66 PUBCHEM_ACTIVITY_OUTCOME = cols[2]
|
|
67 cols = line.pop(4) # removing the URL column
|
|
68 cols.insert(0, assay_id) # insert assay_id as first column
|
|
69 if PUBCHEM_ACTIVITY_OUTCOME in white_list:
|
|
70 out_handle.write( '%s' % line.replace(',', '\t') )
|
|
71 os.remove(tmp_name)
|
|
72
|
|
73
|
|
74 if __name__ == '__main__':
|
|
75 parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
|
|
76 parser.add_argument("-o", "--output", dest="output",
|
|
77 required=True,
|
|
78 help="Path to the output file.")
|
|
79 parser.add_argument("-p", "--processors", dest="processors",
|
|
80 type=int, default=10,
|
|
81 help="How many processors you want to use.")
|
|
82 parser.add_argument("-w", "--white-list", dest="white_list",
|
|
83 default="Active,Inconclusive,Inactive",
|
|
84 help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
|
|
85
|
|
86 options = parser.parse_args()
|
|
87 main( options.output, options.processors, options.white_list.split(',') )
|
|
88
|