comparison get_pubchem/get_pubchem_assays.py @ 5:c2055dd1927b draft default tip

Uploaded
author bgruening
date Thu, 24 Apr 2014 13:19:33 -0400
parents
children
comparison
equal deleted inserted replaced
4:7c1f9962ac07 5:c2055dd1927b
1 #!/usr/bin/env python
2
3 __author__ = 'Bjoern Gruening'
4 __version__ = '0.1'
5 __date__ = '2014'
6 __license__ = 'GLP3+'
7
8 import ftplib
9 import os, sys
10 import argparse
11 import subprocess
12 from multiprocessing import Pool
13 import tempfile
14 import shutil
15 import urllib
16 import zipfile
17 import gzip
18
19
20 PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
21
22 def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
23 """
24 Starting multiple processes to download and extract PubChem Assay data.
25 """
26 td = tempfile.mkdtemp()
27 ftp = ftplib.FTP('ftp.ncbi.nih.gov')
28 ftp.login()
29 ftp.cwd( PUBCHEM_URL )
30 filelist = ftp.nlst()
31
32 pool = Pool(processes = processors)
33 triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
34
35 result = pool.map_async(fetch_convert, triplestore)
36 result.get()
37
38 with open(output,'w+') as output_handle:
39 for filename in os.listdir( td ):
40 path = os.path.join( td, filename )
41 shutil.copyfileobj(open(path, 'rb'), output_handle)
42
43 shutil.rmtree( td )
44
45 def fetch_convert(args):
46 (filename, td, white_list) = args
47 tmp_name = os.path.join( td, filename)
48 urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
49
50 temp_dir = tempfile.mkdtemp()
51 with zipfile.ZipFile(tmp_name, "r") as z:
52 z.extractall(temp_dir)
53
54 output = os.path.join(td, filename) + '.tsv'
55 with open(output, 'w+') as out_handle:
56 for root, dirs, files in os.walk( temp_dir ):
57 for filename in files:
58 # filename encodes the assay_id, it looks like 1.csv.gz
59 # extract the assay id and insert it as column one
60 assay_id = filename.split('.', 1)
61 gzfile_path = os.path.join( root, filename )
62 with gzip.open(gzfile_path, 'rb') as gzfile:
63 gzfile.readline() # skip first line
64 for line in gzfile:
65 cols = line.split(',')
66 PUBCHEM_ACTIVITY_OUTCOME = cols[2]
67 cols = line.pop(4) # removing the URL column
68 cols.insert(0, assay_id) # insert assay_id as first column
69 if PUBCHEM_ACTIVITY_OUTCOME in white_list:
70 out_handle.write( '%s' % line.replace(',', '\t') )
71 os.remove(tmp_name)
72
73
74 if __name__ == '__main__':
75 parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
76 parser.add_argument("-o", "--output", dest="output",
77 required=True,
78 help="Path to the output file.")
79 parser.add_argument("-p", "--processors", dest="processors",
80 type=int, default=10,
81 help="How many processors you want to use.")
82 parser.add_argument("-w", "--white-list", dest="white_list",
83 default="Active,Inconclusive,Inactive",
84 help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
85
86 options = parser.parse_args()
87 main( options.output, options.processors, options.white_list.split(',') )
88