Mercurial > repos > bgruening > get_pubchem
comparison get_pubchem_assays.py @ 0:cd19c3fab3a6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_pubchem commit aed18d7d09e332efe57d00b33c2b8249abefaedb
author | bgruening |
---|---|
date | Wed, 22 May 2019 07:44:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cd19c3fab3a6 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 __author__ = 'Bjoern Gruening' | |
4 __version__ = '0.1' | |
5 __date__ = '2014' | |
6 __license__ = 'GLP3+' | |
7 | |
8 import ftplib | |
9 import os, sys | |
10 import argparse | |
11 import subprocess | |
12 from multiprocessing import Pool | |
13 import tempfile | |
14 import shutil | |
15 import urllib | |
16 import zipfile | |
17 import gzip | |
18 | |
19 | |
20 PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/" | |
21 | |
22 def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']): | |
23 """ | |
24 Starting multiple processes to download and extract PubChem Assay data. | |
25 """ | |
26 td = tempfile.mkdtemp() | |
27 ftp = ftplib.FTP('ftp.ncbi.nih.gov') | |
28 ftp.login() | |
29 ftp.cwd( PUBCHEM_URL ) | |
30 filelist = ftp.nlst() | |
31 | |
32 pool = Pool(processes = processors) | |
33 triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist)) | |
34 | |
35 result = pool.map_async(fetch_convert, triplestore) | |
36 result.get() | |
37 | |
38 with open(output,'w+') as output_handle: | |
39 for filename in os.listdir( td ): | |
40 path = os.path.join( td, filename ) | |
41 shutil.copyfileobj(open(path, 'rb'), output_handle) | |
42 | |
43 shutil.rmtree( td ) | |
44 | |
45 def fetch_convert(args): | |
46 (filename, td, white_list) = args | |
47 tmp_name = os.path.join( td, filename) | |
48 urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name) | |
49 | |
50 temp_dir = tempfile.mkdtemp() | |
51 with zipfile.ZipFile(tmp_name, "r") as z: | |
52 z.extractall(temp_dir) | |
53 | |
54 output = os.path.join(td, filename) + '.tsv' | |
55 with open(output, 'w+') as out_handle: | |
56 for root, dirs, files in os.walk( temp_dir ): | |
57 for filename in files: | |
58 # filename encodes the assay_id, it looks like 1.csv.gz | |
59 # extract the assay id and insert it as column one | |
60 assay_id = filename.split('.', 1) | |
61 gzfile_path = os.path.join( root, filename ) | |
62 with gzip.open(gzfile_path, 'rb') as gzfile: | |
63 gzfile.readline() # skip first line | |
64 for line in gzfile: | |
65 cols = line.split(',') | |
66 PUBCHEM_ACTIVITY_OUTCOME = cols[2] | |
67 cols = line.pop(4) # removing the URL column | |
68 cols.insert(0, assay_id) # insert assay_id as first column | |
69 if PUBCHEM_ACTIVITY_OUTCOME in white_list: | |
70 out_handle.write( '%s' % line.replace(',', '\t') ) | |
71 os.remove(tmp_name) | |
72 | |
73 | |
74 if __name__ == '__main__': | |
75 parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') | |
76 parser.add_argument("-o", "--output", dest="output", | |
77 required=True, | |
78 help="Path to the output file.") | |
79 parser.add_argument("-p", "--processors", dest="processors", | |
80 type=int, default=10, | |
81 help="How many processors you want to use.") | |
82 parser.add_argument("-w", "--white-list", dest="white_list", | |
83 default="Active,Inconclusive,Inactive", | |
84 help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.") | |
85 | |
86 options = parser.parse_args() | |
87 main( options.output, options.processors, options.white_list.split(',') ) | |
88 |