# HG changeset patch # User bgruening # Date 1376551397 14400 # Node ID f653fd06f055f7a3a85a92a0e5a30b22398e078b Uploaded diff -r 000000000000 -r f653fd06f055 get_online_data/get_online_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_online_data/get_online_data.py Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import os, sys +import urllib2 +import gzip, tempfile +import zipfile +import subprocess +import shutil + +def unescape(cond_text): + # Unescape if input has been escaped + mapped_chars = { '>' :'__gt__', + '<' :'__lt__', + "'" :'__sq__', + '"' :'__dq__', + '[' :'__ob__', + ']' :'__cb__', + '{' :'__oc__', + '}' :'__cc__', + '@' : '__at__', + '\n' : '__cn__', + '\r' : '__cr__', + '\t' : '__tc__' + } + for key, value in mapped_chars.items(): + cond_text = cond_text.replace( value, key ) + return cond_text + +urls = unescape(sys.argv[1]) +out = open(sys.argv[2], 'wb') + +if len(sys.argv) > 3: + allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ] +else: + allowed_extensions = ['.sdf', '.smi', '.inchi'] + +for url in urls.split('\n'): + url = url.strip() + request = urllib2.Request( url ) + request.add_header('Accept-encoding', 'gzip') + request.add_header('Accept-encoding', 'gz') + response = urllib2.urlopen( request ) + + if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']: + temp = tempfile.NamedTemporaryFile( delete=False ) + temp.write( response.read() ) + temp.close() + zipfile = gzip.open(temp.name, 'rb') + out.write( zipfile.read() ) + os.remove(temp.name) + elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + + zf = zipfile.ZipFile(temp.name, allowZip64=True) + tmpdir = tempfile.mkdtemp( ) + + for filename in zf.namelist(): + zf.extractall( tmpdir ) + + os.remove( temp.name ) + molfiles = [] + for root, dirs, files in os.walk(tmpdir): + for filename in files: + if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []: + mfile = os.path.join( root, filename) + molfiles.append( mfile ) + + for filename in molfiles: + shutil.copyfileobj(open(filename, 'rb'), out) + shutil.rmtree( tmpdir ) + zf.close() + elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True) + os.remove( temp.name ) + else: + out.write( response.read() ) +out.close() diff -r 000000000000 -r f653fd06f055 get_online_data/get_online_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_online_data/get_online_data.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,37 @@ + + + fetching ... + + + get_online_data.py "$url_paste" $output $whitelist + + + + + + + + + + + +.. class:: infomark + +**What this tool does** + +Fetch data via FTP or HTTP and store them in your history. + +----- + +.. class:: infomark + +**Input** + +Supported filetypes are: + - gz/gzip + - rar + +ZIP is supported with recursive extracting of specific filetypes. + + + diff -r 000000000000 -r f653fd06f055 get_pubchem/get_pubchem_as_smiles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_as_smiles.py Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import ftplib +import os, sys +import argparse +import subprocess +from multiprocessing import Pool +import tempfile +import shutil + +def main(output, processors = 10): + output_handle = open(output,'w+') + + td = tempfile.mkdtemp() + ftp = ftplib.FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/') + filelist = ftp.nlst() + + pool = Pool(processes = processors) + filenames = zip(filelist, [td]*len(filelist)) + + result = pool.map_async(fetch_convert, filenames) + result.get() + + for filename in os.listdir(td): + path = os.path.join(td, filename) + shutil.copyfileobj(open(path, 'rb'), output_handle) + + output_handle.close() + shutil.rmtree( td ) + +def fetch_convert(args): + (filename, td) = args + + tmp_name = os.path.join( tempfile.gettempdir(), filename) + subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] ) + output = os.path.join(td, filename) + subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output]) + os.remove(tmp_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') + parser.add_argument("-o", "--output", dest="output", + required=True, + help="Path to the output file.") + parser.add_argument("-p", "--processors", dest="processors", + type=int, default=10, + help="How many processors you want to use.") + + options = parser.parse_args() + main( options.output, options.processors ) + diff -r 000000000000 -r f653fd06f055 get_pubchem/get_pubchem_as_smiles.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_as_smiles.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,36 @@ + + as canonical SMILES + + get_pubchem_as_smiles.py + -o $pubchem_smi + -p 10 + 2>&1 + + + + + + + + + + + +.. class:: infomark + +**What this tool does** + +This tool will fetch one PubChem_ file after another and convert them to canonical SMILES. + +.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/ + +----- + +.. class:: infomark + +**Output** + +The output will be one large SMILES file. + + + diff -r 000000000000 -r f653fd06f055 repository_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,4 @@ + + + +