Mercurial > repos > bgruening > chemical_data_sources
diff get_online_data/get_online_data.py @ 0:f653fd06f055 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 15 Aug 2013 03:23:17 -0400 |
parents | |
children | c2055dd1927b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_online_data/get_online_data.py Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import os, sys +import urllib2 +import gzip, tempfile +import zipfile +import subprocess +import shutil + +def unescape(cond_text): + # Unescape if input has been escaped + mapped_chars = { '>' :'__gt__', + '<' :'__lt__', + "'" :'__sq__', + '"' :'__dq__', + '[' :'__ob__', + ']' :'__cb__', + '{' :'__oc__', + '}' :'__cc__', + '@' : '__at__', + '\n' : '__cn__', + '\r' : '__cr__', + '\t' : '__tc__' + } + for key, value in mapped_chars.items(): + cond_text = cond_text.replace( value, key ) + return cond_text + +urls = unescape(sys.argv[1]) +out = open(sys.argv[2], 'wb') + +if len(sys.argv) > 3: + allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ] +else: + allowed_extensions = ['.sdf', '.smi', '.inchi'] + +for url in urls.split('\n'): + url = url.strip() + request = urllib2.Request( url ) + request.add_header('Accept-encoding', 'gzip') + request.add_header('Accept-encoding', 'gz') + response = urllib2.urlopen( request ) + + if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']: + temp = tempfile.NamedTemporaryFile( delete=False ) + temp.write( response.read() ) + temp.close() + zipfile = gzip.open(temp.name, 'rb') + out.write( zipfile.read() ) + os.remove(temp.name) + elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + + zf = zipfile.ZipFile(temp.name, allowZip64=True) + tmpdir = tempfile.mkdtemp( ) + + for filename in zf.namelist(): + zf.extractall( tmpdir ) + + os.remove( temp.name ) + molfiles = [] + for root, dirs, files in os.walk(tmpdir): + for filename in files: + if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []: + mfile = os.path.join( root, filename) + molfiles.append( mfile ) + + for filename in molfiles: + shutil.copyfileobj(open(filename, 'rb'), out) + shutil.rmtree( tmpdir ) + zf.close() + elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True) + os.remove( temp.name ) + else: + out.write( response.read() ) +out.close()