diff get_online_data/get_online_data.py @ 0:f653fd06f055 draft

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:23:17 -0400
parents
children c2055dd1927b
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_online_data/get_online_data.py	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import os, sys
+import urllib2
+import gzip, tempfile
+import zipfile
+import subprocess
+import shutil
+
+def unescape(cond_text):
+    # Unescape if input has been escaped
+    mapped_chars = { '>' :'__gt__', 
+                 '<' :'__lt__', 
+                 "'" :'__sq__',
+                 '"' :'__dq__',
+                 '[' :'__ob__',
+                 ']' :'__cb__',
+                 '{' :'__oc__',
+                 '}' :'__cc__',
+                 '@' : '__at__',
+                 '\n' : '__cn__',
+                 '\r' : '__cr__',
+                 '\t' : '__tc__'
+                 }
+    for key, value in mapped_chars.items():
+        cond_text = cond_text.replace( value, key )
+    return cond_text
+
+urls = unescape(sys.argv[1])
+out = open(sys.argv[2], 'wb')
+
+if len(sys.argv) > 3:
+    allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
+else:
+    allowed_extensions = ['.sdf', '.smi', '.inchi']
+
+for url in urls.split('\n'):
+    url = url.strip()
+    request = urllib2.Request( url )
+    request.add_header('Accept-encoding', 'gzip')
+    request.add_header('Accept-encoding', 'gz')
+    response = urllib2.urlopen( request )
+
+    if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']:
+        temp = tempfile.NamedTemporaryFile( delete=False )
+        temp.write( response.read() )
+        temp.close()
+        zipfile = gzip.open(temp.name, 'rb')
+        out.write( zipfile.read() )
+        os.remove(temp.name)
+    elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']:
+        temp = tempfile.NamedTemporaryFile(delete=False)
+        temp.close()
+        with open(temp.name, 'wb') as fp:
+            shutil.copyfileobj(response, fp)
+
+        zf = zipfile.ZipFile(temp.name, allowZip64=True)
+        tmpdir = tempfile.mkdtemp( )
+
+        for filename in zf.namelist():
+            zf.extractall( tmpdir )
+
+        os.remove( temp.name )
+        molfiles = []
+        for root, dirs, files in os.walk(tmpdir):
+            for filename in files:
+                if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
+                    mfile = os.path.join( root, filename)
+                    molfiles.append( mfile )
+
+        for filename in molfiles:
+            shutil.copyfileobj(open(filename, 'rb'), out)
+        shutil.rmtree( tmpdir )
+        zf.close()
+    elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']:
+        temp = tempfile.NamedTemporaryFile(delete=False)
+        temp.close()
+        with open(temp.name, 'wb') as fp:
+            shutil.copyfileobj(response, fp)
+        cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True)
+        os.remove( temp.name )
+    else:
+        out.write( response.read() )
+out.close()