Mercurial > repos > scottx611x > data_manager_fetch_gene_annotation

--- a/data_manager/data_manager.py	Tue Apr 04 18:07:39 2017 -0400
+++ b/data_manager/data_manager.py	Sun Nov 22 12:48:13 2020 +0000
@@ -1,12 +1,84 @@
+# -*- coding: utf-8 -*-
 import argparse
+import bz2
 import datetime
+import gzip
 import json
 import os
+import shutil
 import sys
 import uuid
+import zipfile

-import requests
-from requests.exceptions import ContentDecodingError
+
+# Nice solution to opening compressed files (zip/bz2/gz) transparently
+# https://stackoverflow.com/a/13045892/638445
+
+class CompressedFile(object):
+    magic = None
+    file_type = None
+    mime_type = None
+    proper_extension = None
+
+    def __init__(self, f):
+        # f is an open file or file like object
+        self.f = f
+        self.accessor = self.open()
+
+    @classmethod
+    def is_magic(self, data):
+        return data.startswith(self.magic)
+
+    def open(self):
+        return None
+
+
+class ZIPFile(CompressedFile):
+    magic = '\x50\x4b\x03\x04'
+    file_type = 'zip'
+    mime_type = 'compressed/zip'
+
+    def open(self):
+        return zipfile.ZipFile(self.f)
+
+
+class BZ2File(CompressedFile):
+    magic = '\x42\x5a\x68'
+    file_type = 'bz2'
+    mime_type = 'compressed/bz2'
+
+    def open(self):
+        return bz2.BZ2File(self.f)
+
+
+class GZFile(CompressedFile):
+    magic = '\x1f\x8b\x08'
+    file_type = 'gz'
+    mime_type = 'compressed/gz'
+
+    def open(self):
+        return gzip.GzipFile(self.f)
+
+
+# factory function to create a suitable instance for accessing files
+def get_compressed_file(filename):
+    with open(filename, 'rb') as f:
+        start_of_file = f.read(1024)
+        f.seek(0)
+        for cls in (ZIPFile, BZ2File, GZFile):
+            if cls.is_magic(start_of_file):
+                f.close()
+                return cls(filename)
+
+        return None
+
+
+try:
+    # For Python 3.0 and later
+    from urllib.request import urlretrieve
+except ImportError:
+    # Fall back to Python 2's urllib2
+    from urllib import urlretrieve


 def url_download(url):
@@ -16,36 +88,33 @@
     :returns: name of downloaded gene annotation file
     :raises: ContentDecodingError, IOError
     """
-    response = requests.get(url=url, stream=True)

     # Generate file_name
-    file_name = response.url.split("/")[-1]
+    file_name = url.split('/')[-1]
+
+    try:
+        # download URL (FTP and HTTP work, probably local and data too)
+        urlretrieve(url, file_name)

-    block_size = 10 * 1024 * 1024  # 10MB chunked download
-    with open(file_name, 'w+') as f:
-        try:
-            # Good to note here that requests' iter_content() will
-            # automatically handle decoding "gzip" and "deflate" encoding
-            # formats
-            for buf in response.iter_content(block_size):
-                f.write(buf)
-        except (ContentDecodingError, IOError) as e:
-            sys.stderr.write("Error occured downloading reference file: %s"
-                             % e)
+        # uncompress file if needed
+        cf = get_compressed_file(file_name)
+        if cf is not None:
+            uncompressed_file_name = os.path.splitext(file_name)[0]
+            with open(uncompressed_file_name, 'w+') as uncompressed_file:
+                shutil.copyfileobj(cf.accessor, uncompressed_file)
             os.remove(file_name)
-
+            file_name = uncompressed_file_name
+    except IOError as e:
+        sys.stderr.write('Error occured downloading reference file: %s' % e)
+        os.remove(file_name)
     return file_name


 def main():
     parser = argparse.ArgumentParser(description='Create data manager JSON.')
-    parser.add_argument('--out', dest='output', action='store',
-                        help='JSON filename')
-    parser.add_argument('--name', dest='name', action='store',
-                        default=uuid.uuid4(), help='Data table entry unique ID'
-                        )
-    parser.add_argument('--url', dest='url', action='store',
-                        help='Url to download gtf file from')
+    parser.add_argument('--out', dest='output', action='store', help='JSON filename')
+    parser.add_argument('--name', dest='name', action='store', default=uuid.uuid4(), help='Data table entry unique ID')
+    parser.add_argument('--url', dest='url', action='store', help='Url to download gtf file from')

     args = parser.parse_args()

@@ -66,8 +135,8 @@
         }
     }

-    with open(os.path.join(args.output), "w+") as f:
-        f.write(json.dumps(data_manager_entry))
+    with open(os.path.join(args.output), 'w+') as fh:
+        json.dump(data_manager_entry, fh, sort_keys=True)


 if __name__ == '__main__':
--- a/data_manager/gene_annotation_fetcher.xml	Tue Apr 04 18:07:39 2017 -0400
+++ b/data_manager/gene_annotation_fetcher.xml	Sun Nov 22 12:48:13 2020 +0000
@@ -1,9 +1,6 @@
 <?xml version="1.0"?>
 <tool id="gene_annotation_fetcher_data_manager" name="Gene Annotation Fetch" tool_type="manage_data" version="1.0.1">
     <description>gene annotation fetcher</description>
-    <requirements>
-        <requirement type="package" version="2.13.0">requests</requirement>
-    </requirements>
     <command detect_errors="exit_code">
     <![CDATA[
         python '$__tool_directory__/data_manager.py' --out '${out_file}'