Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt

diff scripts/ReMatCh/modules/download.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author: cstrittmatter
date: Tue, 28 Jan 2020 10:42:31 -0500
parents: 965517909457
--- a/scripts/ReMatCh/modules/download.py	Wed Jan 22 09:10:12 2020 -0500
+++ b/scripts/ReMatCh/modules/download.py	Tue Jan 28 10:42:31 2020 -0500
@@ -1,112 +1,178 @@
-import utils
 import os.path
 import multiprocessing
 import sys
 import functools
 import time
+import subprocess
+
+try:
+    import modules.utils as utils
+except ImportError:
+    from ReMatCh.modules import utils as utils
 
 
-def getReadRunInfo(ena_id):
-    import urllib
+def get_read_run_info(ena_id):
+    import urllib.request
 
     url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=' + ena_id + '&result=read_run'
 
-    readRunInfo = None
+    read_run_info = None
     try:
-        url = urllib.urlopen(url)
-        readRunInfo = url.read().splitlines()
-        if len(readRunInfo) <= 1:
-            readRunInfo = None
+        url = urllib.request.urlopen(url)
+        read_run_info = url.read().decode("utf8").splitlines()
+        if len(read_run_info) <= 1:
+            read_run_info = None
     except Exception as error:
-        print error
+        print(error)
 
-    return readRunInfo
+    return read_run_info
 
 
-def getDownloadInformation(readRunInfo):
-    header_line = readRunInfo[0].split('\t')
-    info_line = readRunInfo[1].split('\t')
+def get_download_information(read_run_info):
+    header_line = read_run_info[0].split('\t')
+    info_line = read_run_info[1].split('\t')
 
-    downloadInformation = {'fastq': None, 'submitted': None, 'cram_index': None}
+    download_information = {'fastq': None, 'submitted': None, 'cram_index': None}
     download_types = ['aspera', 'ftp']
 
     for i in range(0, len(header_line)):
         header = header_line[i].lower().rsplit('_', 1)
-        if header[0] in downloadInformation.keys():
+        if header[0] in list(download_information.keys()):
             if header[1] in download_types:
                 if len(info_line[i]) > 0:
                     files_path = info_line[i].split(';')
                     if len(files_path) > 2:
-                        print 'WARNING: Were found more files than expected in ' + header[1] + ' download links!'
-                    if downloadInformation[header[0]] is None:
-                        downloadInformation[header[0]] = {}
-                    downloadInformation[header[0]][header[1]] = files_path
+                        print('WARNING: Were found more files than expected in'
+                              ' {download_information}-{download_types} download'
+                              ' links!'.format(download_information=header[0], download_types=header[1]))
+                    if download_information[header[0]] is None:
+                        download_information[header[0]] = {}
+                    download_information[header[0]][header[1]] = files_path
 
-    return downloadInformation
+    return download_information
 
 
-def getSequencingInformation(readRunInfo):
-    header_line = readRunInfo[0].split('\t')
-    info_line = readRunInfo[1].split('\t')
+def get_sequencing_information(read_run_info):
+    header_line = read_run_info[0].split('\t')
+    info_line = read_run_info[1].split('\t')
 
-    sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None}
+    sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
+                              'library_layout': None, 'library_source': None, 'extra_run_accession': None,
+                              'nominal_length': None, 'read_count': None, 'base_count': None,
+                              'date_download': time.strftime("%Y-%m-%d")}
 
     for i in range(0, len(header_line)):
         header = header_line[i].lower()
-        if header in sequencingInformation.keys():
+        if header in list(sequencing_information.keys()):
             if len(info_line[i]) > 0:
-                sequencingInformation[header] = info_line[i]
+                sequencing_information[header] = info_line[i]
 
-    if len(readRunInfo) > 2:
+    if len(read_run_info) > 2:
         extra_run_accession = []
-        for i in range(2, len(readRunInfo)):
-            info = readRunInfo[i].split('\t')
+        for i in range(2, len(read_run_info)):
+            info = read_run_info[i].split('\t')
             for j in range(0, len(header_line)):
                 header = header_line[j].lower()
                 if header == 'run_accession':
                     if len(info[j]) > 0:
                         extra_run_accession.append(info[j])
         if len(extra_run_accession) >= 1:
-            sequencingInformation['extra_run_accession'] = ','.join(extra_run_accession)
+            sequencing_information['extra_run_accession'] = ','.join(extra_run_accession)
 
-    return sequencingInformation
+    return sequencing_information
 
 
 @utils.trace_unhandled_exceptions
-def downloadWithAspera(aspera_file_path, asperaKey, outdir, pickle_prefix):
-    command = ['ascp', '-QT', '-l', '300m', '-P33001', '-i', asperaKey, str('era-fasp@' + aspera_file_path), outdir]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)
+def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id):
+    command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir]
+    if not sra:
+        command[4] = '-P33001'
+        command[7] = str('era-fasp@' + aspera_file_path)
+        pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]
+    else:
+        command[7] = 'anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
+            a=ena_id[:3], b=ena_id[:6], c=ena_id)
+        pickle = pickle_prefix + '.' + ena_id
+
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
+
+    utils.save_variable_to_pickle(run_successfully, outdir, pickle)
+
 
-    utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]))
+@utils.trace_unhandled_exceptions
+def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id):
+    command = ['wget', '--tries=1', '', '-O', '']
+    if not sra:
+        command[2] = ftp_file_path
+        file_download = ftp_file_path.rsplit('/', 1)[1]
+        command[4] = os.path.join(outdir, file_download)
+        pickle = pickle_prefix + '.' + file_download
+    else:
+        command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
+            a=ena_id[:3], b=ena_id[:6], c=ena_id)
+        command[4] = os.path.join(outdir, ena_id + '.sra')
+        pickle = pickle_prefix + '.' + ena_id
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
+
+    utils.save_variable_to_pickle(run_successfully, outdir, pickle)
 
 
 @utils.trace_unhandled_exceptions
-def downloadWithWget(ftp_file_path, outdir, pickle_prefix):
-    file_download = ftp_file_path.rsplit('/', 1)[1]
-    command = ['wget', '--tries=2', ftp_file_path, '-O', os.path.join(outdir, file_download)]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)
+def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id):
+    command = ['prefetch', '', ena_id]
+
+    if aspera_key is not None:
+        _, ascp, _ = utils.run_command_popen_communicate(['which', 'ascp'], False, None, False)
+        command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key)
+
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
+    if run_successfully:
+        _, prefetch_outdir, _ = utils.run_command_popen_communicate(['echo', '$HOME/ncbi/public/sra'], True, None,
+                                                                    False)
 
-    utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download))
+        try:
+            os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
+                      os.path.join(outdir, ena_id + '.sra'))
+        except OSError as e:
+            print('Found the following error:'
+                  '{}'.format(e))
+
+            from shutil import copy as shutil_copy
+
+            shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
+                        os.path.join(outdir, ena_id + '.sra'))
+            os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'))
+
+    utils.save_variable_to_pickle(run_successfully, outdir, pickle_prefix + '.' + ena_id)
 
 
 @utils.trace_unhandled_exceptions
-def downloadWithCurl(ftp_file_path, outdir, pickle_prefix):
-    file_download = ftp_file_path.rsplit('/', 1)[1]
-    command = ['curl', '--retry', '2', ftp_file_path, '-O', os.path.join(outdir, file_download)]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True)
+def download_with_curl(ftp_file_path, outdir, pickle_prefix, sra, ena_id):
+    command = ['curl', '--retry', '1', '', '-o', '']
+    if not sra:
+        command[3] = ftp_file_path
+        file_download = ftp_file_path.rsplit('/', 1)[1]
+        command[5] = os.path.join(outdir, file_download)
+        pickle = pickle_prefix + '.' + file_download
+    else:
+        command[3] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
+            a=ena_id[:3], b=ena_id[:6], c=ena_id)
+        command[5] = os.path.join(outdir, ena_id + '.sra')
+        pickle = pickle_prefix + '.' + ena_id
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
 
-    utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download))
+    utils.save_variable_to_pickle(run_successfully, outdir, pickle)
 
 
-def getPickleRunSuccessfully(directory, pickle_prefix):
+def get_pickle_run_successfully(directory, pickle_prefix):
     run_successfully = True
     read_pickle = False
 
-    files = findFiles(directory, pickle_prefix, '.pkl')
+    files = find_files(directory, pickle_prefix, '.pkl')
     if files is not None:
         for file_found in files:
             if run_successfully:
-                run_successfully = utils.extractVariableFromPickle(file_found)
+                run_successfully = utils.extract_variable_from_pickle(file_found)
                 read_pickle = True
 
             os.remove(file_found)
@@ -119,75 +185,103 @@
 
 def curl_installed():
     command = ['which', 'curl']
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
     return run_successfully
 
 
-def download(downloadInformation_type, asperaKey, outdir):
+def download(download_information_type, aspera_key, outdir, sra, sra_opt, ena_id):
     pickle_prefix = 'download'
 
     run_successfully = False
-
-    if asperaKey is not None and downloadInformation_type['aspera'] is not None:
-        pool = multiprocessing.Pool(processes=2)
-        for file_download in downloadInformation_type['aspera']:
-            pool.apply_async(downloadWithAspera, args=(file_download, asperaKey, outdir, pickle_prefix,))
-        pool.close()
-        pool.join()
+    download_sra = False
 
-        run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix)
-
-    if downloadInformation_type['ftp'] is not None and not run_successfully:
-        if curl_installed():
+    if not sra:
+        if aspera_key is not None and download_information_type['aspera'] is not None:
             pool = multiprocessing.Pool(processes=2)
-            for file_download in downloadInformation_type['ftp']:
-                pool.apply_async(downloadWithCurl, args=(file_download, outdir, pickle_prefix,))
+            for file_download in download_information_type['aspera']:
+                pool.apply_async(download_with_aspera, args=(file_download, aspera_key, outdir, pickle_prefix, sra,
+                                                             ena_id,))
             pool.close()
             pool.join()
-            run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix)
+            run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
+        if not run_successfully and download_information_type['ftp'] is not None:
+            if curl_installed():
+                pool = multiprocessing.Pool(processes=2)
+                for file_download in download_information_type['ftp']:
+                    pool.apply_async(download_with_curl, args=(file_download, outdir, pickle_prefix, sra, ena_id,))
+                pool.close()
+                pool.join()
+                run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
+            if not run_successfully:
+                pool = multiprocessing.Pool(processes=2)
+                for file_download in download_information_type['ftp']:
+                    pool.apply_async(download_with_wget, args=(file_download, outdir, pickle_prefix, sra, ena_id,))
+                pool.close()
+                pool.join()
+                run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
+
+    if not run_successfully and (sra or sra_opt):
+        if aspera_key is not None:
+            download_with_aspera(None, aspera_key, outdir, pickle_prefix, sra or sra_opt, ena_id)
+            run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
         if not run_successfully:
-            pool = multiprocessing.Pool(processes=2)
-            for file_download in downloadInformation_type['ftp']:
-                pool.apply_async(downloadWithWget, args=(file_download, outdir, pickle_prefix,))
-            pool.close()
-            pool.join()
-            run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix)
+            download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id)
+            run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
+            if not run_successfully:
+                if curl_installed():
+                    download_with_curl(None, outdir, pickle_prefix, sra or sra_opt, ena_id)
+                    run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
+                if not run_successfully:
+                    download_with_wget(None, outdir, pickle_prefix, sra or sra_opt, ena_id)
+                    run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
 
-    return run_successfully
+        if run_successfully:
+            download_sra = True
+
+    return run_successfully, download_sra
 
 
-def downloadFiles(downloadInformation, asperaKey, outdir, download_cram_bam_True):
+def download_files(download_information, aspera_key, outdir, download_cram_bam_true, sra, sra_opt, ena_id):
     run_successfully = False
     cram_index_run_successfully = False
+    download_sra = False
 
-    if downloadInformation['fastq'] is not None:
-        run_successfully = download(downloadInformation['fastq'], asperaKey, outdir)
+    if download_information['fastq'] is not None:
+        run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, sra, sra_opt,
+                                                  ena_id)
 
     if not run_successfully:
-        if downloadInformation['submitted'] is not None:
-            if not download_cram_bam_True:
+        if download_information['submitted'] is not None:
+            if not download_cram_bam_true:
                 cram_bam = False
-                for i in downloadInformation['submitted']:
-                    if downloadInformation['submitted'][i][0].endswith(('.cram', '.bam')):
+                for i in download_information['submitted']:
+                    if download_information['submitted'][i][0].endswith(('.cram', '.bam')):
                         cram_bam = True
                         break
                 if not cram_bam:
-                    run_successfully = download(downloadInformation['submitted'], asperaKey, outdir)
+                    run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir,
+                                                              False, False, ena_id)
 
-            elif download_cram_bam_True:
-                run_successfully = download(downloadInformation['submitted'], asperaKey, outdir)
-                if run_successfully and downloadInformation['cram_index'] is not None:
-                    cram_index_run_successfully = download(downloadInformation['cram_index'], asperaKey, outdir)
+            elif download_cram_bam_true:
+                run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, False,
+                                                          False, ena_id)
+                if run_successfully and download_information['cram_index'] is not None:
+                    cram_index_run_successfully = download(download_information['cram_index'], aspera_key, outdir,
+                                                           False, False, ena_id)
 
-    return run_successfully, cram_index_run_successfully
+    if not run_successfully and (sra or sra_opt):
+        run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, True, sra_opt,
+                                                  ena_id)
+
+    return run_successfully, cram_index_run_successfully, download_sra
 
 
-def sortAlignment(alignment_file, output_file, sortByName_True, threads):
-    outFormat_string = os.path.splitext(output_file)[1][1:].lower()
-    command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file]
-    if sortByName_True:
+def sort_alignment(alignment_file, output_file, sort_by_name_true, threads):
+    out_format_string = os.path.splitext(output_file)[1][1:].lower()
+    command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file]
+    if sort_by_name_true:
         command[6] = '-n'
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
 
     if not run_successfully:
         output_file = None
@@ -195,34 +289,33 @@
     return run_successfully, output_file
 
 
-def alignmentToFastq(alignment_file, outdir, threads, pair_end_type):
+def alignment_to_fastq(alignment_file, threads, pair_end_type):
     fastq_basename = os.path.splitext(alignment_file)[0]
     outfiles = None
-    bamFile = fastq_basename + '.temp.bam'
+    bam_file = fastq_basename + '.temp.bam'
     # sort cram
-    run_successfully, bamFile = sortAlignment(alignment_file, bamFile, True, threads)
+    run_successfully, bam_file = sort_alignment(alignment_file, bam_file, True, threads)
     if run_successfully:
-        command = ['samtools', 'fastq', '', bamFile]
+        command = ['samtools', 'fastq', '', bam_file]
         if pair_end_type.lower() == 'paired':
             command[2] = '-1 ' + str(fastq_basename + '_1.fq') + ' -2 ' + str(fastq_basename + '_2.fq')
         elif pair_end_type == 'single':
             command[2] = '-0 ' + str(fastq_basename + '.fq')
 
-        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
+        run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
         if run_successfully:
             if pair_end_type.lower() == 'paired':
                 outfiles = [str(fastq_basename + '_1.fq'), str(fastq_basename + '_2.fq')]
             elif pair_end_type.lower() == 'single':
                 outfiles = [str(fastq_basename + '.fq')]
 
-    if os.path.isfile(bamFile):
-        os.remove(bamFile)
+    if bam_file is not None and os.path.isfile(bam_file):
+        os.remove(bam_file)
 
     return run_successfully, outfiles
 
 
-def formartFastqHeaders(in_fastq_1, in_fastq_2):
-    import itertools
+def formart_fastq_headers(in_fastq_1, in_fastq_2):
 
     out_fastq_1 = in_fastq_1 + '.temp'
     out_fastq_2 = in_fastq_2 + '.temp'
@@ -233,7 +326,7 @@
         plus_line = True
         quality_line = True
         number_reads = 0
-        for in_1, in_2 in itertools.izip(reader_in_fastq_1, reader_in_fastq_2):
+        for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2):
             if len(in_1) > 0:
                 in_1 = in_1.splitlines()[0]
                 in_2 = in_2.splitlines()[0]
@@ -269,22 +362,22 @@
 
 
 @utils.trace_unhandled_exceptions
-def gzipFiles(file_2_compress, pickle_prefix, outdir):
-    out_file = None
+def gzip_files(file_2_compress, pickle_prefix, outdir):
     if file_2_compress.endswith('.temp'):
         out_file = os.path.splitext(file_2_compress)[0]
     else:
         out_file = file_2_compress
 
     command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')]
-    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True)
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, True)
     if run_successfully:
         os.remove(file_2_compress)
 
-    utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
+    utils.save_variable_to_pickle(run_successfully, outdir,
+                                  str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
 
 
-def findFiles(directory, prefix, suffix):
+def find_files(directory, prefix, suffix):
     list_files_found = []
     files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))]
     for file_found in files:
@@ -298,59 +391,60 @@
     return list_files_found
 
 
-def compressFiles(fastq_files, outdir, threads):
+def compress_files(fastq_files, outdir, threads):
     pickle_prefix = 'compress'
     compressed_fastq_files = None
 
     pool = multiprocessing.Pool(processes=threads)
     for fastq in fastq_files:
-        pool.apply_async(gzipFiles, args=(fastq, pickle_prefix, outdir,))
+        pool.apply_async(gzip_files, args=(fastq, pickle_prefix, outdir,))
     pool.close()
     pool.join()
 
-    run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix)
+    run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
     if run_successfully:
-        compressed_fastq_files = findFiles(outdir, '', '.gz')
+        compressed_fastq_files = find_files(outdir, '', '.gz')
 
     return run_successfully, compressed_fastq_files
 
 
-def bamCram_2_fastq(alignment_file, outdir, threads, pair_end_type):
-    run_successfully, fastq_files = alignmentToFastq(alignment_file, outdir, threads, pair_end_type)
+def bam_cram_2_fastq(alignment_file, outdir, threads, pair_end_type):
+    run_successfully, fastq_files = alignment_to_fastq(alignment_file, threads, pair_end_type)
     if run_successfully:
         if pair_end_type.lower() == 'paired':
-            number_reads, fastq_files = formartFastqHeaders(fastq_files[0], fastq_files[1])
+            number_reads, fastq_files = formart_fastq_headers(fastq_files[0], fastq_files[1])
 
-        run_successfully, fastq_files = compressFiles(fastq_files, outdir, threads)
+        run_successfully, fastq_files = compress_files(fastq_files, outdir, threads)
 
     return run_successfully, fastq_files
 
 
-def check_correct_links(downloadInformation):
-    for i in downloadInformation:
-        if downloadInformation[i] is not None:
-            if downloadInformation[i]['aspera'] is not None:
-                for j in range(0, len(downloadInformation[i]['aspera'])):
-                    if downloadInformation[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'):
-                        downloadInformation[i]['aspera'][j] = downloadInformation[i]['aspera'][j].replace('fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1)
-            if downloadInformation[i]['ftp'] is not None:
-                for j in range(0, len(downloadInformation[i]['ftp'])):
-                    if '#' in downloadInformation[i]['ftp'][j]:
-                        downloadInformation[i]['ftp'][j] = downloadInformation[i]['ftp'][j].replace('#', '%23')
-    return downloadInformation
+def check_correct_links(download_information):
+    for i in download_information:
+        if download_information[i] is not None:
+            if download_information[i]['aspera'] is not None:
+                for j in range(0, len(download_information[i]['aspera'])):
+                    if download_information[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'):
+                        download_information[i]['aspera'][j] = download_information[i]['aspera'][j].replace(
+                            'fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1)
+            if download_information[i]['ftp'] is not None:
+                for j in range(0, len(download_information[i]['ftp'])):
+                    if '#' in download_information[i]['ftp'][j]:
+                        download_information[i]['ftp'][j] = download_information[i]['ftp'][j].replace('#', '%23')
+    return download_information
 
 
 def get_fastq_files(download_dir, cram_index_run_successfully, threads, download_paired_type):
     run_successfully = False
-    downloaded_files = findFiles(download_dir, '', '')
+    downloaded_files = find_files(download_dir, '', '')
     if cram_index_run_successfully:
         cram_file = None
         for i in downloaded_files:
             if i.endswith('.cram'):
                 cram_file = i
-        run_successfully, downloaded_files = bamCram_2_fastq(cram_file, download_dir, threads, download_paired_type)
+        run_successfully, downloaded_files = bam_cram_2_fastq(cram_file, download_dir, threads, download_paired_type)
     else:
-        if len(downloaded_files) > 0:
+        if downloaded_files is not None and len(downloaded_files) > 0:
             run_successfully = True
 
     return run_successfully, downloaded_files
@@ -373,7 +467,7 @@
             if not temp_name.endswith(('_R1_001.f', '_R2_001.f')):
                 list_new_files[i] = os.path.join(outdir, new_name + '.fq.gz')
                 if temp_name.endswith(('_1.f', '_2.f')):
-                    print 'WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!'
+                    print('WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!')
 
     if len(list_new_files) == 2 and download_paired_type.lower() == 'paired':
         run_successfully = True
@@ -388,9 +482,9 @@
                         os.remove(list_files[i])
                 else:
                     os.rename(list_files[i], list_new_files[i])
-            list_new_files = list_new_files.values()
+            list_new_files = list(list_new_files.values())
         except Exception as e:
-            print e
+            print(e)
             run_successfully = False
 
     if not run_successfully:
@@ -399,34 +493,107 @@
     return run_successfully, list_new_files
 
 
+# @utils.trace_unhandled_exceptions
+def rename_header_sra(fastq):
+    run_successfully = False
+    try:
+        command = ['gawk', '\'{if(NR%4==1) $0=gensub(/\./, \"/\", 2); print}\'', fastq, '|', 'gzip', '-1', '>',
+                   str(fastq + '.gz')]
+        print('Running: ' + str(' '.join(command)))
+        return_code = subprocess.call(' '.join(command), shell=True)
+        if return_code == 0:
+            run_successfully = True
+        else:
+            print('Something went wrong with command: {command}'.format(command=' '.join(command)))
+    except Exception as e:
+        print(e)
+
+    return run_successfully
+
+
+def sra_2_fastq(download_dir, ena_id):
+    command = ['fastq-dump', '-I', '-O', download_dir, '--split-files', '{download_dir}{ena_id}.sra'.format(
+        download_dir=download_dir, ena_id=ena_id)]
+    run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
+    if run_successfully:
+        files = [os.path.join(download_dir, f) for f in os.listdir(download_dir)
+                 if not f.startswith('.') and os.path.isfile(os.path.join(download_dir, f)) and f.endswith('.fastq')]
+
+        pool = multiprocessing.Pool(processes=2)
+        results = []
+        p = pool.map_async(rename_header_sra, files, callback=results.extend)
+        p.wait()
+
+        run_successfully = all(results)
+
+    return run_successfully
+
+
 download_timer = functools.partial(utils.timer, name='Download module')
 
 
 @download_timer
-def runDownload(ena_id, download_paired_type, asperaKey, outdir, download_cram_bam_True, threads, instrument_platform):
+def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform,
+                 sra, sra_opt):
     download_dir = os.path.join(outdir, 'download', '')
-    utils.removeDirectory(download_dir)
+    utils.remove_directory(download_dir)
     os.mkdir(download_dir)
 
     run_successfully = False
     downloaded_files = None
-    sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
+    sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
+                              'library_layout': None, 'library_source': None, 'extra_run_accession': None,
+                              'nominal_length': None, 'read_count': None, 'base_count': None,
+                              'date_download': time.strftime("%Y-%m-%d")}
 
-    readRunInfo = getReadRunInfo(ena_id)
-    if readRunInfo is not None:
-        downloadInformation = getDownloadInformation(readRunInfo)
-        downloadInformation = check_correct_links(downloadInformation)
-        sequencingInformation = getSequencingInformation(readRunInfo)
-        sequencingInformation['date_download'] = time.strftime("%Y-%m-%d")
+    read_run_info = get_read_run_info(ena_id)
+    if read_run_info is not None:
+        download_information = get_download_information(read_run_info)
+        download_information = check_correct_links(download_information)
+        sequencing_information = get_sequencing_information(read_run_info)
 
-        if instrument_platform.lower() == 'all' or (sequencingInformation['instrument_platform'] is not None and sequencingInformation['instrument_platform'].lower() == instrument_platform.lower()):
-            if download_paired_type.lower() == 'both' or (sequencingInformation['library_layout'] is not None and sequencingInformation['library_layout'].lower() == download_paired_type.lower()):
-                run_successfully, cram_index_run_successfully = downloadFiles(downloadInformation, asperaKey, download_dir, download_cram_bam_True)
+        if instrument_platform.lower() == 'all' or \
+                (sequencing_information['instrument_platform'] is not None and
+                 sequencing_information['instrument_platform'].lower() == instrument_platform.lower()):
+            if download_paired_type.lower() == 'both' or \
+                    (sequencing_information['library_layout'] is not None and
+                     sequencing_information['library_layout'].lower() == download_paired_type.lower()):
+                run_successfully, cram_index_run_successfully, download_sra = download_files(download_information,
+                                                                                             aspera_key, download_dir,
+                                                                                             download_cram_bam_true,
+                                                                                             sra, sra_opt, ena_id)
+                if download_sra:
+                    run_successfully = sra_2_fastq(download_dir, ena_id)
                 if run_successfully:
-                    run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, sequencingInformation['library_layout'])
+                    run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
+                                                                         threads,
+                                                                         sequencing_information['library_layout'])
                 if run_successfully and downloaded_files is not None:
-                    run_successfully, downloaded_files = rename_move_files(downloaded_files, sequencingInformation['run_accession'], outdir, sequencingInformation['library_layout'])
+                    run_successfully, downloaded_files = rename_move_files(downloaded_files,
+                                                                           sequencing_information['run_accession'],
+                                                                           outdir,
+                                                                           sequencing_information['library_layout'])
+    else:
+        if sra or sra_opt:
+            run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None,
+                                                                                          'submitted': None,
+                                                                                          'cram_index': None},
+                                                                                         aspera_key, download_dir,
+                                                                                         download_cram_bam_true, sra,
+                                                                                         sra_opt, ena_id)
+            if download_sra:
+                run_successfully = sra_2_fastq(download_dir, ena_id)
+            if run_successfully:
+                run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads,
+                                                                     'paired')
+                if not run_successfully:
+                    run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
+                                                                         threads, 'single')
+            if run_successfully and downloaded_files is not None:
+                run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired')
+                if not run_successfully:
+                    run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single')
 
-    utils.removeDirectory(download_dir)
+    utils.remove_directory(download_dir)
 
-    return run_successfully, downloaded_files, sequencingInformation
+    return run_successfully, downloaded_files, sequencing_information
author	cstrittmatter
date	Tue, 28 Jan 2020 10:42:31 -0500
parents	965517909457
children