Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
diff scripts/ReMatCh/modules/download.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
line wrap: on
line diff
--- a/scripts/ReMatCh/modules/download.py Wed Jan 22 09:10:12 2020 -0500 +++ b/scripts/ReMatCh/modules/download.py Tue Jan 28 10:42:31 2020 -0500 @@ -1,112 +1,178 @@ -import utils import os.path import multiprocessing import sys import functools import time +import subprocess + +try: + import modules.utils as utils +except ImportError: + from ReMatCh.modules import utils as utils -def getReadRunInfo(ena_id): - import urllib +def get_read_run_info(ena_id): + import urllib.request url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=' + ena_id + '&result=read_run' - readRunInfo = None + read_run_info = None try: - url = urllib.urlopen(url) - readRunInfo = url.read().splitlines() - if len(readRunInfo) <= 1: - readRunInfo = None + url = urllib.request.urlopen(url) + read_run_info = url.read().decode("utf8").splitlines() + if len(read_run_info) <= 1: + read_run_info = None except Exception as error: - print error + print(error) - return readRunInfo + return read_run_info -def getDownloadInformation(readRunInfo): - header_line = readRunInfo[0].split('\t') - info_line = readRunInfo[1].split('\t') +def get_download_information(read_run_info): + header_line = read_run_info[0].split('\t') + info_line = read_run_info[1].split('\t') - downloadInformation = {'fastq': None, 'submitted': None, 'cram_index': None} + download_information = {'fastq': None, 'submitted': None, 'cram_index': None} download_types = ['aspera', 'ftp'] for i in range(0, len(header_line)): header = header_line[i].lower().rsplit('_', 1) - if header[0] in downloadInformation.keys(): + if header[0] in list(download_information.keys()): if header[1] in download_types: if len(info_line[i]) > 0: files_path = info_line[i].split(';') if len(files_path) > 2: - print 'WARNING: Were found more files than expected in ' + header[1] + ' download links!' - if downloadInformation[header[0]] is None: - downloadInformation[header[0]] = {} - downloadInformation[header[0]][header[1]] = files_path + print('WARNING: Were found more files than expected in' + ' {download_information}-{download_types} download' + ' links!'.format(download_information=header[0], download_types=header[1])) + if download_information[header[0]] is None: + download_information[header[0]] = {} + download_information[header[0]][header[1]] = files_path - return downloadInformation + return download_information -def getSequencingInformation(readRunInfo): - header_line = readRunInfo[0].split('\t') - info_line = readRunInfo[1].split('\t') +def get_sequencing_information(read_run_info): + header_line = read_run_info[0].split('\t') + info_line = read_run_info[1].split('\t') - sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None} + sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, + 'library_layout': None, 'library_source': None, 'extra_run_accession': None, + 'nominal_length': None, 'read_count': None, 'base_count': None, + 'date_download': time.strftime("%Y-%m-%d")} for i in range(0, len(header_line)): header = header_line[i].lower() - if header in sequencingInformation.keys(): + if header in list(sequencing_information.keys()): if len(info_line[i]) > 0: - sequencingInformation[header] = info_line[i] + sequencing_information[header] = info_line[i] - if len(readRunInfo) > 2: + if len(read_run_info) > 2: extra_run_accession = [] - for i in range(2, len(readRunInfo)): - info = readRunInfo[i].split('\t') + for i in range(2, len(read_run_info)): + info = read_run_info[i].split('\t') for j in range(0, len(header_line)): header = header_line[j].lower() if header == 'run_accession': if len(info[j]) > 0: extra_run_accession.append(info[j]) if len(extra_run_accession) >= 1: - sequencingInformation['extra_run_accession'] = ','.join(extra_run_accession) + sequencing_information['extra_run_accession'] = ','.join(extra_run_accession) - return sequencingInformation + return sequencing_information @utils.trace_unhandled_exceptions -def downloadWithAspera(aspera_file_path, asperaKey, outdir, pickle_prefix): - command = ['ascp', '-QT', '-l', '300m', '-P33001', '-i', asperaKey, str('era-fasp@' + aspera_file_path), outdir] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) +def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id): + command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir] + if not sra: + command[4] = '-P33001' + command[7] = str('era-fasp@' + aspera_file_path) + pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1] + else: + command[7] = 'anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( + a=ena_id[:3], b=ena_id[:6], c=ena_id) + pickle = pickle_prefix + '.' + ena_id + + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) + + utils.save_variable_to_pickle(run_successfully, outdir, pickle) + - utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1])) +@utils.trace_unhandled_exceptions +def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id): + command = ['wget', '--tries=1', '', '-O', ''] + if not sra: + command[2] = ftp_file_path + file_download = ftp_file_path.rsplit('/', 1)[1] + command[4] = os.path.join(outdir, file_download) + pickle = pickle_prefix + '.' + file_download + else: + command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( + a=ena_id[:3], b=ena_id[:6], c=ena_id) + command[4] = os.path.join(outdir, ena_id + '.sra') + pickle = pickle_prefix + '.' + ena_id + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) + + utils.save_variable_to_pickle(run_successfully, outdir, pickle) @utils.trace_unhandled_exceptions -def downloadWithWget(ftp_file_path, outdir, pickle_prefix): - file_download = ftp_file_path.rsplit('/', 1)[1] - command = ['wget', '--tries=2', ftp_file_path, '-O', os.path.join(outdir, file_download)] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) +def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id): + command = ['prefetch', '', ena_id] + + if aspera_key is not None: + _, ascp, _ = utils.run_command_popen_communicate(['which', 'ascp'], False, None, False) + command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key) + + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) + if run_successfully: + _, prefetch_outdir, _ = utils.run_command_popen_communicate(['echo', '$HOME/ncbi/public/sra'], True, None, + False) - utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download)) + try: + os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), + os.path.join(outdir, ena_id + '.sra')) + except OSError as e: + print('Found the following error:' + '{}'.format(e)) + + from shutil import copy as shutil_copy + + shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'), + os.path.join(outdir, ena_id + '.sra')) + os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra')) + + utils.save_variable_to_pickle(run_successfully, outdir, pickle_prefix + '.' + ena_id) @utils.trace_unhandled_exceptions -def downloadWithCurl(ftp_file_path, outdir, pickle_prefix): - file_download = ftp_file_path.rsplit('/', 1)[1] - command = ['curl', '--retry', '2', ftp_file_path, '-O', os.path.join(outdir, file_download)] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, 3600, True) +def download_with_curl(ftp_file_path, outdir, pickle_prefix, sra, ena_id): + command = ['curl', '--retry', '1', '', '-o', ''] + if not sra: + command[3] = ftp_file_path + file_download = ftp_file_path.rsplit('/', 1)[1] + command[5] = os.path.join(outdir, file_download) + pickle = pickle_prefix + '.' + file_download + else: + command[3] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format( + a=ena_id[:3], b=ena_id[:6], c=ena_id) + command[5] = os.path.join(outdir, ena_id + '.sra') + pickle = pickle_prefix + '.' + ena_id + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) - utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + file_download)) + utils.save_variable_to_pickle(run_successfully, outdir, pickle) -def getPickleRunSuccessfully(directory, pickle_prefix): +def get_pickle_run_successfully(directory, pickle_prefix): run_successfully = True read_pickle = False - files = findFiles(directory, pickle_prefix, '.pkl') + files = find_files(directory, pickle_prefix, '.pkl') if files is not None: for file_found in files: if run_successfully: - run_successfully = utils.extractVariableFromPickle(file_found) + run_successfully = utils.extract_variable_from_pickle(file_found) read_pickle = True os.remove(file_found) @@ -119,75 +185,103 @@ def curl_installed(): command = ['which', 'curl'] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False) return run_successfully -def download(downloadInformation_type, asperaKey, outdir): +def download(download_information_type, aspera_key, outdir, sra, sra_opt, ena_id): pickle_prefix = 'download' run_successfully = False - - if asperaKey is not None and downloadInformation_type['aspera'] is not None: - pool = multiprocessing.Pool(processes=2) - for file_download in downloadInformation_type['aspera']: - pool.apply_async(downloadWithAspera, args=(file_download, asperaKey, outdir, pickle_prefix,)) - pool.close() - pool.join() + download_sra = False - run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix) - - if downloadInformation_type['ftp'] is not None and not run_successfully: - if curl_installed(): + if not sra: + if aspera_key is not None and download_information_type['aspera'] is not None: pool = multiprocessing.Pool(processes=2) - for file_download in downloadInformation_type['ftp']: - pool.apply_async(downloadWithCurl, args=(file_download, outdir, pickle_prefix,)) + for file_download in download_information_type['aspera']: + pool.apply_async(download_with_aspera, args=(file_download, aspera_key, outdir, pickle_prefix, sra, + ena_id,)) pool.close() pool.join() - run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) + if not run_successfully and download_information_type['ftp'] is not None: + if curl_installed(): + pool = multiprocessing.Pool(processes=2) + for file_download in download_information_type['ftp']: + pool.apply_async(download_with_curl, args=(file_download, outdir, pickle_prefix, sra, ena_id,)) + pool.close() + pool.join() + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) + if not run_successfully: + pool = multiprocessing.Pool(processes=2) + for file_download in download_information_type['ftp']: + pool.apply_async(download_with_wget, args=(file_download, outdir, pickle_prefix, sra, ena_id,)) + pool.close() + pool.join() + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) + + if not run_successfully and (sra or sra_opt): + if aspera_key is not None: + download_with_aspera(None, aspera_key, outdir, pickle_prefix, sra or sra_opt, ena_id) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) if not run_successfully: - pool = multiprocessing.Pool(processes=2) - for file_download in downloadInformation_type['ftp']: - pool.apply_async(downloadWithWget, args=(file_download, outdir, pickle_prefix,)) - pool.close() - pool.join() - run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix) + download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) + if not run_successfully: + if curl_installed(): + download_with_curl(None, outdir, pickle_prefix, sra or sra_opt, ena_id) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) + if not run_successfully: + download_with_wget(None, outdir, pickle_prefix, sra or sra_opt, ena_id) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) - return run_successfully + if run_successfully: + download_sra = True + + return run_successfully, download_sra -def downloadFiles(downloadInformation, asperaKey, outdir, download_cram_bam_True): +def download_files(download_information, aspera_key, outdir, download_cram_bam_true, sra, sra_opt, ena_id): run_successfully = False cram_index_run_successfully = False + download_sra = False - if downloadInformation['fastq'] is not None: - run_successfully = download(downloadInformation['fastq'], asperaKey, outdir) + if download_information['fastq'] is not None: + run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, sra, sra_opt, + ena_id) if not run_successfully: - if downloadInformation['submitted'] is not None: - if not download_cram_bam_True: + if download_information['submitted'] is not None: + if not download_cram_bam_true: cram_bam = False - for i in downloadInformation['submitted']: - if downloadInformation['submitted'][i][0].endswith(('.cram', '.bam')): + for i in download_information['submitted']: + if download_information['submitted'][i][0].endswith(('.cram', '.bam')): cram_bam = True break if not cram_bam: - run_successfully = download(downloadInformation['submitted'], asperaKey, outdir) + run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, + False, False, ena_id) - elif download_cram_bam_True: - run_successfully = download(downloadInformation['submitted'], asperaKey, outdir) - if run_successfully and downloadInformation['cram_index'] is not None: - cram_index_run_successfully = download(downloadInformation['cram_index'], asperaKey, outdir) + elif download_cram_bam_true: + run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, False, + False, ena_id) + if run_successfully and download_information['cram_index'] is not None: + cram_index_run_successfully = download(download_information['cram_index'], aspera_key, outdir, + False, False, ena_id) - return run_successfully, cram_index_run_successfully + if not run_successfully and (sra or sra_opt): + run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, True, sra_opt, + ena_id) + + return run_successfully, cram_index_run_successfully, download_sra -def sortAlignment(alignment_file, output_file, sortByName_True, threads): - outFormat_string = os.path.splitext(output_file)[1][1:].lower() - command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] - if sortByName_True: +def sort_alignment(alignment_file, output_file, sort_by_name_true, threads): + out_format_string = os.path.splitext(output_file)[1][1:].lower() + command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file] + if sort_by_name_true: command[6] = '-n' - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) if not run_successfully: output_file = None @@ -195,34 +289,33 @@ return run_successfully, output_file -def alignmentToFastq(alignment_file, outdir, threads, pair_end_type): +def alignment_to_fastq(alignment_file, threads, pair_end_type): fastq_basename = os.path.splitext(alignment_file)[0] outfiles = None - bamFile = fastq_basename + '.temp.bam' + bam_file = fastq_basename + '.temp.bam' # sort cram - run_successfully, bamFile = sortAlignment(alignment_file, bamFile, True, threads) + run_successfully, bam_file = sort_alignment(alignment_file, bam_file, True, threads) if run_successfully: - command = ['samtools', 'fastq', '', bamFile] + command = ['samtools', 'fastq', '', bam_file] if pair_end_type.lower() == 'paired': command[2] = '-1 ' + str(fastq_basename + '_1.fq') + ' -2 ' + str(fastq_basename + '_2.fq') elif pair_end_type == 'single': command[2] = '-0 ' + str(fastq_basename + '.fq') - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True) if run_successfully: if pair_end_type.lower() == 'paired': outfiles = [str(fastq_basename + '_1.fq'), str(fastq_basename + '_2.fq')] elif pair_end_type.lower() == 'single': outfiles = [str(fastq_basename + '.fq')] - if os.path.isfile(bamFile): - os.remove(bamFile) + if bam_file is not None and os.path.isfile(bam_file): + os.remove(bam_file) return run_successfully, outfiles -def formartFastqHeaders(in_fastq_1, in_fastq_2): - import itertools +def formart_fastq_headers(in_fastq_1, in_fastq_2): out_fastq_1 = in_fastq_1 + '.temp' out_fastq_2 = in_fastq_2 + '.temp' @@ -233,7 +326,7 @@ plus_line = True quality_line = True number_reads = 0 - for in_1, in_2 in itertools.izip(reader_in_fastq_1, reader_in_fastq_2): + for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2): if len(in_1) > 0: in_1 = in_1.splitlines()[0] in_2 = in_2.splitlines()[0] @@ -269,22 +362,22 @@ @utils.trace_unhandled_exceptions -def gzipFiles(file_2_compress, pickle_prefix, outdir): - out_file = None +def gzip_files(file_2_compress, pickle_prefix, outdir): if file_2_compress.endswith('.temp'): out_file = os.path.splitext(file_2_compress)[0] else: out_file = file_2_compress command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')] - run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, True, None, True) + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, True) if run_successfully: os.remove(file_2_compress) - utils.saveVariableToPickle(run_successfully, outdir, str(pickle_prefix + '.' + os.path.basename(file_2_compress))) + utils.save_variable_to_pickle(run_successfully, outdir, + str(pickle_prefix + '.' + os.path.basename(file_2_compress))) -def findFiles(directory, prefix, suffix): +def find_files(directory, prefix, suffix): list_files_found = [] files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))] for file_found in files: @@ -298,59 +391,60 @@ return list_files_found -def compressFiles(fastq_files, outdir, threads): +def compress_files(fastq_files, outdir, threads): pickle_prefix = 'compress' compressed_fastq_files = None pool = multiprocessing.Pool(processes=threads) for fastq in fastq_files: - pool.apply_async(gzipFiles, args=(fastq, pickle_prefix, outdir,)) + pool.apply_async(gzip_files, args=(fastq, pickle_prefix, outdir,)) pool.close() pool.join() - run_successfully = getPickleRunSuccessfully(outdir, pickle_prefix) + run_successfully = get_pickle_run_successfully(outdir, pickle_prefix) if run_successfully: - compressed_fastq_files = findFiles(outdir, '', '.gz') + compressed_fastq_files = find_files(outdir, '', '.gz') return run_successfully, compressed_fastq_files -def bamCram_2_fastq(alignment_file, outdir, threads, pair_end_type): - run_successfully, fastq_files = alignmentToFastq(alignment_file, outdir, threads, pair_end_type) +def bam_cram_2_fastq(alignment_file, outdir, threads, pair_end_type): + run_successfully, fastq_files = alignment_to_fastq(alignment_file, threads, pair_end_type) if run_successfully: if pair_end_type.lower() == 'paired': - number_reads, fastq_files = formartFastqHeaders(fastq_files[0], fastq_files[1]) + number_reads, fastq_files = formart_fastq_headers(fastq_files[0], fastq_files[1]) - run_successfully, fastq_files = compressFiles(fastq_files, outdir, threads) + run_successfully, fastq_files = compress_files(fastq_files, outdir, threads) return run_successfully, fastq_files -def check_correct_links(downloadInformation): - for i in downloadInformation: - if downloadInformation[i] is not None: - if downloadInformation[i]['aspera'] is not None: - for j in range(0, len(downloadInformation[i]['aspera'])): - if downloadInformation[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'): - downloadInformation[i]['aspera'][j] = downloadInformation[i]['aspera'][j].replace('fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1) - if downloadInformation[i]['ftp'] is not None: - for j in range(0, len(downloadInformation[i]['ftp'])): - if '#' in downloadInformation[i]['ftp'][j]: - downloadInformation[i]['ftp'][j] = downloadInformation[i]['ftp'][j].replace('#', '%23') - return downloadInformation +def check_correct_links(download_information): + for i in download_information: + if download_information[i] is not None: + if download_information[i]['aspera'] is not None: + for j in range(0, len(download_information[i]['aspera'])): + if download_information[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'): + download_information[i]['aspera'][j] = download_information[i]['aspera'][j].replace( + 'fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1) + if download_information[i]['ftp'] is not None: + for j in range(0, len(download_information[i]['ftp'])): + if '#' in download_information[i]['ftp'][j]: + download_information[i]['ftp'][j] = download_information[i]['ftp'][j].replace('#', '%23') + return download_information def get_fastq_files(download_dir, cram_index_run_successfully, threads, download_paired_type): run_successfully = False - downloaded_files = findFiles(download_dir, '', '') + downloaded_files = find_files(download_dir, '', '') if cram_index_run_successfully: cram_file = None for i in downloaded_files: if i.endswith('.cram'): cram_file = i - run_successfully, downloaded_files = bamCram_2_fastq(cram_file, download_dir, threads, download_paired_type) + run_successfully, downloaded_files = bam_cram_2_fastq(cram_file, download_dir, threads, download_paired_type) else: - if len(downloaded_files) > 0: + if downloaded_files is not None and len(downloaded_files) > 0: run_successfully = True return run_successfully, downloaded_files @@ -373,7 +467,7 @@ if not temp_name.endswith(('_R1_001.f', '_R2_001.f')): list_new_files[i] = os.path.join(outdir, new_name + '.fq.gz') if temp_name.endswith(('_1.f', '_2.f')): - print 'WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!' + print('WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!') if len(list_new_files) == 2 and download_paired_type.lower() == 'paired': run_successfully = True @@ -388,9 +482,9 @@ os.remove(list_files[i]) else: os.rename(list_files[i], list_new_files[i]) - list_new_files = list_new_files.values() + list_new_files = list(list_new_files.values()) except Exception as e: - print e + print(e) run_successfully = False if not run_successfully: @@ -399,34 +493,107 @@ return run_successfully, list_new_files +# @utils.trace_unhandled_exceptions +def rename_header_sra(fastq): + run_successfully = False + try: + command = ['gawk', '\'{if(NR%4==1) $0=gensub(/\./, \"/\", 2); print}\'', fastq, '|', 'gzip', '-1', '>', + str(fastq + '.gz')] + print('Running: ' + str(' '.join(command))) + return_code = subprocess.call(' '.join(command), shell=True) + if return_code == 0: + run_successfully = True + else: + print('Something went wrong with command: {command}'.format(command=' '.join(command))) + except Exception as e: + print(e) + + return run_successfully + + +def sra_2_fastq(download_dir, ena_id): + command = ['fastq-dump', '-I', '-O', download_dir, '--split-files', '{download_dir}{ena_id}.sra'.format( + download_dir=download_dir, ena_id=ena_id)] + run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True) + if run_successfully: + files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) + if not f.startswith('.') and os.path.isfile(os.path.join(download_dir, f)) and f.endswith('.fastq')] + + pool = multiprocessing.Pool(processes=2) + results = [] + p = pool.map_async(rename_header_sra, files, callback=results.extend) + p.wait() + + run_successfully = all(results) + + return run_successfully + + download_timer = functools.partial(utils.timer, name='Download module') @download_timer -def runDownload(ena_id, download_paired_type, asperaKey, outdir, download_cram_bam_True, threads, instrument_platform): +def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform, + sra, sra_opt): download_dir = os.path.join(outdir, 'download', '') - utils.removeDirectory(download_dir) + utils.remove_directory(download_dir) os.mkdir(download_dir) run_successfully = False downloaded_files = None - sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} + sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, + 'library_layout': None, 'library_source': None, 'extra_run_accession': None, + 'nominal_length': None, 'read_count': None, 'base_count': None, + 'date_download': time.strftime("%Y-%m-%d")} - readRunInfo = getReadRunInfo(ena_id) - if readRunInfo is not None: - downloadInformation = getDownloadInformation(readRunInfo) - downloadInformation = check_correct_links(downloadInformation) - sequencingInformation = getSequencingInformation(readRunInfo) - sequencingInformation['date_download'] = time.strftime("%Y-%m-%d") + read_run_info = get_read_run_info(ena_id) + if read_run_info is not None: + download_information = get_download_information(read_run_info) + download_information = check_correct_links(download_information) + sequencing_information = get_sequencing_information(read_run_info) - if instrument_platform.lower() == 'all' or (sequencingInformation['instrument_platform'] is not None and sequencingInformation['instrument_platform'].lower() == instrument_platform.lower()): - if download_paired_type.lower() == 'both' or (sequencingInformation['library_layout'] is not None and sequencingInformation['library_layout'].lower() == download_paired_type.lower()): - run_successfully, cram_index_run_successfully = downloadFiles(downloadInformation, asperaKey, download_dir, download_cram_bam_True) + if instrument_platform.lower() == 'all' or \ + (sequencing_information['instrument_platform'] is not None and + sequencing_information['instrument_platform'].lower() == instrument_platform.lower()): + if download_paired_type.lower() == 'both' or \ + (sequencing_information['library_layout'] is not None and + sequencing_information['library_layout'].lower() == download_paired_type.lower()): + run_successfully, cram_index_run_successfully, download_sra = download_files(download_information, + aspera_key, download_dir, + download_cram_bam_true, + sra, sra_opt, ena_id) + if download_sra: + run_successfully = sra_2_fastq(download_dir, ena_id) if run_successfully: - run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, sequencingInformation['library_layout']) + run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, + threads, + sequencing_information['library_layout']) if run_successfully and downloaded_files is not None: - run_successfully, downloaded_files = rename_move_files(downloaded_files, sequencingInformation['run_accession'], outdir, sequencingInformation['library_layout']) + run_successfully, downloaded_files = rename_move_files(downloaded_files, + sequencing_information['run_accession'], + outdir, + sequencing_information['library_layout']) + else: + if sra or sra_opt: + run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None, + 'submitted': None, + 'cram_index': None}, + aspera_key, download_dir, + download_cram_bam_true, sra, + sra_opt, ena_id) + if download_sra: + run_successfully = sra_2_fastq(download_dir, ena_id) + if run_successfully: + run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads, + 'paired') + if not run_successfully: + run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, + threads, 'single') + if run_successfully and downloaded_files is not None: + run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired') + if not run_successfully: + run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single') - utils.removeDirectory(download_dir) + utils.remove_directory(download_dir) - return run_successfully, downloaded_files, sequencingInformation + return run_successfully, downloaded_files, sequencing_information