annotate scripts/ReMatCh/modules/download.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
1 import os.path
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
2 import multiprocessing
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
3 import sys
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
4 import functools
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
5 import time
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
6 import subprocess
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
7
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
8 try:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
9 import modules.utils as utils
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
10 except ImportError:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
11 from ReMatCh.modules import utils as utils
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
12
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
13
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
14 def get_read_run_info(ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
15 import urllib.request
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
16
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
17 url = 'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=' + ena_id + '&result=read_run'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
18
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
19 read_run_info = None
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
20 try:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
21 url = urllib.request.urlopen(url)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
22 read_run_info = url.read().decode("utf8").splitlines()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
23 if len(read_run_info) <= 1:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
24 read_run_info = None
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
25 except Exception as error:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
26 print(error)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
27
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
28 return read_run_info
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
29
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
30
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
31 def get_download_information(read_run_info):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
32 header_line = read_run_info[0].split('\t')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
33 info_line = read_run_info[1].split('\t')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
34
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
35 download_information = {'fastq': None, 'submitted': None, 'cram_index': None}
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
36 download_types = ['aspera', 'ftp']
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
37
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
38 for i in range(0, len(header_line)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
39 header = header_line[i].lower().rsplit('_', 1)
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
40 if header[0] in list(download_information.keys()):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
41 if header[1] in download_types:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
42 if len(info_line[i]) > 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
43 files_path = info_line[i].split(';')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
44 if len(files_path) > 2:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
45 print('WARNING: Were found more files than expected in'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
46 ' {download_information}-{download_types} download'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
47 ' links!'.format(download_information=header[0], download_types=header[1]))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
48 if download_information[header[0]] is None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
49 download_information[header[0]] = {}
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
50 download_information[header[0]][header[1]] = files_path
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
51
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
52 return download_information
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
53
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
54
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
55 def get_sequencing_information(read_run_info):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
56 header_line = read_run_info[0].split('\t')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
57 info_line = read_run_info[1].split('\t')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
58
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
59 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
60 'library_layout': None, 'library_source': None, 'extra_run_accession': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
61 'nominal_length': None, 'read_count': None, 'base_count': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
62 'date_download': time.strftime("%Y-%m-%d")}
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
63
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
64 for i in range(0, len(header_line)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
65 header = header_line[i].lower()
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
66 if header in list(sequencing_information.keys()):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
67 if len(info_line[i]) > 0:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
68 sequencing_information[header] = info_line[i]
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
69
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
70 if len(read_run_info) > 2:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
71 extra_run_accession = []
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
72 for i in range(2, len(read_run_info)):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
73 info = read_run_info[i].split('\t')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
74 for j in range(0, len(header_line)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
75 header = header_line[j].lower()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
76 if header == 'run_accession':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
77 if len(info[j]) > 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
78 extra_run_accession.append(info[j])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
79 if len(extra_run_accession) >= 1:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
80 sequencing_information['extra_run_accession'] = ','.join(extra_run_accession)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
81
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
82 return sequencing_information
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
83
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
84
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
85 @utils.trace_unhandled_exceptions
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
86 def download_with_aspera(aspera_file_path, aspera_key, outdir, pickle_prefix, sra, ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
87 command = ['ascp', '-QT', '-l', '300m', '', '-i', aspera_key, '', outdir]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
88 if not sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
89 command[4] = '-P33001'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
90 command[7] = str('era-fasp@' + aspera_file_path)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
91 pickle = pickle_prefix + '.' + aspera_file_path.rsplit('/', 1)[1]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
92 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
93 command[7] = 'anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
94 a=ena_id[:3], b=ena_id[:6], c=ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
95 pickle = pickle_prefix + '.' + ena_id
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
96
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
97 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
98
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
99 utils.save_variable_to_pickle(run_successfully, outdir, pickle)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
100
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
101
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
102 @utils.trace_unhandled_exceptions
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
103 def download_with_wget(ftp_file_path, outdir, pickle_prefix, sra, ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
104 command = ['wget', '--tries=1', '', '-O', '']
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
105 if not sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
106 command[2] = ftp_file_path
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
107 file_download = ftp_file_path.rsplit('/', 1)[1]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
108 command[4] = os.path.join(outdir, file_download)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
109 pickle = pickle_prefix + '.' + file_download
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
110 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
111 command[2] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
112 a=ena_id[:3], b=ena_id[:6], c=ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
113 command[4] = os.path.join(outdir, ena_id + '.sra')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
114 pickle = pickle_prefix + '.' + ena_id
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
115 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
116
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
117 utils.save_variable_to_pickle(run_successfully, outdir, pickle)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
118
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
119
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
120 @utils.trace_unhandled_exceptions
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
121 def download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
122 command = ['prefetch', '', ena_id]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
123
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
124 if aspera_key is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
125 _, ascp, _ = utils.run_command_popen_communicate(['which', 'ascp'], False, None, False)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
126 command[1] = '-a {ascp}|{aspera_key}'.format(ascp=ascp.splitlines()[0], aspera_key=aspera_key)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
127
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
128 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
129 if run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
130 _, prefetch_outdir, _ = utils.run_command_popen_communicate(['echo', '$HOME/ncbi/public/sra'], True, None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
131 False)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
132
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
133 try:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
134 os.rename(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
135 os.path.join(outdir, ena_id + '.sra'))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
136 except OSError as e:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
137 print('Found the following error:'
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
138 '{}'.format(e))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
139
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
140 from shutil import copy as shutil_copy
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
141
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
142 shutil_copy(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'),
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
143 os.path.join(outdir, ena_id + '.sra'))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
144 os.remove(os.path.join(prefetch_outdir.splitlines()[0], ena_id + '.sra'))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
145
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
146 utils.save_variable_to_pickle(run_successfully, outdir, pickle_prefix + '.' + ena_id)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
147
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
148
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
149 @utils.trace_unhandled_exceptions
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
150 def download_with_curl(ftp_file_path, outdir, pickle_prefix, sra, ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
151 command = ['curl', '--retry', '1', '', '-o', '']
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
152 if not sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
153 command[3] = ftp_file_path
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
154 file_download = ftp_file_path.rsplit('/', 1)[1]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
155 command[5] = os.path.join(outdir, file_download)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
156 pickle = pickle_prefix + '.' + file_download
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
157 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
158 command[3] = 'ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/{a}/{b}/{c}/{c}.sra'.format(
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
159 a=ena_id[:3], b=ena_id[:6], c=ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
160 command[5] = os.path.join(outdir, ena_id + '.sra')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
161 pickle = pickle_prefix + '.' + ena_id
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
162 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
163
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
164 utils.save_variable_to_pickle(run_successfully, outdir, pickle)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
165
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
166
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
167 def get_pickle_run_successfully(directory, pickle_prefix):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
168 run_successfully = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
169 read_pickle = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
170
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
171 files = find_files(directory, pickle_prefix, '.pkl')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
172 if files is not None:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
173 for file_found in files:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
174 if run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
175 run_successfully = utils.extract_variable_from_pickle(file_found)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
176 read_pickle = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
177
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
178 os.remove(file_found)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
179
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
180 if not read_pickle:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
181 run_successfully = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
182
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
183 return run_successfully
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
184
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
185
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
186 def curl_installed():
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
187 command = ['which', 'curl']
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
188 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, False)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
189 return run_successfully
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
190
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
191
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
192 def download(download_information_type, aspera_key, outdir, sra, sra_opt, ena_id):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
193 pickle_prefix = 'download'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
194
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
195 run_successfully = False
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
196 download_sra = False
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
197
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
198 if not sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
199 if aspera_key is not None and download_information_type['aspera'] is not None:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
200 pool = multiprocessing.Pool(processes=2)
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
201 for file_download in download_information_type['aspera']:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
202 pool.apply_async(download_with_aspera, args=(file_download, aspera_key, outdir, pickle_prefix, sra,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
203 ena_id,))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
204 pool.close()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
205 pool.join()
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
206 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
207 if not run_successfully and download_information_type['ftp'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
208 if curl_installed():
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
209 pool = multiprocessing.Pool(processes=2)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
210 for file_download in download_information_type['ftp']:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
211 pool.apply_async(download_with_curl, args=(file_download, outdir, pickle_prefix, sra, ena_id,))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
212 pool.close()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
213 pool.join()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
214 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
215 if not run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
216 pool = multiprocessing.Pool(processes=2)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
217 for file_download in download_information_type['ftp']:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
218 pool.apply_async(download_with_wget, args=(file_download, outdir, pickle_prefix, sra, ena_id,))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
219 pool.close()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
220 pool.join()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
221 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
222
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
223 if not run_successfully and (sra or sra_opt):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
224 if aspera_key is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
225 download_with_aspera(None, aspera_key, outdir, pickle_prefix, sra or sra_opt, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
226 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
227 if not run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
228 download_with_sra_prefetch(aspera_key, outdir, pickle_prefix, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
229 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
230 if not run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
231 if curl_installed():
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
232 download_with_curl(None, outdir, pickle_prefix, sra or sra_opt, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
233 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
234 if not run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
235 download_with_wget(None, outdir, pickle_prefix, sra or sra_opt, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
236 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
237
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
238 if run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
239 download_sra = True
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
240
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
241 return run_successfully, download_sra
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
242
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
243
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
244 def download_files(download_information, aspera_key, outdir, download_cram_bam_true, sra, sra_opt, ena_id):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
245 run_successfully = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
246 cram_index_run_successfully = False
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
247 download_sra = False
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
248
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
249 if download_information['fastq'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
250 run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, sra, sra_opt,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
251 ena_id)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
252
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
253 if not run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
254 if download_information['submitted'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
255 if not download_cram_bam_true:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
256 cram_bam = False
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
257 for i in download_information['submitted']:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
258 if download_information['submitted'][i][0].endswith(('.cram', '.bam')):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
259 cram_bam = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
260 break
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
261 if not cram_bam:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
262 run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
263 False, False, ena_id)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
264
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
265 elif download_cram_bam_true:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
266 run_successfully, download_sra = download(download_information['submitted'], aspera_key, outdir, False,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
267 False, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
268 if run_successfully and download_information['cram_index'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
269 cram_index_run_successfully = download(download_information['cram_index'], aspera_key, outdir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
270 False, False, ena_id)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
271
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
272 if not run_successfully and (sra or sra_opt):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
273 run_successfully, download_sra = download(download_information['fastq'], aspera_key, outdir, True, sra_opt,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
274 ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
275
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
276 return run_successfully, cram_index_run_successfully, download_sra
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
277
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
278
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
279 def sort_alignment(alignment_file, output_file, sort_by_name_true, threads):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
280 out_format_string = os.path.splitext(output_file)[1][1:].lower()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
281 command = ['samtools', 'sort', '-o', output_file, '-O', out_format_string, '', '-@', str(threads), alignment_file]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
282 if sort_by_name_true:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
283 command[6] = '-n'
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
284 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
285
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
286 if not run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
287 output_file = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
288
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
289 return run_successfully, output_file
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
290
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
291
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
292 def alignment_to_fastq(alignment_file, threads, pair_end_type):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
293 fastq_basename = os.path.splitext(alignment_file)[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
294 outfiles = None
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
295 bam_file = fastq_basename + '.temp.bam'
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
296 # sort cram
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
297 run_successfully, bam_file = sort_alignment(alignment_file, bam_file, True, threads)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
298 if run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
299 command = ['samtools', 'fastq', '', bam_file]
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
300 if pair_end_type.lower() == 'paired':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
301 command[2] = '-1 ' + str(fastq_basename + '_1.fq') + ' -2 ' + str(fastq_basename + '_2.fq')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
302 elif pair_end_type == 'single':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
303 command[2] = '-0 ' + str(fastq_basename + '.fq')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
304
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
305 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, None, True)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
306 if run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
307 if pair_end_type.lower() == 'paired':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
308 outfiles = [str(fastq_basename + '_1.fq'), str(fastq_basename + '_2.fq')]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
309 elif pair_end_type.lower() == 'single':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
310 outfiles = [str(fastq_basename + '.fq')]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
311
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
312 if bam_file is not None and os.path.isfile(bam_file):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
313 os.remove(bam_file)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
314
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
315 return run_successfully, outfiles
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
316
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
317
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
318 def formart_fastq_headers(in_fastq_1, in_fastq_2):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
319
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
320 out_fastq_1 = in_fastq_1 + '.temp'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
321 out_fastq_2 = in_fastq_2 + '.temp'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
322 writer_in_fastq_1 = open(out_fastq_1, 'wt')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
323 writer_in_fastq_2 = open(out_fastq_2, 'wt')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
324 outfiles = [out_fastq_1, out_fastq_2]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
325 with open(in_fastq_1, 'rtU') as reader_in_fastq_1, open(in_fastq_2, 'rtU') as reader_in_fastq_2:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
326 plus_line = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
327 quality_line = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
328 number_reads = 0
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
329 for in_1, in_2 in zip(reader_in_fastq_1, reader_in_fastq_2):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
330 if len(in_1) > 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
331 in_1 = in_1.splitlines()[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
332 in_2 = in_2.splitlines()[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
333 if in_1.startswith('@') and plus_line and quality_line:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
334 if in_1 != in_2:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
335 sys.exit('The PE fastq files are not aligned properly!')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
336 in_1 += '/1' + '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
337 in_2 += '/2' + '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
338 writer_in_fastq_1.write(in_1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
339 writer_in_fastq_2.write(in_2)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
340 plus_line = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
341 quality_line = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
342 elif in_1.startswith('+') and not plus_line:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
343 in_1 += '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
344 writer_in_fastq_1.write(in_1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
345 writer_in_fastq_2.write(in_1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
346 plus_line = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
347 elif plus_line and not quality_line:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
348 in_1 += '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
349 in_2 += '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
350 writer_in_fastq_1.write(in_1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
351 writer_in_fastq_2.write(in_2)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
352 writer_in_fastq_1.flush()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
353 writer_in_fastq_2.flush()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
354 number_reads += 1
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
355 quality_line = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
356 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
357 in_1 += '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
358 in_2 += '\n'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
359 writer_in_fastq_1.write(in_1)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
360 writer_in_fastq_2.write(in_2)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
361 return number_reads, outfiles
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
362
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
363
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
364 @utils.trace_unhandled_exceptions
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
365 def gzip_files(file_2_compress, pickle_prefix, outdir):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
366 if file_2_compress.endswith('.temp'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
367 out_file = os.path.splitext(file_2_compress)[0]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
368 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
369 out_file = file_2_compress
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
370
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
371 command = ['gzip', '--stdout', '--best', file_2_compress, '>', str(out_file + '.gz')]
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
372 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, True, None, True)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
373 if run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
374 os.remove(file_2_compress)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
375
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
376 utils.save_variable_to_pickle(run_successfully, outdir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
377 str(pickle_prefix + '.' + os.path.basename(file_2_compress)))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
378
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
379
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
380 def find_files(directory, prefix, suffix):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
381 list_files_found = []
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
382 files = [f for f in os.listdir(directory) if not f.startswith('.') and os.path.isfile(os.path.join(directory, f))]
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
383 for file_found in files:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
384 if file_found.startswith(prefix) and file_found.endswith(suffix):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
385 file_path = os.path.join(directory, file_found)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
386 list_files_found.append(file_path)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
387
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
388 if len(list_files_found) == 0:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
389 list_files_found = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
390
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
391 return list_files_found
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
392
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
393
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
394 def compress_files(fastq_files, outdir, threads):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
395 pickle_prefix = 'compress'
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
396 compressed_fastq_files = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
397
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
398 pool = multiprocessing.Pool(processes=threads)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
399 for fastq in fastq_files:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
400 pool.apply_async(gzip_files, args=(fastq, pickle_prefix, outdir,))
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
401 pool.close()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
402 pool.join()
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
403
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
404 run_successfully = get_pickle_run_successfully(outdir, pickle_prefix)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
405 if run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
406 compressed_fastq_files = find_files(outdir, '', '.gz')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
407
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
408 return run_successfully, compressed_fastq_files
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
409
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
410
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
411 def bam_cram_2_fastq(alignment_file, outdir, threads, pair_end_type):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
412 run_successfully, fastq_files = alignment_to_fastq(alignment_file, threads, pair_end_type)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
413 if run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
414 if pair_end_type.lower() == 'paired':
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
415 number_reads, fastq_files = formart_fastq_headers(fastq_files[0], fastq_files[1])
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
416
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
417 run_successfully, fastq_files = compress_files(fastq_files, outdir, threads)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
418
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
419 return run_successfully, fastq_files
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
420
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
421
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
422 def check_correct_links(download_information):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
423 for i in download_information:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
424 if download_information[i] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
425 if download_information[i]['aspera'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
426 for j in range(0, len(download_information[i]['aspera'])):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
427 if download_information[i]['aspera'][j].startswith('fasp.sra.ebi.ac.uk/'):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
428 download_information[i]['aspera'][j] = download_information[i]['aspera'][j].replace(
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
429 'fasp.sra.ebi.ac.uk/', 'fasp.sra.ebi.ac.uk:/', 1)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
430 if download_information[i]['ftp'] is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
431 for j in range(0, len(download_information[i]['ftp'])):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
432 if '#' in download_information[i]['ftp'][j]:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
433 download_information[i]['ftp'][j] = download_information[i]['ftp'][j].replace('#', '%23')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
434 return download_information
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
435
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
436
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
437 def get_fastq_files(download_dir, cram_index_run_successfully, threads, download_paired_type):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
438 run_successfully = False
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
439 downloaded_files = find_files(download_dir, '', '')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
440 if cram_index_run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
441 cram_file = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
442 for i in downloaded_files:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
443 if i.endswith('.cram'):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
444 cram_file = i
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
445 run_successfully, downloaded_files = bam_cram_2_fastq(cram_file, download_dir, threads, download_paired_type)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
446 else:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
447 if downloaded_files is not None and len(downloaded_files) > 0:
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
448 run_successfully = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
449
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
450 return run_successfully, downloaded_files
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
451
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
452
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
453 def rename_move_files(list_files, new_name, outdir, download_paired_type):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
454 list_new_files = {}
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
455 run_successfully = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
456
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
457 for i in range(0, len(list_files)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
458 temp_name = utils.rchop(os.path.basename(list_files[i]), 'astq.gz')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
459 if len(temp_name) == len(os.path.basename(list_files[i])):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
460 temp_name = utils.rchop(os.path.basename(list_files[i]), 'q.gz')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
461 if download_paired_type.lower() == 'paired':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
462 if temp_name.endswith(('_R1_001.f', '_1.f')):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
463 list_new_files[i] = os.path.join(outdir, new_name + '_1.fq.gz')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
464 elif temp_name.endswith(('_R2_001.f', '_2.f')):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
465 list_new_files[i] = os.path.join(outdir, new_name + '_2.fq.gz')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
466 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
467 if not temp_name.endswith(('_R1_001.f', '_R2_001.f')):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
468 list_new_files[i] = os.path.join(outdir, new_name + '.fq.gz')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
469 if temp_name.endswith(('_1.f', '_2.f')):
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
470 print('WARNING: possible single-end file conflict with pair-end (' + list_files[i] + ')!')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
471
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
472 if len(list_new_files) == 2 and download_paired_type.lower() == 'paired':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
473 run_successfully = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
474 elif len(list_new_files) == 1 and download_paired_type.lower() == 'single':
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
475 run_successfully = True
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
476
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
477 if run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
478 try:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
479 for i in range(0, len(list_files)):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
480 if i not in list_new_files:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
481 if os.path.isfile(list_files[i]):
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
482 os.remove(list_files[i])
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
483 else:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
484 os.rename(list_files[i], list_new_files[i])
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
485 list_new_files = list(list_new_files.values())
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
486 except Exception as e:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
487 print(e)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
488 run_successfully = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
489
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
490 if not run_successfully:
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
491 list_new_files = None
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
492
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
493 return run_successfully, list_new_files
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
494
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
495
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
496 # @utils.trace_unhandled_exceptions
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
497 def rename_header_sra(fastq):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
498 run_successfully = False
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
499 try:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
500 command = ['gawk', '\'{if(NR%4==1) $0=gensub(/\./, \"/\", 2); print}\'', fastq, '|', 'gzip', '-1', '>',
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
501 str(fastq + '.gz')]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
502 print('Running: ' + str(' '.join(command)))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
503 return_code = subprocess.call(' '.join(command), shell=True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
504 if return_code == 0:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
505 run_successfully = True
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
506 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
507 print('Something went wrong with command: {command}'.format(command=' '.join(command)))
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
508 except Exception as e:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
509 print(e)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
510
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
511 return run_successfully
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
512
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
513
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
514 def sra_2_fastq(download_dir, ena_id):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
515 command = ['fastq-dump', '-I', '-O', download_dir, '--split-files', '{download_dir}{ena_id}.sra'.format(
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
516 download_dir=download_dir, ena_id=ena_id)]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
517 run_successfully, stdout, stderr = utils.run_command_popen_communicate(command, False, 3600, True)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
518 if run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
519 files = [os.path.join(download_dir, f) for f in os.listdir(download_dir)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
520 if not f.startswith('.') and os.path.isfile(os.path.join(download_dir, f)) and f.endswith('.fastq')]
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
521
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
522 pool = multiprocessing.Pool(processes=2)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
523 results = []
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
524 p = pool.map_async(rename_header_sra, files, callback=results.extend)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
525 p.wait()
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
526
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
527 run_successfully = all(results)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
528
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
529 return run_successfully
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
530
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
531
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
532 download_timer = functools.partial(utils.timer, name='Download module')
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
533
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
534
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
535 @download_timer
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
536 def run_download(ena_id, download_paired_type, aspera_key, outdir, download_cram_bam_true, threads, instrument_platform,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
537 sra, sra_opt):
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
538 download_dir = os.path.join(outdir, 'download', '')
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
539 utils.remove_directory(download_dir)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
540 os.mkdir(download_dir)
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
541
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
542 run_successfully = False
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
543 downloaded_files = None
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
544 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
545 'library_layout': None, 'library_source': None, 'extra_run_accession': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
546 'nominal_length': None, 'read_count': None, 'base_count': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
547 'date_download': time.strftime("%Y-%m-%d")}
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
548
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
549 read_run_info = get_read_run_info(ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
550 if read_run_info is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
551 download_information = get_download_information(read_run_info)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
552 download_information = check_correct_links(download_information)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
553 sequencing_information = get_sequencing_information(read_run_info)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
554
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
555 if instrument_platform.lower() == 'all' or \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
556 (sequencing_information['instrument_platform'] is not None and
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
557 sequencing_information['instrument_platform'].lower() == instrument_platform.lower()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
558 if download_paired_type.lower() == 'both' or \
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
559 (sequencing_information['library_layout'] is not None and
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
560 sequencing_information['library_layout'].lower() == download_paired_type.lower()):
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
561 run_successfully, cram_index_run_successfully, download_sra = download_files(download_information,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
562 aspera_key, download_dir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
563 download_cram_bam_true,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
564 sra, sra_opt, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
565 if download_sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
566 run_successfully = sra_2_fastq(download_dir, ena_id)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
567 if run_successfully:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
568 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
569 threads,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
570 sequencing_information['library_layout'])
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
571 if run_successfully and downloaded_files is not None:
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
572 run_successfully, downloaded_files = rename_move_files(downloaded_files,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
573 sequencing_information['run_accession'],
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
574 outdir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
575 sequencing_information['library_layout'])
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
576 else:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
577 if sra or sra_opt:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
578 run_successfully, cram_index_run_successfully, download_sra = download_files({'fastq': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
579 'submitted': None,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
580 'cram_index': None},
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
581 aspera_key, download_dir,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
582 download_cram_bam_true, sra,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
583 sra_opt, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
584 if download_sra:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
585 run_successfully = sra_2_fastq(download_dir, ena_id)
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
586 if run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
587 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully, threads,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
588 'paired')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
589 if not run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
590 run_successfully, downloaded_files = get_fastq_files(download_dir, cram_index_run_successfully,
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
591 threads, 'single')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
592 if run_successfully and downloaded_files is not None:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
593 run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'paired')
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
594 if not run_successfully:
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
595 run_successfully, downloaded_files = rename_move_files(downloaded_files, ena_id, outdir, 'single')
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
596
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
597 utils.remove_directory(download_dir)
0
965517909457 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff changeset
598
3
0cbed1c0a762 planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents: 0
diff changeset
599 return run_successfully, downloaded_files, sequencing_information