Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/restart_rematch.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
comparison
equal
deleted
inserted
replaced
2:6837f733b4aa | 3:0cbed1c0a762 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
2 | 2 |
3 # -*- coding: utf-8 -*- | 3 # -*- coding: utf-8 -*- |
4 | 4 |
5 """ | 5 """ |
6 restart_rematch.py - Restarts a ReMatCh run abruptly terminated | 6 restart_rematch.py - Restarts a ReMatCh run abruptly terminated |
7 <https://github.com/B-UMMI/ReMatCh/> | 7 <https://github.com/B-UMMI/ReMatCh/> |
8 | 8 |
9 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> | 9 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt> |
10 | 10 |
11 Last modified: February 09, 2017 | 11 Last modified: October 15, 2018 |
12 | 12 |
13 This program is free software: you can redistribute it and/or modify | 13 This program is free software: you can redistribute it and/or modify |
14 it under the terms of the GNU General Public License as published by | 14 it under the terms of the GNU General Public License as published by |
15 the Free Software Foundation, either version 3 of the License, or | 15 the Free Software Foundation, either version 3 of the License, or |
16 (at your option) any later version. | 16 (at your option) any later version. |
31 | 31 |
32 | 32 |
33 version = '0.1' | 33 version = '0.1' |
34 | 34 |
35 | 35 |
36 def runRematch(args): | 36 def run_rematch(args): |
37 print '\n' + '==========> Restarting ReMatCh <==========' + '\n' | 37 print('\n' + '==========> Restarting ReMatCh <==========' + '\n') |
38 | 38 |
39 workdir = os.path.abspath(args.workdir) | 39 workdir = os.path.abspath(args.workdir) |
40 if not os.path.isdir(workdir): | 40 if not os.path.isdir(workdir): |
41 os.makedirs(workdir) | 41 os.makedirs(workdir) |
42 | 42 |
43 initialWorkdir = os.path.abspath(args.initialWorkdir) | 43 initial_workdir = os.path.abspath(args.initialWorkdir) |
44 | 44 |
45 files_required = get_files_required(initialWorkdir) | 45 files_required = get_files_required(initial_workdir) |
46 | 46 |
47 samples_run = get_samples_run(files_required['sample_report']['file']) | 47 samples_run = get_samples_run(files_required['sample_report']['file']) |
48 | 48 |
49 command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file']) | 49 command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file']) |
50 | 50 |
51 if list_ids is not None: | 51 samples_fastq = {} |
52 total_samples = getListIDs_fromFile(list_ids) | 52 |
53 elif taxon: | 53 if list_ids is not None: |
54 total_samples = getTaxonRunIDs(files_required['IDs_list.seqFromWebTaxon']['file']) | 54 total_samples = get_list_ids_from_file(list_ids) |
55 else: | 55 elif taxon: |
56 samples_fastq = searchFastqFiles(initialWorkdir) | 56 total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file']) |
57 total_samples = samples_fastq.keys() | 57 else: |
58 | 58 samples_fastq = search_fastq_files(initial_workdir) |
59 samples_to_run = list(set(total_samples).symmetric_difference(set(sum(samples_run.values(), []) if not args.runFailedSamples else samples_run['True'] if 'True' in samples_run else ['']))) | 59 total_samples = list(samples_fastq.keys()) |
60 | 60 |
61 print str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by ReMatCh' + '\n' | 61 samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if |
62 | 62 not args.runFailedSamples else |
63 if list_ids is not None or taxon: | 63 samples_run['True'] if |
64 samples_to_run_file = write_samples_to_run(samples_to_run, workdir) | 64 'True' in samples_run else ['']))) |
65 else: | 65 |
66 setSamples_fromFolders(samples_to_run, samples_fastq, workdir) | 66 print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by' |
67 | 67 ' ReMatCh' + '\n') |
68 command.extend(['-w', workdir]) | 68 |
69 command.extend(['-j', str(threads) if args.threads is None else str(args.threads)]) | 69 if list_ids is not None or taxon: |
70 if list_ids is not None or taxon: | 70 samples_to_run_file = write_samples_to_run(samples_to_run, workdir) |
71 command.extend(['-l', samples_to_run_file]) | 71 else: |
72 | 72 set_samples_from_folders(samples_to_run, samples_fastq, workdir) |
73 print 'ReMatCh will start in 5 seconds...' | 73 |
74 time.sleep(5) | 74 command.extend(['-w', workdir]) |
75 | 75 command.extend(['-j', str(threads) if args.threads is None else str(args.threads)]) |
76 os.chdir(initial_present_directory) | 76 if list_ids is not None or taxon: |
77 subprocess.call(command) | 77 command.extend(['-l', samples_to_run_file]) |
78 | |
79 print('ReMatCh will start in 5 seconds...') | |
80 time.sleep(5) | |
81 | |
82 os.chdir(initial_present_directory) | |
83 subprocess.call(command) | |
78 | 84 |
79 | 85 |
80 def write_samples_to_run(samples_to_run, workdir): | 86 def write_samples_to_run(samples_to_run, workdir): |
81 samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt') | 87 samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt') |
82 with open(samples_to_run_file, 'wt') as writer: | 88 with open(samples_to_run_file, 'wt') as writer: |
83 for sample in samples_to_run: | 89 for sample in samples_to_run: |
84 writer.write(sample + '\n') | 90 writer.write(sample + '\n') |
85 return samples_to_run_file | 91 return samples_to_run_file |
86 | 92 |
87 | 93 |
88 def get_files_required(initialWorkdir): | 94 def get_files_required(initial_workdir): |
89 files_required = {'sample_report': {'extension': 'tab'}, 'run': {'extension': 'log'}, 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}} | 95 files_required = {'sample_report': {'extension': 'tab'}, |
90 files = sorted([f for f in os.listdir(initialWorkdir) if not f.startswith('.') and os.path.isfile(os.path.join(initialWorkdir, f))]) | 96 'run': {'extension': 'log'}, |
91 for file_found in files: | 97 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}} |
92 file_path = os.path.join(initialWorkdir, file_found) | 98 files = sorted([f for f in os.listdir(initial_workdir) if |
93 file_modification = os.path.getmtime(file_path) | 99 not f.startswith('.') and |
94 for prefix, values in files_required.items(): | 100 os.path.isfile(os.path.join(initial_workdir, f))]) |
95 if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']): | 101 for file_found in files: |
96 if 'file' not in values: | 102 file_path = os.path.join(initial_workdir, file_found) |
97 files_required[prefix]['file'] = file_path | 103 file_modification = os.path.getmtime(file_path) |
98 files_required[prefix]['modification'] = file_modification | 104 for prefix, values in list(files_required.items()): |
99 else: | 105 if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']): |
100 if file_modification > files_required[prefix]['modification']: | 106 if 'file' not in values: |
101 files_required[prefix]['file'] = file_path | 107 files_required[prefix]['file'] = file_path |
102 files_required[prefix]['modification'] = file_modification | 108 files_required[prefix]['modification'] = file_modification |
103 return files_required | 109 else: |
110 if file_modification > files_required[prefix]['modification']: | |
111 files_required[prefix]['file'] = file_path | |
112 files_required[prefix]['modification'] = file_modification | |
113 return files_required | |
104 | 114 |
105 | 115 |
106 def get_samples_run(sample_report_file): | 116 def get_samples_run(sample_report_file): |
107 samples_run = {} | 117 samples_run = {} |
108 with open(sample_report_file, 'rtU') as reader: | 118 with open(sample_report_file, 'rtU') as reader: |
109 for line in reader: | 119 for line in reader: |
110 line = line.splitlines()[0] | 120 line = line.splitlines()[0] |
111 if len(line) > 0: | 121 if len(line) > 0: |
112 if not line.startswith('#'): | 122 if not line.startswith('#'): |
113 sample_info = line.split('\t') | 123 sample_info = line.split('\t') |
114 if sample_info[1] not in samples_run: | 124 if sample_info[1] not in samples_run: |
115 samples_run[sample_info[1]] = [] | 125 samples_run[sample_info[1]] = [] |
116 samples_run[sample_info[1]].append(sample_info[0]) | 126 samples_run[sample_info[1]].append(sample_info[0]) |
117 return samples_run | 127 return samples_run |
118 | 128 |
119 | 129 |
120 def get_rematch_command(log_file): | 130 def get_rematch_command(log_file): |
121 variables = {'command': False, 'directory': False} | 131 variables = {'command': False, 'directory': False} |
122 with open(log_file, 'rtU') as reader: | 132 with open(log_file, 'rtU') as reader: |
123 for line in reader: | 133 for line in reader: |
124 if any([isinstance(value, bool) for value in variables.values()]): | 134 if any([isinstance(value, bool) for value in list(variables.values())]): |
125 line = line.splitlines()[0] | 135 line = line.splitlines()[0] |
126 if len(line) > 0: | 136 if len(line) > 0: |
127 if line == 'COMMAND:': | 137 if line == 'COMMAND:': |
128 variables['command'] = True | 138 variables['command'] = True |
129 elif line == 'PRESENT DIRECTORY:': | 139 elif line == 'PRESENT DIRECTORY:': |
130 variables['directory'] = True | 140 variables['directory'] = True |
131 else: | 141 else: |
132 if variables['command'] is True: | 142 if variables['command'] is True: |
133 variables['command'] = line.split(' ') | 143 variables['command'] = line.split(' ') |
134 elif variables['directory'] is True: | 144 elif variables['directory'] is True: |
135 variables['directory'] = line | 145 variables['directory'] = line |
136 else: | 146 else: |
137 break | 147 break |
138 command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None} | 148 command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None} |
139 if all([not isinstance(value, bool) for value in variables.values()]): | 149 if all([not isinstance(value, bool) for value in list(variables.values())]): |
140 counter = 0 | 150 counter = 0 |
141 while counter < len(variables['command']): | 151 while counter < len(variables['command']): |
142 if variables['command'][counter].startswith('-'): | 152 if variables['command'][counter].startswith('-'): |
143 if variables['command'][counter] not in ('-t', '--taxon'): | 153 if variables['command'][counter] not in ('-t', '--taxon'): |
144 if variables['command'][counter] in ('-l', '--listIDs'): | 154 if variables['command'][counter] in ('-l', '--listIDs'): |
145 command['listIDs'] = variables['command'][counter + 1] | 155 command['listIDs'] = variables['command'][counter + 1] |
146 counter += 1 | 156 counter += 1 |
147 elif variables['command'][counter] in ('-w', '--workdir'): | 157 elif variables['command'][counter] in ('-w', '--workdir'): |
148 counter += 1 | 158 counter += 1 |
149 elif variables['command'][counter] in ('-j', '--threads'): | 159 elif variables['command'][counter] in ('-j', '--threads'): |
150 command['threads'] = int(variables['command'][counter + 1]) | 160 command['threads'] = int(variables['command'][counter + 1]) |
151 counter += 1 | 161 counter += 1 |
152 elif variables['command'][counter] == '--mlst': | 162 elif variables['command'][counter] == '--mlst': |
153 species = [] | 163 species = [] |
154 counter += 1 | 164 counter += 1 |
155 while counter < len(variables['command']) and not variables['command'][counter].startswith('-'): | 165 while counter < len(variables['command']) and not variables['command'][counter].startswith('-'): |
156 if len(variables['command'][counter]) > 0: | 166 if len(variables['command'][counter]) > 0: |
157 species.append(variables['command'][counter]) | 167 species.append(variables['command'][counter]) |
158 counter += 1 | 168 counter += 1 |
159 command['command'].extend(['--mlst', ' '.join(species)]) | 169 command['command'].extend(['--mlst', ' '.join(species)]) |
160 else: | 170 else: |
161 command['command'].append(variables['command'][counter]) | 171 command['command'].append(variables['command'][counter]) |
162 if counter + 1 < len(variables['command']) and not variables['command'][counter + 1].startswith('-'): | 172 if counter + 1 < len(variables['command']) and \ |
163 command['command'].append(variables['command'][counter + 1]) | 173 not variables['command'][counter + 1].startswith('-'): |
164 counter += 1 | 174 command['command'].append(variables['command'][counter + 1]) |
165 else: | 175 counter += 1 |
166 command['taxon'] = True | 176 else: |
167 for i in range(counter, len(variables['command'])): | 177 command['taxon'] = True |
168 if i + 1 < len(variables['command']): | 178 for i in range(counter, len(variables['command'])): |
169 if variables['command'][i + 1].startswith('-'): | 179 if i + 1 < len(variables['command']): |
170 counter = i | 180 if variables['command'][i + 1].startswith('-'): |
171 break | 181 counter = i |
172 else: | 182 break |
173 counter = i | 183 else: |
174 else: | 184 counter = i |
175 command['command'].append(variables['command'][counter]) | 185 else: |
176 counter += 1 | 186 command['command'].append(variables['command'][counter]) |
177 return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory'] | 187 counter += 1 |
178 | 188 return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory'] |
179 | 189 |
180 def getTaxonRunIDs(IDs_list_seqFromWebTaxon_file): | 190 |
181 list_ids = [] | 191 def get_taxon_run_ids(ids_list_seq_from_web_taxon_file): |
182 with open(IDs_list_seqFromWebTaxon_file, 'rtU') as reader: | 192 list_ids = [] |
183 for line in reader: | 193 with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader: |
184 line = line.splitlines()[0] | 194 for line in reader: |
185 if len(line) > 0: | 195 line = line.splitlines()[0] |
186 if not line.startswith('#'): | 196 if len(line) > 0: |
187 line = line.split('\t') | 197 if not line.startswith('#'): |
188 list_ids.append(line[0]) | 198 line = line.split('\t') |
189 return list_ids | 199 list_ids.append(line[0]) |
190 | 200 return list_ids |
191 | 201 |
192 def getListIDs_fromFile(listIDs_file): | 202 |
193 list_ids = [] | 203 def get_list_ids_from_file(list_ids_file): |
194 with open(listIDs_file, 'rtU') as lines: | 204 list_ids = [] |
195 for line in lines: | 205 with open(list_ids_file, 'rtU') as lines: |
196 line = line.splitlines()[0] | 206 for line in lines: |
197 if len(line) > 0: | 207 line = line.splitlines()[0] |
198 list_ids.append(line) | 208 if len(line) > 0: |
199 return list_ids | 209 list_ids.append(line) |
200 | 210 return list_ids |
201 | 211 |
202 def searchFastqFiles(initialWorkdir): | 212 |
203 filesExtensions = ['.fastq.gz', '.fq.gz'] | 213 def search_fastq_files(initial_workdir): |
204 pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] | 214 files_extensions = ['.fastq.gz', '.fq.gz'] |
205 | 215 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] |
206 list_ids = {} | 216 |
207 directories = [d for d in os.listdir(initialWorkdir) if not d.startswith('.') and os.path.isdir(os.path.join(initialWorkdir, d, ''))] | 217 list_ids = {} |
208 for directory_found in directories: | 218 directories = [d for d in os.listdir(initial_workdir) if |
209 directory_path = os.path.join(initialWorkdir, directory_found, '') | 219 not d.startswith('.') and |
210 | 220 os.path.isdir(os.path.join(initial_workdir, d, ''))] |
211 fastqFound = [] | 221 for directory_found in directories: |
212 files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] | 222 directory_path = os.path.join(initial_workdir, directory_found, '') |
213 for file_found in files: | 223 |
214 if file_found.endswith(tuple(filesExtensions)): | 224 fastq_found = [] |
215 fastqFound.append(file_found) | 225 files = [f for f in os.listdir(directory_path) if |
216 | 226 not f.startswith('.') and |
217 if len(fastqFound) == 1: | 227 os.path.isfile(os.path.join(directory_path, f))] |
218 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastqFound] | 228 for file_found in files: |
219 elif len(fastqFound) >= 2: | 229 if file_found.endswith(tuple(files_extensions)): |
220 file_pair = [] | 230 fastq_found.append(file_found) |
221 | 231 |
222 # Search pairs | 232 if len(fastq_found) == 1: |
223 for PE_separation in pairEnd_filesSeparation: | 233 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found] |
224 for fastq in fastqFound: | 234 elif len(fastq_found) >= 2: |
225 if PE_separation[0] in fastq or PE_separation[1] in fastq: | 235 file_pair = [] |
226 file_pair.append(fastq) | 236 |
227 | 237 # Search pairs |
228 if len(file_pair) == 2: | 238 for pe_separation in pair_end_files_separation: |
229 break | 239 for fastq in fastq_found: |
230 else: | 240 if pe_separation[0] in fastq or pe_separation[1] in fastq: |
231 file_pair = [] | 241 file_pair.append(fastq) |
232 | 242 |
233 # Search single | 243 if len(file_pair) == 2: |
234 if len(file_pair) == 0: | 244 break |
235 for PE_separation in pairEnd_filesSeparation: | 245 else: |
236 for fastq in fastqFound: | 246 file_pair = [] |
237 if PE_separation[0] not in fastq or PE_separation[1] not in fastq: | 247 |
238 file_pair.append(fastq) | 248 # Search single |
239 | 249 if len(file_pair) == 0: |
240 if len(file_pair) >= 1: | 250 for pe_separation in pair_end_files_separation: |
241 file_pair = file_pair[0] | 251 for fastq in fastq_found: |
242 | 252 if pe_separation[0] not in fastq or pe_separation[1] not in fastq: |
243 if len(file_pair) >= 1: | 253 file_pair.append(fastq) |
244 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair] | 254 |
245 | 255 if len(file_pair) >= 1: |
246 return list_ids | 256 file_pair = file_pair[0] |
247 | 257 |
248 | 258 if len(file_pair) >= 1: |
249 def setSamples_fromFolders(samples_to_run, samples_fastq, workdir): | 259 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair] |
250 for sample in samples_to_run: | 260 |
251 sample_dir = os.path.join(workdir, sample, '') | 261 return list_ids |
252 if not os.path.isdir(sample_dir): | 262 |
253 os.mkdir(sample_dir) | 263 |
254 for file_found in samples_fastq[sample]: | 264 def set_samples_from_folders(samples_to_run, samples_fastq, workdir): |
255 link_path = os.path.join(sample_dir, os.path.basename(file_found)) | 265 for sample in samples_to_run: |
256 if os.path.islink(link_path): | 266 sample_dir = os.path.join(workdir, sample, '') |
257 os.remove(link_path) | 267 if not os.path.isdir(sample_dir): |
258 if not os.path.isfile(link_path): | 268 os.mkdir(sample_dir) |
259 os.symlink(file_found, link_path) | 269 for file_found in samples_fastq[sample]: |
270 link_path = os.path.join(sample_dir, os.path.basename(file_found)) | |
271 if os.path.islink(link_path): | |
272 os.remove(link_path) | |
273 if not os.path.isfile(link_path): | |
274 os.symlink(file_found, link_path) | |
260 | 275 |
261 | 276 |
262 def main(): | 277 def main(): |
263 parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | 278 parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated', |
264 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | 279 formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
265 | 280 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) |
266 parser_required = parser.add_argument_group('Required options') | 281 |
267 parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/', help='Path to the directory where ReMatCh was running', required=True) | 282 parser_required = parser.add_argument_group('Required options') |
268 | 283 parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/', |
269 parser_optional_general = parser.add_argument_group('General facultative options') | 284 help='Path to the directory where ReMatCh was running', required=True) |
270 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run again', required=False, default='.') | 285 |
271 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use instead of the ones set in initial ReMatCh run', required=False) | 286 parser_optional_general = parser.add_argument_group('General facultative options') |
272 parser_optional_general.add_argument('--runFailedSamples', action='store_true', help='Will run ReMatCh for those samples missing, as well as for samples that did not run successfully in initial ReMatCh run') | 287 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', |
273 | 288 help='Path to the directory where ReMatCh will run again', required=False, |
274 args = parser.parse_args() | 289 default='.') |
275 | 290 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', |
276 runRematch(args) | 291 help='Number of threads to use instead of the ones set in initial ReMatCh run', |
292 required=False) | |
293 parser_optional_general.add_argument('--runFailedSamples', action='store_true', | |
294 help='Will run ReMatCh for those samples missing, as well as for samples that' | |
295 ' did not run successfully in initial ReMatCh run') | |
296 | |
297 args = parser.parse_args() | |
298 | |
299 run_rematch(args) | |
277 | 300 |
278 | 301 |
279 if __name__ == "__main__": | 302 if __name__ == "__main__": |
280 main() | 303 main() |