comparison scripts/ReMatCh/utils/restart_rematch.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 2
3 # -*- coding: utf-8 -*- 3 # -*- coding: utf-8 -*-
4 4
5 """ 5 """
6 restart_rematch.py - Restarts a ReMatCh run abruptly terminated 6 restart_rematch.py - Restarts a ReMatCh run abruptly terminated
7 <https://github.com/B-UMMI/ReMatCh/> 7 <https://github.com/B-UMMI/ReMatCh/>
8 8
9 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> 9 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
10 10
11 Last modified: February 09, 2017 11 Last modified: October 15, 2018
12 12
13 This program is free software: you can redistribute it and/or modify 13 This program is free software: you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by 14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or 15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version. 16 (at your option) any later version.
31 31
32 32
33 version = '0.1' 33 version = '0.1'
34 34
35 35
36 def runRematch(args): 36 def run_rematch(args):
37 print '\n' + '==========> Restarting ReMatCh <==========' + '\n' 37 print('\n' + '==========> Restarting ReMatCh <==========' + '\n')
38 38
39 workdir = os.path.abspath(args.workdir) 39 workdir = os.path.abspath(args.workdir)
40 if not os.path.isdir(workdir): 40 if not os.path.isdir(workdir):
41 os.makedirs(workdir) 41 os.makedirs(workdir)
42 42
43 initialWorkdir = os.path.abspath(args.initialWorkdir) 43 initial_workdir = os.path.abspath(args.initialWorkdir)
44 44
45 files_required = get_files_required(initialWorkdir) 45 files_required = get_files_required(initial_workdir)
46 46
47 samples_run = get_samples_run(files_required['sample_report']['file']) 47 samples_run = get_samples_run(files_required['sample_report']['file'])
48 48
49 command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file']) 49 command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file'])
50 50
51 if list_ids is not None: 51 samples_fastq = {}
52 total_samples = getListIDs_fromFile(list_ids) 52
53 elif taxon: 53 if list_ids is not None:
54 total_samples = getTaxonRunIDs(files_required['IDs_list.seqFromWebTaxon']['file']) 54 total_samples = get_list_ids_from_file(list_ids)
55 else: 55 elif taxon:
56 samples_fastq = searchFastqFiles(initialWorkdir) 56 total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file'])
57 total_samples = samples_fastq.keys() 57 else:
58 58 samples_fastq = search_fastq_files(initial_workdir)
59 samples_to_run = list(set(total_samples).symmetric_difference(set(sum(samples_run.values(), []) if not args.runFailedSamples else samples_run['True'] if 'True' in samples_run else ['']))) 59 total_samples = list(samples_fastq.keys())
60 60
61 print str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by ReMatCh' + '\n' 61 samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if
62 62 not args.runFailedSamples else
63 if list_ids is not None or taxon: 63 samples_run['True'] if
64 samples_to_run_file = write_samples_to_run(samples_to_run, workdir) 64 'True' in samples_run else [''])))
65 else: 65
66 setSamples_fromFolders(samples_to_run, samples_fastq, workdir) 66 print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by'
67 67 ' ReMatCh' + '\n')
68 command.extend(['-w', workdir]) 68
69 command.extend(['-j', str(threads) if args.threads is None else str(args.threads)]) 69 if list_ids is not None or taxon:
70 if list_ids is not None or taxon: 70 samples_to_run_file = write_samples_to_run(samples_to_run, workdir)
71 command.extend(['-l', samples_to_run_file]) 71 else:
72 72 set_samples_from_folders(samples_to_run, samples_fastq, workdir)
73 print 'ReMatCh will start in 5 seconds...' 73
74 time.sleep(5) 74 command.extend(['-w', workdir])
75 75 command.extend(['-j', str(threads) if args.threads is None else str(args.threads)])
76 os.chdir(initial_present_directory) 76 if list_ids is not None or taxon:
77 subprocess.call(command) 77 command.extend(['-l', samples_to_run_file])
78
79 print('ReMatCh will start in 5 seconds...')
80 time.sleep(5)
81
82 os.chdir(initial_present_directory)
83 subprocess.call(command)
78 84
79 85
80 def write_samples_to_run(samples_to_run, workdir): 86 def write_samples_to_run(samples_to_run, workdir):
81 samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt') 87 samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt')
82 with open(samples_to_run_file, 'wt') as writer: 88 with open(samples_to_run_file, 'wt') as writer:
83 for sample in samples_to_run: 89 for sample in samples_to_run:
84 writer.write(sample + '\n') 90 writer.write(sample + '\n')
85 return samples_to_run_file 91 return samples_to_run_file
86 92
87 93
88 def get_files_required(initialWorkdir): 94 def get_files_required(initial_workdir):
89 files_required = {'sample_report': {'extension': 'tab'}, 'run': {'extension': 'log'}, 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}} 95 files_required = {'sample_report': {'extension': 'tab'},
90 files = sorted([f for f in os.listdir(initialWorkdir) if not f.startswith('.') and os.path.isfile(os.path.join(initialWorkdir, f))]) 96 'run': {'extension': 'log'},
91 for file_found in files: 97 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}}
92 file_path = os.path.join(initialWorkdir, file_found) 98 files = sorted([f for f in os.listdir(initial_workdir) if
93 file_modification = os.path.getmtime(file_path) 99 not f.startswith('.') and
94 for prefix, values in files_required.items(): 100 os.path.isfile(os.path.join(initial_workdir, f))])
95 if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']): 101 for file_found in files:
96 if 'file' not in values: 102 file_path = os.path.join(initial_workdir, file_found)
97 files_required[prefix]['file'] = file_path 103 file_modification = os.path.getmtime(file_path)
98 files_required[prefix]['modification'] = file_modification 104 for prefix, values in list(files_required.items()):
99 else: 105 if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']):
100 if file_modification > files_required[prefix]['modification']: 106 if 'file' not in values:
101 files_required[prefix]['file'] = file_path 107 files_required[prefix]['file'] = file_path
102 files_required[prefix]['modification'] = file_modification 108 files_required[prefix]['modification'] = file_modification
103 return files_required 109 else:
110 if file_modification > files_required[prefix]['modification']:
111 files_required[prefix]['file'] = file_path
112 files_required[prefix]['modification'] = file_modification
113 return files_required
104 114
105 115
106 def get_samples_run(sample_report_file): 116 def get_samples_run(sample_report_file):
107 samples_run = {} 117 samples_run = {}
108 with open(sample_report_file, 'rtU') as reader: 118 with open(sample_report_file, 'rtU') as reader:
109 for line in reader: 119 for line in reader:
110 line = line.splitlines()[0] 120 line = line.splitlines()[0]
111 if len(line) > 0: 121 if len(line) > 0:
112 if not line.startswith('#'): 122 if not line.startswith('#'):
113 sample_info = line.split('\t') 123 sample_info = line.split('\t')
114 if sample_info[1] not in samples_run: 124 if sample_info[1] not in samples_run:
115 samples_run[sample_info[1]] = [] 125 samples_run[sample_info[1]] = []
116 samples_run[sample_info[1]].append(sample_info[0]) 126 samples_run[sample_info[1]].append(sample_info[0])
117 return samples_run 127 return samples_run
118 128
119 129
120 def get_rematch_command(log_file): 130 def get_rematch_command(log_file):
121 variables = {'command': False, 'directory': False} 131 variables = {'command': False, 'directory': False}
122 with open(log_file, 'rtU') as reader: 132 with open(log_file, 'rtU') as reader:
123 for line in reader: 133 for line in reader:
124 if any([isinstance(value, bool) for value in variables.values()]): 134 if any([isinstance(value, bool) for value in list(variables.values())]):
125 line = line.splitlines()[0] 135 line = line.splitlines()[0]
126 if len(line) > 0: 136 if len(line) > 0:
127 if line == 'COMMAND:': 137 if line == 'COMMAND:':
128 variables['command'] = True 138 variables['command'] = True
129 elif line == 'PRESENT DIRECTORY:': 139 elif line == 'PRESENT DIRECTORY:':
130 variables['directory'] = True 140 variables['directory'] = True
131 else: 141 else:
132 if variables['command'] is True: 142 if variables['command'] is True:
133 variables['command'] = line.split(' ') 143 variables['command'] = line.split(' ')
134 elif variables['directory'] is True: 144 elif variables['directory'] is True:
135 variables['directory'] = line 145 variables['directory'] = line
136 else: 146 else:
137 break 147 break
138 command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None} 148 command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None}
139 if all([not isinstance(value, bool) for value in variables.values()]): 149 if all([not isinstance(value, bool) for value in list(variables.values())]):
140 counter = 0 150 counter = 0
141 while counter < len(variables['command']): 151 while counter < len(variables['command']):
142 if variables['command'][counter].startswith('-'): 152 if variables['command'][counter].startswith('-'):
143 if variables['command'][counter] not in ('-t', '--taxon'): 153 if variables['command'][counter] not in ('-t', '--taxon'):
144 if variables['command'][counter] in ('-l', '--listIDs'): 154 if variables['command'][counter] in ('-l', '--listIDs'):
145 command['listIDs'] = variables['command'][counter + 1] 155 command['listIDs'] = variables['command'][counter + 1]
146 counter += 1 156 counter += 1
147 elif variables['command'][counter] in ('-w', '--workdir'): 157 elif variables['command'][counter] in ('-w', '--workdir'):
148 counter += 1 158 counter += 1
149 elif variables['command'][counter] in ('-j', '--threads'): 159 elif variables['command'][counter] in ('-j', '--threads'):
150 command['threads'] = int(variables['command'][counter + 1]) 160 command['threads'] = int(variables['command'][counter + 1])
151 counter += 1 161 counter += 1
152 elif variables['command'][counter] == '--mlst': 162 elif variables['command'][counter] == '--mlst':
153 species = [] 163 species = []
154 counter += 1 164 counter += 1
155 while counter < len(variables['command']) and not variables['command'][counter].startswith('-'): 165 while counter < len(variables['command']) and not variables['command'][counter].startswith('-'):
156 if len(variables['command'][counter]) > 0: 166 if len(variables['command'][counter]) > 0:
157 species.append(variables['command'][counter]) 167 species.append(variables['command'][counter])
158 counter += 1 168 counter += 1
159 command['command'].extend(['--mlst', ' '.join(species)]) 169 command['command'].extend(['--mlst', ' '.join(species)])
160 else: 170 else:
161 command['command'].append(variables['command'][counter]) 171 command['command'].append(variables['command'][counter])
162 if counter + 1 < len(variables['command']) and not variables['command'][counter + 1].startswith('-'): 172 if counter + 1 < len(variables['command']) and \
163 command['command'].append(variables['command'][counter + 1]) 173 not variables['command'][counter + 1].startswith('-'):
164 counter += 1 174 command['command'].append(variables['command'][counter + 1])
165 else: 175 counter += 1
166 command['taxon'] = True 176 else:
167 for i in range(counter, len(variables['command'])): 177 command['taxon'] = True
168 if i + 1 < len(variables['command']): 178 for i in range(counter, len(variables['command'])):
169 if variables['command'][i + 1].startswith('-'): 179 if i + 1 < len(variables['command']):
170 counter = i 180 if variables['command'][i + 1].startswith('-'):
171 break 181 counter = i
172 else: 182 break
173 counter = i 183 else:
174 else: 184 counter = i
175 command['command'].append(variables['command'][counter]) 185 else:
176 counter += 1 186 command['command'].append(variables['command'][counter])
177 return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory'] 187 counter += 1
178 188 return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory']
179 189
180 def getTaxonRunIDs(IDs_list_seqFromWebTaxon_file): 190
181 list_ids = [] 191 def get_taxon_run_ids(ids_list_seq_from_web_taxon_file):
182 with open(IDs_list_seqFromWebTaxon_file, 'rtU') as reader: 192 list_ids = []
183 for line in reader: 193 with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader:
184 line = line.splitlines()[0] 194 for line in reader:
185 if len(line) > 0: 195 line = line.splitlines()[0]
186 if not line.startswith('#'): 196 if len(line) > 0:
187 line = line.split('\t') 197 if not line.startswith('#'):
188 list_ids.append(line[0]) 198 line = line.split('\t')
189 return list_ids 199 list_ids.append(line[0])
190 200 return list_ids
191 201
192 def getListIDs_fromFile(listIDs_file): 202
193 list_ids = [] 203 def get_list_ids_from_file(list_ids_file):
194 with open(listIDs_file, 'rtU') as lines: 204 list_ids = []
195 for line in lines: 205 with open(list_ids_file, 'rtU') as lines:
196 line = line.splitlines()[0] 206 for line in lines:
197 if len(line) > 0: 207 line = line.splitlines()[0]
198 list_ids.append(line) 208 if len(line) > 0:
199 return list_ids 209 list_ids.append(line)
200 210 return list_ids
201 211
202 def searchFastqFiles(initialWorkdir): 212
203 filesExtensions = ['.fastq.gz', '.fq.gz'] 213 def search_fastq_files(initial_workdir):
204 pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] 214 files_extensions = ['.fastq.gz', '.fq.gz']
205 215 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
206 list_ids = {} 216
207 directories = [d for d in os.listdir(initialWorkdir) if not d.startswith('.') and os.path.isdir(os.path.join(initialWorkdir, d, ''))] 217 list_ids = {}
208 for directory_found in directories: 218 directories = [d for d in os.listdir(initial_workdir) if
209 directory_path = os.path.join(initialWorkdir, directory_found, '') 219 not d.startswith('.') and
210 220 os.path.isdir(os.path.join(initial_workdir, d, ''))]
211 fastqFound = [] 221 for directory_found in directories:
212 files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] 222 directory_path = os.path.join(initial_workdir, directory_found, '')
213 for file_found in files: 223
214 if file_found.endswith(tuple(filesExtensions)): 224 fastq_found = []
215 fastqFound.append(file_found) 225 files = [f for f in os.listdir(directory_path) if
216 226 not f.startswith('.') and
217 if len(fastqFound) == 1: 227 os.path.isfile(os.path.join(directory_path, f))]
218 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastqFound] 228 for file_found in files:
219 elif len(fastqFound) >= 2: 229 if file_found.endswith(tuple(files_extensions)):
220 file_pair = [] 230 fastq_found.append(file_found)
221 231
222 # Search pairs 232 if len(fastq_found) == 1:
223 for PE_separation in pairEnd_filesSeparation: 233 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
224 for fastq in fastqFound: 234 elif len(fastq_found) >= 2:
225 if PE_separation[0] in fastq or PE_separation[1] in fastq: 235 file_pair = []
226 file_pair.append(fastq) 236
227 237 # Search pairs
228 if len(file_pair) == 2: 238 for pe_separation in pair_end_files_separation:
229 break 239 for fastq in fastq_found:
230 else: 240 if pe_separation[0] in fastq or pe_separation[1] in fastq:
231 file_pair = [] 241 file_pair.append(fastq)
232 242
233 # Search single 243 if len(file_pair) == 2:
234 if len(file_pair) == 0: 244 break
235 for PE_separation in pairEnd_filesSeparation: 245 else:
236 for fastq in fastqFound: 246 file_pair = []
237 if PE_separation[0] not in fastq or PE_separation[1] not in fastq: 247
238 file_pair.append(fastq) 248 # Search single
239 249 if len(file_pair) == 0:
240 if len(file_pair) >= 1: 250 for pe_separation in pair_end_files_separation:
241 file_pair = file_pair[0] 251 for fastq in fastq_found:
242 252 if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
243 if len(file_pair) >= 1: 253 file_pair.append(fastq)
244 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair] 254
245 255 if len(file_pair) >= 1:
246 return list_ids 256 file_pair = file_pair[0]
247 257
248 258 if len(file_pair) >= 1:
249 def setSamples_fromFolders(samples_to_run, samples_fastq, workdir): 259 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
250 for sample in samples_to_run: 260
251 sample_dir = os.path.join(workdir, sample, '') 261 return list_ids
252 if not os.path.isdir(sample_dir): 262
253 os.mkdir(sample_dir) 263
254 for file_found in samples_fastq[sample]: 264 def set_samples_from_folders(samples_to_run, samples_fastq, workdir):
255 link_path = os.path.join(sample_dir, os.path.basename(file_found)) 265 for sample in samples_to_run:
256 if os.path.islink(link_path): 266 sample_dir = os.path.join(workdir, sample, '')
257 os.remove(link_path) 267 if not os.path.isdir(sample_dir):
258 if not os.path.isfile(link_path): 268 os.mkdir(sample_dir)
259 os.symlink(file_found, link_path) 269 for file_found in samples_fastq[sample]:
270 link_path = os.path.join(sample_dir, os.path.basename(file_found))
271 if os.path.islink(link_path):
272 os.remove(link_path)
273 if not os.path.isfile(link_path):
274 os.symlink(file_found, link_path)
260 275
261 276
262 def main(): 277 def main():
263 parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 278 parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated',
264 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) 279 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
265 280 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
266 parser_required = parser.add_argument_group('Required options') 281
267 parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/', help='Path to the directory where ReMatCh was running', required=True) 282 parser_required = parser.add_argument_group('Required options')
268 283 parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/',
269 parser_optional_general = parser.add_argument_group('General facultative options') 284 help='Path to the directory where ReMatCh was running', required=True)
270 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run again', required=False, default='.') 285
271 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use instead of the ones set in initial ReMatCh run', required=False) 286 parser_optional_general = parser.add_argument_group('General facultative options')
272 parser_optional_general.add_argument('--runFailedSamples', action='store_true', help='Will run ReMatCh for those samples missing, as well as for samples that did not run successfully in initial ReMatCh run') 287 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
273 288 help='Path to the directory where ReMatCh will run again', required=False,
274 args = parser.parse_args() 289 default='.')
275 290 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N',
276 runRematch(args) 291 help='Number of threads to use instead of the ones set in initial ReMatCh run',
292 required=False)
293 parser_optional_general.add_argument('--runFailedSamples', action='store_true',
294 help='Will run ReMatCh for those samples missing, as well as for samples that'
295 ' did not run successfully in initial ReMatCh run')
296
297 args = parser.parse_args()
298
299 run_rematch(args)
277 300
278 301
279 if __name__ == "__main__": 302 if __name__ == "__main__":
280 main() 303 main()