diff scripts/ReMatCh/utils/restart_rematch.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
line wrap: on
line diff
--- a/scripts/ReMatCh/utils/restart_rematch.py	Wed Jan 22 09:10:12 2020 -0500
+++ b/scripts/ReMatCh/utils/restart_rematch.py	Tue Jan 28 10:42:31 2020 -0500
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # -*- coding: utf-8 -*-
 
@@ -6,9 +6,9 @@
 restart_rematch.py - Restarts a ReMatCh run abruptly terminated
 <https://github.com/B-UMMI/ReMatCh/>
 
-Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt>
+Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
 
-Last modified: February 09, 2017
+Last modified: October 15, 2018
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -33,248 +33,271 @@
 version = '0.1'
 
 
-def runRematch(args):
-	print '\n' + '==========> Restarting ReMatCh <==========' + '\n'
+def run_rematch(args):
+    print('\n' + '==========> Restarting ReMatCh <==========' + '\n')
 
-	workdir = os.path.abspath(args.workdir)
-	if not os.path.isdir(workdir):
-		os.makedirs(workdir)
+    workdir = os.path.abspath(args.workdir)
+    if not os.path.isdir(workdir):
+        os.makedirs(workdir)
 
-	initialWorkdir = os.path.abspath(args.initialWorkdir)
+    initial_workdir = os.path.abspath(args.initialWorkdir)
 
-	files_required = get_files_required(initialWorkdir)
+    files_required = get_files_required(initial_workdir)
 
-	samples_run = get_samples_run(files_required['sample_report']['file'])
+    samples_run = get_samples_run(files_required['sample_report']['file'])
 
-	command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file'])
+    command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file'])
+
+    samples_fastq = {}
 
-	if list_ids is not None:
-		total_samples = getListIDs_fromFile(list_ids)
-	elif taxon:
-		total_samples = getTaxonRunIDs(files_required['IDs_list.seqFromWebTaxon']['file'])
-	else:
-		samples_fastq = searchFastqFiles(initialWorkdir)
-		total_samples = samples_fastq.keys()
+    if list_ids is not None:
+        total_samples = get_list_ids_from_file(list_ids)
+    elif taxon:
+        total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file'])
+    else:
+        samples_fastq = search_fastq_files(initial_workdir)
+        total_samples = list(samples_fastq.keys())
 
-	samples_to_run = list(set(total_samples).symmetric_difference(set(sum(samples_run.values(), []) if not args.runFailedSamples else samples_run['True'] if 'True' in samples_run else [''])))
+    samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if
+                                                                      not args.runFailedSamples else
+                                                                      samples_run['True'] if
+                                                                      'True' in samples_run else [''])))
 
-	print str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by ReMatCh' + '\n'
+    print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by'
+                                                                                    ' ReMatCh' + '\n')
 
-	if list_ids is not None or taxon:
-		samples_to_run_file = write_samples_to_run(samples_to_run, workdir)
-	else:
-		setSamples_fromFolders(samples_to_run, samples_fastq, workdir)
+    if list_ids is not None or taxon:
+        samples_to_run_file = write_samples_to_run(samples_to_run, workdir)
+    else:
+        set_samples_from_folders(samples_to_run, samples_fastq, workdir)
 
-	command.extend(['-w', workdir])
-	command.extend(['-j', str(threads) if args.threads is None else str(args.threads)])
-	if list_ids is not None or taxon:
-		command.extend(['-l', samples_to_run_file])
+    command.extend(['-w', workdir])
+    command.extend(['-j', str(threads) if args.threads is None else str(args.threads)])
+    if list_ids is not None or taxon:
+        command.extend(['-l', samples_to_run_file])
 
-	print 'ReMatCh will start in 5 seconds...'
-	time.sleep(5)
+    print('ReMatCh will start in 5 seconds...')
+    time.sleep(5)
 
-	os.chdir(initial_present_directory)
-	subprocess.call(command)
+    os.chdir(initial_present_directory)
+    subprocess.call(command)
 
 
 def write_samples_to_run(samples_to_run, workdir):
-	samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt')
-	with open(samples_to_run_file, 'wt') as writer:
-		for sample in samples_to_run:
-			writer.write(sample + '\n')
-	return samples_to_run_file
+    samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt')
+    with open(samples_to_run_file, 'wt') as writer:
+        for sample in samples_to_run:
+            writer.write(sample + '\n')
+    return samples_to_run_file
 
 
-def get_files_required(initialWorkdir):
-	files_required = {'sample_report': {'extension': 'tab'}, 'run': {'extension': 'log'}, 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}}
-	files = sorted([f for f in os.listdir(initialWorkdir) if not f.startswith('.') and os.path.isfile(os.path.join(initialWorkdir, f))])
-	for file_found in files:
-		file_path = os.path.join(initialWorkdir, file_found)
-		file_modification = os.path.getmtime(file_path)
-		for prefix, values in files_required.items():
-			if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']):
-				if 'file' not in values:
-					files_required[prefix]['file'] = file_path
-					files_required[prefix]['modification'] = file_modification
-				else:
-					if file_modification > files_required[prefix]['modification']:
-						files_required[prefix]['file'] = file_path
-						files_required[prefix]['modification'] = file_modification
-	return files_required
+def get_files_required(initial_workdir):
+    files_required = {'sample_report': {'extension': 'tab'},
+                      'run': {'extension': 'log'},
+                      'IDs_list.seqFromWebTaxon': {'extension': 'tab'}}
+    files = sorted([f for f in os.listdir(initial_workdir) if
+                    not f.startswith('.') and
+                    os.path.isfile(os.path.join(initial_workdir, f))])
+    for file_found in files:
+        file_path = os.path.join(initial_workdir, file_found)
+        file_modification = os.path.getmtime(file_path)
+        for prefix, values in list(files_required.items()):
+            if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']):
+                if 'file' not in values:
+                    files_required[prefix]['file'] = file_path
+                    files_required[prefix]['modification'] = file_modification
+                else:
+                    if file_modification > files_required[prefix]['modification']:
+                        files_required[prefix]['file'] = file_path
+                        files_required[prefix]['modification'] = file_modification
+    return files_required
 
 
 def get_samples_run(sample_report_file):
-	samples_run = {}
-	with open(sample_report_file, 'rtU') as reader:
-		for line in reader:
-			line = line.splitlines()[0]
-			if len(line) > 0:
-				if not line.startswith('#'):
-					sample_info = line.split('\t')
-					if sample_info[1] not in samples_run:
-						samples_run[sample_info[1]] = []
-					samples_run[sample_info[1]].append(sample_info[0])
-	return samples_run
+    samples_run = {}
+    with open(sample_report_file, 'rtU') as reader:
+        for line in reader:
+            line = line.splitlines()[0]
+            if len(line) > 0:
+                if not line.startswith('#'):
+                    sample_info = line.split('\t')
+                    if sample_info[1] not in samples_run:
+                        samples_run[sample_info[1]] = []
+                    samples_run[sample_info[1]].append(sample_info[0])
+    return samples_run
 
 
 def get_rematch_command(log_file):
-	variables = {'command': False, 'directory': False}
-	with open(log_file, 'rtU') as reader:
-		for line in reader:
-			if any([isinstance(value, bool) for value in variables.values()]):
-				line = line.splitlines()[0]
-				if len(line) > 0:
-					if line == 'COMMAND:':
-						variables['command'] = True
-					elif line == 'PRESENT DIRECTORY:':
-						variables['directory'] = True
-					else:
-						if variables['command'] is True:
-							variables['command'] = line.split(' ')
-						elif variables['directory'] is True:
-							variables['directory'] = line
-			else:
-				break
-	command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None}
-	if all([not isinstance(value, bool) for value in variables.values()]):
-		counter = 0
-		while counter < len(variables['command']):
-			if variables['command'][counter].startswith('-'):
-				if variables['command'][counter] not in ('-t', '--taxon'):
-					if variables['command'][counter] in ('-l', '--listIDs'):
-						command['listIDs'] = variables['command'][counter + 1]
-						counter += 1
-					elif variables['command'][counter] in ('-w', '--workdir'):
-						counter += 1
-					elif variables['command'][counter] in ('-j', '--threads'):
-						command['threads'] = int(variables['command'][counter + 1])
-						counter += 1
-					elif variables['command'][counter] == '--mlst':
-						species = []
-						counter += 1
-						while counter < len(variables['command']) and not variables['command'][counter].startswith('-'):
-							if len(variables['command'][counter]) > 0:
-								species.append(variables['command'][counter])
-							counter += 1
-						command['command'].extend(['--mlst', ' '.join(species)])
-					else:
-						command['command'].append(variables['command'][counter])
-						if counter + 1 < len(variables['command']) and not variables['command'][counter + 1].startswith('-'):
-							command['command'].append(variables['command'][counter + 1])
-							counter += 1
-				else:
-					command['taxon'] = True
-					for i in range(counter, len(variables['command'])):
-						if i + 1 < len(variables['command']):
-							if variables['command'][i + 1].startswith('-'):
-								counter = i
-								break
-						else:
-							counter = i
-			else:
-				command['command'].append(variables['command'][counter])
-			counter += 1
-	return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory']
+    variables = {'command': False, 'directory': False}
+    with open(log_file, 'rtU') as reader:
+        for line in reader:
+            if any([isinstance(value, bool) for value in list(variables.values())]):
+                line = line.splitlines()[0]
+                if len(line) > 0:
+                    if line == 'COMMAND:':
+                        variables['command'] = True
+                    elif line == 'PRESENT DIRECTORY:':
+                        variables['directory'] = True
+                    else:
+                        if variables['command'] is True:
+                            variables['command'] = line.split(' ')
+                        elif variables['directory'] is True:
+                            variables['directory'] = line
+            else:
+                break
+    command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None}
+    if all([not isinstance(value, bool) for value in list(variables.values())]):
+        counter = 0
+        while counter < len(variables['command']):
+            if variables['command'][counter].startswith('-'):
+                if variables['command'][counter] not in ('-t', '--taxon'):
+                    if variables['command'][counter] in ('-l', '--listIDs'):
+                        command['listIDs'] = variables['command'][counter + 1]
+                        counter += 1
+                    elif variables['command'][counter] in ('-w', '--workdir'):
+                        counter += 1
+                    elif variables['command'][counter] in ('-j', '--threads'):
+                        command['threads'] = int(variables['command'][counter + 1])
+                        counter += 1
+                    elif variables['command'][counter] == '--mlst':
+                        species = []
+                        counter += 1
+                        while counter < len(variables['command']) and not variables['command'][counter].startswith('-'):
+                            if len(variables['command'][counter]) > 0:
+                                species.append(variables['command'][counter])
+                            counter += 1
+                        command['command'].extend(['--mlst', ' '.join(species)])
+                    else:
+                        command['command'].append(variables['command'][counter])
+                        if counter + 1 < len(variables['command']) and \
+                                not variables['command'][counter + 1].startswith('-'):
+                            command['command'].append(variables['command'][counter + 1])
+                            counter += 1
+                else:
+                    command['taxon'] = True
+                    for i in range(counter, len(variables['command'])):
+                        if i + 1 < len(variables['command']):
+                            if variables['command'][i + 1].startswith('-'):
+                                counter = i
+                                break
+                        else:
+                            counter = i
+            else:
+                command['command'].append(variables['command'][counter])
+            counter += 1
+    return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory']
+
+
+def get_taxon_run_ids(ids_list_seq_from_web_taxon_file):
+    list_ids = []
+    with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader:
+        for line in reader:
+            line = line.splitlines()[0]
+            if len(line) > 0:
+                if not line.startswith('#'):
+                    line = line.split('\t')
+                    list_ids.append(line[0])
+    return list_ids
 
 
-def getTaxonRunIDs(IDs_list_seqFromWebTaxon_file):
-	list_ids = []
-	with open(IDs_list_seqFromWebTaxon_file, 'rtU') as reader:
-		for line in reader:
-			line = line.splitlines()[0]
-			if len(line) > 0:
-				if not line.startswith('#'):
-					line = line.split('\t')
-					list_ids.append(line[0])
-	return list_ids
-
-
-def getListIDs_fromFile(listIDs_file):
-	list_ids = []
-	with open(listIDs_file, 'rtU') as lines:
-		for line in lines:
-			line = line.splitlines()[0]
-			if len(line) > 0:
-				list_ids.append(line)
-	return list_ids
+def get_list_ids_from_file(list_ids_file):
+    list_ids = []
+    with open(list_ids_file, 'rtU') as lines:
+        for line in lines:
+            line = line.splitlines()[0]
+            if len(line) > 0:
+                list_ids.append(line)
+    return list_ids
 
 
-def searchFastqFiles(initialWorkdir):
-	filesExtensions = ['.fastq.gz', '.fq.gz']
-	pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
+def search_fastq_files(initial_workdir):
+    files_extensions = ['.fastq.gz', '.fq.gz']
+    pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
 
-	list_ids = {}
-	directories = [d for d in os.listdir(initialWorkdir) if not d.startswith('.') and os.path.isdir(os.path.join(initialWorkdir, d, ''))]
-	for directory_found in directories:
-		directory_path = os.path.join(initialWorkdir, directory_found, '')
+    list_ids = {}
+    directories = [d for d in os.listdir(initial_workdir) if
+                   not d.startswith('.') and
+                   os.path.isdir(os.path.join(initial_workdir, d, ''))]
+    for directory_found in directories:
+        directory_path = os.path.join(initial_workdir, directory_found, '')
 
-		fastqFound = []
-		files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))]
-		for file_found in files:
-			if file_found.endswith(tuple(filesExtensions)):
-				fastqFound.append(file_found)
+        fastq_found = []
+        files = [f for f in os.listdir(directory_path) if
+                 not f.startswith('.') and
+                 os.path.isfile(os.path.join(directory_path, f))]
+        for file_found in files:
+            if file_found.endswith(tuple(files_extensions)):
+                fastq_found.append(file_found)
 
-		if len(fastqFound) == 1:
-			list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastqFound]
-		elif len(fastqFound) >= 2:
-			file_pair = []
+        if len(fastq_found) == 1:
+            list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
+        elif len(fastq_found) >= 2:
+            file_pair = []
 
-			# Search pairs
-			for PE_separation in pairEnd_filesSeparation:
-				for fastq in fastqFound:
-					if PE_separation[0] in fastq or PE_separation[1] in fastq:
-						file_pair.append(fastq)
+            # Search pairs
+            for pe_separation in pair_end_files_separation:
+                for fastq in fastq_found:
+                    if pe_separation[0] in fastq or pe_separation[1] in fastq:
+                        file_pair.append(fastq)
 
-				if len(file_pair) == 2:
-					break
-				else:
-					file_pair = []
+                if len(file_pair) == 2:
+                    break
+                else:
+                    file_pair = []
 
-			# Search single
-			if len(file_pair) == 0:
-				for PE_separation in pairEnd_filesSeparation:
-					for fastq in fastqFound:
-						if PE_separation[0] not in fastq or PE_separation[1] not in fastq:
-							file_pair.append(fastq)
+            # Search single
+            if len(file_pair) == 0:
+                for pe_separation in pair_end_files_separation:
+                    for fastq in fastq_found:
+                        if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
+                            file_pair.append(fastq)
 
-				if len(file_pair) >= 1:
-					file_pair = file_pair[0]
+                if len(file_pair) >= 1:
+                    file_pair = file_pair[0]
 
-			if len(file_pair) >= 1:
-				list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
+            if len(file_pair) >= 1:
+                list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
 
-	return list_ids
+    return list_ids
 
 
-def setSamples_fromFolders(samples_to_run, samples_fastq, workdir):
-	for sample in samples_to_run:
-		sample_dir = os.path.join(workdir, sample, '')
-		if not os.path.isdir(sample_dir):
-			os.mkdir(sample_dir)
-		for file_found in samples_fastq[sample]:
-			link_path = os.path.join(sample_dir, os.path.basename(file_found))
-			if os.path.islink(link_path):
-				os.remove(link_path)
-			if not os.path.isfile(link_path):
-				os.symlink(file_found, link_path)
+def set_samples_from_folders(samples_to_run, samples_fastq, workdir):
+    for sample in samples_to_run:
+        sample_dir = os.path.join(workdir, sample, '')
+        if not os.path.isdir(sample_dir):
+            os.mkdir(sample_dir)
+        for file_found in samples_fastq[sample]:
+            link_path = os.path.join(sample_dir, os.path.basename(file_found))
+            if os.path.islink(link_path):
+                os.remove(link_path)
+            if not os.path.isfile(link_path):
+                os.symlink(file_found, link_path)
 
 
 def main():
-	parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
+    parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
 
-	parser_required = parser.add_argument_group('Required options')
-	parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/', help='Path to the directory where ReMatCh was running', required=True)
+    parser_required = parser.add_argument_group('Required options')
+    parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/',
+                                 help='Path to the directory where ReMatCh was running', required=True)
 
-	parser_optional_general = parser.add_argument_group('General facultative options')
-	parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run again', required=False, default='.')
-	parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use instead of the ones set in initial ReMatCh run', required=False)
-	parser_optional_general.add_argument('--runFailedSamples', action='store_true', help='Will run ReMatCh for those samples missing, as well as for samples that did not run successfully in initial ReMatCh run')
+    parser_optional_general = parser.add_argument_group('General facultative options')
+    parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
+                                         help='Path to the directory where ReMatCh will run again', required=False,
+                                         default='.')
+    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N',
+                                         help='Number of threads to use instead of the ones set in initial ReMatCh run',
+                                         required=False)
+    parser_optional_general.add_argument('--runFailedSamples', action='store_true',
+                                         help='Will run ReMatCh for those samples missing, as well as for samples that'
+                                              ' did not run successfully in initial ReMatCh run')
 
-	args = parser.parse_args()
+    args = parser.parse_args()
 
-	runRematch(args)
+    run_rematch(args)
 
 
 if __name__ == "__main__":
-	main()
+    main()