Mercurial > repos > petr-novak > dante_ltr
diff detect_putative_ltr_wrapper.py @ 12:ff01d4263391 draft
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author | petr-novak |
---|---|
date | Thu, 21 Jul 2022 08:23:15 +0000 |
parents | |
children | 559940c04c44 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/detect_putative_ltr_wrapper.py Thu Jul 21 08:23:15 2022 +0000 @@ -0,0 +1,224 @@ +#!/usr/bin/env python +"""This wrapper is intended to be used on large genomes and large DANTE input to +minimize memory usage, It splits input files to pieces and analyze it on by one by +detect_putative_ltr.R +If input does not exceed memory limit, it will run detect_putative_ltr.R directly +""" + +import argparse +import os +import sys +import tempfile +from itertools import cycle +import subprocess + + +def get_arguments(): + """ + Get arguments from command line + :return: + args + """ + parser = argparse.ArgumentParser( + description="""detect_putative_ltr_wrapper.py is a wrapper for + detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument( + '-g', '--gff3', default=None, required=True, help="gff3 file", type=str, + action='store' + ) + parser.add_argument( + '-s', '--reference_sequence', default=None, required=True, + help="reference sequence as fasta file", type=str, action='store' + ) + parser.add_argument( + '-o', '--output', default=None, required=True, help="output file path and prefix", + type=str, action='store' + ) + parser.add_argument( + '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int, + action='store' + ) + parser.add_argument( + '-M', '--max_missing_domains', default=0, required=False, type=int + ) + parser.add_argument( + '-L', '--min_relative_length', default=0.6, required=False, type=float, + help="Minimum relative length of protein domain to be " + "considered for retrostransposon detection" + ) + parser.add_argument( + '-S', '--max_chunk_size', default=100000000, required=False, type=int, + help='If size of reference sequence is greater than this value, reference is ' + 'analyzed in chunks of this size. This is just approximate value - ' + 'sequences ' + 'which are longer are are not split, default is %(default)s' + ) + args = parser.parse_args() + return args + + +def read_fasta_sequence_size(fasta_file): + """Read size of sequence into dictionary""" + fasta_dict = {} + with open(fasta_file, 'r') as f: + for line in f: + if line[0] == '>': + header = line.strip().split(' ')[0][1:] # remove part of name after space + fasta_dict[header] = 0 + else: + fasta_dict[header] += len(line.strip()) + return fasta_dict + + +def make_temp_files(number_of_files): + """ + Make named temporary files, file will not be deleted upon exit! + :param number_of_files: + :return: + filepaths + """ + temp_files = [] + for i in range(number_of_files): + temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) + return temp_files + + +def sum_up_stats_files(files): + """ + Sum up statistics files + :return: + """ + new_statistics = {} + for file in files: + with open(file, 'r') as fh: + for line in fh: + items = line.strip().split('\t') + if items[0] == 'Classification': + header = items + continue + else: + counts = [int(item) for item in items[1:]] + if items[0] in new_statistics: + new_statistics[items[0]] = [sum(x) for x in + zip(new_statistics[items[0]], counts)] + else: + new_statistics[items[0]] = counts + # convert to string, first line is header + statistics_str = [] + for classification, counts in new_statistics.items(): + statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) + sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) + return sorted_stat_with_header + + +def main(): + """ + Main function + """ + args = get_arguments() + # locate directory of current script + tool_path = os.path.dirname(os.path.realpath(__file__)) + fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) + total_size = sum(fasta_seq_size.values()) + number_of_sequences = len(fasta_seq_size) + if total_size > args.max_chunk_size and number_of_sequences > 1: + # sort dictionary by values + seq_id_size_sorted = [i[0] for i in sorted( + fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True + )] + number_of_temp_files = int(total_size / args.max_chunk_size) + 1 + if number_of_temp_files > number_of_sequences: + number_of_temp_files = number_of_sequences + + temp_files_fasta = make_temp_files(number_of_temp_files) + file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] + # make dictionary seq_id_sorted as keys and values as file handles + seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) + + # write sequences to temporary files + with open(args.reference_sequence, 'r') as f: + for line in f: + if line[0] == '>': + header = line.strip().split(' ')[0][1:] + print(header) + seq_id_file_handle_dict[header].write(line) + else: + seq_id_file_handle_dict[header].write(line) + # close file handles + for file_handle in file_handles: + file_handle.close() + + # split gff3 file to temporary files - + # each temporary file will contain gff lines matching fasta + temp_files_gff = make_temp_files(number_of_temp_files) + file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] + # make dictionary seq_id_sorted as keys and values as file handles + seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) + # write gff lines to chunks + with open(args.gff3, 'r') as f: + for line in f: + if line[0] == '#': + continue + else: + header = line.strip().split('\t')[0] + seq_id_file_handle_dict[header].write(line) + # close file handles + for file_handle in file_handles: + file_handle.close() + + # run retrotransposon detection on each temporary file + output_files = make_temp_files(number_of_temp_files) + for i in range(number_of_temp_files): + print('Running retrotransposon detection on file ' + str(i)) + subprocess.check_call( + [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', + temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', + str(args.max_missing_domains), '-L', str(args.min_relative_length)] + ) + + # remove all temporary input files + for temp_file in temp_files_fasta + temp_files_gff: + os.remove(temp_file) + + # concatenate output files + output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', + '_DLP.fasta', '.gff3', '_statistics.csv'] + + for suffix in output_file_suffixes: + if suffix == '_statistics.csv': + # sum up line with same word in first column + stat_files = [output_file + suffix for output_file in output_files] + new_statistics = sum_up_stats_files(stat_files) + with open(args.output + suffix, 'w') as f: + f.write("\n".join(new_statistics)) + # remove parsed temporary statistics files + for file in stat_files: + os.remove(file) + else: + with open(args.output + suffix, 'w') as f: + for i in range(number_of_temp_files): + # some file may not exist, so we need to check + try: + with open(output_files[i] + suffix, 'r') as g: + for line in g: + f.write(line) + # remove parsed temporary output files + os.remove(output_files[i]) + except FileNotFoundError: + pass + else: + # no need to split sequences into chunks + subprocess.check_call( + [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', + args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', + str(args.max_missing_domains), '-L', str(args.min_relative_length)] + ) + + +if __name__ == '__main__': + # check version of python must be 3.6 or greater + if sys.version_info < (3, 6): + print('Python version must be 3.6 or greater') + sys.exit(1) + main()