annotate detect_putative_ltr_wrapper.py @ 12:ff01d4263391 draft

"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author petr-novak
date Thu, 21 Jul 2022 08:23:15 +0000
parents
children 559940c04c44
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
2 """This wrapper is intended to be used on large genomes and large DANTE input to
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
3 minimize memory usage, It splits input files to pieces and analyze it on by one by
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
4 detect_putative_ltr.R
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
5 If input does not exceed memory limit, it will run detect_putative_ltr.R directly
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
6 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
7
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
8 import argparse
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
9 import os
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
10 import sys
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
11 import tempfile
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
12 from itertools import cycle
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
13 import subprocess
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
14
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
15
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
16 def get_arguments():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
17 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
18 Get arguments from command line
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
19 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
20 args
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
21 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
22 parser = argparse.ArgumentParser(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
23 description="""detect_putative_ltr_wrapper.py is a wrapper for
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
24 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
25 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
26 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
27 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
28 action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
29 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
30 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
31 '-s', '--reference_sequence', default=None, required=True,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
32 help="reference sequence as fasta file", type=str, action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
33 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
34 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
35 '-o', '--output', default=None, required=True, help="output file path and prefix",
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
36 type=str, action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
37 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
38 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
39 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
40 action='store'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
41 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
42 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
43 '-M', '--max_missing_domains', default=0, required=False, type=int
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
44 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
45 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
46 '-L', '--min_relative_length', default=0.6, required=False, type=float,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
47 help="Minimum relative length of protein domain to be "
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
48 "considered for retrostransposon detection"
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
49 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
50 parser.add_argument(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
51 '-S', '--max_chunk_size', default=100000000, required=False, type=int,
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
52 help='If size of reference sequence is greater than this value, reference is '
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
53 'analyzed in chunks of this size. This is just approximate value - '
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
54 'sequences '
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
55 'which are longer are are not split, default is %(default)s'
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
56 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
57 args = parser.parse_args()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
58 return args
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
59
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
60
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
61 def read_fasta_sequence_size(fasta_file):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
62 """Read size of sequence into dictionary"""
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
63 fasta_dict = {}
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
64 with open(fasta_file, 'r') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
65 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
66 if line[0] == '>':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
67 header = line.strip().split(' ')[0][1:] # remove part of name after space
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
68 fasta_dict[header] = 0
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
69 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
70 fasta_dict[header] += len(line.strip())
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
71 return fasta_dict
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
72
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
73
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
74 def make_temp_files(number_of_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
75 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
76 Make named temporary files, file will not be deleted upon exit!
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
77 :param number_of_files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
78 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
79 filepaths
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
80 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
81 temp_files = []
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
82 for i in range(number_of_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
83 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
84 return temp_files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
85
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
86
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
87 def sum_up_stats_files(files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
88 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
89 Sum up statistics files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
90 :return:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
91 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
92 new_statistics = {}
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
93 for file in files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
94 with open(file, 'r') as fh:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
95 for line in fh:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
96 items = line.strip().split('\t')
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
97 if items[0] == 'Classification':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
98 header = items
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
99 continue
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
100 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
101 counts = [int(item) for item in items[1:]]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
102 if items[0] in new_statistics:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
103 new_statistics[items[0]] = [sum(x) for x in
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
104 zip(new_statistics[items[0]], counts)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
105 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
106 new_statistics[items[0]] = counts
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
107 # convert to string, first line is header
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
108 statistics_str = []
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
109 for classification, counts in new_statistics.items():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
110 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts]))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
111 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
112 return sorted_stat_with_header
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
113
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
114
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
115 def main():
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
116 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
117 Main function
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
118 """
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
119 args = get_arguments()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
120 # locate directory of current script
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
121 tool_path = os.path.dirname(os.path.realpath(__file__))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
122 fasta_seq_size = read_fasta_sequence_size(args.reference_sequence)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
123 total_size = sum(fasta_seq_size.values())
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
124 number_of_sequences = len(fasta_seq_size)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
125 if total_size > args.max_chunk_size and number_of_sequences > 1:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
126 # sort dictionary by values
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
127 seq_id_size_sorted = [i[0] for i in sorted(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
128 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
129 )]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
130 number_of_temp_files = int(total_size / args.max_chunk_size) + 1
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
131 if number_of_temp_files > number_of_sequences:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
132 number_of_temp_files = number_of_sequences
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
133
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
134 temp_files_fasta = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
135 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
136 # make dictionary seq_id_sorted as keys and values as file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
137 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
138
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
139 # write sequences to temporary files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
140 with open(args.reference_sequence, 'r') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
141 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
142 if line[0] == '>':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
143 header = line.strip().split(' ')[0][1:]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
144 print(header)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
145 seq_id_file_handle_dict[header].write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
146 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
147 seq_id_file_handle_dict[header].write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
148 # close file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
149 for file_handle in file_handles:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
150 file_handle.close()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
151
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
152 # split gff3 file to temporary files -
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
153 # each temporary file will contain gff lines matching fasta
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
154 temp_files_gff = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
155 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
156 # make dictionary seq_id_sorted as keys and values as file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
157 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
158 # write gff lines to chunks
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
159 with open(args.gff3, 'r') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
160 for line in f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
161 if line[0] == '#':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
162 continue
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
163 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
164 header = line.strip().split('\t')[0]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
165 seq_id_file_handle_dict[header].write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
166 # close file handles
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
167 for file_handle in file_handles:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
168 file_handle.close()
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
169
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
170 # run retrotransposon detection on each temporary file
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
171 output_files = make_temp_files(number_of_temp_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
172 for i in range(number_of_temp_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
173 print('Running retrotransposon detection on file ' + str(i))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
174 subprocess.check_call(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
175 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
176 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
177 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
178 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
179
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
180 # remove all temporary input files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
181 for temp_file in temp_files_fasta + temp_files_gff:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
182 os.remove(temp_file)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
183
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
184 # concatenate output files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
185 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
186 '_DLP.fasta', '.gff3', '_statistics.csv']
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
187
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
188 for suffix in output_file_suffixes:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
189 if suffix == '_statistics.csv':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
190 # sum up line with same word in first column
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
191 stat_files = [output_file + suffix for output_file in output_files]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
192 new_statistics = sum_up_stats_files(stat_files)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
193 with open(args.output + suffix, 'w') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
194 f.write("\n".join(new_statistics))
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
195 # remove parsed temporary statistics files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
196 for file in stat_files:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
197 os.remove(file)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
198 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
199 with open(args.output + suffix, 'w') as f:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
200 for i in range(number_of_temp_files):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
201 # some file may not exist, so we need to check
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
202 try:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
203 with open(output_files[i] + suffix, 'r') as g:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
204 for line in g:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
205 f.write(line)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
206 # remove parsed temporary output files
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
207 os.remove(output_files[i])
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
208 except FileNotFoundError:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
209 pass
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
210 else:
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
211 # no need to split sequences into chunks
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
212 subprocess.check_call(
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
213 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
214 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M',
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
215 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
216 )
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
217
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
218
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
219 if __name__ == '__main__':
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
220 # check version of python must be 3.6 or greater
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
221 if sys.version_info < (3, 6):
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
222 print('Python version must be 3.6 or greater')
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
223 sys.exit(1)
ff01d4263391 "planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff changeset
224 main()