Mercurial > repos > petr-novak > dante_ltr
annotate detect_putative_ltr_wrapper.py @ 12:ff01d4263391 draft
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
author | petr-novak |
---|---|
date | Thu, 21 Jul 2022 08:23:15 +0000 |
parents | |
children | 559940c04c44 |
rev | line source |
---|---|
12
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env python |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
2 """This wrapper is intended to be used on large genomes and large DANTE input to |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
3 minimize memory usage, It splits input files to pieces and analyze it on by one by |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
4 detect_putative_ltr.R |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
5 If input does not exceed memory limit, it will run detect_putative_ltr.R directly |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
6 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
7 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
8 import argparse |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
9 import os |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
10 import sys |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
11 import tempfile |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
12 from itertools import cycle |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
13 import subprocess |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
14 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
15 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
16 def get_arguments(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
17 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
18 Get arguments from command line |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
19 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
20 args |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
21 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
22 parser = argparse.ArgumentParser( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
23 description="""detect_putative_ltr_wrapper.py is a wrapper for |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
24 detect_putative_ltr.R""", formatter_class=argparse.RawTextHelpFormatter |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
25 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
26 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
27 '-g', '--gff3', default=None, required=True, help="gff3 file", type=str, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
28 action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
29 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
30 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
31 '-s', '--reference_sequence', default=None, required=True, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
32 help="reference sequence as fasta file", type=str, action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
33 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
34 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
35 '-o', '--output', default=None, required=True, help="output file path and prefix", |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
36 type=str, action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
37 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
38 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
39 '-c', '--cpu', default=1, required=False, help="number of CPUs", type=int, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
40 action='store' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
41 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
42 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
43 '-M', '--max_missing_domains', default=0, required=False, type=int |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
44 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
45 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
46 '-L', '--min_relative_length', default=0.6, required=False, type=float, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
47 help="Minimum relative length of protein domain to be " |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
48 "considered for retrostransposon detection" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
49 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
50 parser.add_argument( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
51 '-S', '--max_chunk_size', default=100000000, required=False, type=int, |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
52 help='If size of reference sequence is greater than this value, reference is ' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
53 'analyzed in chunks of this size. This is just approximate value - ' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
54 'sequences ' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
55 'which are longer are are not split, default is %(default)s' |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
56 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
57 args = parser.parse_args() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
58 return args |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
59 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
60 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
61 def read_fasta_sequence_size(fasta_file): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
62 """Read size of sequence into dictionary""" |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
63 fasta_dict = {} |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
64 with open(fasta_file, 'r') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
65 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
66 if line[0] == '>': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
67 header = line.strip().split(' ')[0][1:] # remove part of name after space |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
68 fasta_dict[header] = 0 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
69 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
70 fasta_dict[header] += len(line.strip()) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
71 return fasta_dict |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
72 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
73 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
74 def make_temp_files(number_of_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
75 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
76 Make named temporary files, file will not be deleted upon exit! |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
77 :param number_of_files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
78 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
79 filepaths |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
80 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
81 temp_files = [] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
82 for i in range(number_of_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
83 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
84 return temp_files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
85 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
86 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
87 def sum_up_stats_files(files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
88 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
89 Sum up statistics files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
90 :return: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
91 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
92 new_statistics = {} |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
93 for file in files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
94 with open(file, 'r') as fh: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
95 for line in fh: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
96 items = line.strip().split('\t') |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
97 if items[0] == 'Classification': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
98 header = items |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
99 continue |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
100 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
101 counts = [int(item) for item in items[1:]] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
102 if items[0] in new_statistics: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
103 new_statistics[items[0]] = [sum(x) for x in |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
104 zip(new_statistics[items[0]], counts)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
105 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
106 new_statistics[items[0]] = counts |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
107 # convert to string, first line is header |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
108 statistics_str = [] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
109 for classification, counts in new_statistics.items(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
110 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
111 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
112 return sorted_stat_with_header |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
113 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
114 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
115 def main(): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
116 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
117 Main function |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
118 """ |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
119 args = get_arguments() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
120 # locate directory of current script |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
121 tool_path = os.path.dirname(os.path.realpath(__file__)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
122 fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
123 total_size = sum(fasta_seq_size.values()) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
124 number_of_sequences = len(fasta_seq_size) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
125 if total_size > args.max_chunk_size and number_of_sequences > 1: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
126 # sort dictionary by values |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
127 seq_id_size_sorted = [i[0] for i in sorted( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
128 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
129 )] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
130 number_of_temp_files = int(total_size / args.max_chunk_size) + 1 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
131 if number_of_temp_files > number_of_sequences: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
132 number_of_temp_files = number_of_sequences |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
133 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
134 temp_files_fasta = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
135 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
136 # make dictionary seq_id_sorted as keys and values as file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
137 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
138 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
139 # write sequences to temporary files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
140 with open(args.reference_sequence, 'r') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
141 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
142 if line[0] == '>': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
143 header = line.strip().split(' ')[0][1:] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
144 print(header) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
145 seq_id_file_handle_dict[header].write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
146 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
147 seq_id_file_handle_dict[header].write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
148 # close file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
149 for file_handle in file_handles: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
150 file_handle.close() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
151 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
152 # split gff3 file to temporary files - |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
153 # each temporary file will contain gff lines matching fasta |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
154 temp_files_gff = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
155 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
156 # make dictionary seq_id_sorted as keys and values as file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
157 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
158 # write gff lines to chunks |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
159 with open(args.gff3, 'r') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
160 for line in f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
161 if line[0] == '#': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
162 continue |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
163 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
164 header = line.strip().split('\t')[0] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
165 seq_id_file_handle_dict[header].write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
166 # close file handles |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
167 for file_handle in file_handles: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
168 file_handle.close() |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
169 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
170 # run retrotransposon detection on each temporary file |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
171 output_files = make_temp_files(number_of_temp_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
172 for i in range(number_of_temp_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
173 print('Running retrotransposon detection on file ' + str(i)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
174 subprocess.check_call( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
175 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
176 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
177 str(args.max_missing_domains), '-L', str(args.min_relative_length)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
178 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
179 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
180 # remove all temporary input files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
181 for temp_file in temp_files_fasta + temp_files_gff: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
182 os.remove(temp_file) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
183 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
184 # concatenate output files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
185 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
186 '_DLP.fasta', '.gff3', '_statistics.csv'] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
187 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
188 for suffix in output_file_suffixes: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
189 if suffix == '_statistics.csv': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
190 # sum up line with same word in first column |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
191 stat_files = [output_file + suffix for output_file in output_files] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
192 new_statistics = sum_up_stats_files(stat_files) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
193 with open(args.output + suffix, 'w') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
194 f.write("\n".join(new_statistics)) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
195 # remove parsed temporary statistics files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
196 for file in stat_files: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
197 os.remove(file) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
198 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
199 with open(args.output + suffix, 'w') as f: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
200 for i in range(number_of_temp_files): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
201 # some file may not exist, so we need to check |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
202 try: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
203 with open(output_files[i] + suffix, 'r') as g: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
204 for line in g: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
205 f.write(line) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
206 # remove parsed temporary output files |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
207 os.remove(output_files[i]) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
208 except FileNotFoundError: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
209 pass |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
210 else: |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
211 # no need to split sequences into chunks |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
212 subprocess.check_call( |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
213 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
214 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
215 str(args.max_missing_domains), '-L', str(args.min_relative_length)] |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
216 ) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
217 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
218 |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
219 if __name__ == '__main__': |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
220 # check version of python must be 3.6 or greater |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
221 if sys.version_info < (3, 6): |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
222 print('Python version must be 3.6 or greater') |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
223 sys.exit(1) |
ff01d4263391
"planemo upload commit 414119ad7c44562d2e956b765e97ca113bc35b2b-dirty"
petr-novak
parents:
diff
changeset
|
224 main() |