annotate dante.py @ 20:31449f1183d8 draft

Uploaded
author petr-novak
date Tue, 24 Sep 2019 08:04:09 -0400
parents 039c45c01b47
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
2
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
3 import numpy as np
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
4 import subprocess
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
5 import math
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
6 import time
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
7 from operator import itemgetter
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
8 from collections import Counter
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
9 from itertools import groupby
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
10 import os
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
11 import re
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
12 import configuration
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
13 from tempfile import NamedTemporaryFile
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
14 import sys
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
15 import warnings
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
16 import shutil
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
17 from collections import defaultdict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
18
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
19 np.set_printoptions(threshold=sys.maxsize)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
20
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
21 def alignment_scoring():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
22 ''' Create hash table for alignment similarity counting: for every
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
23 combination of aminoacids in alignment assign score from protein
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
24 scoring matrix defined in configuration file '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
25 score_dict = {}
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
26 with open(configuration.SC_MATRIX) as smatrix:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
27 count = 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
28 for line in smatrix:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
29 if not line.startswith("#"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
30 if count == 1:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
31 aa_all = line.rstrip().replace(" ", "")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
32 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
33 count_aa = 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
34 line = list(filter(None, line.rstrip().split(" ")))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
35 for aa in aa_all:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
36 score_dict["{}{}".format(line[0], aa)] = line[count_aa]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
37 count_aa += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
38 count += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
39 return score_dict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
40
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
41
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
42 def characterize_fasta(QUERY, WIN_DOM):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
43 ''' Find the sequences, their lengths, starts, ends and if
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
44 they exceed the window '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
45 with open(QUERY) as query:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
46 headers = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
47 fasta_lengths = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
48 seq_starts = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
49 seq_ends = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
50 fasta_chunk_len = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
51 count_line = 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
52 for line in query:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
53 line = line.rstrip()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
54 if line.startswith(">"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
55 headers.append(line.rstrip())
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
56 fasta_lengths.append(fasta_chunk_len)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
57 fasta_chunk_len = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
58 seq_starts.append(count_line + 1)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
59 seq_ends.append(count_line - 1)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
60 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
61 fasta_chunk_len += len(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
62 count_line += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
63 seq_ends.append(count_line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
64 seq_ends = seq_ends[1:]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
65 fasta_lengths.append(fasta_chunk_len)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
66 fasta_lengths = fasta_lengths[1:]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
67 # control if there are correct (unique) names for individual seqs:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
68 # LASTAL takes seqs IDs till the first space which can then create problems with ambiguous records
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
69 if len(headers) > len(set([header.split(" ")[0] for header in headers
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
70 ])):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
71 raise NameError(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
72 '''Sequences in multifasta format are not named correctly:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
73 seq IDs (before the first space) are the same''')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
74
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
75 above_win = [idx
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
76 for idx, value in enumerate(fasta_lengths) if value > WIN_DOM]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
77 below_win = [idx
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
78 for idx, value in enumerate(fasta_lengths)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
79 if value <= WIN_DOM]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
80 lens_above_win = np.array(fasta_lengths)[above_win]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
81 return headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
82
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
83
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
84 def split_fasta(QUERY, WIN_DOM, step, headers, above_win, below_win,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
85 lens_above_win, seq_starts, seq_ends):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
86 ''' Create temporary file containing all sequences - the ones that exceed
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
87 the window are cut with a set overlap (greater than domain size with a reserve) '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
88 with open(QUERY, "r") as query:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
89 count_fasta_divided = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
90 count_fasta_not_divided = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
91 ntf = NamedTemporaryFile(delete=False)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
92 divided = np.array(headers)[above_win]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
93 row_length = configuration.FASTA_LINE
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
94 for line in query:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
95 line = line.rstrip()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
96 if line.startswith(">") and line in divided:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
97 stop_line = seq_ends[above_win[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
98 count_fasta_divided]] - seq_starts[above_win[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
99 count_fasta_divided]] + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
100 count_line = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
101 whole_seq = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
102 for line2 in query:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
103 whole_seq.append(line2.rstrip())
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
104 count_line += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
105 if count_line == stop_line:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
106 break
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
107 whole_seq = "".join(whole_seq)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
108 ## create list of starting positions for individual parts of a seq with a step given by a window and overlap
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
109 windows_starts = list(range(0, lens_above_win[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
110 count_fasta_divided], step))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
111 ## create list of ending positions (starting pos + window), the last element is the whole seq length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
112 windows_ends = [
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
113 x + WIN_DOM
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
114 if x + WIN_DOM < lens_above_win[count_fasta_divided] else
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
115 lens_above_win[count_fasta_divided] for x in windows_starts
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
116 ]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
117 count_part = 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
118 for start_part, end_part in zip(windows_starts, windows_ends):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
119 seq_part = whole_seq[start_part:end_part]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
120 if count_part == len(windows_starts):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
121 ntf.write("{}_DANTE_PART{}_LAST:{}-{}\n{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
122 line.split(" ")[0], count_part, start_part + 1,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
123 end_part, "\n".join([seq_part[i:i + row_length]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
124 for i in range(0, len(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
125 seq_part), row_length)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
126 ])).encode("utf-8"))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
127 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
128 ntf.write("{}_DANTE_PART{}:{}-{}\n{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
129 line.split(" ")[0], count_part, start_part + 1,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
130 end_part, "\n".join([seq_part[i:i + row_length]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
131 for i in range(0, len(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
132 seq_part), row_length)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
133 ])).encode("utf-8"))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
134 count_part += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
135 count_fasta_divided += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
136 elif line.startswith(">") and line not in divided:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
137 length_seq = seq_ends[below_win[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
138 count_fasta_not_divided]] - seq_starts[below_win[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
139 count_fasta_not_divided]] + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
140 ntf.write("{}\n{}".format(line, "".join([query.readline(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
141 ) for x in range(length_seq)])).encode("utf-8"))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
142 count_fasta_not_divided += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
143 query_temp = ntf.name
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
144 ntf.close()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
145 return query_temp
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
146
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
147
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
148 def domain_annotation(elements, CLASSIFICATION):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
149 ''' Assign protein domain to each hit from protein database '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
150 domains = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
151 annotations = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
152 with open(CLASSIFICATION, "r") as cl_tbl:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
153 annotation = {}
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
154 for line in cl_tbl:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
155 record = line.rstrip().split("\t")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
156 annotation[record[0]] = record[1:]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
157 for i in range(len(elements)):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
158 domains.append(elements[i].split("__")[0].split("-")[1])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
159 element_name = "__".join(elements[i].split("__")[1:])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
160 if element_name in annotation.keys():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
161 annotations.append("|".join([elements[i].split("__")[0].split("-")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
162 1], ("|".join(annotation[element_name]))]))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
163 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
164 annotations.append("unknown|unknown")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
165 return annotations
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
166
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
167
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
168 def hits_processing(seq_len, start, end, strand):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
169 ''' Gain hits intervals separately for forward and reverse strand '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
170 reverse_strand_idx = np.where(strand == "-")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
171 if not reverse_strand_idx.any():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
172 start_pos_plus = start + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
173 end_pos_plus = end
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
174 regions_plus = list(zip(start_pos_plus, end_pos_plus))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
175 regions_minus = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
176 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
177 reverse_strand_idx = reverse_strand_idx[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
178 start_pos_plus = start[0:reverse_strand_idx] + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
179 end_pos_plus = end[0:reverse_strand_idx]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
180 start_pos_minus = seq_len[0] - end[reverse_strand_idx:] + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
181 end_pos_minus = seq_len[0] - start[reverse_strand_idx:]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
182 regions_plus = list(zip(start_pos_plus, end_pos_plus))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
183 regions_minus = list(zip(start_pos_minus, end_pos_minus))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
184 return reverse_strand_idx, regions_plus, regions_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
185
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
186
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
187 def overlapping_regions(input_data):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
188 ''' Join all overlapping intervals(hits) to clusters (potential domains),
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
189 get list of start-end positions of individual hits within the interval,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
190 list of minimus and maximums as well as the indices in the original
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
191 sequence_hits structure for the hits belonging to the same clusters '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
192 if input_data:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
193 sorted_idx, sorted_data = zip(*sorted(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
194 [(index, data) for index, data in enumerate(input_data)],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
195 key=itemgetter(1)))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
196 merged_ends = input_data[sorted_idx[0]][1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
197 intervals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
198 data = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
199 output_intervals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
200 output_data = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
201 for i, j in zip(sorted_idx, sorted_data):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
202 if input_data[i][0] < merged_ends:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
203 merged_ends = max(input_data[i][1], merged_ends)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
204 intervals.append(i)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
205 data.append(j)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
206 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
207 output_intervals.append(intervals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
208 output_data.append(data)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
209 intervals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
210 data = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
211 intervals.append(i)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
212 data.append(j)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
213 merged_ends = input_data[i][1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
214 output_intervals.append(intervals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
215 output_data.append(data)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
216 mins = [x[0][0] for x in output_data]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
217 maxs = [max(x, key=itemgetter(1))[1] for x in output_data]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
218 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
219 mins = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
220 maxs = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
221 output_intervals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
222 output_data = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
223 return mins, maxs, output_data, output_intervals
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
224
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
225
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
226 def annotations_dict(annotations):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
227 ''' Hash table where annotations of the hits within a clusters are the keys.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
228 Each annotation has serial number assigned which indexes the row in the score_table '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
229 classes_dict = {classes: idx
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
230 for idx, classes in enumerate(set(annotations))}
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
231 return classes_dict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
232
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
233
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
235 ''' Score table is created based on the annotations occurance in the cluster.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
237 axis x represents positions of analyzed seq in a given cluster.
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
238 For every hit within cluster, array of scores on the corresponding position
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
239 is recorded to the table in case if the score on certain position is so far the highest
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
240 for the certain position and certain annotation '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
241 classes_dict = annotations_dict(annotations)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
243 count = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
244 for item in annotations:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
245 saved_scores = score_matrix[classes_dict[item], data[count][0] - mins:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
246 data[count][1] - mins + 1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
247 new_scores = [scores[count]] * len(saved_scores)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
248 score_matrix[classes_dict[item], data[count][0] - mins:data[count][
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
249 1] - mins + 1] = [max(*pos_score)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
250 for pos_score in zip(saved_scores, new_scores)]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
251 count += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
252 return score_matrix, classes_dict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
253
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
254
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
255 def score_matrix_evaluation(score_matrix, classes_dict, THRESHOLD_SCORE):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
256 ''' Score matrix is evaluated based on each position.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
257 For every position the list of annotations with a score which reaches
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
258 certain percentage of the overal best score of the cluster are stored '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
259 ann_per_reg = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
260 overal_best_score_reg = max((score_matrix.max(axis=1)))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
261 for position in score_matrix.T:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
262 ## score threshold calculated as a percentage of the OVERALL best score in the cluster
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
263 threshold = overal_best_score_reg * THRESHOLD_SCORE / 100
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
264 above_th = [idx
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
265 for idx, score in enumerate(position)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
266 if position[idx] >= threshold]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
267 ## select unique annotations in one position that are above threshold
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
268 ann_per_pos = list(set(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
269 [key for key, value in classes_dict.items() if value in above_th]))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
270 ann_per_reg.append(ann_per_pos)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
271 return ann_per_reg
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
272
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
273
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
274 def group_annot_regs(ann_per_reg):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
275 ''' Get list of domains, annotations, longest common annotations and
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
276 counts of positions with certain annotation per regions '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
277 ## tranform list of lists (potential multiple annotations for every position ) to flat list of all annotations
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
278 all_annotations = [item for sublist in ann_per_reg for item in sublist]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
279 unique_annotations = list(set(all_annotations))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
280 ann_pos_counts = [all_annotations.count(x) for x in unique_annotations]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
281 unique_annotations = list(set(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
282 [item for sublist in ann_per_reg for item in sublist]))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
283 domain_type = list(set([annotation.split("|")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
284 for annotation in unique_annotations]))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
285 classification_list = [annotation.split("|")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
286 for annotation in unique_annotations]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
287 ann_substring = "|".join(os.path.commonprefix(classification_list))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
288 domain_type = "/".join(domain_type)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
289 return domain_type, ann_substring, unique_annotations, ann_pos_counts
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
290
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
291
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
292 def best_score(scores, region):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
293 ''' From overlapping intervals take the one with the highest score '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
294 ## if more hits have the same best score take only the first one
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
295 best_idx = region[np.where(scores == max(scores))[0][0]]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
296 best_idx_reg = np.where(scores == max(scores))[0][0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
297 return best_idx, best_idx_reg
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
298
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
299
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
301 dom_start, dom_end, step, best_idx, annotation_best,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
302 db_name_best, db_starts_best, db_ends_best, strand, score,
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
303 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
305 best_start = positions[best_idx][0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
306 best_end = positions[best_idx][1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
307 best_score = score[best_idx]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
308 ## proportion of length of the best hit to the whole region length found by base
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
309 length_proportion = int((best_end - best_start + 1) /
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
310 (dom_end - dom_start + 1) * 100)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
311 db_seq_best = db_seq[best_idx]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
312 query_seq_best = query_seq[best_idx]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
313 domain_size_best = domain_size[best_idx]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
314 [percent_ident, align_similarity, relat_align_len, relat_interrupt,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
315 db_len_proportion
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
316 ] = filter_params(db_seq_best, query_seq_best, domain_size_best)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
317 ann_substring = "|".join(ann_substring.split("|")[1:])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
318 annotation_best = "|".join([db_name_best] + annotation_best.split("|")[1:])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
319 if "DANTE_PART" in seq_id:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
320 part = int(seq_id.split("DANTE_PART")[1].split(":")[0].split("_")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
321 dom_start = dom_start + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
322 dom_end = dom_end + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
323 best_start = best_start + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
324 best_end = best_end + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
325 if ann_substring is '':
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
326 ann_substring = "NONE(Annotations from different classes)"
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
327 if len(unique_annotations) > 1:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
328 unique_annotations = ",".join(["{}[{}bp]".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
329 ann, pos) for ann, pos in zip(unique_annotations, ann_pos_counts)])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
330 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
331 unique_annotations = unique_annotations[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
332 if __name__ == '__main__':
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
333 SOURCE = configuration.SOURCE_DANTE
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
334 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
335 SOURCE = configuration.SOURCE_PROFREP
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
336 if "/" in domain_type:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
337 gff.write(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
338 "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\tName={};Final_Classification=Ambiguous_domain;Region_Hits_Classifications_={}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
340 dom_end, strand, configuration.PHASE, domain_type,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
341 unique_annotations))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
342 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
343 gff.write(
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
346 dom_end, best_score, strand, configuration.PHASE, domain_type,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
347 ann_substring, unique_annotations, annotation_best, best_start,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
348 best_end, length_proportion, db_starts_best, db_ends_best,
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
349 domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
350 align_similarity, relat_align_len, relat_interrupt,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
351 db_len_proportion))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
352
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
353
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
354 def filter_params(db, query, protein_len):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
355 ''' Calculate basic statistics of the quality of the alignment '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
356 score_dict = alignment_scoring()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
357 num_ident = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
358 count_interrupt = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
359 count_similarity = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
360 alignment_len = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
361 for i, j in zip(db.upper(), query.upper()):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
362 if i == j and i != "X":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
363 num_ident += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
364 if j == "/" or j == "\\" or j == "*":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
365 count_interrupt += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
366 if (i.isalpha() or i == "*") and (j.isalpha() or j == "*"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
367 if int(score_dict["{}{}".format(i, j)]) > 0:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
368 count_similarity += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
369 ## gapless alignment length proportional to the domain protein length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
370 relat_align_len = round((len(db) - db.count("-")) / protein_len, 3)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
371 ## proportional identical bases (except of X) to al.length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
372 align_identity = round(num_ident / len(db), 2)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
373 ## proportional count of positive scores from scoring matrix to al. length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
374 align_similarity = round(count_similarity / len(db), 2)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
375 ## number of interruptions per 100 bp
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
376 relat_interrupt = round(count_interrupt / math.ceil((len(query) / 100)), 2)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
377 ## Proportion of alignment to the original length of protein domain from database (indels included)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
378 db_len_proportion = round(len(db) / protein_len, 2)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
379 return align_identity, align_similarity, relat_align_len, relat_interrupt, db_len_proportion
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
380
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
381
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
382 def line_generator(tab_pipe, maf_pipe, start):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
383 ''' Yield individual lines of LASTAL stdout for single sequence '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
384 if hasattr(line_generator, "dom"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
385 seq_id = line_generator.dom.split("\t")[6]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
386 yield line_generator.dom.encode("utf-8")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
387 del line_generator.dom
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
388 line_tab = ""
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
389 for line_tab in tab_pipe:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
390 line_tab = line_tab.decode("utf-8")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
391 if not line_tab.startswith('#'):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
392 if start:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
393 if not ('seq_id' in locals() and
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
394 seq_id != line_tab.split("\t")[6]):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
395 seq_id = line_tab.split("\t")[6]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
396 start = False
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
397 line_maf = [maf_pipe.readline() for line_count in range(4)]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
398 db_seq = line_maf[1].decode("utf-8").rstrip().split(" ")[-1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
399 alignment_seq = line_maf[2].decode("utf-8").rstrip().split(" ")[-1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
400 line = "{}\t{}\t{}".format(line_tab, db_seq, alignment_seq)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
401 line_id = line.split("\t")[6]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
402 if seq_id != line_id:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
403 line_generator.dom = line
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
404 return
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
405 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
406 yield line.encode("utf-8")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
407 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
408 maf_pipe.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
409 if line_tab == "":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
410 raise RuntimeError
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
411 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
412 return
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
413
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
414
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
415 def get_version(path, LAST_DB):
5
a38efa4937d7 Uploaded
petr-novak
parents: 0
diff changeset
416 '''Return version is run from git repository '''
18
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
417 version_string = (
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
418 "##-----------------------------------------------\n"
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
419 "##PROTEIN DATABASE VERSION : {PD}\n"
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
420 "##-----------------------------------------------\n").format(
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
421 PD=os.path.basename(LAST_DB)
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
422 )
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
423 if os.path.exists(".git"):
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
424 try:
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
425 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
426 shell=True,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
427 cwd=path).decode('ascii').strip()
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
428 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
429 shell=True,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
430 cwd=path).decode('ascii').strip()
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
431 revcount = len(subprocess.check_output("git log --oneline",
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
432 shell=True,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
433 cwd=path).decode('ascii').split())
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
434 version_string = (
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
435 "##-----------------------------------------------\n"
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
436 "##PIPELINE VERSION : "
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
437 "{branch}-rv-{revcount}({shorthash})\n"
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
438 "##PROTEIN DATABASE VERSION : {PD}\n"
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
439 "##-----------------------------------------------\n").format(
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
440 branch=branch,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
441 shorthash=shorthash,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
442 revcount=revcount,
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
443 PD=os.path.basename(LAST_DB))
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
444 except:
039c45c01b47 Uploaded
petr-novak
parents: 15
diff changeset
445 pass
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
446 return version_string
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
447
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
448
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
449 def write_info(dom_gff_tmp, version_string):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
450 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
451 dom_gff_tmp.write(version_string)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
452
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
453
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
454 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
455 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
456 ''' Search for protein domains using our protein database and external tool LAST,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
457 stdout is parsed in real time and hits for a single sequence undergo further processing
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
458 - tabular format(TAB) to get info about position, score, orientation
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
459 - MAF format to gain alignment and original sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
460 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
461
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
462 step = WIN_DOM - OVERLAP_DOM
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
463 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
464 ] = characterize_fasta(QUERY, WIN_DOM)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
465 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
466 below_win, lens_above_win, seq_starts, seq_ends)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
467
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
468 ## TAB output contains all the alignment scores, positions, strands...
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
469 lastal_columns = {
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
470 "BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
471 " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
472 " block1, block2, block3, db_seq, q_seq"),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
473 "BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
474 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
475 " seq_size_q, block1, block2, block3, db_seq, q_seq"),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
476 "MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
477 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
478 " seq_size_q, block1, db_seq, q_seq"),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
479 }
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
480 tab = subprocess.Popen(
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
481 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
482 query_temp,
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
483 SCORING_MATRIX),
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
484 stdout=subprocess.PIPE,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
485 shell=True)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
486 ## MAF output contains alignment sequences
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
487 maf = subprocess.Popen(
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
488 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f MAF".format(LAST_DB,
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
489 query_temp,
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
490 SCORING_MATRIX),
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
491 stdout=subprocess.PIPE,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
492 shell=True)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
493
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
494 tab_pipe = tab.stdout
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
495 maf_pipe = maf.stdout
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
496 maf_pipe.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
497
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
498 seq_ids = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
499 dom_tmp = NamedTemporaryFile(delete=False)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
500 with open(dom_tmp.name, "w") as dom_gff_tmp:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
501 path = os.path.dirname(os.path.realpath(__file__))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
502 version_string = get_version(path, LAST_DB)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
503 write_info(dom_gff_tmp, version_string)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
504 gff = open(dom_tmp.name, "a")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
505 start = True
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
506 while True:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
507 try:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
508 with warnings.catch_warnings():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
509 warnings.simplefilter("ignore")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
510 sequence_hits = np.genfromtxt(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
511 line_generator(tab_pipe, maf_pipe, start),
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
512 names=lastal_columns[SCORING_MATRIX],
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
513 usecols=("score, name_q, start_q, al_size_q,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
514 " strand_q, seq_size_q, name_db, db_seq,"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
515 " q_seq, seq_size_db, start_db, al_size_db"),
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
516 dtype=None,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
517 comments=None)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
518 except RuntimeError:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
519 break
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
520 ## if there are no domains found
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
521 if sequence_hits.size is 0:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
522 gff.write("##NO DOMAINS")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
523 return [], [], [], []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
524
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
525 ############# PARSING LASTAL OUTPUT ############################
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
526 sequence_hits = np.atleast_1d(sequence_hits)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
527 score = sequence_hits['score'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
528 seq_id = sequence_hits['name_q'][0].astype("str")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
529 start_hit = sequence_hits['start_q'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
530 end_hit = start_hit + sequence_hits['al_size_q'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
531 strand = sequence_hits['strand_q'].astype("str")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
532 seq_len = sequence_hits['seq_size_q'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
533 domain_db = sequence_hits['name_db'].astype("str")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
534 db_seq = sequence_hits['db_seq'].astype("str")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
535 query_seq = sequence_hits['q_seq'].astype("str")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
536 domain_size = sequence_hits['seq_size_db'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
537 db_start = sequence_hits['start_db'].astype("int") + 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
538 db_end = sequence_hits['start_db'].astype("int") + sequence_hits[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
539 'al_size_db'].astype("int")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
540
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
541 [reverse_strand_idx, positions_plus, positions_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
542 ] = hits_processing(seq_len, start_hit, end_hit, strand)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
543 strand_gff = "+"
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
544 [mins_plus, maxs_plus, data_plus, indices_plus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
545 ] = overlapping_regions(positions_plus)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
546 [mins_minus, maxs_minus, data_minus, indices_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
547 ] = overlapping_regions(positions_minus)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
548 positions = positions_plus + positions_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
549 indices_overal = indices_plus + [x + reverse_strand_idx
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
550 for x in indices_minus]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
551 mins = mins_plus + mins_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
552 maxs = maxs_plus + maxs_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
553 data = data_plus + data_minus
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
554 ## process every region (cluster) of overlapping hits sequentially
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
555 count_region = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
556 for region in indices_overal:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
557 db_names = domain_db[np.array(region)]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
558 db_starts = db_start[np.array(region)]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
559 db_ends = db_end[np.array(region)]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
560 scores = score[np.array(region)]
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
561 regions_above_threshold = [
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
562 region[i]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
563 for i, _ in enumerate(region)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
564 if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
565 ]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
566 ## sort by score first:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
567 consensus = get_full_translation(
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
568 translation_alignments(
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
569 query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
570 start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
571 end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
572 )
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
573
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
574 annotations = domain_annotation(db_names, CLASSIFICATION)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
575 [score_matrix, classes_dict] = score_table(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
576 mins[count_region], maxs[count_region], data[count_region],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
577 annotations, scores, CLASSIFICATION)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
578 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
579 THRESHOLD_SCORE)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
580 [domain_type, ann_substring, unique_annotations, ann_pos_counts
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
581 ] = group_annot_regs(ann_per_reg)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
582 [best_idx, best_idx_reg] = best_score(scores, region)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
583 annotation_best = annotations[best_idx_reg]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
584 db_name_best = db_names[best_idx_reg]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
585 db_starts_best = db_starts[best_idx_reg]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
586 db_ends_best = db_ends[best_idx_reg]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
587 if count_region == len(indices_plus):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
588 strand_gff = "-"
14
a6c55d1bdb6c Uploaded
petr-novak
parents: 10
diff changeset
589 if strand_gff == "+":
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
590 feature_start = min(start_hit[regions_above_threshold]) + 1
14
a6c55d1bdb6c Uploaded
petr-novak
parents: 10
diff changeset
591 feature_end = max(end_hit[regions_above_threshold])
a6c55d1bdb6c Uploaded
petr-novak
parents: 10
diff changeset
592 else:
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
593 feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold])
14
a6c55d1bdb6c Uploaded
petr-novak
parents: 10
diff changeset
594 feature_start = seq_len[region][0] - max(end_hit[regions_above_threshold]) + 1
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
595 create_gff3(domain_type, ann_substring, unique_annotations,
14
a6c55d1bdb6c Uploaded
petr-novak
parents: 10
diff changeset
596 ann_pos_counts, feature_start,feature_end,
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
597 step, best_idx, annotation_best, db_name_best,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
598 db_starts_best, db_ends_best, strand_gff, score,
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
599 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
600 count_region += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
601 seq_ids.append(seq_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
602 os.unlink(query_temp)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
603 gff.close()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
604 dom_tmp.close()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
605 ## if any sequence from input data was split into windows, merge and adjust the data from individual windows
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
606 if any("DANTE_PART" in x for x in seq_ids):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
607 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
608 ## otherwise use the temporary output as the final domains gff
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
609 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
610 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
611 os.unlink(dom_tmp.name)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
612
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
613 def sortby(a, by, reverse=False):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
614 ''' sort according values in the by list '''
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
615 a_sorted = [i[0] for i in
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
616 sorted(
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
617 zip(a, by),
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
618 key=lambda i: i[1],
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
619 reverse=reverse
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
620 )]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
621 return a_sorted
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
622
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
623
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
624 def a2nnn(s):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
625 s1 = "".join([c if c in ['/', '\\'] else c + c + c
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
626 for c in s.replace("-", "")])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
627 # collapse frameshifts (/)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
628 s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
629 s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
630 return (s3)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
631
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
632
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
633
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
634 def rle(s):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
635 '''run length encoding but max is set to 3 (codon)'''
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
636 prev = ""
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
637 count = 1
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
638 char = []
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
639 length = []
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
640 L = 0
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
641 for n in s:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
642 if n == prev and count < (3 - L):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
643 count += 1
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
644 else:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
645 char.append(prev)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
646 length.append(count)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
647 L = 1 if prev == "/" else 0
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
648 prev = n
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
649 count = 1
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
650 char.append(prev)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
651 length.append(count)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
652 sequence = char[1:]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
653 return sequence, length[1:]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
654
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
655 def get_full_translation(translations):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
656 '''get one full length translation from multiple partial
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
657 aligned translation '''
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
658 # find minimal set of alignements
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
659 minimal_set = []
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
660 not_filled_prev = len(translations[0])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
661 for s in translations:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
662 minimal_set.append(s)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
663 # check defined position - is there only '-' character?
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
664 not_filled = sum([set(i) == {"-"} for i in zip(*minimal_set)])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
665 if not_filled == 0:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
666 break
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
667 if not_filled == not_filled_prev:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
668 # last added sequence did not improve coverage - remove it.
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
669 minimal_set.pop()
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
670 not_filled_prev = not_filled
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
671 # merge translations
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
672 final_translation = minimal_set[0]
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
673 # record positions of joins to correct frameshifts reportings
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
674 position_of_joins = set()
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
675 position_of_joins_rle = set()
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
676 if len(minimal_set) > 1: # translation must be merged
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
677 for s in minimal_set[1:]:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
678 s1 = re.search(r"-\w", final_translation)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
679 s2 = re.search(r"\w-", final_translation)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
680 if s1:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
681 position_of_joins.add(s1.start())
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
682 if s2:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
683 position_of_joins.add((s2.end() - 1))
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
684 final_translation = "".join(
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
685 [b if a == "-" else a for a, b in zip(final_translation, s)])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
686 translation_rle = rle(final_translation)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
687 cumsumed_positions = np.cumsum(translation_rle[1])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
688 for p in position_of_joins:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
689 position_of_joins_rle.add(sum(cumsumed_positions <= p))
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
690 # insert /\ when necessary
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
691 for p in position_of_joins_rle:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
692 if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
693 if translation_rle[1][p] == 2:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
694 translation_rle[0][p] = translation_rle[0][p] + "/"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
695 if translation_rle[1][p] == 1:
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
696 translation_rle[0][p] = "\\"
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
697 consensus = "".join(translation_rle[0])
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
698 return consensus
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
699
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
700
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
701 # helper function for debugging
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
702 def translation_alignments(query_seq, start_hit, end_hit):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
703 pstart = min(start_hit)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
704 pend = max(end_hit)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
705 nnn = list()
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
706 for s, start, end in zip(query_seq, start_hit, end_hit):
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
707 nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
708 return (nnn)
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
709
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
710
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
711
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
712 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
713 ''' Original gff file is adjusted in case of containing cut parts
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
714 - for consecutive sequences overlap is divided to half with first half
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
715 of records(domains) belonging to the first sequence and second to the following one.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
716 Duplicate domains going through the middle of the overlap are removed.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
717 First and the last part (marked as LAST) of a certain sequence are
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
718 handled separately as the are overlapped from one side only '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
719
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
720 seq_id_all = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
721 class_dict = defaultdict(int)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
722 seen = set()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
723 with open(OUTPUT_DOMAIN, "w") as adjusted_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
724 with open(gff, "r") as primary_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
725 start = True
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
726 for line in primary_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
727 if line.startswith("#"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
728 adjusted_gff.write(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
729 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
730 split_line = line.split("\t")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
731 classification = split_line[-1].split(";")[1].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
732 if start:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
733 seq_id_all.append(split_line[0].split("_DANTE_PART")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
734 0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
735 start = False
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
736 seq_id = split_line[0].split("_DANTE_PART")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
737 if "DANTE_PART" in line:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
738 line_without_id = "\t".join(split_line[1:])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
739 part = int(split_line[0].split("_DANTE_PART")[1].split(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
740 ":")[0].split("_")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
741 if seq_id != seq_id_all[-1]:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
742 seq_id_all.append(seq_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
743
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
744 ## first part of the sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
745 if part == 1:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
746 cut_end = WIN_DOM - OVERLAP_DOM / 2
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
747 if int(split_line[3]) <= cut_end <= int(split_line[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
748 4]):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
749 if line_without_id not in seen:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
750 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
751 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
752 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
753 seen.add(line_without_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
754 elif int(split_line[4]) < cut_end:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
755 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
756 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
757 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
758
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
759 ## last part of the sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
760 elif "LAST" in split_line[0]:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
761 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
762 if int(split_line[3]) <= cut_start <= int(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
763 split_line[4]):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
764 if line_without_id not in seen:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
765 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
766 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
767 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
768 seen.add(line_without_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
769 elif int(split_line[3]) > cut_start:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
770 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
771 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
772 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
773
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
774 ## middle part of the sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
775 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
776 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
777 cut_end = WIN_DOM - OVERLAP_DOM / 2 + (part -
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
778 1) * step
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
779 if int(split_line[3]) <= cut_start <= int(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
780 split_line[4]) or int(split_line[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
781 3]) <= cut_end <= int(split_line[4]):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
782 if line_without_id not in seen:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
783 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
784 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
785 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
786 seen.add(line_without_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
787 elif int(split_line[3]) > cut_start and int(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
788 split_line[4]) < cut_end:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
789 adjusted_gff.write("{}\t{}".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
790 seq_id, line_without_id))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
791 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
792 ## not divived
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
793 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
794 if seq_id != seq_id_all[-1]:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
795 seq_id_all.append(seq_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
796 adjusted_gff.write(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
797 class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
798
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
799
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
800 def main(args):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
801
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
802 t = time.time()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
803
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
804 QUERY = args.query
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
805 LAST_DB = args.protein_database
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
806 CLASSIFICATION = args.classification
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
807 OUTPUT_DOMAIN = args.domain_gff
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
808 NEW_LDB = args.new_ldb
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
809 OUTPUT_DIR = args.output_dir
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
810 THRESHOLD_SCORE = args.threshold_score
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
811 WIN_DOM = args.win_dom
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
812 OVERLAP_DOM = args.overlap_dom
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
813 SCORING_MATRIX = args.scoring_matrix
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
814 configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
815
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
816 if OUTPUT_DOMAIN is None:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
817 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
818 if os.path.isdir(LAST_DB):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
819 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
820 if os.path.isdir(CLASSIFICATION):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
821 CLASSIFICATION = os.path.join(CLASSIFICATION, configuration.CLASS_FILE)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
822
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
823 if NEW_LDB:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
824 subprocess.call("lastdb -p -cR01 {} {}".format(LAST_DB, LAST_DB),
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
825 shell=True)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
826
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
827 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
828 os.makedirs(OUTPUT_DIR)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
829
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
830 if not os.path.isabs(OUTPUT_DOMAIN):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
831 if OUTPUT_DIR is None:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
832 OUTPUT_DIR = configuration.TMP
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
833 if not os.path.exists(OUTPUT_DIR):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
834 os.makedirs(OUTPUT_DIR)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
835 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
836 os.path.basename(OUTPUT_DOMAIN))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
837 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
838 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
839
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
840 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
841
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
842
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
843 if __name__ == "__main__":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
844 import argparse
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
845 from argparse import RawDescriptionHelpFormatter
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
846
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
847 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
848 argparse.RawDescriptionHelpFormatter):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
849 pass
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
850
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
851 parser = argparse.ArgumentParser(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
852 description=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
853 '''Script performs similarity search on given DNA sequence(s) in (multi)fasta against our protein domains database of all Transposable element for certain group of organisms (Viridiplantae or Metazoans). Domains are subsequently annotated and classified - in case certain domain has multiple annotations assigned, classifation is derived from the common classification level of all of them. Domains search is accomplished engaging LASTAL alignment tool.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
854
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
855 DEPENDENCIES:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
856 - python 3.4 or higher with packages:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
857 -numpy
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
858 - lastal 744 or higher [http://last.cbrc.jp/]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
859 - configuration.py module
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
860
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
861 EXAMPLE OF USAGE:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
862
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
863 ./protein_domains_pd.py -q PATH_TO_INPUT_SEQ -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
864
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
865 When running for the first time with a new database use -nld option allowing lastal to create indexed database files:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
866
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
867 -nld True
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
868
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
869 ''',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
870 epilog="""""",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
871 formatter_class=CustomFormatter)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
872
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
873 requiredNamed = parser.add_argument_group('required named arguments')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
874 requiredNamed.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
875 "-q",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
876 "--query",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
877 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
878 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
879 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
880 'input DNA sequence to search for protein domains in a fasta format. Multifasta format allowed.')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
881 requiredNamed.add_argument('-pdb',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
882 "--protein_database",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
883 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
884 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
885 help='protein domains database file')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
886 requiredNamed.add_argument('-cs',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
887 '--classification',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
888 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
889 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
890 help='protein domains classification file')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
891 parser.add_argument("-oug",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
892 "--domain_gff",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
893 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
894 help="output domains gff format")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
895 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
896 "-nld",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
897 "--new_ldb",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
898 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
899 default=False,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
900 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
901 "create indexed database files for lastal in case of working with new protein db")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
902 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
903 "-dir",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
904 "--output_dir",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
905 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
906 help="specify if you want to change the output directory")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
907 parser.add_argument(
10
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
908 "-M",
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
909 "--scoring_matrix",
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
910 type=str,
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
911 default="BL80",
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
912 choices=['BL80', 'BL62', 'MIQS'],
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
913 help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
d0431a839606 Uploaded
petr-novak
parents: 5
diff changeset
914 parser.add_argument(
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
915 "-thsc",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
916 "--threshold_score",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
917 type=int,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
918 default=80,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
919 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
920 "percentage of the best score in the cluster to be tolerated when assigning annotations per base")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
921 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
922 "-wd",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
923 "--win_dom",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
924 type=int,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
925 default=10000000,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
926 help="window to process large input sequences sequentially")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
927 parser.add_argument("-od",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
928 "--overlap_dom",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
929 type=int,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
930 default=10000,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
931 help="overlap of sequences in two consecutive windows")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
932
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
933 args = parser.parse_args()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
934 main(args)