annotate dante.py @ 4:e27e86406f56 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 10:23:50 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3 import numpy as np
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 import subprocess
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 import math
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 import time
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 from operator import itemgetter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 from collections import Counter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 from itertools import groupby
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10 import os
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11 import configuration
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 from tempfile import NamedTemporaryFile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 import sys
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 import warnings
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 import shutil
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 from collections import defaultdict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 np.set_printoptions(threshold=sys.maxsize)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 def alignment_scoring():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 ''' Create hash table for alignment similarity counting: for every
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 combination of aminoacids in alignment assign score from protein
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 scoring matrix defined in configuration file '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 score_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 with open(configuration.SC_MATRIX) as smatrix:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 count = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 for line in smatrix:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 if not line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 if count == 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 aa_all = line.rstrip().replace(" ", "")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 count_aa = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 line = list(filter(None, line.rstrip().split(" ")))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 for aa in aa_all:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 score_dict["{}{}".format(line[0], aa)] = line[count_aa]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 count_aa += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 return score_dict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 def characterize_fasta(QUERY, WIN_DOM):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 ''' Find the sequences, their lengths, starts, ends and if
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 they exceed the window '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 with open(QUERY) as query:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 headers = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 fasta_lengths = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 seq_starts = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 seq_ends = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 fasta_chunk_len = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 count_line = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 for line in query:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53 line = line.rstrip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54 if line.startswith(">"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 headers.append(line.rstrip())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 fasta_lengths.append(fasta_chunk_len)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 fasta_chunk_len = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 seq_starts.append(count_line + 1)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 seq_ends.append(count_line - 1)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61 fasta_chunk_len += len(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62 count_line += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 seq_ends.append(count_line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 seq_ends = seq_ends[1:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 fasta_lengths.append(fasta_chunk_len)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 fasta_lengths = fasta_lengths[1:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 # control if there are correct (unique) names for individual seqs:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 # LASTAL takes seqs IDs till the first space which can then create problems with ambiguous records
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 if len(headers) > len(set([header.split(" ")[0] for header in headers
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 ])):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 raise NameError(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 '''Sequences in multifasta format are not named correctly:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 seq IDs (before the first space) are the same''')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75 above_win = [idx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76 for idx, value in enumerate(fasta_lengths) if value > WIN_DOM]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 below_win = [idx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 for idx, value in enumerate(fasta_lengths)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 if value <= WIN_DOM]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 lens_above_win = np.array(fasta_lengths)[above_win]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 return headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 def split_fasta(QUERY, WIN_DOM, step, headers, above_win, below_win,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 lens_above_win, seq_starts, seq_ends):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 ''' Create temporary file containing all sequences - the ones that exceed
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 the window are cut with a set overlap (greater than domain size with a reserve) '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 with open(QUERY, "r") as query:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 count_fasta_divided = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 count_fasta_not_divided = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91 ntf = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92 divided = np.array(headers)[above_win]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 row_length = configuration.FASTA_LINE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
94 for line in query:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
95 line = line.rstrip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
96 if line.startswith(">") and line in divided:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
97 stop_line = seq_ends[above_win[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
98 count_fasta_divided]] - seq_starts[above_win[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
99 count_fasta_divided]] + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
100 count_line = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
101 whole_seq = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
102 for line2 in query:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
103 whole_seq.append(line2.rstrip())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
104 count_line += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
105 if count_line == stop_line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
106 break
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
107 whole_seq = "".join(whole_seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
108 ## create list of starting positions for individual parts of a seq with a step given by a window and overlap
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
109 windows_starts = list(range(0, lens_above_win[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
110 count_fasta_divided], step))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
111 ## create list of ending positions (starting pos + window), the last element is the whole seq length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
112 windows_ends = [
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
113 x + WIN_DOM
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
114 if x + WIN_DOM < lens_above_win[count_fasta_divided] else
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
115 lens_above_win[count_fasta_divided] for x in windows_starts
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
116 ]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
117 count_part = 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
118 for start_part, end_part in zip(windows_starts, windows_ends):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
119 seq_part = whole_seq[start_part:end_part]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
120 if count_part == len(windows_starts):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
121 ntf.write("{}_DANTE_PART{}_LAST:{}-{}\n{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
122 line.split(" ")[0], count_part, start_part + 1,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
123 end_part, "\n".join([seq_part[i:i + row_length]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
124 for i in range(0, len(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
125 seq_part), row_length)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
126 ])).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
127 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
128 ntf.write("{}_DANTE_PART{}:{}-{}\n{}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
129 line.split(" ")[0], count_part, start_part + 1,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
130 end_part, "\n".join([seq_part[i:i + row_length]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
131 for i in range(0, len(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
132 seq_part), row_length)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
133 ])).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
134 count_part += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
135 count_fasta_divided += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
136 elif line.startswith(">") and line not in divided:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
137 length_seq = seq_ends[below_win[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
138 count_fasta_not_divided]] - seq_starts[below_win[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
139 count_fasta_not_divided]] + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
140 ntf.write("{}\n{}".format(line, "".join([query.readline(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
141 ) for x in range(length_seq)])).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
142 count_fasta_not_divided += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
143 query_temp = ntf.name
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
144 ntf.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
145 return query_temp
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
146
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
147
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
148 def domain_annotation(elements, CLASSIFICATION):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
149 ''' Assign protein domain to each hit from protein database '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
150 domains = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
151 annotations = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
152 with open(CLASSIFICATION, "r") as cl_tbl:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
153 annotation = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
154 for line in cl_tbl:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
155 record = line.rstrip().split("\t")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
156 annotation[record[0]] = record[1:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
157 for i in range(len(elements)):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
158 domains.append(elements[i].split("__")[0].split("-")[1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
159 element_name = "__".join(elements[i].split("__")[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
160 if element_name in annotation.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
161 annotations.append("|".join([elements[i].split("__")[0].split("-")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
162 1], ("|".join(annotation[element_name]))]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
163 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
164 annotations.append("unknown|unknown")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
165 return annotations
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
166
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
167
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
168 def hits_processing(seq_len, start, end, strand):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
169 ''' Gain hits intervals separately for forward and reverse strand '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
170 reverse_strand_idx = np.where(strand == "-")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
171 if not reverse_strand_idx.any():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
172 start_pos_plus = start + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
173 end_pos_plus = end
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
174 regions_plus = list(zip(start_pos_plus, end_pos_plus))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
175 regions_minus = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
176 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
177 reverse_strand_idx = reverse_strand_idx[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
178 start_pos_plus = start[0:reverse_strand_idx] + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
179 end_pos_plus = end[0:reverse_strand_idx]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
180 start_pos_minus = seq_len[0] - end[reverse_strand_idx:] + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
181 end_pos_minus = seq_len[0] - start[reverse_strand_idx:]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
182 regions_plus = list(zip(start_pos_plus, end_pos_plus))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
183 regions_minus = list(zip(start_pos_minus, end_pos_minus))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
184 return reverse_strand_idx, regions_plus, regions_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
185
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
186
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
187 def overlapping_regions(input_data):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
188 ''' Join all overlapping intervals(hits) to clusters (potential domains),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
189 get list of start-end positions of individual hits within the interval,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
190 list of minimus and maximums as well as the indices in the original
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
191 sequence_hits structure for the hits belonging to the same clusters '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
192 if input_data:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
193 sorted_idx, sorted_data = zip(*sorted(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
194 [(index, data) for index, data in enumerate(input_data)],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
195 key=itemgetter(1)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
196 merged_ends = input_data[sorted_idx[0]][1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
197 intervals = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
198 data = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
199 output_intervals = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
200 output_data = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
201 for i, j in zip(sorted_idx, sorted_data):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
202 if input_data[i][0] < merged_ends:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
203 merged_ends = max(input_data[i][1], merged_ends)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
204 intervals.append(i)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
205 data.append(j)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
206 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
207 output_intervals.append(intervals)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
208 output_data.append(data)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
209 intervals = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
210 data = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
211 intervals.append(i)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
212 data.append(j)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
213 merged_ends = input_data[i][1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
214 output_intervals.append(intervals)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
215 output_data.append(data)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
216 mins = [x[0][0] for x in output_data]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
217 maxs = [max(x, key=itemgetter(1))[1] for x in output_data]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
218 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
219 mins = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
220 maxs = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
221 output_intervals = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
222 output_data = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
223 return mins, maxs, output_data, output_intervals
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
224
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
225
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
226 def annotations_dict(annotations):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
227 ''' Hash table where annotations of the hits within a clusters are the keys.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
228 Each annotation has serial number assigned which indexes the row in the score_table '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
229 classes_dict = {classes: idx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
230 for idx, classes in enumerate(set(annotations))}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
231 return classes_dict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
232
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
233
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
235 ''' Score table is created based on the annotations occurance in the cluster.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
237 axis x represents positions of analyzed seq in a given cluster.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
238 For every hit within cluster, array of scores on the corresponding position
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
239 is recorded to the table in case if the score on certain position is so far the highest
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
240 for the certain position and certain annotation '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
241 classes_dict = annotations_dict(annotations)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
243 count = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
244 for item in annotations:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
245 saved_scores = score_matrix[classes_dict[item], data[count][0] - mins:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
246 data[count][1] - mins + 1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
247 new_scores = [scores[count]] * len(saved_scores)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
248 score_matrix[classes_dict[item], data[count][0] - mins:data[count][
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
249 1] - mins + 1] = [max(*pos_score)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
250 for pos_score in zip(saved_scores, new_scores)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
251 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
252 return score_matrix, classes_dict
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
253
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
254
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
255 def score_matrix_evaluation(score_matrix, classes_dict, THRESHOLD_SCORE):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
256 ''' Score matrix is evaluated based on each position.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
257 For every position the list of annotations with a score which reaches
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
258 certain percentage of the overal best score of the cluster are stored '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
259 ann_per_reg = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
260 overal_best_score_reg = max((score_matrix.max(axis=1)))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
261 for position in score_matrix.T:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
262 ## score threshold calculated as a percentage of the OVERALL best score in the cluster
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
263 threshold = overal_best_score_reg * THRESHOLD_SCORE / 100
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
264 above_th = [idx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
265 for idx, score in enumerate(position)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
266 if position[idx] >= threshold]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
267 ## select unique annotations in one position that are above threshold
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
268 ann_per_pos = list(set(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
269 [key for key, value in classes_dict.items() if value in above_th]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
270 ann_per_reg.append(ann_per_pos)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
271 return ann_per_reg
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
272
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
273
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
274 def group_annot_regs(ann_per_reg):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
275 ''' Get list of domains, annotations, longest common annotations and
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
276 counts of positions with certain annotation per regions '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
277 ## tranform list of lists (potential multiple annotations for every position ) to flat list of all annotations
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
278 all_annotations = [item for sublist in ann_per_reg for item in sublist]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
279 unique_annotations = list(set(all_annotations))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
280 ann_pos_counts = [all_annotations.count(x) for x in unique_annotations]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
281 unique_annotations = list(set(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
282 [item for sublist in ann_per_reg for item in sublist]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
283 domain_type = list(set([annotation.split("|")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
284 for annotation in unique_annotations]))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
285 classification_list = [annotation.split("|")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
286 for annotation in unique_annotations]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
287 ann_substring = "|".join(os.path.commonprefix(classification_list))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
288 domain_type = "/".join(domain_type)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
289 return domain_type, ann_substring, unique_annotations, ann_pos_counts
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
290
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
291
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
292 def best_score(scores, region):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
293 ''' From overlapping intervals take the one with the highest score '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
294 ## if more hits have the same best score take only the first one
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
295 best_idx = region[np.where(scores == max(scores))[0][0]]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
296 best_idx_reg = np.where(scores == max(scores))[0][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
297 return best_idx, best_idx_reg
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
298
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
299
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
301 dom_start, dom_end, step, best_idx, annotation_best,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
302 db_name_best, db_starts_best, db_ends_best, strand, score,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
303 seq_id, db_seq, query_seq, domain_size, positions, gff):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
305 best_start = positions[best_idx][0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
306 best_end = positions[best_idx][1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
307 best_score = score[best_idx]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
308 ## proportion of length of the best hit to the whole region length found by base
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
309 length_proportion = int((best_end - best_start + 1) /
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
310 (dom_end - dom_start + 1) * 100)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
311 db_seq_best = db_seq[best_idx]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
312 query_seq_best = query_seq[best_idx]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
313 domain_size_best = domain_size[best_idx]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
314 [percent_ident, align_similarity, relat_align_len, relat_interrupt,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
315 db_len_proportion
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
316 ] = filter_params(db_seq_best, query_seq_best, domain_size_best)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
317 ann_substring = "|".join(ann_substring.split("|")[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
318 annotation_best = "|".join([db_name_best] + annotation_best.split("|")[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
319 if "DANTE_PART" in seq_id:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
320 part = int(seq_id.split("DANTE_PART")[1].split(":")[0].split("_")[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
321 dom_start = dom_start + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
322 dom_end = dom_end + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
323 best_start = best_start + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
324 best_end = best_end + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
325 if ann_substring is '':
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
326 ann_substring = "NONE(Annotations from different classes)"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
327 if len(unique_annotations) > 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
328 unique_annotations = ",".join(["{}[{}bp]".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
329 ann, pos) for ann, pos in zip(unique_annotations, ann_pos_counts)])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
330 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
331 unique_annotations = unique_annotations[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
332 if __name__ == '__main__':
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
333 SOURCE = configuration.SOURCE_DANTE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
334 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
335 SOURCE = configuration.SOURCE_PROFREP
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
336 if "/" in domain_type:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
337 gff.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
338 "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\tName={};Final_Classification=Ambiguous_domain;Region_Hits_Classifications_={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
340 dom_end, strand, configuration.PHASE, domain_type,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
341 unique_annotations))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
342 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
343 gff.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
346 dom_end, best_score, strand, configuration.PHASE, domain_type,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
347 ann_substring, unique_annotations, annotation_best, best_start,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
348 best_end, length_proportion, db_starts_best, db_ends_best,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
349 domain_size_best, db_seq_best, query_seq_best, percent_ident,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
350 align_similarity, relat_align_len, relat_interrupt,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
351 db_len_proportion))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
352
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
353
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
354 def filter_params(db, query, protein_len):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
355 ''' Calculate basic statistics of the quality of the alignment '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
356 score_dict = alignment_scoring()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
357 num_ident = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
358 count_interrupt = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
359 count_similarity = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
360 alignment_len = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
361 for i, j in zip(db.upper(), query.upper()):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
362 if i == j and i != "X":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
363 num_ident += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
364 if j == "/" or j == "\\" or j == "*":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
365 count_interrupt += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
366 if (i.isalpha() or i == "*") and (j.isalpha() or j == "*"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
367 if int(score_dict["{}{}".format(i, j)]) > 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
368 count_similarity += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
369 ## gapless alignment length proportional to the domain protein length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
370 relat_align_len = round((len(db) - db.count("-")) / protein_len, 3)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
371 ## proportional identical bases (except of X) to al.length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
372 align_identity = round(num_ident / len(db), 2)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
373 ## proportional count of positive scores from scoring matrix to al. length
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
374 align_similarity = round(count_similarity / len(db), 2)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
375 ## number of interruptions per 100 bp
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
376 relat_interrupt = round(count_interrupt / math.ceil((len(query) / 100)), 2)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
377 ## Proportion of alignment to the original length of protein domain from database (indels included)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
378 db_len_proportion = round(len(db) / protein_len, 2)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
379 return align_identity, align_similarity, relat_align_len, relat_interrupt, db_len_proportion
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
380
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
381
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
382 def line_generator(tab_pipe, maf_pipe, start):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
383 ''' Yield individual lines of LASTAL stdout for single sequence '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
384 if hasattr(line_generator, "dom"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
385 seq_id = line_generator.dom.split("\t")[6]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
386 yield line_generator.dom.encode("utf-8")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
387 del line_generator.dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
388 line_tab = ""
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
389 for line_tab in tab_pipe:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
390 line_tab = line_tab.decode("utf-8")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
391 if not line_tab.startswith('#'):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
392 if start:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
393 if not ('seq_id' in locals() and
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
394 seq_id != line_tab.split("\t")[6]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
395 seq_id = line_tab.split("\t")[6]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
396 start = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
397 line_maf = [maf_pipe.readline() for line_count in range(4)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
398 db_seq = line_maf[1].decode("utf-8").rstrip().split(" ")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
399 alignment_seq = line_maf[2].decode("utf-8").rstrip().split(" ")[-1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
400 line = "{}\t{}\t{}".format(line_tab, db_seq, alignment_seq)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
401 line_id = line.split("\t")[6]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
402 if seq_id != line_id:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
403 line_generator.dom = line
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
404 return
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
405 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
406 yield line.encode("utf-8")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
407 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
408 maf_pipe.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
409 if line_tab == "":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
410 raise RuntimeError
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
411 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
412 return
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
413
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
414
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
415 def get_version(path, LAST_DB):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
416 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
417 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
418 cwd=path).decode('ascii').strip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
419 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
420 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
421 cwd=path).decode('ascii').strip()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
422 revcount = len(subprocess.check_output("git log --oneline",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
423 shell=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
424 cwd=path).decode('ascii').split())
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
425 version_string = (
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
426 "##-----------------------------------------------\n"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
427 "##PIPELINE VERSION : "
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
428 "{branch}-rv-{revcount}({shorthash})\n"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
429 "##PROTEIN DATABASE VERSION : {PD}\n"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
430 "##-----------------------------------------------\n").format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
431 branch=branch,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
432 shorthash=shorthash,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
433 revcount=revcount,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
434 PD=os.path.basename(LAST_DB))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
435 return version_string
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
436
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
437
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
438 def write_info(dom_gff_tmp, version_string):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
439 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
440 dom_gff_tmp.write(version_string)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
441
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
442
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
443 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
444 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
445 ''' Search for protein domains using our protein database and external tool LAST,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
446 stdout is parsed in real time and hits for a single sequence undergo further processing
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
447 - tabular format(TAB) to get info about position, score, orientation
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
448 - MAF format to gain alignment and original sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
449 '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
450
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
451 step = WIN_DOM - OVERLAP_DOM
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
452 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
453 ] = characterize_fasta(QUERY, WIN_DOM)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
454 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
455 below_win, lens_above_win, seq_starts, seq_ends)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
456
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
457 ## TAB output contains all the alignment scores, positions, strands...
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
458 tab = subprocess.Popen(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
459 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f TAB".format(LAST_DB,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
460 query_temp),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
461 stdout=subprocess.PIPE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
462 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
463 ## MAF output contains alignment sequences
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
464 maf = subprocess.Popen(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
465 "lastal -F15 {} {} -L 10 -m 70 -p BL80 -f MAF".format(LAST_DB,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
466 query_temp),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
467 stdout=subprocess.PIPE,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
468 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
469
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
470 tab_pipe = tab.stdout
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
471 maf_pipe = maf.stdout
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
472 maf_pipe.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
473
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
474 seq_ids = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
475 dom_tmp = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
476 with open(dom_tmp.name, "w") as dom_gff_tmp:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
477 path = os.path.dirname(os.path.realpath(__file__))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
478 version_string = get_version(path, LAST_DB)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
479 write_info(dom_gff_tmp, version_string)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
480 gff = open(dom_tmp.name, "a")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
481 start = True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
482 while True:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
483 try:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
484 with warnings.catch_warnings():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
485 warnings.simplefilter("ignore")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
486 sequence_hits = np.genfromtxt(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
487 line_generator(tab_pipe, maf_pipe, start),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
488 names=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
489 "score, name_db, start_db, al_size_db, strand_db, seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q, block1, block2, block3, db_seq, q_seq",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
490 usecols=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
491 "score, name_q, start_q, al_size_q, strand_q, seq_size_q, name_db, db_seq, q_seq, seq_size_db, start_db, al_size_db",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
492 dtype=None,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
493 comments=None)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
494 except RuntimeError:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
495 break
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
496 ## if there are no domains found
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
497 if sequence_hits.size is 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
498 gff.write("##NO DOMAINS")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
499 return [], [], [], []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
500
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
501 ############# PARSING LASTAL OUTPUT ############################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
502 sequence_hits = np.atleast_1d(sequence_hits)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
503 score = sequence_hits['score'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
504 seq_id = sequence_hits['name_q'][0].astype("str")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
505 start_hit = sequence_hits['start_q'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
506 end_hit = start_hit + sequence_hits['al_size_q'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
507 strand = sequence_hits['strand_q'].astype("str")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
508 seq_len = sequence_hits['seq_size_q'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
509 domain_db = sequence_hits['name_db'].astype("str")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
510 db_seq = sequence_hits['db_seq'].astype("str")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
511 query_seq = sequence_hits['q_seq'].astype("str")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
512 domain_size = sequence_hits['seq_size_db'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
513 db_start = sequence_hits['start_db'].astype("int") + 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
514 db_end = sequence_hits['start_db'].astype("int") + sequence_hits[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
515 'al_size_db'].astype("int")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
516
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
517 [reverse_strand_idx, positions_plus, positions_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
518 ] = hits_processing(seq_len, start_hit, end_hit, strand)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
519 strand_gff = "+"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
520 [mins_plus, maxs_plus, data_plus, indices_plus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
521 ] = overlapping_regions(positions_plus)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
522 [mins_minus, maxs_minus, data_minus, indices_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
523 ] = overlapping_regions(positions_minus)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
524 positions = positions_plus + positions_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
525 indices_overal = indices_plus + [x + reverse_strand_idx
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
526 for x in indices_minus]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
527 mins = mins_plus + mins_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
528 maxs = maxs_plus + maxs_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
529 data = data_plus + data_minus
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
530 ## process every region (cluster) of overlapping hits sequentially
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
531 count_region = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
532 for region in indices_overal:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
533 db_names = domain_db[np.array(region)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
534 db_starts = db_start[np.array(region)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
535 db_ends = db_end[np.array(region)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
536 scores = score[np.array(region)]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
537 annotations = domain_annotation(db_names, CLASSIFICATION)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
538 [score_matrix, classes_dict] = score_table(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
539 mins[count_region], maxs[count_region], data[count_region],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
540 annotations, scores, CLASSIFICATION)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
541 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
542 THRESHOLD_SCORE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
543 [domain_type, ann_substring, unique_annotations, ann_pos_counts
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
544 ] = group_annot_regs(ann_per_reg)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
545 [best_idx, best_idx_reg] = best_score(scores, region)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
546 annotation_best = annotations[best_idx_reg]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
547 db_name_best = db_names[best_idx_reg]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
548 db_starts_best = db_starts[best_idx_reg]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
549 db_ends_best = db_ends[best_idx_reg]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
550 if count_region == len(indices_plus):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
551 strand_gff = "-"
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
552 create_gff3(domain_type, ann_substring, unique_annotations,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
553 ann_pos_counts, mins[count_region], maxs[count_region],
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
554 step, best_idx, annotation_best, db_name_best,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
555 db_starts_best, db_ends_best, strand_gff, score,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
556 seq_id, db_seq, query_seq, domain_size, positions, gff)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
557 count_region += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
558 seq_ids.append(seq_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
559 os.unlink(query_temp)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
560 gff.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
561 dom_tmp.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
562 ## if any sequence from input data was split into windows, merge and adjust the data from individual windows
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
563 if any("DANTE_PART" in x for x in seq_ids):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
564 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
565 ## otherwise use the temporary output as the final domains gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
566 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
567 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
568 os.unlink(dom_tmp.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
569
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
570
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
571 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
572 ''' Original gff file is adjusted in case of containing cut parts
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
573 - for consecutive sequences overlap is divided to half with first half
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
574 of records(domains) belonging to the first sequence and second to the following one.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
575 Duplicate domains going through the middle of the overlap are removed.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
576 First and the last part (marked as LAST) of a certain sequence are
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
577 handled separately as the are overlapped from one side only '''
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
578
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
579 seq_id_all = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
580 class_dict = defaultdict(int)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
581 seen = set()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
582 with open(OUTPUT_DOMAIN, "w") as adjusted_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
583 with open(gff, "r") as primary_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
584 start = True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
585 for line in primary_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
586 if line.startswith("#"):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
587 adjusted_gff.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
588 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
589 split_line = line.split("\t")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
590 classification = split_line[-1].split(";")[1].split("=")[1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
591 if start:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
592 seq_id_all.append(split_line[0].split("_DANTE_PART")[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
593 0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
594 start = False
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
595 seq_id = split_line[0].split("_DANTE_PART")[0]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
596 if "DANTE_PART" in line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
597 line_without_id = "\t".join(split_line[1:])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
598 part = int(split_line[0].split("_DANTE_PART")[1].split(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
599 ":")[0].split("_")[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
600 if seq_id != seq_id_all[-1]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
601 seq_id_all.append(seq_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
602
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
603 ## first part of the sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
604 if part == 1:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
605 cut_end = WIN_DOM - OVERLAP_DOM / 2
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
606 if int(split_line[3]) <= cut_end <= int(split_line[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
607 4]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
608 if line_without_id not in seen:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
609 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
610 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
611 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
612 seen.add(line_without_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
613 elif int(split_line[4]) < cut_end:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
614 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
615 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
616 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
617
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
618 ## last part of the sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
619 elif "LAST" in split_line[0]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
620 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
621 if int(split_line[3]) <= cut_start <= int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
622 split_line[4]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
623 if line_without_id not in seen:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
624 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
625 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
626 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
627 seen.add(line_without_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
628 elif int(split_line[3]) > cut_start:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
629 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
630 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
631 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
632
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
633 ## middle part of the sequence
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
634 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
635 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
636 cut_end = WIN_DOM - OVERLAP_DOM / 2 + (part -
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
637 1) * step
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
638 if int(split_line[3]) <= cut_start <= int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
639 split_line[4]) or int(split_line[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
640 3]) <= cut_end <= int(split_line[4]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
641 if line_without_id not in seen:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
642 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
643 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
644 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
645 seen.add(line_without_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
646 elif int(split_line[3]) > cut_start and int(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
647 split_line[4]) < cut_end:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
648 adjusted_gff.write("{}\t{}".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
649 seq_id, line_without_id))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
650 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
651 ## not divived
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
652 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
653 if seq_id != seq_id_all[-1]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
654 seq_id_all.append(seq_id)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
655 adjusted_gff.write(line)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
656 class_dict[classification] += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
657
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
658
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
659 def main(args):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
660
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
661 t = time.time()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
662
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
663 QUERY = args.query
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
664 LAST_DB = args.protein_database
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
665 CLASSIFICATION = args.classification
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
666 OUTPUT_DOMAIN = args.domain_gff
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
667 NEW_LDB = args.new_ldb
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
668 OUTPUT_DIR = args.output_dir
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
669 THRESHOLD_SCORE = args.threshold_score
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
670 WIN_DOM = args.win_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
671 OVERLAP_DOM = args.overlap_dom
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
672
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
673 if OUTPUT_DOMAIN is None:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
674 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
675 if os.path.isdir(LAST_DB):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
676 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
677 if os.path.isdir(CLASSIFICATION):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
678 CLASSIFICATION = os.path.join(CLASSIFICATION, configuration.CLASS_FILE)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
679
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
680 if NEW_LDB:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
681 subprocess.call("lastdb -p -cR01 {} {}".format(LAST_DB, LAST_DB),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
682 shell=True)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
683
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
684 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
685 os.makedirs(OUTPUT_DIR)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
686
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
687 if not os.path.isabs(OUTPUT_DOMAIN):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
688 if OUTPUT_DIR is None:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
689 OUTPUT_DIR = configuration.TMP
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
690 if not os.path.exists(OUTPUT_DIR):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
691 os.makedirs(OUTPUT_DIR)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
692 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
693 os.path.basename(OUTPUT_DOMAIN))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
694 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
695 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
696
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
697 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
698
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
699
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
700 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
701 import argparse
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
702 from argparse import RawDescriptionHelpFormatter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
703
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
704 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
705 argparse.RawDescriptionHelpFormatter):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
706 pass
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
707
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
708 parser = argparse.ArgumentParser(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
709 description=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
710 '''Script performs similarity search on given DNA sequence(s) in (multi)fasta against our protein domains database of all Transposable element for certain group of organisms (Viridiplantae or Metazoans). Domains are subsequently annotated and classified - in case certain domain has multiple annotations assigned, classifation is derived from the common classification level of all of them. Domains search is accomplished engaging LASTAL alignment tool.
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
711
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
712 DEPENDENCIES:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
713 - python 3.4 or higher with packages:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
714 -numpy
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
715 - lastal 744 or higher [http://last.cbrc.jp/]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
716 - configuration.py module
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
717
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
718 EXAMPLE OF USAGE:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
719
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
720 ./protein_domains_pd.py -q PATH_TO_INPUT_SEQ -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
721
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
722 When running for the first time with a new database use -nld option allowing lastal to create indexed database files:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
723
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
724 -nld True
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
725
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
726 ''',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
727 epilog="""""",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
728 formatter_class=CustomFormatter)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
729
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
730 requiredNamed = parser.add_argument_group('required named arguments')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
731 requiredNamed.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
732 "-q",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
733 "--query",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
734 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
735 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
736 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
737 'input DNA sequence to search for protein domains in a fasta format. Multifasta format allowed.')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
738 requiredNamed.add_argument('-pdb',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
739 "--protein_database",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
740 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
741 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
742 help='protein domains database file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
743 requiredNamed.add_argument('-cs',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
744 '--classification',
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
745 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
746 required=True,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
747 help='protein domains classification file')
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
748 parser.add_argument("-oug",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
749 "--domain_gff",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
750 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
751 help="output domains gff format")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
752 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
753 "-nld",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
754 "--new_ldb",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
755 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
756 default=False,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
757 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
758 "create indexed database files for lastal in case of working with new protein db")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
759 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
760 "-dir",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
761 "--output_dir",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
762 type=str,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
763 help="specify if you want to change the output directory")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
764 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
765 "-thsc",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
766 "--threshold_score",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
767 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
768 default=80,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
769 help=
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
770 "percentage of the best score in the cluster to be tolerated when assigning annotations per base")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
771 parser.add_argument(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
772 "-wd",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
773 "--win_dom",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
774 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
775 default=10000000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
776 help="window to process large input sequences sequentially")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
777 parser.add_argument("-od",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
778 "--overlap_dom",
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
779 type=int,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
780 default=10000,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
781 help="overlap of sequences in two consecutive windows")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
782
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
783 args = parser.parse_args()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
784 main(args)