0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import numpy as np
|
|
4 import subprocess
|
|
5 import math
|
|
6 import time
|
|
7 from operator import itemgetter
|
|
8 from collections import Counter
|
|
9 from itertools import groupby
|
|
10 import os
|
10
|
11 import re
|
0
|
12 import configuration
|
|
13 from tempfile import NamedTemporaryFile
|
|
14 import sys
|
|
15 import warnings
|
|
16 import shutil
|
|
17 from collections import defaultdict
|
|
18
|
|
19 np.set_printoptions(threshold=sys.maxsize)
|
|
20
|
|
21 def alignment_scoring():
|
|
22 ''' Create hash table for alignment similarity counting: for every
|
|
23 combination of aminoacids in alignment assign score from protein
|
|
24 scoring matrix defined in configuration file '''
|
|
25 score_dict = {}
|
|
26 with open(configuration.SC_MATRIX) as smatrix:
|
|
27 count = 1
|
|
28 for line in smatrix:
|
|
29 if not line.startswith("#"):
|
|
30 if count == 1:
|
|
31 aa_all = line.rstrip().replace(" ", "")
|
|
32 else:
|
|
33 count_aa = 1
|
|
34 line = list(filter(None, line.rstrip().split(" ")))
|
|
35 for aa in aa_all:
|
|
36 score_dict["{}{}".format(line[0], aa)] = line[count_aa]
|
|
37 count_aa += 1
|
|
38 count += 1
|
|
39 return score_dict
|
|
40
|
|
41
|
|
42 def characterize_fasta(QUERY, WIN_DOM):
|
|
43 ''' Find the sequences, their lengths, starts, ends and if
|
|
44 they exceed the window '''
|
|
45 with open(QUERY) as query:
|
|
46 headers = []
|
|
47 fasta_lengths = []
|
|
48 seq_starts = []
|
|
49 seq_ends = []
|
|
50 fasta_chunk_len = 0
|
|
51 count_line = 1
|
|
52 for line in query:
|
|
53 line = line.rstrip()
|
|
54 if line.startswith(">"):
|
|
55 headers.append(line.rstrip())
|
|
56 fasta_lengths.append(fasta_chunk_len)
|
|
57 fasta_chunk_len = 0
|
|
58 seq_starts.append(count_line + 1)
|
|
59 seq_ends.append(count_line - 1)
|
|
60 else:
|
|
61 fasta_chunk_len += len(line)
|
|
62 count_line += 1
|
|
63 seq_ends.append(count_line)
|
|
64 seq_ends = seq_ends[1:]
|
|
65 fasta_lengths.append(fasta_chunk_len)
|
|
66 fasta_lengths = fasta_lengths[1:]
|
|
67 # control if there are correct (unique) names for individual seqs:
|
|
68 # LASTAL takes seqs IDs till the first space which can then create problems with ambiguous records
|
|
69 if len(headers) > len(set([header.split(" ")[0] for header in headers
|
|
70 ])):
|
|
71 raise NameError(
|
|
72 '''Sequences in multifasta format are not named correctly:
|
|
73 seq IDs (before the first space) are the same''')
|
|
74
|
|
75 above_win = [idx
|
|
76 for idx, value in enumerate(fasta_lengths) if value > WIN_DOM]
|
|
77 below_win = [idx
|
|
78 for idx, value in enumerate(fasta_lengths)
|
|
79 if value <= WIN_DOM]
|
|
80 lens_above_win = np.array(fasta_lengths)[above_win]
|
|
81 return headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
|
|
82
|
|
83
|
|
84 def split_fasta(QUERY, WIN_DOM, step, headers, above_win, below_win,
|
|
85 lens_above_win, seq_starts, seq_ends):
|
|
86 ''' Create temporary file containing all sequences - the ones that exceed
|
|
87 the window are cut with a set overlap (greater than domain size with a reserve) '''
|
|
88 with open(QUERY, "r") as query:
|
|
89 count_fasta_divided = 0
|
|
90 count_fasta_not_divided = 0
|
|
91 ntf = NamedTemporaryFile(delete=False)
|
|
92 divided = np.array(headers)[above_win]
|
|
93 row_length = configuration.FASTA_LINE
|
|
94 for line in query:
|
|
95 line = line.rstrip()
|
|
96 if line.startswith(">") and line in divided:
|
|
97 stop_line = seq_ends[above_win[
|
|
98 count_fasta_divided]] - seq_starts[above_win[
|
|
99 count_fasta_divided]] + 1
|
|
100 count_line = 0
|
|
101 whole_seq = []
|
|
102 for line2 in query:
|
|
103 whole_seq.append(line2.rstrip())
|
|
104 count_line += 1
|
|
105 if count_line == stop_line:
|
|
106 break
|
|
107 whole_seq = "".join(whole_seq)
|
|
108 ## create list of starting positions for individual parts of a seq with a step given by a window and overlap
|
|
109 windows_starts = list(range(0, lens_above_win[
|
|
110 count_fasta_divided], step))
|
|
111 ## create list of ending positions (starting pos + window), the last element is the whole seq length
|
|
112 windows_ends = [
|
|
113 x + WIN_DOM
|
|
114 if x + WIN_DOM < lens_above_win[count_fasta_divided] else
|
|
115 lens_above_win[count_fasta_divided] for x in windows_starts
|
|
116 ]
|
|
117 count_part = 1
|
|
118 for start_part, end_part in zip(windows_starts, windows_ends):
|
|
119 seq_part = whole_seq[start_part:end_part]
|
|
120 if count_part == len(windows_starts):
|
|
121 ntf.write("{}_DANTE_PART{}_LAST:{}-{}\n{}\n".format(
|
|
122 line.split(" ")[0], count_part, start_part + 1,
|
|
123 end_part, "\n".join([seq_part[i:i + row_length]
|
|
124 for i in range(0, len(
|
|
125 seq_part), row_length)
|
|
126 ])).encode("utf-8"))
|
|
127 else:
|
|
128 ntf.write("{}_DANTE_PART{}:{}-{}\n{}\n".format(
|
|
129 line.split(" ")[0], count_part, start_part + 1,
|
|
130 end_part, "\n".join([seq_part[i:i + row_length]
|
|
131 for i in range(0, len(
|
|
132 seq_part), row_length)
|
|
133 ])).encode("utf-8"))
|
|
134 count_part += 1
|
|
135 count_fasta_divided += 1
|
|
136 elif line.startswith(">") and line not in divided:
|
|
137 length_seq = seq_ends[below_win[
|
|
138 count_fasta_not_divided]] - seq_starts[below_win[
|
|
139 count_fasta_not_divided]] + 1
|
|
140 ntf.write("{}\n{}".format(line, "".join([query.readline(
|
|
141 ) for x in range(length_seq)])).encode("utf-8"))
|
|
142 count_fasta_not_divided += 1
|
|
143 query_temp = ntf.name
|
|
144 ntf.close()
|
|
145 return query_temp
|
|
146
|
|
147
|
|
148 def domain_annotation(elements, CLASSIFICATION):
|
|
149 ''' Assign protein domain to each hit from protein database '''
|
|
150 domains = []
|
|
151 annotations = []
|
|
152 with open(CLASSIFICATION, "r") as cl_tbl:
|
|
153 annotation = {}
|
|
154 for line in cl_tbl:
|
|
155 record = line.rstrip().split("\t")
|
|
156 annotation[record[0]] = record[1:]
|
|
157 for i in range(len(elements)):
|
|
158 domains.append(elements[i].split("__")[0].split("-")[1])
|
|
159 element_name = "__".join(elements[i].split("__")[1:])
|
|
160 if element_name in annotation.keys():
|
|
161 annotations.append("|".join([elements[i].split("__")[0].split("-")[
|
|
162 1], ("|".join(annotation[element_name]))]))
|
|
163 else:
|
|
164 annotations.append("unknown|unknown")
|
|
165 return annotations
|
|
166
|
|
167
|
|
168 def hits_processing(seq_len, start, end, strand):
|
|
169 ''' Gain hits intervals separately for forward and reverse strand '''
|
|
170 reverse_strand_idx = np.where(strand == "-")[0]
|
|
171 if not reverse_strand_idx.any():
|
|
172 start_pos_plus = start + 1
|
|
173 end_pos_plus = end
|
|
174 regions_plus = list(zip(start_pos_plus, end_pos_plus))
|
|
175 regions_minus = []
|
|
176 else:
|
|
177 reverse_strand_idx = reverse_strand_idx[0]
|
|
178 start_pos_plus = start[0:reverse_strand_idx] + 1
|
|
179 end_pos_plus = end[0:reverse_strand_idx]
|
|
180 start_pos_minus = seq_len[0] - end[reverse_strand_idx:] + 1
|
|
181 end_pos_minus = seq_len[0] - start[reverse_strand_idx:]
|
|
182 regions_plus = list(zip(start_pos_plus, end_pos_plus))
|
|
183 regions_minus = list(zip(start_pos_minus, end_pos_minus))
|
|
184 return reverse_strand_idx, regions_plus, regions_minus
|
|
185
|
|
186
|
|
187 def overlapping_regions(input_data):
|
|
188 ''' Join all overlapping intervals(hits) to clusters (potential domains),
|
|
189 get list of start-end positions of individual hits within the interval,
|
|
190 list of minimus and maximums as well as the indices in the original
|
|
191 sequence_hits structure for the hits belonging to the same clusters '''
|
|
192 if input_data:
|
|
193 sorted_idx, sorted_data = zip(*sorted(
|
|
194 [(index, data) for index, data in enumerate(input_data)],
|
|
195 key=itemgetter(1)))
|
|
196 merged_ends = input_data[sorted_idx[0]][1]
|
|
197 intervals = []
|
|
198 data = []
|
|
199 output_intervals = []
|
|
200 output_data = []
|
|
201 for i, j in zip(sorted_idx, sorted_data):
|
|
202 if input_data[i][0] < merged_ends:
|
|
203 merged_ends = max(input_data[i][1], merged_ends)
|
|
204 intervals.append(i)
|
|
205 data.append(j)
|
|
206 else:
|
|
207 output_intervals.append(intervals)
|
|
208 output_data.append(data)
|
|
209 intervals = []
|
|
210 data = []
|
|
211 intervals.append(i)
|
|
212 data.append(j)
|
|
213 merged_ends = input_data[i][1]
|
|
214 output_intervals.append(intervals)
|
|
215 output_data.append(data)
|
|
216 mins = [x[0][0] for x in output_data]
|
|
217 maxs = [max(x, key=itemgetter(1))[1] for x in output_data]
|
|
218 else:
|
|
219 mins = []
|
|
220 maxs = []
|
|
221 output_intervals = []
|
|
222 output_data = []
|
|
223 return mins, maxs, output_data, output_intervals
|
|
224
|
|
225
|
|
226 def annotations_dict(annotations):
|
|
227 ''' Hash table where annotations of the hits within a clusters are the keys.
|
|
228 Each annotation has serial number assigned which indexes the row in the score_table '''
|
|
229 classes_dict = {classes: idx
|
|
230 for idx, classes in enumerate(set(annotations))}
|
|
231 return classes_dict
|
|
232
|
|
233
|
|
234 def score_table(mins, maxs, data, annotations, scores, CLASSIFICATION):
|
|
235 ''' Score table is created based on the annotations occurance in the cluster.
|
|
236 Matrix axis y corresponds to individual annotations (indexed according to classes_dict),
|
10
|
237 axis x represents positions of analyzed seq in a given cluster.
|
|
238 For every hit within cluster, array of scores on the corresponding position
|
|
239 is recorded to the table in case if the score on certain position is so far the highest
|
0
|
240 for the certain position and certain annotation '''
|
|
241 classes_dict = annotations_dict(annotations)
|
|
242 score_matrix = np.zeros((len(classes_dict), maxs - mins + 1), dtype=int)
|
|
243 count = 0
|
|
244 for item in annotations:
|
|
245 saved_scores = score_matrix[classes_dict[item], data[count][0] - mins:
|
|
246 data[count][1] - mins + 1]
|
|
247 new_scores = [scores[count]] * len(saved_scores)
|
|
248 score_matrix[classes_dict[item], data[count][0] - mins:data[count][
|
|
249 1] - mins + 1] = [max(*pos_score)
|
|
250 for pos_score in zip(saved_scores, new_scores)]
|
|
251 count += 1
|
|
252 return score_matrix, classes_dict
|
|
253
|
|
254
|
|
255 def score_matrix_evaluation(score_matrix, classes_dict, THRESHOLD_SCORE):
|
|
256 ''' Score matrix is evaluated based on each position.
|
|
257 For every position the list of annotations with a score which reaches
|
|
258 certain percentage of the overal best score of the cluster are stored '''
|
|
259 ann_per_reg = []
|
|
260 overal_best_score_reg = max((score_matrix.max(axis=1)))
|
|
261 for position in score_matrix.T:
|
|
262 ## score threshold calculated as a percentage of the OVERALL best score in the cluster
|
|
263 threshold = overal_best_score_reg * THRESHOLD_SCORE / 100
|
|
264 above_th = [idx
|
|
265 for idx, score in enumerate(position)
|
|
266 if position[idx] >= threshold]
|
|
267 ## select unique annotations in one position that are above threshold
|
|
268 ann_per_pos = list(set(
|
|
269 [key for key, value in classes_dict.items() if value in above_th]))
|
|
270 ann_per_reg.append(ann_per_pos)
|
|
271 return ann_per_reg
|
|
272
|
|
273
|
|
274 def group_annot_regs(ann_per_reg):
|
|
275 ''' Get list of domains, annotations, longest common annotations and
|
|
276 counts of positions with certain annotation per regions '''
|
|
277 ## tranform list of lists (potential multiple annotations for every position ) to flat list of all annotations
|
|
278 all_annotations = [item for sublist in ann_per_reg for item in sublist]
|
|
279 unique_annotations = list(set(all_annotations))
|
|
280 ann_pos_counts = [all_annotations.count(x) for x in unique_annotations]
|
|
281 unique_annotations = list(set(
|
|
282 [item for sublist in ann_per_reg for item in sublist]))
|
|
283 domain_type = list(set([annotation.split("|")[0]
|
|
284 for annotation in unique_annotations]))
|
|
285 classification_list = [annotation.split("|")
|
|
286 for annotation in unique_annotations]
|
|
287 ann_substring = "|".join(os.path.commonprefix(classification_list))
|
|
288 domain_type = "/".join(domain_type)
|
|
289 return domain_type, ann_substring, unique_annotations, ann_pos_counts
|
|
290
|
|
291
|
|
292 def best_score(scores, region):
|
|
293 ''' From overlapping intervals take the one with the highest score '''
|
|
294 ## if more hits have the same best score take only the first one
|
|
295 best_idx = region[np.where(scores == max(scores))[0][0]]
|
|
296 best_idx_reg = np.where(scores == max(scores))[0][0]
|
|
297 return best_idx, best_idx_reg
|
|
298
|
|
299
|
|
300 def create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts,
|
|
301 dom_start, dom_end, step, best_idx, annotation_best,
|
|
302 db_name_best, db_starts_best, db_ends_best, strand, score,
|
10
|
303 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus):
|
0
|
304 ''' Record obtained information about domain corresponding to individual cluster to common gff file '''
|
|
305 best_start = positions[best_idx][0]
|
|
306 best_end = positions[best_idx][1]
|
|
307 best_score = score[best_idx]
|
|
308 ## proportion of length of the best hit to the whole region length found by base
|
|
309 length_proportion = int((best_end - best_start + 1) /
|
|
310 (dom_end - dom_start + 1) * 100)
|
|
311 db_seq_best = db_seq[best_idx]
|
|
312 query_seq_best = query_seq[best_idx]
|
|
313 domain_size_best = domain_size[best_idx]
|
|
314 [percent_ident, align_similarity, relat_align_len, relat_interrupt,
|
|
315 db_len_proportion
|
|
316 ] = filter_params(db_seq_best, query_seq_best, domain_size_best)
|
|
317 ann_substring = "|".join(ann_substring.split("|")[1:])
|
|
318 annotation_best = "|".join([db_name_best] + annotation_best.split("|")[1:])
|
|
319 if "DANTE_PART" in seq_id:
|
|
320 part = int(seq_id.split("DANTE_PART")[1].split(":")[0].split("_")[0])
|
|
321 dom_start = dom_start + (part - 1) * step
|
|
322 dom_end = dom_end + (part - 1) * step
|
|
323 best_start = best_start + (part - 1) * step
|
|
324 best_end = best_end + (part - 1) * step
|
|
325 if ann_substring is '':
|
|
326 ann_substring = "NONE(Annotations from different classes)"
|
|
327 if len(unique_annotations) > 1:
|
|
328 unique_annotations = ",".join(["{}[{}bp]".format(
|
|
329 ann, pos) for ann, pos in zip(unique_annotations, ann_pos_counts)])
|
|
330 else:
|
|
331 unique_annotations = unique_annotations[0]
|
|
332 if __name__ == '__main__':
|
|
333 SOURCE = configuration.SOURCE_DANTE
|
|
334 else:
|
|
335 SOURCE = configuration.SOURCE_PROFREP
|
|
336 if "/" in domain_type:
|
|
337 gff.write(
|
|
338 "{}\t{}\t{}\t{}\t{}\t.\t{}\t{}\tName={};Final_Classification=Ambiguous_domain;Region_Hits_Classifications_={}\n".format(
|
|
339 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
|
|
340 dom_end, strand, configuration.PHASE, domain_type,
|
|
341 unique_annotations))
|
|
342 else:
|
|
343 gff.write(
|
10
|
344 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Final_Classification={};Region_Hits_Classifications={};Best_Hit={}:{}-{}[{}percent];Best_Hit_DB_Pos={}:{}of{};DB_Seq={};Region_Seq={};Query_Seq={};Identity={};Similarity={};Relat_Length={};Relat_Interruptions={};Hit_to_DB_Length={}\n".format(
|
0
|
345 seq_id, SOURCE, configuration.DOMAINS_FEATURE, dom_start,
|
|
346 dom_end, best_score, strand, configuration.PHASE, domain_type,
|
|
347 ann_substring, unique_annotations, annotation_best, best_start,
|
|
348 best_end, length_proportion, db_starts_best, db_ends_best,
|
10
|
349 domain_size_best, db_seq_best, consensus, query_seq_best, percent_ident,
|
0
|
350 align_similarity, relat_align_len, relat_interrupt,
|
|
351 db_len_proportion))
|
|
352
|
|
353
|
|
354 def filter_params(db, query, protein_len):
|
|
355 ''' Calculate basic statistics of the quality of the alignment '''
|
|
356 score_dict = alignment_scoring()
|
|
357 num_ident = 0
|
|
358 count_interrupt = 0
|
|
359 count_similarity = 0
|
|
360 alignment_len = 0
|
|
361 for i, j in zip(db.upper(), query.upper()):
|
|
362 if i == j and i != "X":
|
|
363 num_ident += 1
|
|
364 if j == "/" or j == "\\" or j == "*":
|
|
365 count_interrupt += 1
|
|
366 if (i.isalpha() or i == "*") and (j.isalpha() or j == "*"):
|
|
367 if int(score_dict["{}{}".format(i, j)]) > 0:
|
|
368 count_similarity += 1
|
|
369 ## gapless alignment length proportional to the domain protein length
|
|
370 relat_align_len = round((len(db) - db.count("-")) / protein_len, 3)
|
|
371 ## proportional identical bases (except of X) to al.length
|
|
372 align_identity = round(num_ident / len(db), 2)
|
|
373 ## proportional count of positive scores from scoring matrix to al. length
|
|
374 align_similarity = round(count_similarity / len(db), 2)
|
|
375 ## number of interruptions per 100 bp
|
|
376 relat_interrupt = round(count_interrupt / math.ceil((len(query) / 100)), 2)
|
|
377 ## Proportion of alignment to the original length of protein domain from database (indels included)
|
|
378 db_len_proportion = round(len(db) / protein_len, 2)
|
|
379 return align_identity, align_similarity, relat_align_len, relat_interrupt, db_len_proportion
|
|
380
|
|
381
|
|
382 def line_generator(tab_pipe, maf_pipe, start):
|
|
383 ''' Yield individual lines of LASTAL stdout for single sequence '''
|
|
384 if hasattr(line_generator, "dom"):
|
|
385 seq_id = line_generator.dom.split("\t")[6]
|
|
386 yield line_generator.dom.encode("utf-8")
|
|
387 del line_generator.dom
|
|
388 line_tab = ""
|
|
389 for line_tab in tab_pipe:
|
|
390 line_tab = line_tab.decode("utf-8")
|
|
391 if not line_tab.startswith('#'):
|
|
392 if start:
|
|
393 if not ('seq_id' in locals() and
|
|
394 seq_id != line_tab.split("\t")[6]):
|
|
395 seq_id = line_tab.split("\t")[6]
|
|
396 start = False
|
|
397 line_maf = [maf_pipe.readline() for line_count in range(4)]
|
|
398 db_seq = line_maf[1].decode("utf-8").rstrip().split(" ")[-1]
|
|
399 alignment_seq = line_maf[2].decode("utf-8").rstrip().split(" ")[-1]
|
|
400 line = "{}\t{}\t{}".format(line_tab, db_seq, alignment_seq)
|
|
401 line_id = line.split("\t")[6]
|
|
402 if seq_id != line_id:
|
|
403 line_generator.dom = line
|
|
404 return
|
|
405 else:
|
|
406 yield line.encode("utf-8")
|
|
407 else:
|
|
408 maf_pipe.readline()
|
|
409 if line_tab == "":
|
|
410 raise RuntimeError
|
|
411 else:
|
|
412 return
|
|
413
|
|
414
|
|
415 def get_version(path, LAST_DB):
|
5
|
416 '''Return version is run from git repository '''
|
18
|
417 version_string = (
|
|
418 "##-----------------------------------------------\n"
|
|
419 "##PROTEIN DATABASE VERSION : {PD}\n"
|
|
420 "##-----------------------------------------------\n").format(
|
|
421 PD=os.path.basename(LAST_DB)
|
|
422 )
|
|
423 if os.path.exists(".git"):
|
|
424 try:
|
|
425 branch = subprocess.check_output("git rev-parse --abbrev-ref HEAD",
|
|
426 shell=True,
|
|
427 cwd=path).decode('ascii').strip()
|
|
428 shorthash = subprocess.check_output("git log --pretty=format:'%h' -n 1 ",
|
|
429 shell=True,
|
|
430 cwd=path).decode('ascii').strip()
|
|
431 revcount = len(subprocess.check_output("git log --oneline",
|
|
432 shell=True,
|
|
433 cwd=path).decode('ascii').split())
|
|
434 version_string = (
|
|
435 "##-----------------------------------------------\n"
|
|
436 "##PIPELINE VERSION : "
|
|
437 "{branch}-rv-{revcount}({shorthash})\n"
|
|
438 "##PROTEIN DATABASE VERSION : {PD}\n"
|
|
439 "##-----------------------------------------------\n").format(
|
|
440 branch=branch,
|
|
441 shorthash=shorthash,
|
|
442 revcount=revcount,
|
|
443 PD=os.path.basename(LAST_DB))
|
|
444 except:
|
|
445 pass
|
0
|
446 return version_string
|
|
447
|
|
448
|
|
449 def write_info(dom_gff_tmp, version_string):
|
|
450 dom_gff_tmp.write("{}\n".format(configuration.HEADER_GFF))
|
|
451 dom_gff_tmp.write(version_string)
|
|
452
|
|
453
|
|
454 def domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
|
10
|
455 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX):
|
0
|
456 ''' Search for protein domains using our protein database and external tool LAST,
|
|
457 stdout is parsed in real time and hits for a single sequence undergo further processing
|
10
|
458 - tabular format(TAB) to get info about position, score, orientation
|
0
|
459 - MAF format to gain alignment and original sequence
|
|
460 '''
|
|
461
|
|
462 step = WIN_DOM - OVERLAP_DOM
|
|
463 [headers, above_win, below_win, lens_above_win, seq_starts, seq_ends
|
|
464 ] = characterize_fasta(QUERY, WIN_DOM)
|
|
465 query_temp = split_fasta(QUERY, WIN_DOM, step, headers, above_win,
|
|
466 below_win, lens_above_win, seq_starts, seq_ends)
|
|
467
|
|
468 ## TAB output contains all the alignment scores, positions, strands...
|
10
|
469 lastal_columns = {
|
|
470 "BL80" : ("score, name_db, start_db, al_size_db, strand_db,"
|
|
471 " seq_size_db, name_q, start_q, al_size_q, strand_q, seq_size_q,"
|
|
472 " block1, block2, block3, db_seq, q_seq"),
|
|
473 "BL62" : ("score, name_db, start_db, al_size_db, strand_db,"
|
|
474 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
|
|
475 " seq_size_q, block1, block2, block3, db_seq, q_seq"),
|
|
476 "MIQS" : ("score, name_db, start_db, al_size_db, strand_db,"
|
|
477 " seq_size_db, name_q, start_q, al_size_q, strand_q,"
|
|
478 " seq_size_q, block1, db_seq, q_seq"),
|
|
479 }
|
0
|
480 tab = subprocess.Popen(
|
10
|
481 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f TAB".format(LAST_DB,
|
|
482 query_temp,
|
|
483 SCORING_MATRIX),
|
0
|
484 stdout=subprocess.PIPE,
|
|
485 shell=True)
|
|
486 ## MAF output contains alignment sequences
|
|
487 maf = subprocess.Popen(
|
10
|
488 "lastal -F15 {} {} -L 10 -m 70 -p {} -e 80 -f MAF".format(LAST_DB,
|
|
489 query_temp,
|
|
490 SCORING_MATRIX),
|
0
|
491 stdout=subprocess.PIPE,
|
|
492 shell=True)
|
|
493
|
|
494 tab_pipe = tab.stdout
|
|
495 maf_pipe = maf.stdout
|
|
496 maf_pipe.readline()
|
|
497
|
|
498 seq_ids = []
|
|
499 dom_tmp = NamedTemporaryFile(delete=False)
|
|
500 with open(dom_tmp.name, "w") as dom_gff_tmp:
|
|
501 path = os.path.dirname(os.path.realpath(__file__))
|
|
502 version_string = get_version(path, LAST_DB)
|
|
503 write_info(dom_gff_tmp, version_string)
|
|
504 gff = open(dom_tmp.name, "a")
|
|
505 start = True
|
|
506 while True:
|
|
507 try:
|
|
508 with warnings.catch_warnings():
|
|
509 warnings.simplefilter("ignore")
|
|
510 sequence_hits = np.genfromtxt(
|
|
511 line_generator(tab_pipe, maf_pipe, start),
|
10
|
512 names=lastal_columns[SCORING_MATRIX],
|
|
513 usecols=("score, name_q, start_q, al_size_q,"
|
|
514 " strand_q, seq_size_q, name_db, db_seq,"
|
|
515 " q_seq, seq_size_db, start_db, al_size_db"),
|
0
|
516 dtype=None,
|
|
517 comments=None)
|
|
518 except RuntimeError:
|
|
519 break
|
|
520 ## if there are no domains found
|
|
521 if sequence_hits.size is 0:
|
|
522 gff.write("##NO DOMAINS")
|
|
523 return [], [], [], []
|
|
524
|
|
525 ############# PARSING LASTAL OUTPUT ############################
|
|
526 sequence_hits = np.atleast_1d(sequence_hits)
|
|
527 score = sequence_hits['score'].astype("int")
|
|
528 seq_id = sequence_hits['name_q'][0].astype("str")
|
|
529 start_hit = sequence_hits['start_q'].astype("int")
|
|
530 end_hit = start_hit + sequence_hits['al_size_q'].astype("int")
|
|
531 strand = sequence_hits['strand_q'].astype("str")
|
|
532 seq_len = sequence_hits['seq_size_q'].astype("int")
|
|
533 domain_db = sequence_hits['name_db'].astype("str")
|
|
534 db_seq = sequence_hits['db_seq'].astype("str")
|
|
535 query_seq = sequence_hits['q_seq'].astype("str")
|
|
536 domain_size = sequence_hits['seq_size_db'].astype("int")
|
|
537 db_start = sequence_hits['start_db'].astype("int") + 1
|
|
538 db_end = sequence_hits['start_db'].astype("int") + sequence_hits[
|
|
539 'al_size_db'].astype("int")
|
|
540
|
|
541 [reverse_strand_idx, positions_plus, positions_minus
|
|
542 ] = hits_processing(seq_len, start_hit, end_hit, strand)
|
|
543 strand_gff = "+"
|
|
544 [mins_plus, maxs_plus, data_plus, indices_plus
|
|
545 ] = overlapping_regions(positions_plus)
|
|
546 [mins_minus, maxs_minus, data_minus, indices_minus
|
|
547 ] = overlapping_regions(positions_minus)
|
|
548 positions = positions_plus + positions_minus
|
|
549 indices_overal = indices_plus + [x + reverse_strand_idx
|
|
550 for x in indices_minus]
|
|
551 mins = mins_plus + mins_minus
|
|
552 maxs = maxs_plus + maxs_minus
|
|
553 data = data_plus + data_minus
|
|
554 ## process every region (cluster) of overlapping hits sequentially
|
|
555 count_region = 0
|
|
556 for region in indices_overal:
|
|
557 db_names = domain_db[np.array(region)]
|
|
558 db_starts = db_start[np.array(region)]
|
|
559 db_ends = db_end[np.array(region)]
|
|
560 scores = score[np.array(region)]
|
10
|
561 regions_above_threshold = [
|
|
562 region[i]
|
|
563 for i, _ in enumerate(region)
|
|
564 if max(scores) / 100 * THRESHOLD_SCORE < scores[i]
|
|
565 ]
|
|
566 ## sort by score first:
|
|
567 consensus = get_full_translation(
|
|
568 translation_alignments(
|
|
569 query_seq=sortby(query_seq[regions_above_threshold], score[regions_above_threshold], True),
|
|
570 start_hit=sortby(start_hit[regions_above_threshold], score[regions_above_threshold], True),
|
|
571 end_hit=sortby(end_hit[regions_above_threshold], score[regions_above_threshold], True))
|
|
572 )
|
|
573
|
0
|
574 annotations = domain_annotation(db_names, CLASSIFICATION)
|
|
575 [score_matrix, classes_dict] = score_table(
|
|
576 mins[count_region], maxs[count_region], data[count_region],
|
|
577 annotations, scores, CLASSIFICATION)
|
|
578 ann_per_reg = score_matrix_evaluation(score_matrix, classes_dict,
|
|
579 THRESHOLD_SCORE)
|
|
580 [domain_type, ann_substring, unique_annotations, ann_pos_counts
|
|
581 ] = group_annot_regs(ann_per_reg)
|
|
582 [best_idx, best_idx_reg] = best_score(scores, region)
|
|
583 annotation_best = annotations[best_idx_reg]
|
|
584 db_name_best = db_names[best_idx_reg]
|
|
585 db_starts_best = db_starts[best_idx_reg]
|
|
586 db_ends_best = db_ends[best_idx_reg]
|
|
587 if count_region == len(indices_plus):
|
|
588 strand_gff = "-"
|
14
|
589 if strand_gff == "+":
|
15
|
590 feature_start = min(start_hit[regions_above_threshold]) + 1
|
14
|
591 feature_end = max(end_hit[regions_above_threshold])
|
|
592 else:
|
15
|
593 feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold])
|
14
|
594 feature_start = seq_len[region][0] - max(end_hit[regions_above_threshold]) + 1
|
0
|
595 create_gff3(domain_type, ann_substring, unique_annotations,
|
14
|
596 ann_pos_counts, feature_start,feature_end,
|
0
|
597 step, best_idx, annotation_best, db_name_best,
|
|
598 db_starts_best, db_ends_best, strand_gff, score,
|
10
|
599 seq_id, db_seq, query_seq, domain_size, positions, gff, consensus)
|
0
|
600 count_region += 1
|
|
601 seq_ids.append(seq_id)
|
|
602 os.unlink(query_temp)
|
|
603 gff.close()
|
|
604 dom_tmp.close()
|
|
605 ## if any sequence from input data was split into windows, merge and adjust the data from individual windows
|
|
606 if any("DANTE_PART" in x for x in seq_ids):
|
|
607 adjust_gff(OUTPUT_DOMAIN, dom_tmp.name, WIN_DOM, OVERLAP_DOM, step)
|
|
608 ## otherwise use the temporary output as the final domains gff
|
|
609 else:
|
|
610 shutil.copy2(dom_tmp.name, OUTPUT_DOMAIN)
|
|
611 os.unlink(dom_tmp.name)
|
|
612
|
10
|
613 def sortby(a, by, reverse=False):
|
|
614 ''' sort according values in the by list '''
|
|
615 a_sorted = [i[0] for i in
|
|
616 sorted(
|
|
617 zip(a, by),
|
|
618 key=lambda i: i[1],
|
|
619 reverse=reverse
|
|
620 )]
|
|
621 return a_sorted
|
|
622
|
|
623
|
|
624 def a2nnn(s):
|
|
625 s1 = "".join([c if c in ['/', '\\'] else c + c + c
|
|
626 for c in s.replace("-", "")])
|
|
627 # collapse frameshifts (/)
|
|
628 s2 = re.sub("[a-zA-Z*]{2}//[a-zA-Z*]{2}", "//", s1)
|
|
629 s3 = re.sub("[a-zA-Z*]/[a-zA-Z*]", "/", s2)
|
|
630 return (s3)
|
|
631
|
|
632
|
|
633
|
|
634 def rle(s):
|
|
635 '''run length encoding but max is set to 3 (codon)'''
|
|
636 prev = ""
|
|
637 count = 1
|
|
638 char = []
|
|
639 length = []
|
|
640 L = 0
|
|
641 for n in s:
|
|
642 if n == prev and count < (3 - L):
|
|
643 count += 1
|
|
644 else:
|
|
645 char.append(prev)
|
|
646 length.append(count)
|
|
647 L = 1 if prev == "/" else 0
|
|
648 prev = n
|
|
649 count = 1
|
|
650 char.append(prev)
|
|
651 length.append(count)
|
|
652 sequence = char[1:]
|
|
653 return sequence, length[1:]
|
|
654
|
|
655 def get_full_translation(translations):
|
|
656 '''get one full length translation from multiple partial
|
|
657 aligned translation '''
|
|
658 # find minimal set of alignements
|
|
659 minimal_set = []
|
|
660 not_filled_prev = len(translations[0])
|
|
661 for s in translations:
|
|
662 minimal_set.append(s)
|
|
663 # check defined position - is there only '-' character?
|
|
664 not_filled = sum([set(i) == {"-"} for i in zip(*minimal_set)])
|
|
665 if not_filled == 0:
|
|
666 break
|
|
667 if not_filled == not_filled_prev:
|
|
668 # last added sequence did not improve coverage - remove it.
|
|
669 minimal_set.pop()
|
|
670 not_filled_prev = not_filled
|
|
671 # merge translations
|
|
672 final_translation = minimal_set[0]
|
|
673 # record positions of joins to correct frameshifts reportings
|
|
674 position_of_joins = set()
|
|
675 position_of_joins_rle = set()
|
|
676 if len(minimal_set) > 1: # translation must be merged
|
|
677 for s in minimal_set[1:]:
|
|
678 s1 = re.search(r"-\w", final_translation)
|
|
679 s2 = re.search(r"\w-", final_translation)
|
|
680 if s1:
|
|
681 position_of_joins.add(s1.start())
|
|
682 if s2:
|
|
683 position_of_joins.add((s2.end() - 1))
|
|
684 final_translation = "".join(
|
|
685 [b if a == "-" else a for a, b in zip(final_translation, s)])
|
|
686 translation_rle = rle(final_translation)
|
|
687 cumsumed_positions = np.cumsum(translation_rle[1])
|
|
688 for p in position_of_joins:
|
|
689 position_of_joins_rle.add(sum(cumsumed_positions <= p))
|
|
690 # insert /\ when necessary
|
|
691 for p in position_of_joins_rle:
|
|
692 if translation_rle[0][p] not in ['/',"//","\\", "\\\\"]:
|
|
693 if translation_rle[1][p] == 2:
|
|
694 translation_rle[0][p] = translation_rle[0][p] + "/"
|
|
695 if translation_rle[1][p] == 1:
|
|
696 translation_rle[0][p] = "\\"
|
|
697 consensus = "".join(translation_rle[0])
|
|
698 return consensus
|
|
699
|
|
700
|
|
701 # helper function for debugging
|
|
702 def translation_alignments(query_seq, start_hit, end_hit):
|
|
703 pstart = min(start_hit)
|
|
704 pend = max(end_hit)
|
|
705 nnn = list()
|
|
706 for s, start, end in zip(query_seq, start_hit, end_hit):
|
|
707 nnn.append("-" * (start - pstart) + a2nnn(s) + "-" * (pend - end))
|
|
708 return (nnn)
|
|
709
|
|
710
|
0
|
711
|
|
712 def adjust_gff(OUTPUT_DOMAIN, gff, WIN_DOM, OVERLAP_DOM, step):
|
|
713 ''' Original gff file is adjusted in case of containing cut parts
|
|
714 - for consecutive sequences overlap is divided to half with first half
|
|
715 of records(domains) belonging to the first sequence and second to the following one.
|
|
716 Duplicate domains going through the middle of the overlap are removed.
|
|
717 First and the last part (marked as LAST) of a certain sequence are
|
|
718 handled separately as the are overlapped from one side only '''
|
|
719
|
|
720 seq_id_all = []
|
|
721 class_dict = defaultdict(int)
|
|
722 seen = set()
|
|
723 with open(OUTPUT_DOMAIN, "w") as adjusted_gff:
|
|
724 with open(gff, "r") as primary_gff:
|
|
725 start = True
|
|
726 for line in primary_gff:
|
|
727 if line.startswith("#"):
|
|
728 adjusted_gff.write(line)
|
|
729 else:
|
|
730 split_line = line.split("\t")
|
|
731 classification = split_line[-1].split(";")[1].split("=")[1]
|
|
732 if start:
|
|
733 seq_id_all.append(split_line[0].split("_DANTE_PART")[
|
|
734 0])
|
|
735 start = False
|
|
736 seq_id = split_line[0].split("_DANTE_PART")[0]
|
|
737 if "DANTE_PART" in line:
|
|
738 line_without_id = "\t".join(split_line[1:])
|
|
739 part = int(split_line[0].split("_DANTE_PART")[1].split(
|
|
740 ":")[0].split("_")[0])
|
|
741 if seq_id != seq_id_all[-1]:
|
|
742 seq_id_all.append(seq_id)
|
|
743
|
|
744 ## first part of the sequence
|
|
745 if part == 1:
|
|
746 cut_end = WIN_DOM - OVERLAP_DOM / 2
|
|
747 if int(split_line[3]) <= cut_end <= int(split_line[
|
|
748 4]):
|
|
749 if line_without_id not in seen:
|
|
750 adjusted_gff.write("{}\t{}".format(
|
|
751 seq_id, line_without_id))
|
|
752 class_dict[classification] += 1
|
|
753 seen.add(line_without_id)
|
|
754 elif int(split_line[4]) < cut_end:
|
|
755 adjusted_gff.write("{}\t{}".format(
|
|
756 seq_id, line_without_id))
|
|
757 class_dict[classification] += 1
|
|
758
|
|
759 ## last part of the sequence
|
|
760 elif "LAST" in split_line[0]:
|
|
761 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
|
|
762 if int(split_line[3]) <= cut_start <= int(
|
|
763 split_line[4]):
|
|
764 if line_without_id not in seen:
|
|
765 adjusted_gff.write("{}\t{}".format(
|
|
766 seq_id, line_without_id))
|
|
767 class_dict[classification] += 1
|
|
768 seen.add(line_without_id)
|
|
769 elif int(split_line[3]) > cut_start:
|
|
770 adjusted_gff.write("{}\t{}".format(
|
|
771 seq_id, line_without_id))
|
|
772 class_dict[classification] += 1
|
|
773
|
|
774 ## middle part of the sequence
|
|
775 else:
|
|
776 cut_start = OVERLAP_DOM / 2 + (part - 1) * step
|
|
777 cut_end = WIN_DOM - OVERLAP_DOM / 2 + (part -
|
|
778 1) * step
|
|
779 if int(split_line[3]) <= cut_start <= int(
|
|
780 split_line[4]) or int(split_line[
|
|
781 3]) <= cut_end <= int(split_line[4]):
|
|
782 if line_without_id not in seen:
|
|
783 adjusted_gff.write("{}\t{}".format(
|
|
784 seq_id, line_without_id))
|
|
785 class_dict[classification] += 1
|
|
786 seen.add(line_without_id)
|
|
787 elif int(split_line[3]) > cut_start and int(
|
|
788 split_line[4]) < cut_end:
|
|
789 adjusted_gff.write("{}\t{}".format(
|
|
790 seq_id, line_without_id))
|
|
791 class_dict[classification] += 1
|
|
792 ## not divived
|
|
793 else:
|
|
794 if seq_id != seq_id_all[-1]:
|
|
795 seq_id_all.append(seq_id)
|
|
796 adjusted_gff.write(line)
|
|
797 class_dict[classification] += 1
|
|
798
|
|
799
|
|
800 def main(args):
|
|
801
|
|
802 t = time.time()
|
|
803
|
|
804 QUERY = args.query
|
|
805 LAST_DB = args.protein_database
|
|
806 CLASSIFICATION = args.classification
|
|
807 OUTPUT_DOMAIN = args.domain_gff
|
|
808 NEW_LDB = args.new_ldb
|
|
809 OUTPUT_DIR = args.output_dir
|
|
810 THRESHOLD_SCORE = args.threshold_score
|
|
811 WIN_DOM = args.win_dom
|
|
812 OVERLAP_DOM = args.overlap_dom
|
10
|
813 SCORING_MATRIX = args.scoring_matrix
|
|
814 configuration.SC_MATRIX = configuration.SC_MATRIX_SKELETON.format(SCORING_MATRIX)
|
0
|
815
|
|
816 if OUTPUT_DOMAIN is None:
|
|
817 OUTPUT_DOMAIN = configuration.DOMAINS_GFF
|
|
818 if os.path.isdir(LAST_DB):
|
|
819 LAST_DB = os.path.join(LAST_DB, configuration.LAST_DB_FILE)
|
|
820 if os.path.isdir(CLASSIFICATION):
|
|
821 CLASSIFICATION = os.path.join(CLASSIFICATION, configuration.CLASS_FILE)
|
|
822
|
|
823 if NEW_LDB:
|
|
824 subprocess.call("lastdb -p -cR01 {} {}".format(LAST_DB, LAST_DB),
|
|
825 shell=True)
|
|
826
|
|
827 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
|
|
828 os.makedirs(OUTPUT_DIR)
|
|
829
|
|
830 if not os.path.isabs(OUTPUT_DOMAIN):
|
|
831 if OUTPUT_DIR is None:
|
|
832 OUTPUT_DIR = configuration.TMP
|
|
833 if not os.path.exists(OUTPUT_DIR):
|
|
834 os.makedirs(OUTPUT_DIR)
|
|
835 OUTPUT_DOMAIN = os.path.join(OUTPUT_DIR,
|
|
836 os.path.basename(OUTPUT_DOMAIN))
|
|
837 domain_search(QUERY, LAST_DB, CLASSIFICATION, OUTPUT_DOMAIN,
|
10
|
838 THRESHOLD_SCORE, WIN_DOM, OVERLAP_DOM, SCORING_MATRIX)
|
0
|
839
|
|
840 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
|
|
841
|
|
842
|
|
843 if __name__ == "__main__":
|
|
844 import argparse
|
|
845 from argparse import RawDescriptionHelpFormatter
|
|
846
|
|
847 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
848 argparse.RawDescriptionHelpFormatter):
|
|
849 pass
|
|
850
|
|
851 parser = argparse.ArgumentParser(
|
|
852 description=
|
|
853 '''Script performs similarity search on given DNA sequence(s) in (multi)fasta against our protein domains database of all Transposable element for certain group of organisms (Viridiplantae or Metazoans). Domains are subsequently annotated and classified - in case certain domain has multiple annotations assigned, classifation is derived from the common classification level of all of them. Domains search is accomplished engaging LASTAL alignment tool.
|
|
854
|
|
855 DEPENDENCIES:
|
|
856 - python 3.4 or higher with packages:
|
|
857 -numpy
|
|
858 - lastal 744 or higher [http://last.cbrc.jp/]
|
|
859 - configuration.py module
|
|
860
|
|
861 EXAMPLE OF USAGE:
|
|
862
|
|
863 ./protein_domains_pd.py -q PATH_TO_INPUT_SEQ -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE
|
|
864
|
|
865 When running for the first time with a new database use -nld option allowing lastal to create indexed database files:
|
|
866
|
|
867 -nld True
|
|
868
|
|
869 ''',
|
|
870 epilog="""""",
|
|
871 formatter_class=CustomFormatter)
|
|
872
|
|
873 requiredNamed = parser.add_argument_group('required named arguments')
|
|
874 requiredNamed.add_argument(
|
|
875 "-q",
|
|
876 "--query",
|
|
877 type=str,
|
|
878 required=True,
|
|
879 help=
|
|
880 'input DNA sequence to search for protein domains in a fasta format. Multifasta format allowed.')
|
|
881 requiredNamed.add_argument('-pdb',
|
|
882 "--protein_database",
|
|
883 type=str,
|
|
884 required=True,
|
|
885 help='protein domains database file')
|
|
886 requiredNamed.add_argument('-cs',
|
|
887 '--classification',
|
|
888 type=str,
|
|
889 required=True,
|
|
890 help='protein domains classification file')
|
|
891 parser.add_argument("-oug",
|
|
892 "--domain_gff",
|
|
893 type=str,
|
|
894 help="output domains gff format")
|
|
895 parser.add_argument(
|
|
896 "-nld",
|
|
897 "--new_ldb",
|
|
898 type=str,
|
|
899 default=False,
|
|
900 help=
|
|
901 "create indexed database files for lastal in case of working with new protein db")
|
|
902 parser.add_argument(
|
|
903 "-dir",
|
|
904 "--output_dir",
|
|
905 type=str,
|
|
906 help="specify if you want to change the output directory")
|
|
907 parser.add_argument(
|
10
|
908 "-M",
|
|
909 "--scoring_matrix",
|
|
910 type=str,
|
|
911 default="BL80",
|
|
912 choices=['BL80', 'BL62', 'MIQS'],
|
|
913 help="specify scoring matrix to use for similarity search (BL80, BL62, MIQS)")
|
|
914 parser.add_argument(
|
0
|
915 "-thsc",
|
|
916 "--threshold_score",
|
|
917 type=int,
|
|
918 default=80,
|
|
919 help=
|
|
920 "percentage of the best score in the cluster to be tolerated when assigning annotations per base")
|
|
921 parser.add_argument(
|
|
922 "-wd",
|
|
923 "--win_dom",
|
|
924 type=int,
|
|
925 default=10000000,
|
|
926 help="window to process large input sequences sequentially")
|
|
927 parser.add_argument("-od",
|
|
928 "--overlap_dom",
|
|
929 type=int,
|
|
930 default=10000,
|
|
931 help="overlap of sequences in two consecutive windows")
|
|
932
|
|
933 args = parser.parse_args()
|
|
934 main(args)
|