13
|
1 import time
|
|
2 import subprocess
|
|
3 import operator
|
|
4
|
|
5 from ragoo_utilities.SeqReader import SeqReader
|
|
6
|
|
7 """ A collection of various helper functions"""
|
|
8
|
|
9 complements = str.maketrans("ACGTNURYSWKMBVDHacgtnuryswkmbvdh", "TGCANAYRSWMKVBHDtgcanayrswmkvbhd")
|
|
10
|
|
11
|
|
12 def reverse_complement(seq):
|
|
13 """
|
|
14 Reverse complement a nucleotide sequence.
|
|
15 :param seq: Sequence to be reverse complemented
|
|
16 :return: A reverse complemented sequence
|
|
17 """
|
|
18 return seq.translate(complements)[::-1]
|
|
19
|
|
20
|
|
21 def run(cmnd):
|
|
22 """ Run command and report status. """
|
|
23 log('Running : %s' % cmnd)
|
|
24 if subprocess.call(cmnd, shell=True, executable='/bin/bash') != 0:
|
|
25 raise RuntimeError('Failed : %s ' % cmnd)
|
|
26
|
|
27
|
|
28 def log(message):
|
|
29 """ Log messages to standard output. """
|
|
30 print(time.ctime() + ' --- ' + message, flush=True)
|
|
31
|
|
32
|
|
33 def read_contigs(in_file):
|
|
34 d = dict()
|
|
35 x = SeqReader(in_file)
|
|
36 for header, seq in x.parse_fasta():
|
|
37 d[header.replace('>', '').split(' ')[0]] = seq
|
|
38 return d
|
|
39
|
|
40
|
|
41 def read_gz_contigs(in_file):
|
|
42 d = dict()
|
|
43 x = SeqReader(in_file)
|
|
44 for header, seq in x.parse_gzip_fasta():
|
|
45 d[header.replace('>', '').split(' ')[0]] = seq
|
|
46 return d
|
|
47
|
|
48
|
|
49 def binary_search(query, numbers, left, right):
|
|
50 """
|
|
51 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
|
|
52 written by Maria Nattestad. The original script can be found here:
|
|
53
|
|
54 https://github.com/MariaNattestad/Assemblytics
|
|
55
|
|
56 And the publication associated with Maria's work is here:
|
|
57
|
|
58 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
|
|
59 web analytics tool for the detection of variants from an
|
|
60 assembly." Bioinformatics 32.19 (2016): 3021-3023.
|
|
61 """
|
|
62 # Returns index of the matching element or the first element to the right
|
|
63
|
|
64 if left >= right:
|
|
65 return right
|
|
66 mid = (right + left) // 2
|
|
67
|
|
68 if query == numbers[mid]:
|
|
69 return mid
|
|
70 elif query < numbers[mid]:
|
|
71 return binary_search(query, numbers, left, mid)
|
|
72 else: # if query > numbers[mid]:
|
|
73 return binary_search(query, numbers, mid + 1, right)
|
|
74
|
|
75
|
|
76 def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False):
|
|
77 """
|
|
78 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
|
|
79 written by Maria Nattestad. The original script can be found here:
|
|
80
|
|
81 https://github.com/MariaNattestad/Assemblytics
|
|
82
|
|
83 And the publication associated with Maria's work is here:
|
|
84
|
|
85 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
|
|
86 web analytics tool for the detection of variants from an
|
|
87 assembly." Bioinformatics 32.19 (2016): 3021-3023.
|
|
88 """
|
|
89 alignments_to_keep = []
|
|
90 # print len(lines)
|
|
91
|
|
92 # If no alignments:
|
|
93 if len(lines) == 0:
|
|
94 return []
|
|
95
|
|
96 # If only one alignment:
|
|
97 if len(lines) == 1:
|
|
98 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
|
|
99 return [0]
|
|
100 else:
|
|
101 return []
|
|
102
|
|
103 starts_and_stops = []
|
|
104 for query_min, query_max in lines:
|
|
105 # print query_min, query_max
|
|
106 starts_and_stops.append((query_min, "start"))
|
|
107 starts_and_stops.append((query_max, "stop"))
|
|
108
|
|
109 sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0))
|
|
110
|
|
111 current_coverage = 0
|
|
112 last_position = -1
|
|
113 sorted_unique_intervals_left = []
|
|
114 sorted_unique_intervals_right = []
|
|
115 for pos, change in sorted_starts_and_stops:
|
|
116
|
|
117 if current_coverage == 1:
|
|
118 # sorted_unique_intervals.append((last_position,pos))
|
|
119 sorted_unique_intervals_left.append(last_position)
|
|
120 sorted_unique_intervals_right.append(pos)
|
|
121
|
|
122 if change == "start":
|
|
123 current_coverage += 1
|
|
124 else:
|
|
125 current_coverage -= 1
|
|
126 last_position = pos
|
|
127
|
|
128 linecounter = 0
|
|
129 for query_min, query_max in lines:
|
|
130
|
|
131 i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left))
|
|
132
|
|
133 exact_match = False
|
|
134 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
|
|
135 exact_match = True
|
|
136 sum_uniq = 0
|
|
137 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and \
|
|
138 sorted_unique_intervals_right[i] <= query_max:
|
|
139 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
|
|
140 i += 1
|
|
141
|
|
142 # print query_min,query_max,sum_uniq
|
|
143 if sum_uniq >= unique_length_required:
|
|
144 alignments_to_keep.append(linecounter)
|
|
145 elif keep_small_uniques == True and exact_match == True:
|
|
146 alignments_to_keep.append(linecounter)
|
|
147 # print "Keeping small alignment:", query_min, query_max
|
|
148 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
|
|
149
|
|
150 linecounter += 1
|
|
151
|
|
152 return alignments_to_keep
|