5
|
1 """
|
|
2 Class that contains functions related to
|
|
3 finding open reading frames in the sequence.
|
|
4
|
|
5 Author Nedias Sept, 2016
|
|
6 """
|
|
7
|
|
8 # TODO: Currently using regular expression to match string, may change to other algorithms
|
|
9 import re
|
|
10
|
|
11
|
|
12 # Find location of certain sequence in the sequenced data
|
|
13 # input: 1.seq: Sequenced data in Seq format
|
|
14 # 2.tag: Specific sequence such as start and end in the condon table
|
|
15 # return: a list of locations where the designated sequence are found
|
|
16 def find_locations(seq, tag):
|
|
17 locs = []
|
|
18 for m in re.finditer(tag, seq):
|
|
19 locs.append(m.start())
|
|
20 return locs
|
|
21
|
|
22
|
|
23 # Get all start and end positions from the sequenced data
|
|
24 # input: 1.seq: Sequenced data in Seq format
|
|
25 # 2.rev: True for -strand and False for +strand
|
|
26 # return: a dictionary contains all start and end positions
|
|
27 def get_all_orf(seq, rev):
|
|
28 result = dict()
|
|
29
|
|
30 if rev:
|
|
31 sta = "TAC"
|
|
32 end_1 = "ATT"
|
|
33 end_2 = "ACT"
|
|
34 end_3 = "ATC"
|
|
35 else:
|
|
36 sta = "ATG"
|
|
37 end_1 = "TAA"
|
|
38 end_2 = "TGA"
|
|
39 end_3 = "TAG"
|
|
40
|
|
41 result["starts"] = find_locations(seq, sta)
|
|
42 result["ends"] = find_locations(seq, end_1)
|
|
43 result["ends"] += find_locations(seq, end_2)
|
|
44 result["ends"] += find_locations(seq, end_3)
|
|
45 # Must sorted to make sure the positions are in ascension trend
|
|
46 # TODO: May use other RE to match all 3 end tags at the same time
|
|
47 result["ends"].sort()
|
|
48 return result
|
|
49
|
|
50
|
|
51 # Pair all start and end position data
|
|
52 # Each pair represents a possible ORF
|
|
53 # input: dictionary contains all start and end positions
|
|
54 # return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
|
|
55 # a pair is a list of two elements, first is start and last is end
|
|
56 def find_all_orf(pos_dic):
|
|
57
|
|
58 starts = pos_dic["starts"]
|
|
59 ends = pos_dic["ends"]
|
|
60
|
|
61 result = []
|
|
62
|
|
63 max_pair = []
|
|
64
|
|
65 index_end = 0
|
|
66
|
|
67 # Loop all starts
|
|
68 for start in starts:
|
|
69 # Loop till the end of the ends list
|
|
70 while index_end < len(ends):
|
|
71 end = ends[index_end]
|
|
72 # If start is before than the end, and the length between start and end is a multiple of 3
|
|
73 if start < end and (end - start) % 3 == 0:
|
|
74 # It will be a possible ORF, store in the result list
|
|
75 result.append([start, end + 3])
|
|
76 # Find if it is longest of all ORFs
|
|
77 if len(max_pair) == 0:
|
|
78 max_pair = [start, end + 3]
|
|
79 elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
|
|
80 max_pair = [start, end + 3]
|
|
81 index_end += 1
|
|
82 break
|
|
83 else:
|
|
84 index_end += 1
|
|
85 index_end = 0
|
|
86 result.append(max_pair)
|
|
87 return result
|
|
88
|
|
89
|
|
90 # Get all pairs longer than the designated length
|
|
91 # input: 1.pairs: all pairs of start and end positions
|
|
92 # 2.length: designated length in percentage of the longest match
|
|
93 # return: list, pairs of start and end that longer than the designated length
|
|
94 def get_desi_pairs(pairs, length):
|
|
95 desi_pairs = []
|
|
96 for pair in pairs[:-1]:
|
|
97 if pair[1] - pair[0] >= length:
|
|
98 desi_pairs.append(pair)
|
|
99
|
|
100 return desi_pairs
|
|
101
|
|
102
|
|
103 # Get the longest pair of start and end position
|
|
104 # input: 1.pairs: all pairs of start and end positions of +strand
|
|
105 # 2.rev_pairs: all pairs of start and end positions of -strand
|
|
106 # return: longest pair of start and end position
|
|
107 # TODO: Temporary use, need replace by formal method
|
|
108 def get_longest_pair(pairs, rev_pairs):
|
|
109
|
|
110 # The longest pair of each strand is store in the last position of the pair list,
|
|
111 # so just pull it out directly
|
8
|
112 if len(pairs) > 1:
|
|
113 pos_longest = pairs[-1][1] - pairs[-1][0]
|
|
114 else:
|
|
115 pos_longest = 0
|
|
116 if len(rev_pairs) > 1:
|
|
117 rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
|
|
118 else:
|
|
119 rev_longest = 0
|
5
|
120 return max(pos_longest, rev_longest)
|
|
121
|
|
122
|
|
123
|
|
124
|
|
125
|
|
126
|
|
127
|
|
128
|
|
129
|
|
130
|