Mercurial > repos > nedias > orf_tools
comparison ORFFinder.py @ 5:d42adca5ecc2 draft
Uploaded
author | nedias |
---|---|
date | Wed, 12 Oct 2016 00:05:12 -0400 |
parents | |
children | e5616d5101c0 |
comparison
equal
deleted
inserted
replaced
4:04de7d352a3d | 5:d42adca5ecc2 |
---|---|
1 """ | |
2 Class that contains functions related to | |
3 finding open reading frames in the sequence. | |
4 | |
5 Author Nedias Sept, 2016 | |
6 """ | |
7 | |
8 # TODO: Currently using regular expression to match string, may change to other algorithms | |
9 import re | |
10 | |
11 | |
12 # Find location of certain sequence in the sequenced data | |
13 # input: 1.seq: Sequenced data in Seq format | |
14 # 2.tag: Specific sequence such as start and end in the condon table | |
15 # return: a list of locations where the designated sequence are found | |
16 def find_locations(seq, tag): | |
17 locs = [] | |
18 for m in re.finditer(tag, seq): | |
19 locs.append(m.start()) | |
20 return locs | |
21 | |
22 | |
23 # Get all start and end positions from the sequenced data | |
24 # input: 1.seq: Sequenced data in Seq format | |
25 # 2.rev: True for -strand and False for +strand | |
26 # return: a dictionary contains all start and end positions | |
27 def get_all_orf(seq, rev): | |
28 result = dict() | |
29 | |
30 if rev: | |
31 sta = "TAC" | |
32 end_1 = "ATT" | |
33 end_2 = "ACT" | |
34 end_3 = "ATC" | |
35 else: | |
36 sta = "ATG" | |
37 end_1 = "TAA" | |
38 end_2 = "TGA" | |
39 end_3 = "TAG" | |
40 | |
41 result["starts"] = find_locations(seq, sta) | |
42 result["ends"] = find_locations(seq, end_1) | |
43 result["ends"] += find_locations(seq, end_2) | |
44 result["ends"] += find_locations(seq, end_3) | |
45 # Must sorted to make sure the positions are in ascension trend | |
46 # TODO: May use other RE to match all 3 end tags at the same time | |
47 result["ends"].sort() | |
48 return result | |
49 | |
50 | |
51 # Pair all start and end position data | |
52 # Each pair represents a possible ORF | |
53 # input: dictionary contains all start and end positions | |
54 # return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list | |
55 # a pair is a list of two elements, first is start and last is end | |
56 def find_all_orf(pos_dic): | |
57 | |
58 starts = pos_dic["starts"] | |
59 ends = pos_dic["ends"] | |
60 | |
61 result = [] | |
62 | |
63 max_pair = [] | |
64 | |
65 index_end = 0 | |
66 | |
67 # Loop all starts | |
68 for start in starts: | |
69 # Loop till the end of the ends list | |
70 while index_end < len(ends): | |
71 end = ends[index_end] | |
72 # If start is before than the end, and the length between start and end is a multiple of 3 | |
73 if start < end and (end - start) % 3 == 0: | |
74 # It will be a possible ORF, store in the result list | |
75 result.append([start, end + 3]) | |
76 # Find if it is longest of all ORFs | |
77 if len(max_pair) == 0: | |
78 max_pair = [start, end + 3] | |
79 elif (max_pair[1] - max_pair[0]) < (end + 3 - start): | |
80 max_pair = [start, end + 3] | |
81 index_end += 1 | |
82 break | |
83 else: | |
84 index_end += 1 | |
85 index_end = 0 | |
86 result.append(max_pair) | |
87 return result | |
88 | |
89 | |
90 # Get all pairs longer than the designated length | |
91 # input: 1.pairs: all pairs of start and end positions | |
92 # 2.length: designated length in percentage of the longest match | |
93 # return: list, pairs of start and end that longer than the designated length | |
94 def get_desi_pairs(pairs, length): | |
95 desi_pairs = [] | |
96 for pair in pairs[:-1]: | |
97 if pair[1] - pair[0] >= length: | |
98 desi_pairs.append(pair) | |
99 | |
100 return desi_pairs | |
101 | |
102 | |
103 # Get the longest pair of start and end position | |
104 # input: 1.pairs: all pairs of start and end positions of +strand | |
105 # 2.rev_pairs: all pairs of start and end positions of -strand | |
106 # return: longest pair of start and end position | |
107 # TODO: Temporary use, need replace by formal method | |
108 def get_longest_pair(pairs, rev_pairs): | |
109 | |
110 # The longest pair of each strand is store in the last position of the pair list, | |
111 # so just pull it out directly | |
112 pos_longest = pairs[-1][1] - pairs[-1][0] | |
113 rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0] | |
114 return max(pos_longest, rev_longest) | |
115 | |
116 | |
117 | |
118 | |
119 | |
120 | |
121 | |
122 | |
123 | |
124 |