comparison ORFFinder.py @ 5:d42adca5ecc2 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 00:05:12 -0400
parents
children e5616d5101c0
comparison
equal deleted inserted replaced
4:04de7d352a3d 5:d42adca5ecc2
1 """
2 Class that contains functions related to
3 finding open reading frames in the sequence.
4
5 Author Nedias Sept, 2016
6 """
7
8 # TODO: Currently using regular expression to match string, may change to other algorithms
9 import re
10
11
12 # Find location of certain sequence in the sequenced data
13 # input: 1.seq: Sequenced data in Seq format
14 # 2.tag: Specific sequence such as start and end in the condon table
15 # return: a list of locations where the designated sequence are found
16 def find_locations(seq, tag):
17 locs = []
18 for m in re.finditer(tag, seq):
19 locs.append(m.start())
20 return locs
21
22
23 # Get all start and end positions from the sequenced data
24 # input: 1.seq: Sequenced data in Seq format
25 # 2.rev: True for -strand and False for +strand
26 # return: a dictionary contains all start and end positions
27 def get_all_orf(seq, rev):
28 result = dict()
29
30 if rev:
31 sta = "TAC"
32 end_1 = "ATT"
33 end_2 = "ACT"
34 end_3 = "ATC"
35 else:
36 sta = "ATG"
37 end_1 = "TAA"
38 end_2 = "TGA"
39 end_3 = "TAG"
40
41 result["starts"] = find_locations(seq, sta)
42 result["ends"] = find_locations(seq, end_1)
43 result["ends"] += find_locations(seq, end_2)
44 result["ends"] += find_locations(seq, end_3)
45 # Must sorted to make sure the positions are in ascension trend
46 # TODO: May use other RE to match all 3 end tags at the same time
47 result["ends"].sort()
48 return result
49
50
51 # Pair all start and end position data
52 # Each pair represents a possible ORF
53 # input: dictionary contains all start and end positions
54 # return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
55 # a pair is a list of two elements, first is start and last is end
56 def find_all_orf(pos_dic):
57
58 starts = pos_dic["starts"]
59 ends = pos_dic["ends"]
60
61 result = []
62
63 max_pair = []
64
65 index_end = 0
66
67 # Loop all starts
68 for start in starts:
69 # Loop till the end of the ends list
70 while index_end < len(ends):
71 end = ends[index_end]
72 # If start is before than the end, and the length between start and end is a multiple of 3
73 if start < end and (end - start) % 3 == 0:
74 # It will be a possible ORF, store in the result list
75 result.append([start, end + 3])
76 # Find if it is longest of all ORFs
77 if len(max_pair) == 0:
78 max_pair = [start, end + 3]
79 elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
80 max_pair = [start, end + 3]
81 index_end += 1
82 break
83 else:
84 index_end += 1
85 index_end = 0
86 result.append(max_pair)
87 return result
88
89
90 # Get all pairs longer than the designated length
91 # input: 1.pairs: all pairs of start and end positions
92 # 2.length: designated length in percentage of the longest match
93 # return: list, pairs of start and end that longer than the designated length
94 def get_desi_pairs(pairs, length):
95 desi_pairs = []
96 for pair in pairs[:-1]:
97 if pair[1] - pair[0] >= length:
98 desi_pairs.append(pair)
99
100 return desi_pairs
101
102
103 # Get the longest pair of start and end position
104 # input: 1.pairs: all pairs of start and end positions of +strand
105 # 2.rev_pairs: all pairs of start and end positions of -strand
106 # return: longest pair of start and end position
107 # TODO: Temporary use, need replace by formal method
108 def get_longest_pair(pairs, rev_pairs):
109
110 # The longest pair of each strand is store in the last position of the pair list,
111 # so just pull it out directly
112 pos_longest = pairs[-1][1] - pairs[-1][0]
113 rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
114 return max(pos_longest, rev_longest)
115
116
117
118
119
120
121
122
123
124