annotate ORFFinder.py @ 5:d42adca5ecc2 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 00:05:12 -0400
parents
children e5616d5101c0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
1 """
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
2 Class that contains functions related to
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
3 finding open reading frames in the sequence.
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
4
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
5 Author Nedias Sept, 2016
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
6 """
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
7
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
8 # TODO: Currently using regular expression to match string, may change to other algorithms
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
9 import re
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
10
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
11
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
12 # Find location of certain sequence in the sequenced data
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
13 # input: 1.seq: Sequenced data in Seq format
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
14 # 2.tag: Specific sequence such as start and end in the condon table
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
15 # return: a list of locations where the designated sequence are found
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
16 def find_locations(seq, tag):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
17 locs = []
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
18 for m in re.finditer(tag, seq):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
19 locs.append(m.start())
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
20 return locs
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
21
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
22
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
23 # Get all start and end positions from the sequenced data
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
24 # input: 1.seq: Sequenced data in Seq format
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
25 # 2.rev: True for -strand and False for +strand
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
26 # return: a dictionary contains all start and end positions
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
27 def get_all_orf(seq, rev):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
28 result = dict()
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
29
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
30 if rev:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
31 sta = "TAC"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
32 end_1 = "ATT"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
33 end_2 = "ACT"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
34 end_3 = "ATC"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
35 else:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
36 sta = "ATG"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
37 end_1 = "TAA"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
38 end_2 = "TGA"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
39 end_3 = "TAG"
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
40
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
41 result["starts"] = find_locations(seq, sta)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
42 result["ends"] = find_locations(seq, end_1)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
43 result["ends"] += find_locations(seq, end_2)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
44 result["ends"] += find_locations(seq, end_3)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
45 # Must sorted to make sure the positions are in ascension trend
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
46 # TODO: May use other RE to match all 3 end tags at the same time
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
47 result["ends"].sort()
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
48 return result
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
49
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
50
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
51 # Pair all start and end position data
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
52 # Each pair represents a possible ORF
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
53 # input: dictionary contains all start and end positions
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
54 # return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
55 # a pair is a list of two elements, first is start and last is end
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
56 def find_all_orf(pos_dic):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
57
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
58 starts = pos_dic["starts"]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
59 ends = pos_dic["ends"]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
60
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
61 result = []
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
62
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
63 max_pair = []
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
64
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
65 index_end = 0
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
66
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
67 # Loop all starts
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
68 for start in starts:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
69 # Loop till the end of the ends list
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
70 while index_end < len(ends):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
71 end = ends[index_end]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
72 # If start is before than the end, and the length between start and end is a multiple of 3
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
73 if start < end and (end - start) % 3 == 0:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
74 # It will be a possible ORF, store in the result list
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
75 result.append([start, end + 3])
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
76 # Find if it is longest of all ORFs
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
77 if len(max_pair) == 0:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
78 max_pair = [start, end + 3]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
79 elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
80 max_pair = [start, end + 3]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
81 index_end += 1
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
82 break
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
83 else:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
84 index_end += 1
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
85 index_end = 0
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
86 result.append(max_pair)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
87 return result
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
88
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
89
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
90 # Get all pairs longer than the designated length
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
91 # input: 1.pairs: all pairs of start and end positions
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
92 # 2.length: designated length in percentage of the longest match
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
93 # return: list, pairs of start and end that longer than the designated length
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
94 def get_desi_pairs(pairs, length):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
95 desi_pairs = []
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
96 for pair in pairs[:-1]:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
97 if pair[1] - pair[0] >= length:
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
98 desi_pairs.append(pair)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
99
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
100 return desi_pairs
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
101
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
102
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
103 # Get the longest pair of start and end position
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
104 # input: 1.pairs: all pairs of start and end positions of +strand
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
105 # 2.rev_pairs: all pairs of start and end positions of -strand
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
106 # return: longest pair of start and end position
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
107 # TODO: Temporary use, need replace by formal method
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
108 def get_longest_pair(pairs, rev_pairs):
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
109
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
110 # The longest pair of each strand is store in the last position of the pair list,
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
111 # so just pull it out directly
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
112 pos_longest = pairs[-1][1] - pairs[-1][0]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
113 rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
114 return max(pos_longest, rev_longest)
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
115
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
116
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
117
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
118
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
119
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
120
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
121
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
122
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
123
d42adca5ecc2 Uploaded
nedias
parents:
diff changeset
124