# HG changeset patch
# User nedias
# Date 1476245112 14400
# Node ID d42adca5ecc24ddf5849872551fc3a6c9f7b62d4
# Parent  04de7d352a3db8c51f791d02d8cfcb5ce58845f0
Uploaded

diff -r 04de7d352a3d -r d42adca5ecc2 ORFFinder.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ORFFinder.py	Wed Oct 12 00:05:12 2016 -0400
@@ -0,0 +1,124 @@
+"""
+ Class that contains functions related to
+ finding open reading frames in the sequence.
+
+ Author Nedias Sept, 2016
+"""
+
+# TODO: Currently using regular expression to match string, may change to other algorithms
+import re
+
+
+# Find location of certain sequence in the sequenced data
+# input: 1.seq: Sequenced data in Seq format
+#        2.tag: Specific sequence such as start and end in the condon table
+# return: a list of locations where the designated sequence are found
+def find_locations(seq, tag):
+    locs = []
+    for m in re.finditer(tag, seq):
+        locs.append(m.start())
+    return locs
+
+
+# Get all start and end positions from the sequenced data
+# input: 1.seq: Sequenced data in Seq format
+#        2.rev: True for -strand and False for +strand
+# return: a dictionary contains all start and end positions
+def get_all_orf(seq, rev):
+    result = dict()
+
+    if rev:
+        sta = "TAC"
+        end_1 = "ATT"
+        end_2 = "ACT"
+        end_3 = "ATC"
+    else:
+        sta = "ATG"
+        end_1 = "TAA"
+        end_2 = "TGA"
+        end_3 = "TAG"
+
+    result["starts"] = find_locations(seq, sta)
+    result["ends"] = find_locations(seq, end_1)
+    result["ends"] += find_locations(seq, end_2)
+    result["ends"] += find_locations(seq, end_3)
+    # Must sorted to make sure the positions are in ascension trend
+    # TODO: May use other RE to match all 3 end tags at the same time
+    result["ends"].sort()
+    return result
+
+
+# Pair all start and end position data
+# Each pair represents a possible ORF
+# input: dictionary contains all start and end positions
+# return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
+#         a pair is a list of two elements, first is start and last is end
+def find_all_orf(pos_dic):
+
+    starts = pos_dic["starts"]
+    ends = pos_dic["ends"]
+
+    result = []
+
+    max_pair = []
+
+    index_end = 0
+
+    # Loop all starts
+    for start in starts:
+        # Loop till the end of the ends list
+        while index_end < len(ends):
+            end = ends[index_end]
+            # If start is before than the end, and the length between start and end is a multiple of 3
+            if start < end and (end - start) % 3 == 0:
+                # It will be a possible ORF, store in the result list
+                result.append([start, end + 3])
+                # Find if it is longest of all ORFs
+                if len(max_pair) == 0:
+                    max_pair = [start, end + 3]
+                elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
+                    max_pair = [start, end + 3]
+                index_end += 1
+                break
+            else:
+                index_end += 1
+        index_end = 0
+    result.append(max_pair)
+    return result
+
+
+# Get all pairs longer than the designated length
+# input: 1.pairs: all pairs of start and end positions
+#        2.length: designated length in percentage of the longest match
+# return: list, pairs of start and end that longer than the designated length
+def get_desi_pairs(pairs, length):
+    desi_pairs = []
+    for pair in pairs[:-1]:
+        if pair[1] - pair[0] >= length:
+            desi_pairs.append(pair)
+
+    return desi_pairs
+
+
+# Get the longest pair of start and end position
+# input: 1.pairs: all pairs of start and end positions of +strand
+#        2.rev_pairs: all pairs of start and end positions of -strand
+# return: longest pair of start and end position
+# TODO: Temporary use, need replace by formal method
+def get_longest_pair(pairs, rev_pairs):
+
+    # The longest pair of each strand is store in the last position of the pair list,
+    # so just pull it out directly
+    pos_longest = pairs[-1][1] - pairs[-1][0]
+    rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
+    return max(pos_longest, rev_longest)
+
+
+
+
+
+
+
+
+
+