orf_tools: ORFFinder.py comparison

comparison ORFFinder.py @ 5:d42adca5ecc2 draft

Uploaded

author	nedias
date	Wed, 12 Oct 2016 00:05:12 -0400
parents
children	e5616d5101c0

comparison

equal deleted inserted replaced

-:04de7d352a3d
+:d42adca5ecc2
+"""
+Class that contains functions related to
+finding open reading frames in the sequence.
+Author Nedias Sept, 2016
+"""
+# TODO: Currently using regular expression to match string, may change to other algorithms
+import re
+# Find location of certain sequence in the sequenced data
+# input: 1.seq: Sequenced data in Seq format
+#        2.tag: Specific sequence such as start and end in the condon table
+# return: a list of locations where the designated sequence are found
+def find_locations(seq, tag):
+locs = []
+for m in re.finditer(tag, seq):
+locs.append(m.start())
+return locs
+# Get all start and end positions from the sequenced data
+# input: 1.seq: Sequenced data in Seq format
+#        2.rev: True for -strand and False for +strand
+# return: a dictionary contains all start and end positions
+def get_all_orf(seq, rev):
+result = dict()
+if rev:
+sta = "TAC"
+end_1 = "ATT"
+end_2 = "ACT"
+end_3 = "ATC"
+else:
+sta = "ATG"
+end_1 = "TAA"
+end_2 = "TGA"
+end_3 = "TAG"
+result["starts"] = find_locations(seq, sta)
+result["ends"] = find_locations(seq, end_1)
+result["ends"] += find_locations(seq, end_2)
+result["ends"] += find_locations(seq, end_3)
+# Must sorted to make sure the positions are in ascension trend
+# TODO: May use other RE to match all 3 end tags at the same time
+result["ends"].sort()
+return result
+# Pair all start and end position data
+# Each pair represents a possible ORF
+# input: dictionary contains all start and end positions
+# return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
+#         a pair is a list of two elements, first is start and last is end
+def find_all_orf(pos_dic):
+starts = pos_dic["starts"]
+ends = pos_dic["ends"]
+result = []
+max_pair = []
+index_end = 0
+# Loop all starts
+for start in starts:
+# Loop till the end of the ends list
+while index_end < len(ends):
+end = ends[index_end]
+# If start is before than the end, and the length between start and end is a multiple of 3
+if start < end and (end - start) % 3 == 0:
+# It will be a possible ORF, store in the result list
+result.append([start, end + 3])
+# Find if it is longest of all ORFs
+if len(max_pair) == 0:
+max_pair = [start, end + 3]
+elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
+max_pair = [start, end + 3]
+index_end += 1
+break
+else:
+index_end += 1
+index_end = 0
+result.append(max_pair)
+return result
+# Get all pairs longer than the designated length
+# input: 1.pairs: all pairs of start and end positions
+#        2.length: designated length in percentage of the longest match
+# return: list, pairs of start and end that longer than the designated length
+def get_desi_pairs(pairs, length):
+desi_pairs = []
+for pair in pairs[:-1]:
+if pair[1] - pair[0] >= length:
+desi_pairs.append(pair)
+return desi_pairs
+# Get the longest pair of start and end position
+# input: 1.pairs: all pairs of start and end positions of +strand
+#        2.rev_pairs: all pairs of start and end positions of -strand
+# return: longest pair of start and end position
+# TODO: Temporary use, need replace by formal method
+def get_longest_pair(pairs, rev_pairs):
+# The longest pair of each strand is store in the last position of the pair list,
+# so just pull it out directly
+pos_longest = pairs[-1][1] - pairs[-1][0]
+rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
+return max(pos_longest, rev_longest)

Mercurial > repos > nedias > orf_tools

comparison ORFFinder.py @ 5:d42adca5ecc2 draft