# HG changeset patch # User nedias # Date 1476245112 14400 # Node ID d42adca5ecc24ddf5849872551fc3a6c9f7b62d4 # Parent 04de7d352a3db8c51f791d02d8cfcb5ce58845f0 Uploaded diff -r 04de7d352a3d -r d42adca5ecc2 ORFFinder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ORFFinder.py Wed Oct 12 00:05:12 2016 -0400 @@ -0,0 +1,124 @@ +""" + Class that contains functions related to + finding open reading frames in the sequence. + + Author Nedias Sept, 2016 +""" + +# TODO: Currently using regular expression to match string, may change to other algorithms +import re + + +# Find location of certain sequence in the sequenced data +# input: 1.seq: Sequenced data in Seq format +# 2.tag: Specific sequence such as start and end in the condon table +# return: a list of locations where the designated sequence are found +def find_locations(seq, tag): + locs = [] + for m in re.finditer(tag, seq): + locs.append(m.start()) + return locs + + +# Get all start and end positions from the sequenced data +# input: 1.seq: Sequenced data in Seq format +# 2.rev: True for -strand and False for +strand +# return: a dictionary contains all start and end positions +def get_all_orf(seq, rev): + result = dict() + + if rev: + sta = "TAC" + end_1 = "ATT" + end_2 = "ACT" + end_3 = "ATC" + else: + sta = "ATG" + end_1 = "TAA" + end_2 = "TGA" + end_3 = "TAG" + + result["starts"] = find_locations(seq, sta) + result["ends"] = find_locations(seq, end_1) + result["ends"] += find_locations(seq, end_2) + result["ends"] += find_locations(seq, end_3) + # Must sorted to make sure the positions are in ascension trend + # TODO: May use other RE to match all 3 end tags at the same time + result["ends"].sort() + return result + + +# Pair all start and end position data +# Each pair represents a possible ORF +# input: dictionary contains all start and end positions +# return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list +# a pair is a list of two elements, first is start and last is end +def find_all_orf(pos_dic): + + starts = pos_dic["starts"] + ends = pos_dic["ends"] + + result = [] + + max_pair = [] + + index_end = 0 + + # Loop all starts + for start in starts: + # Loop till the end of the ends list + while index_end < len(ends): + end = ends[index_end] + # If start is before than the end, and the length between start and end is a multiple of 3 + if start < end and (end - start) % 3 == 0: + # It will be a possible ORF, store in the result list + result.append([start, end + 3]) + # Find if it is longest of all ORFs + if len(max_pair) == 0: + max_pair = [start, end + 3] + elif (max_pair[1] - max_pair[0]) < (end + 3 - start): + max_pair = [start, end + 3] + index_end += 1 + break + else: + index_end += 1 + index_end = 0 + result.append(max_pair) + return result + + +# Get all pairs longer than the designated length +# input: 1.pairs: all pairs of start and end positions +# 2.length: designated length in percentage of the longest match +# return: list, pairs of start and end that longer than the designated length +def get_desi_pairs(pairs, length): + desi_pairs = [] + for pair in pairs[:-1]: + if pair[1] - pair[0] >= length: + desi_pairs.append(pair) + + return desi_pairs + + +# Get the longest pair of start and end position +# input: 1.pairs: all pairs of start and end positions of +strand +# 2.rev_pairs: all pairs of start and end positions of -strand +# return: longest pair of start and end position +# TODO: Temporary use, need replace by formal method +def get_longest_pair(pairs, rev_pairs): + + # The longest pair of each strand is store in the last position of the pair list, + # so just pull it out directly + pos_longest = pairs[-1][1] - pairs[-1][0] + rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0] + return max(pos_longest, rev_longest) + + + + + + + + + +