Mercurial > repos > nedias > orf_tools
view ORFFinder.py @ 8:e5616d5101c0 draft default tip
Bug fix - Null strand give index out of bound error
author | nedias |
---|---|
date | Wed, 19 Oct 2016 14:24:31 -0400 |
parents | d42adca5ecc2 |
children |
line wrap: on
line source
""" Class that contains functions related to finding open reading frames in the sequence. Author Nedias Sept, 2016 """ # TODO: Currently using regular expression to match string, may change to other algorithms import re # Find location of certain sequence in the sequenced data # input: 1.seq: Sequenced data in Seq format # 2.tag: Specific sequence such as start and end in the condon table # return: a list of locations where the designated sequence are found def find_locations(seq, tag): locs = [] for m in re.finditer(tag, seq): locs.append(m.start()) return locs # Get all start and end positions from the sequenced data # input: 1.seq: Sequenced data in Seq format # 2.rev: True for -strand and False for +strand # return: a dictionary contains all start and end positions def get_all_orf(seq, rev): result = dict() if rev: sta = "TAC" end_1 = "ATT" end_2 = "ACT" end_3 = "ATC" else: sta = "ATG" end_1 = "TAA" end_2 = "TGA" end_3 = "TAG" result["starts"] = find_locations(seq, sta) result["ends"] = find_locations(seq, end_1) result["ends"] += find_locations(seq, end_2) result["ends"] += find_locations(seq, end_3) # Must sorted to make sure the positions are in ascension trend # TODO: May use other RE to match all 3 end tags at the same time result["ends"].sort() return result # Pair all start and end position data # Each pair represents a possible ORF # input: dictionary contains all start and end positions # return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list # a pair is a list of two elements, first is start and last is end def find_all_orf(pos_dic): starts = pos_dic["starts"] ends = pos_dic["ends"] result = [] max_pair = [] index_end = 0 # Loop all starts for start in starts: # Loop till the end of the ends list while index_end < len(ends): end = ends[index_end] # If start is before than the end, and the length between start and end is a multiple of 3 if start < end and (end - start) % 3 == 0: # It will be a possible ORF, store in the result list result.append([start, end + 3]) # Find if it is longest of all ORFs if len(max_pair) == 0: max_pair = [start, end + 3] elif (max_pair[1] - max_pair[0]) < (end + 3 - start): max_pair = [start, end + 3] index_end += 1 break else: index_end += 1 index_end = 0 result.append(max_pair) return result # Get all pairs longer than the designated length # input: 1.pairs: all pairs of start and end positions # 2.length: designated length in percentage of the longest match # return: list, pairs of start and end that longer than the designated length def get_desi_pairs(pairs, length): desi_pairs = [] for pair in pairs[:-1]: if pair[1] - pair[0] >= length: desi_pairs.append(pair) return desi_pairs # Get the longest pair of start and end position # input: 1.pairs: all pairs of start and end positions of +strand # 2.rev_pairs: all pairs of start and end positions of -strand # return: longest pair of start and end position # TODO: Temporary use, need replace by formal method def get_longest_pair(pairs, rev_pairs): # The longest pair of each strand is store in the last position of the pair list, # so just pull it out directly if len(pairs) > 1: pos_longest = pairs[-1][1] - pairs[-1][0] else: pos_longest = 0 if len(rev_pairs) > 1: rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0] else: rev_longest = 0 return max(pos_longest, rev_longest)