Mercurial > repos > nedias > orf_tools

"""
 Class that contains functions related to
 finding open reading frames in the sequence.

 Author Nedias Sept, 2016
"""

# TODO: Currently using regular expression to match string, may change to other algorithms
import re


# Find location of certain sequence in the sequenced data
# input: 1.seq: Sequenced data in Seq format
#        2.tag: Specific sequence such as start and end in the condon table
# return: a list of locations where the designated sequence are found
def find_locations(seq, tag):
    locs = []
    for m in re.finditer(tag, seq):
        locs.append(m.start())
    return locs


# Get all start and end positions from the sequenced data
# input: 1.seq: Sequenced data in Seq format
#        2.rev: True for -strand and False for +strand
# return: a dictionary contains all start and end positions
def get_all_orf(seq, rev):
    result = dict()

    if rev:
        sta = "TAC"
        end_1 = "ATT"
        end_2 = "ACT"
        end_3 = "ATC"
    else:
        sta = "ATG"
        end_1 = "TAA"
        end_2 = "TGA"
        end_3 = "TAG"

    result["starts"] = find_locations(seq, sta)
    result["ends"] = find_locations(seq, end_1)
    result["ends"] += find_locations(seq, end_2)
    result["ends"] += find_locations(seq, end_3)
    # Must sorted to make sure the positions are in ascension trend
    # TODO: May use other RE to match all 3 end tags at the same time
    result["ends"].sort()
    return result


# Pair all start and end position data
# Each pair represents a possible ORF
# input: dictionary contains all start and end positions
# return: a list contain all pairs of starts and ends, the longest pair are store in the end of the list
#         a pair is a list of two elements, first is start and last is end
def find_all_orf(pos_dic):

    starts = pos_dic["starts"]
    ends = pos_dic["ends"]

    result = []

    max_pair = []

    index_end = 0

    # Loop all starts
    for start in starts:
        # Loop till the end of the ends list
        while index_end < len(ends):
            end = ends[index_end]
            # If start is before than the end, and the length between start and end is a multiple of 3
            if start < end and (end - start) % 3 == 0:
                # It will be a possible ORF, store in the result list
                result.append([start, end + 3])
                # Find if it is longest of all ORFs
                if len(max_pair) == 0:
                    max_pair = [start, end + 3]
                elif (max_pair[1] - max_pair[0]) < (end + 3 - start):
                    max_pair = [start, end + 3]
                index_end += 1
                break
            else:
                index_end += 1
        index_end = 0
    result.append(max_pair)
    return result


# Get all pairs longer than the designated length
# input: 1.pairs: all pairs of start and end positions
#        2.length: designated length in percentage of the longest match
# return: list, pairs of start and end that longer than the designated length
def get_desi_pairs(pairs, length):
    desi_pairs = []
    for pair in pairs[:-1]:
        if pair[1] - pair[0] >= length:
            desi_pairs.append(pair)

    return desi_pairs


# Get the longest pair of start and end position
# input: 1.pairs: all pairs of start and end positions of +strand
#        2.rev_pairs: all pairs of start and end positions of -strand
# return: longest pair of start and end position
# TODO: Temporary use, need replace by formal method
def get_longest_pair(pairs, rev_pairs):

    # The longest pair of each strand is store in the last position of the pair list,
    # so just pull it out directly
    if len(pairs) > 1:
        pos_longest = pairs[-1][1] - pairs[-1][0]
    else:
        pos_longest = 0
    if len(rev_pairs) > 1:
        rev_longest = rev_pairs[-1][1] - rev_pairs[-1][0]
    else:
        rev_longest = 0
    return max(pos_longest, rev_longest)
author	nedias
date	Wed, 19 Oct 2016 14:24:31 -0400
parents	d42adca5ecc2
children