# HG changeset patch # User nedias # Date 1476245033 14400 # Node ID 0095bf758b19bd26cc3174108a8ea41260f43a11 # Parent c56b8a6bd02ee6305f3a91dd7c6df17fa9f4e236 Uploaded diff -r c56b8a6bd02e -r 0095bf758b19 orf_tool.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orf_tool.py Wed Oct 12 00:03:53 2016 -0400 @@ -0,0 +1,122 @@ +""" + Actual function class of open reading frame searching tool + Served as a bridge between util class and entry. + + Author Nedias Sept, 2016 +""" +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +import ORFFinder +import os +import GTranslator + + +# Get command and parameter from entry and call corresponding function +def exec_tool(options): + # If format is fasta + if options.format and options.format == "fasta": + exec_fasta(options.input, options.outputa, options.outputd, options.length) + # TODO: If format is fastq + elif options.format and options.format == "fastq": + print("Process Fastq File(TODO:Not Implemented)") + # TODO: If format is sam + elif options.format and options.format == "sam": + print("Process Sam File(TODO:Not Implemented)") + # TODO: If format is bam + elif options.format and options.format == "bam": + print("Process Bam File(TODO:Not Implemented)") + + +# Read the input fasta file find all open reading frames(ORFs) +# input: 1.in_file: input file in fasta format +# 2.outputa: all ORFs found are saved in this file +# 3.outputd: all ORFs longer than designated length are saved in this file +# 4.length: filter all ORFs if less than percentage of the length of the longest ORF found +# return: execute status code +# TODO: Seq and Rev_seq need to be process in the same time +def exec_fasta(in_file, output_all, output_dest, length): + + # Open input file(read only) + input_file = open(in_file, "rU") + # Open all match file(create or override) + all_mth_file = open(output_all, "w+") + # Open all match file(create or override) + desi_file = open(output_dest, "w+") + + # Scan through all Sequenced data in input file + for record in SeqIO.parse(input_file, "fasta"): + + # for each sequence, use function in ORFFinder to abstract all ORFs + seq = record.seq + # Get all start and end positions in +strand + result = ORFFinder.get_all_orf(str(seq), False) + # Get all start-end pairs in +strand + pairs = ORFFinder.find_all_orf(result) + + # Reverse the sequenced data + rev_seq = seq[::-1] + # Get all start and end positions in -strand + rev_result = ORFFinder.get_all_orf(str(rev_seq), True) + # Get all start-end pairs in -strand + rev_pairs = ORFFinder.find_all_orf(rev_result) + + # Get longest start-end pair of both strands + longest_match = ORFFinder.get_longest_pair(pairs, rev_pairs) + + # Calculate the designated length + match_length = int(longest_match * int(length) / 100) + + # All ORFs + all_frags = [] + # All designated ORFs + desi_frags = [] + + # TODO: considering make the result in dictionary and make this four for-loop into 2 or 1 loop + # For each pair in the +strand + for pair in pairs[:-1]: + # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence + # into polypeptide sequence + frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(record.seq[pair[0]:pair[1]], False), record.id + "|" + + str(pair[0]) + "-" + str(pair[1]), + '', '') + all_frags.append(frag) + + # For each pair in the -strand + for pair2 in rev_pairs[:-1]: + # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence + # into polypeptide sequence + frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair2[0]:pair2[1]], True), + record.id + "|" + str(len(rev_seq) - pair2[0]) + "-" + str(len(rev_seq) - pair2[1]), + '', '') + all_frags.append(frag) + + desi_pairs = ORFFinder.get_desi_pairs(pairs, match_length) + rev_desi_pairs = ORFFinder.get_desi_pairs(rev_pairs, match_length) + + # For each designated pair in the +strand + for pair in desi_pairs: + # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence + # into polypeptide sequence + frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(seq[pair[0]:pair[1]], False), + record.id + "|" + str(pair[0]) + "-" + str(pair[1]), '', '') + desi_frags.append(frag) + + # For each designated pair in the strand + for pair in rev_desi_pairs: + # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence + # into polypeptide sequence + frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair[0]:pair[1]], True), + record.id + "|" + str(len(rev_seq) - pair[0]) + "-" + str(len(rev_seq) - pair[1]), + '', '') + desi_frags.append(frag) + + # Write the result to output file + SeqIO.write(all_frags, all_mth_file, "fasta") + SeqIO.write(desi_frags, desi_file, "fasta") + + # Close file entry + input_file.close() + all_mth_file.close() + desi_file.close() + + return 0