changeset 3:0095bf758b19 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 00:03:53 -0400
parents c56b8a6bd02e
children 04de7d352a3d
files orf_tool.py
diffstat 1 files changed, 122 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/orf_tool.py	Wed Oct 12 00:03:53 2016 -0400
@@ -0,0 +1,122 @@
+"""
+ Actual function class of open reading frame searching tool
+ Served as a bridge between util class and entry.
+
+ Author Nedias Sept, 2016
+"""
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+import ORFFinder
+import os
+import GTranslator
+
+
+# Get command and parameter from entry and call corresponding function
+def exec_tool(options):
+    # If format is fasta
+    if options.format and options.format == "fasta":
+        exec_fasta(options.input, options.outputa, options.outputd, options.length)
+    # TODO: If format is fastq
+    elif options.format and options.format == "fastq":
+        print("Process Fastq File(TODO:Not Implemented)")
+    # TODO: If format is sam
+    elif options.format and options.format == "sam":
+        print("Process Sam File(TODO:Not Implemented)")
+    # TODO: If format is bam
+    elif options.format and options.format == "bam":
+        print("Process Bam File(TODO:Not Implemented)")
+
+
+# Read the input fasta file find all open reading frames(ORFs)
+# input: 1.in_file: input file in fasta format
+#        2.outputa: all ORFs found are saved in this file
+#        3.outputd: all ORFs longer than designated length are saved in this file
+#        4.length: filter all ORFs if less than percentage of the length of the longest ORF found
+# return: execute status code
+# TODO: Seq and Rev_seq need to be process in the same time
+def exec_fasta(in_file, output_all, output_dest, length):
+
+    # Open input file(read only)
+    input_file = open(in_file, "rU")
+    # Open all match file(create or override)
+    all_mth_file = open(output_all, "w+")
+    # Open all match file(create or override)
+    desi_file = open(output_dest, "w+")
+
+    # Scan through all Sequenced data in input file
+    for record in SeqIO.parse(input_file, "fasta"):
+
+        # for each sequence, use function in ORFFinder to abstract all ORFs
+        seq = record.seq
+        # Get all start and end positions in +strand
+        result = ORFFinder.get_all_orf(str(seq), False)
+        # Get all start-end pairs in +strand
+        pairs = ORFFinder.find_all_orf(result)
+
+        # Reverse the sequenced data
+        rev_seq = seq[::-1]
+        # Get all start and end positions in -strand
+        rev_result = ORFFinder.get_all_orf(str(rev_seq), True)
+        # Get all start-end pairs in -strand
+        rev_pairs = ORFFinder.find_all_orf(rev_result)
+
+        # Get longest start-end pair of both strands
+        longest_match = ORFFinder.get_longest_pair(pairs, rev_pairs)
+
+        # Calculate the designated length
+        match_length = int(longest_match * int(length) / 100)
+
+        # All ORFs
+        all_frags = []
+        # All designated ORFs
+        desi_frags = []
+
+        # TODO: considering make the result in dictionary and make this four for-loop into 2 or 1 loop
+        # For each pair in the +strand
+        for pair in pairs[:-1]:
+            # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
+            # into polypeptide sequence
+            frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(record.seq[pair[0]:pair[1]], False), record.id + "|"
+                             + str(pair[0]) + "-" + str(pair[1]),
+                             '', '')
+            all_frags.append(frag)
+
+        # For each pair in the -strand
+        for pair2 in rev_pairs[:-1]:
+            # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
+            # into polypeptide sequence
+            frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair2[0]:pair2[1]], True),
+                             record.id + "|" + str(len(rev_seq) - pair2[0]) + "-" + str(len(rev_seq) - pair2[1]),
+                             '', '')
+            all_frags.append(frag)
+
+        desi_pairs = ORFFinder.get_desi_pairs(pairs, match_length)
+        rev_desi_pairs = ORFFinder.get_desi_pairs(rev_pairs, match_length)
+
+        # For each designated pair in the +strand
+        for pair in desi_pairs:
+            # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
+            # into polypeptide sequence
+            frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(seq[pair[0]:pair[1]], False),
+                             record.id + "|" + str(pair[0]) + "-" + str(pair[1]), '', '')
+            desi_frags.append(frag)
+
+        # For each designated pair in the strand
+        for pair in rev_desi_pairs:
+            # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
+            # into polypeptide sequence
+            frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair[0]:pair[1]], True),
+                             record.id + "|" + str(len(rev_seq) - pair[0]) + "-" + str(len(rev_seq) - pair[1]),
+                             '', '')
+            desi_frags.append(frag)
+
+        # Write the result to output file
+        SeqIO.write(all_frags, all_mth_file, "fasta")
+        SeqIO.write(desi_frags, desi_file, "fasta")
+
+    # Close file entry
+    input_file.close()
+    all_mth_file.close()
+    desi_file.close()
+
+    return 0