annotate orf_tool.py @ 5:d42adca5ecc2 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 00:05:12 -0400
parents 0095bf758b19
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
0095bf758b19 Uploaded
nedias
parents:
diff changeset
1 """
0095bf758b19 Uploaded
nedias
parents:
diff changeset
2 Actual function class of open reading frame searching tool
0095bf758b19 Uploaded
nedias
parents:
diff changeset
3 Served as a bridge between util class and entry.
0095bf758b19 Uploaded
nedias
parents:
diff changeset
4
0095bf758b19 Uploaded
nedias
parents:
diff changeset
5 Author Nedias Sept, 2016
0095bf758b19 Uploaded
nedias
parents:
diff changeset
6 """
0095bf758b19 Uploaded
nedias
parents:
diff changeset
7 from Bio import SeqIO
0095bf758b19 Uploaded
nedias
parents:
diff changeset
8 from Bio.SeqRecord import SeqRecord
0095bf758b19 Uploaded
nedias
parents:
diff changeset
9 import ORFFinder
0095bf758b19 Uploaded
nedias
parents:
diff changeset
10 import os
0095bf758b19 Uploaded
nedias
parents:
diff changeset
11 import GTranslator
0095bf758b19 Uploaded
nedias
parents:
diff changeset
12
0095bf758b19 Uploaded
nedias
parents:
diff changeset
13
0095bf758b19 Uploaded
nedias
parents:
diff changeset
14 # Get command and parameter from entry and call corresponding function
0095bf758b19 Uploaded
nedias
parents:
diff changeset
15 def exec_tool(options):
0095bf758b19 Uploaded
nedias
parents:
diff changeset
16 # If format is fasta
0095bf758b19 Uploaded
nedias
parents:
diff changeset
17 if options.format and options.format == "fasta":
0095bf758b19 Uploaded
nedias
parents:
diff changeset
18 exec_fasta(options.input, options.outputa, options.outputd, options.length)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
19 # TODO: If format is fastq
0095bf758b19 Uploaded
nedias
parents:
diff changeset
20 elif options.format and options.format == "fastq":
0095bf758b19 Uploaded
nedias
parents:
diff changeset
21 print("Process Fastq File(TODO:Not Implemented)")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
22 # TODO: If format is sam
0095bf758b19 Uploaded
nedias
parents:
diff changeset
23 elif options.format and options.format == "sam":
0095bf758b19 Uploaded
nedias
parents:
diff changeset
24 print("Process Sam File(TODO:Not Implemented)")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
25 # TODO: If format is bam
0095bf758b19 Uploaded
nedias
parents:
diff changeset
26 elif options.format and options.format == "bam":
0095bf758b19 Uploaded
nedias
parents:
diff changeset
27 print("Process Bam File(TODO:Not Implemented)")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
28
0095bf758b19 Uploaded
nedias
parents:
diff changeset
29
0095bf758b19 Uploaded
nedias
parents:
diff changeset
30 # Read the input fasta file find all open reading frames(ORFs)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
31 # input: 1.in_file: input file in fasta format
0095bf758b19 Uploaded
nedias
parents:
diff changeset
32 # 2.outputa: all ORFs found are saved in this file
0095bf758b19 Uploaded
nedias
parents:
diff changeset
33 # 3.outputd: all ORFs longer than designated length are saved in this file
0095bf758b19 Uploaded
nedias
parents:
diff changeset
34 # 4.length: filter all ORFs if less than percentage of the length of the longest ORF found
0095bf758b19 Uploaded
nedias
parents:
diff changeset
35 # return: execute status code
0095bf758b19 Uploaded
nedias
parents:
diff changeset
36 # TODO: Seq and Rev_seq need to be process in the same time
0095bf758b19 Uploaded
nedias
parents:
diff changeset
37 def exec_fasta(in_file, output_all, output_dest, length):
0095bf758b19 Uploaded
nedias
parents:
diff changeset
38
0095bf758b19 Uploaded
nedias
parents:
diff changeset
39 # Open input file(read only)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
40 input_file = open(in_file, "rU")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
41 # Open all match file(create or override)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
42 all_mth_file = open(output_all, "w+")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
43 # Open all match file(create or override)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
44 desi_file = open(output_dest, "w+")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
45
0095bf758b19 Uploaded
nedias
parents:
diff changeset
46 # Scan through all Sequenced data in input file
0095bf758b19 Uploaded
nedias
parents:
diff changeset
47 for record in SeqIO.parse(input_file, "fasta"):
0095bf758b19 Uploaded
nedias
parents:
diff changeset
48
0095bf758b19 Uploaded
nedias
parents:
diff changeset
49 # for each sequence, use function in ORFFinder to abstract all ORFs
0095bf758b19 Uploaded
nedias
parents:
diff changeset
50 seq = record.seq
0095bf758b19 Uploaded
nedias
parents:
diff changeset
51 # Get all start and end positions in +strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
52 result = ORFFinder.get_all_orf(str(seq), False)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
53 # Get all start-end pairs in +strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
54 pairs = ORFFinder.find_all_orf(result)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
55
0095bf758b19 Uploaded
nedias
parents:
diff changeset
56 # Reverse the sequenced data
0095bf758b19 Uploaded
nedias
parents:
diff changeset
57 rev_seq = seq[::-1]
0095bf758b19 Uploaded
nedias
parents:
diff changeset
58 # Get all start and end positions in -strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
59 rev_result = ORFFinder.get_all_orf(str(rev_seq), True)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
60 # Get all start-end pairs in -strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
61 rev_pairs = ORFFinder.find_all_orf(rev_result)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
62
0095bf758b19 Uploaded
nedias
parents:
diff changeset
63 # Get longest start-end pair of both strands
0095bf758b19 Uploaded
nedias
parents:
diff changeset
64 longest_match = ORFFinder.get_longest_pair(pairs, rev_pairs)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
65
0095bf758b19 Uploaded
nedias
parents:
diff changeset
66 # Calculate the designated length
0095bf758b19 Uploaded
nedias
parents:
diff changeset
67 match_length = int(longest_match * int(length) / 100)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
68
0095bf758b19 Uploaded
nedias
parents:
diff changeset
69 # All ORFs
0095bf758b19 Uploaded
nedias
parents:
diff changeset
70 all_frags = []
0095bf758b19 Uploaded
nedias
parents:
diff changeset
71 # All designated ORFs
0095bf758b19 Uploaded
nedias
parents:
diff changeset
72 desi_frags = []
0095bf758b19 Uploaded
nedias
parents:
diff changeset
73
0095bf758b19 Uploaded
nedias
parents:
diff changeset
74 # TODO: considering make the result in dictionary and make this four for-loop into 2 or 1 loop
0095bf758b19 Uploaded
nedias
parents:
diff changeset
75 # For each pair in the +strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
76 for pair in pairs[:-1]:
0095bf758b19 Uploaded
nedias
parents:
diff changeset
77 # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
78 # into polypeptide sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
79 frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(record.seq[pair[0]:pair[1]], False), record.id + "|"
0095bf758b19 Uploaded
nedias
parents:
diff changeset
80 + str(pair[0]) + "-" + str(pair[1]),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
81 '', '')
0095bf758b19 Uploaded
nedias
parents:
diff changeset
82 all_frags.append(frag)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
83
0095bf758b19 Uploaded
nedias
parents:
diff changeset
84 # For each pair in the -strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
85 for pair2 in rev_pairs[:-1]:
0095bf758b19 Uploaded
nedias
parents:
diff changeset
86 # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
87 # into polypeptide sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
88 frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair2[0]:pair2[1]], True),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
89 record.id + "|" + str(len(rev_seq) - pair2[0]) + "-" + str(len(rev_seq) - pair2[1]),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
90 '', '')
0095bf758b19 Uploaded
nedias
parents:
diff changeset
91 all_frags.append(frag)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
92
0095bf758b19 Uploaded
nedias
parents:
diff changeset
93 desi_pairs = ORFFinder.get_desi_pairs(pairs, match_length)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
94 rev_desi_pairs = ORFFinder.get_desi_pairs(rev_pairs, match_length)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
95
0095bf758b19 Uploaded
nedias
parents:
diff changeset
96 # For each designated pair in the +strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
97 for pair in desi_pairs:
0095bf758b19 Uploaded
nedias
parents:
diff changeset
98 # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
99 # into polypeptide sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
100 frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(seq[pair[0]:pair[1]], False),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
101 record.id + "|" + str(pair[0]) + "-" + str(pair[1]), '', '')
0095bf758b19 Uploaded
nedias
parents:
diff changeset
102 desi_frags.append(frag)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
103
0095bf758b19 Uploaded
nedias
parents:
diff changeset
104 # For each designated pair in the strand
0095bf758b19 Uploaded
nedias
parents:
diff changeset
105 for pair in rev_desi_pairs:
0095bf758b19 Uploaded
nedias
parents:
diff changeset
106 # Intercept ORF from the original sequence using the start-end pair, and than translate the sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
107 # into polypeptide sequence
0095bf758b19 Uploaded
nedias
parents:
diff changeset
108 frag = SeqRecord(GTranslator.nucleotide_to_polypeptide(rev_seq[pair[0]:pair[1]], True),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
109 record.id + "|" + str(len(rev_seq) - pair[0]) + "-" + str(len(rev_seq) - pair[1]),
0095bf758b19 Uploaded
nedias
parents:
diff changeset
110 '', '')
0095bf758b19 Uploaded
nedias
parents:
diff changeset
111 desi_frags.append(frag)
0095bf758b19 Uploaded
nedias
parents:
diff changeset
112
0095bf758b19 Uploaded
nedias
parents:
diff changeset
113 # Write the result to output file
0095bf758b19 Uploaded
nedias
parents:
diff changeset
114 SeqIO.write(all_frags, all_mth_file, "fasta")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
115 SeqIO.write(desi_frags, desi_file, "fasta")
0095bf758b19 Uploaded
nedias
parents:
diff changeset
116
0095bf758b19 Uploaded
nedias
parents:
diff changeset
117 # Close file entry
0095bf758b19 Uploaded
nedias
parents:
diff changeset
118 input_file.close()
0095bf758b19 Uploaded
nedias
parents:
diff changeset
119 all_mth_file.close()
0095bf758b19 Uploaded
nedias
parents:
diff changeset
120 desi_file.close()
0095bf758b19 Uploaded
nedias
parents:
diff changeset
121
0095bf758b19 Uploaded
nedias
parents:
diff changeset
122 return 0