annotate blast_tool.py @ 1:0c471588f014 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 18:11:55 -0400
parents 45fbe538fa01
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
1 """
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
2 Util class of BLAST API
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
3 Use Biopython's BLAST interface to access BLAST API
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
4 BLAST call may take more than 5 min to complete.
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
5
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
6 Author Nedias Sept, 2016
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
7
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
8 """
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
9
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
10
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
11 # BLAST Web api, use to calling BLAST using url
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
12 from Bio.Blast import NCBIWWW
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
13 # BLAST XML util, use to read data from BLAST's response
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
14 from Bio.Blast import NCBIXML
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
15
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
16
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
17 # Get command and parameter from entry and call corresponding function
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
18 def exec_tool(options):
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
19 if options.format and options.format == "fasta":
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
20 blast_call(options.input, options.output, options.program, options.database)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
21
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
22
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
23 # Call BLAST protein API
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
24 # input: 1.in_file: an fasta format file with polypeptide data
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
25 # 2.out_file: an XML format file with BLAST result
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
26 # 3.program: define which program of BLAST is used
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
27 # 4.database: define which database of BLAST is used
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
28 # output: the searching result file from BLAST in XML format
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
29 # return: execute status code(developing)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
30 def blast_call(in_file, out_file, program, database):
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
31
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
32 # Open input file(read only)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
33 input_file = open(in_file, "rU")
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
34 # Create an output file(create or override)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
35 output_file = open(out_file, "w+")
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
36
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
37 # Read all data from the input fasta file
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
38 fasta_string = input_file.read()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
39
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
40 # Call BLAST to search similar protein data
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
41 result_handle = NCBIWWW.qblast(program, database, fasta_string)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
42
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
43 # Read result form responded data and save them in output file
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
44 result = result_handle.read()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
45 output_file.write(result)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
46
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
47 return 0
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
48
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
49
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
50 # Call BLAST protein API (text version)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
51 # input: 1.in_text: polypeptide data text
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
52 # 2.program: define which program of BLAST is used
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
53 # 3.database: define which database of BLAST is used
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
54 # return: the searching result from BLAST in StringIO format
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
55 def blast_call_text(in_text, program, database):
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
56
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
57 # Call BLAST to search similar protein data
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
58 result_handle = NCBIWWW.qblast(program, database, in_text)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
59
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
60 return result_handle
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
61
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
62
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
63 # Abstract data from BLAST responded XML
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
64 # input: the searching result file from BLAST in XML format
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
65 # return: an dictionary include all data from BLAST result.
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
66 # the structure of the dictionary is:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
67 # dict -- hits: Integer, number of total hits during searching
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
68 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
69 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
70 # |
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
71 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
72 # |
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
73 # hps: Dictionary, contains all data of the high-scoring pair
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
74 def read_blast_result(blast_xml):
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
75
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
76 # Open the BLAST result
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
77 result_file = open(blast_xml)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
78 # Read all result using biopython
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
79 blast_record = NCBIXML.read(result_file)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
80 # E-value limit, every data with a less-than 96% e-value will be filtered
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
81 E_VALUE_THRESH = 0.04
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
82
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
83 # Initialize the result dictionary
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
84 result_dict = dict()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
85
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
86 # Store the hits number
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
87 result_dict['hits': len(blast_record.alignments)]
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
88 # Initialize the alignment list
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
89 aligns = list()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
90 # Scan through all alignments
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
91 for alignment in blast_record.alignments:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
92 # Initialize the high-score pairs list
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
93 hsps = list()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
94 # Scan through all hsps
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
95 for hsp in alignment.hsps:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
96 # filter all hsp has less e-value than E_VALUE_THRESH
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
97 if hsp.expect < E_VALUE_THRESH:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
98 # Initialize high-score pair dictionary
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
99 hsp_dict = dict()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
100 # Store all data obtained from BLAST into the dictionary
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
101 hsp_dict['seq'] = alignment.title
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
102 hsp_dict['len'] = alignment.length
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
103 hsp_dict['eval'] = hsp.expect
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
104 # Identity is calculated as the percentage of identities in the alignment
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
105 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
106 hsp_dict['query'] = hsp.query
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
107 hsp_dict['match'] = hsp.match
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
108 hsp_dict['sbjct'] = hsp.sbjct
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
109 hsps.append(hsp_dict)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
110 # Store the hsps into the alignment data
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
111 aligns.append(hsps)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
112
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
113 # Store alignment data into the dictionary
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
114 result_dict['alignments', aligns]
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
115 return result_dict
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
116
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
117
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
118 # Abstract data from BLAST responded XML (text version)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
119 # input: the searching result file from BLAST in StringIO
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
120 # return: an dictionary include all data from BLAST result.
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
121 def read_blast_result_text(blast_handle):
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
122
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
123 blast_record = NCBIXML.read(blast_handle)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
124
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
125 E_VALUE_THRESH = 0.04
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
126
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
127 result_dict = dict()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
128
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
129 result_dict['hits'] = len(blast_record.alignments)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
130 aligns = list()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
131 for alignment in blast_record.alignments:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
132 hsps = list()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
133 for hsp in alignment.hsps:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
134
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
135 if hsp.expect < E_VALUE_THRESH:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
136 hsp_dict = dict()
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
137 hsp_dict['seq'] = alignment.title
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
138 hsp_dict['len'] = alignment.length
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
139 hsp_dict['eval'] = hsp.expect
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
140 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
141 hsp_dict['query'] = hsp.query
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
142 hsp_dict['match'] = hsp.match
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
143 hsp_dict['sbjct'] = hsp.sbjct
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
144 hsps.append(hsp_dict)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
145 aligns.append(hsps)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
146
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
147 result_dict['alignments'] = aligns
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
148 return result_dict
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
149
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
150
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
151 # Sample of usage (text version)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
152 def text():
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
153 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
154 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
155 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
156 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
157 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
158 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
159 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
160 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
161 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
162 )
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
163 re_dict = read_blast_result_text(result)
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
164 print(re_dict['hits'])
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
165 for alignments in re_dict['alignments']:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
166 for hsp in alignments:
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
167 print(hsp['seq'])
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
168 print(hsp['len'])
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
169 print(hsp['eval'])
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
170 print(hsp['ident'])
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
171 print(hsp['query'][0:75] + "...")
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
172 print(hsp['match'][0:75] + "...")
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
173 print(hsp['sbjct'][0:75] + "...")
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
174
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
175
45fbe538fa01 Uploaded
nedias
parents:
diff changeset
176