Mercurial > repos > nedias > blast_tool
comparison blast_tool.py @ 0:45fbe538fa01 draft
Uploaded
author | nedias |
---|---|
date | Wed, 12 Oct 2016 18:11:41 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:45fbe538fa01 |
---|---|
1 """ | |
2 Util class of BLAST API | |
3 Use Biopython's BLAST interface to access BLAST API | |
4 BLAST call may take more than 5 min to complete. | |
5 | |
6 Author Nedias Sept, 2016 | |
7 | |
8 """ | |
9 | |
10 | |
11 # BLAST Web api, use to calling BLAST using url | |
12 from Bio.Blast import NCBIWWW | |
13 # BLAST XML util, use to read data from BLAST's response | |
14 from Bio.Blast import NCBIXML | |
15 | |
16 | |
17 # Get command and parameter from entry and call corresponding function | |
18 def exec_tool(options): | |
19 if options.format and options.format == "fasta": | |
20 blast_call(options.input, options.output, options.program, options.database) | |
21 | |
22 | |
23 # Call BLAST protein API | |
24 # input: 1.in_file: an fasta format file with polypeptide data | |
25 # 2.out_file: an XML format file with BLAST result | |
26 # 3.program: define which program of BLAST is used | |
27 # 4.database: define which database of BLAST is used | |
28 # output: the searching result file from BLAST in XML format | |
29 # return: execute status code(developing) | |
30 def blast_call(in_file, out_file, program, database): | |
31 | |
32 # Open input file(read only) | |
33 input_file = open(in_file, "rU") | |
34 # Create an output file(create or override) | |
35 output_file = open(out_file, "w+") | |
36 | |
37 # Read all data from the input fasta file | |
38 fasta_string = input_file.read() | |
39 | |
40 # Call BLAST to search similar protein data | |
41 result_handle = NCBIWWW.qblast(program, database, fasta_string) | |
42 | |
43 # Read result form responded data and save them in output file | |
44 result = result_handle.read() | |
45 output_file.write(result) | |
46 | |
47 return 0 | |
48 | |
49 | |
50 # Call BLAST protein API (text version) | |
51 # input: 1.in_text: polypeptide data text | |
52 # 2.program: define which program of BLAST is used | |
53 # 3.database: define which database of BLAST is used | |
54 # return: the searching result from BLAST in StringIO format | |
55 def blast_call_text(in_text, program, database): | |
56 | |
57 # Call BLAST to search similar protein data | |
58 result_handle = NCBIWWW.qblast(program, database, in_text) | |
59 | |
60 return result_handle | |
61 | |
62 | |
63 # Abstract data from BLAST responded XML | |
64 # input: the searching result file from BLAST in XML format | |
65 # return: an dictionary include all data from BLAST result. | |
66 # the structure of the dictionary is: | |
67 # dict -- hits: Integer, number of total hits during searching | |
68 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file) | |
69 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry) | |
70 # | | |
71 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain | |
72 # | | |
73 # hps: Dictionary, contains all data of the high-scoring pair | |
74 def read_blast_result(blast_xml): | |
75 | |
76 # Open the BLAST result | |
77 result_file = open(blast_xml) | |
78 # Read all result using biopython | |
79 blast_record = NCBIXML.read(result_file) | |
80 # E-value limit, every data with a less-than 96% e-value will be filtered | |
81 E_VALUE_THRESH = 0.04 | |
82 | |
83 # Initialize the result dictionary | |
84 result_dict = dict() | |
85 | |
86 # Store the hits number | |
87 result_dict['hits': len(blast_record.alignments)] | |
88 # Initialize the alignment list | |
89 aligns = list() | |
90 # Scan through all alignments | |
91 for alignment in blast_record.alignments: | |
92 # Initialize the high-score pairs list | |
93 hsps = list() | |
94 # Scan through all hsps | |
95 for hsp in alignment.hsps: | |
96 # filter all hsp has less e-value than E_VALUE_THRESH | |
97 if hsp.expect < E_VALUE_THRESH: | |
98 # Initialize high-score pair dictionary | |
99 hsp_dict = dict() | |
100 # Store all data obtained from BLAST into the dictionary | |
101 hsp_dict['seq'] = alignment.title | |
102 hsp_dict['len'] = alignment.length | |
103 hsp_dict['eval'] = hsp.expect | |
104 # Identity is calculated as the percentage of identities in the alignment | |
105 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) | |
106 hsp_dict['query'] = hsp.query | |
107 hsp_dict['match'] = hsp.match | |
108 hsp_dict['sbjct'] = hsp.sbjct | |
109 hsps.append(hsp_dict) | |
110 # Store the hsps into the alignment data | |
111 aligns.append(hsps) | |
112 | |
113 # Store alignment data into the dictionary | |
114 result_dict['alignments', aligns] | |
115 return result_dict | |
116 | |
117 | |
118 # Abstract data from BLAST responded XML (text version) | |
119 # input: the searching result file from BLAST in StringIO | |
120 # return: an dictionary include all data from BLAST result. | |
121 def read_blast_result_text(blast_handle): | |
122 | |
123 blast_record = NCBIXML.read(blast_handle) | |
124 | |
125 E_VALUE_THRESH = 0.04 | |
126 | |
127 result_dict = dict() | |
128 | |
129 result_dict['hits'] = len(blast_record.alignments) | |
130 aligns = list() | |
131 for alignment in blast_record.alignments: | |
132 hsps = list() | |
133 for hsp in alignment.hsps: | |
134 | |
135 if hsp.expect < E_VALUE_THRESH: | |
136 hsp_dict = dict() | |
137 hsp_dict['seq'] = alignment.title | |
138 hsp_dict['len'] = alignment.length | |
139 hsp_dict['eval'] = hsp.expect | |
140 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) | |
141 hsp_dict['query'] = hsp.query | |
142 hsp_dict['match'] = hsp.match | |
143 hsp_dict['sbjct'] = hsp.sbjct | |
144 hsps.append(hsp_dict) | |
145 aligns.append(hsps) | |
146 | |
147 result_dict['alignments'] = aligns | |
148 return result_dict | |
149 | |
150 | |
151 # Sample of usage (text version) | |
152 def text(): | |
153 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\ | |
154 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\ | |
155 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\ | |
156 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\ | |
157 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\ | |
158 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\ | |
159 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\ | |
160 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\ | |
161 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI' | |
162 ) | |
163 re_dict = read_blast_result_text(result) | |
164 print(re_dict['hits']) | |
165 for alignments in re_dict['alignments']: | |
166 for hsp in alignments: | |
167 print(hsp['seq']) | |
168 print(hsp['len']) | |
169 print(hsp['eval']) | |
170 print(hsp['ident']) | |
171 print(hsp['query'][0:75] + "...") | |
172 print(hsp['match'][0:75] + "...") | |
173 print(hsp['sbjct'][0:75] + "...") | |
174 | |
175 | |
176 |