0
|
1 """
|
|
2 Util class of BLAST API
|
|
3 Use Biopython's BLAST interface to access BLAST API
|
|
4 BLAST call may take more than 5 min to complete.
|
|
5
|
|
6 Author Nedias Sept, 2016
|
|
7
|
|
8 """
|
|
9
|
|
10
|
|
11 # BLAST Web api, use to calling BLAST using url
|
|
12 from Bio.Blast import NCBIWWW
|
|
13 # BLAST XML util, use to read data from BLAST's response
|
|
14 from Bio.Blast import NCBIXML
|
|
15
|
|
16
|
|
17 def exec_tool(options):
|
|
18 if options.format and options.format == "fasta":
|
|
19 blast_call(options.input, options.output)
|
|
20
|
|
21
|
|
22
|
|
23
|
|
24 # Call BLAST protein API
|
|
25 # input: an fasta format file with polypeptide data
|
|
26 # output: the searching result file from BLAST in XML format
|
|
27 # return: execute status code(developing)
|
|
28 def blast_call(in_file, out_file):
|
|
29
|
|
30 # Open input file(read only)
|
|
31 input_file = open(in_file, "rU")
|
|
32 # Create an output file(create or override)
|
|
33 output_file = open(out_file, "w+")
|
|
34
|
|
35 # Read all data from the input fasta file
|
|
36 fasta_string = input_file.read()
|
|
37
|
|
38 # Call BLAST to search similar protein data
|
|
39 # blastp's "p" means using the protein search API
|
|
40 # "nr" means not specific database, which will perform a search base on all possible database
|
|
41 # TODO:May consider parametrize these options
|
|
42 result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string)
|
|
43
|
|
44 # Read result form responded data and save them in output file
|
|
45 result = result_handle.read()
|
|
46 output_file.write(result)
|
|
47
|
|
48 return 0
|
|
49
|
|
50
|
|
51 # Call BLAST protein API (text version)
|
|
52 # input: polypeptide data text
|
|
53 # return: the searching result from BLAST in StringIO format
|
|
54 def blast_call_text(in_text):
|
|
55
|
|
56 # Call BLAST to search similar protein data
|
|
57 result_handle = NCBIWWW.qblast("blastp", "nr", in_text)
|
|
58
|
|
59 return result_handle
|
|
60
|
|
61
|
|
62 # Abstract data from BLAST responded XML
|
|
63 # input: the searching result file from BLAST in XML format
|
|
64 # return: an dictionary include all data from BLAST result.
|
|
65 # the structure of the dictionary is:
|
|
66 # dict -- hits: Integer, number of total hits during searching
|
|
67 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
|
|
68 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
|
|
69 # |
|
|
70 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
|
|
71 # |
|
|
72 # hps: Dictionary, contains all data of the high-scoring pair
|
|
73 def read_blast_result(blast_xml):
|
|
74
|
|
75 # Open the BLAST result
|
|
76 result_file = open(blast_xml)
|
|
77 # Read all result using biopython
|
|
78 blast_record = NCBIXML.read(result_file)
|
|
79 # E-value limit, every data with a less-than 96% e-value will be filtered
|
|
80 E_VALUE_THRESH = 0.04
|
|
81
|
|
82 # Initialize the result dictionary
|
|
83 result_dict = dict()
|
|
84
|
|
85 # Store the hits number
|
|
86 result_dict['hits': len(blast_record.alignments)]
|
|
87 # Initialize the alignment list
|
|
88 aligns = list()
|
|
89 # Scan through all alignments
|
|
90 for alignment in blast_record.alignments:
|
|
91 # Initialize the high-score pairs list
|
|
92 hsps = list()
|
|
93 # Scan through all hsps
|
|
94 for hsp in alignment.hsps:
|
|
95 # filter all hsp has less e-value than E_VALUE_THRESH
|
|
96 if hsp.expect < E_VALUE_THRESH:
|
|
97 # Initialize high-score pair dictionary
|
|
98 hsp_dict = dict()
|
|
99 # Store all data obtained from BLAST into the dictionary
|
|
100 hsp_dict['seq'] = alignment.title
|
|
101 hsp_dict['len'] = alignment.length
|
|
102 hsp_dict['eval'] = hsp.expect
|
|
103 # Identity is calculated as the percentage of identities in the alignment
|
|
104 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
|
|
105 hsp_dict['query'] = hsp.query
|
|
106 hsp_dict['match'] = hsp.match
|
|
107 hsp_dict['sbjct'] = hsp.sbjct
|
|
108 hsps.append(hsp_dict)
|
|
109 # Store the hsps into the alignment data
|
|
110 aligns.append(hsps)
|
|
111
|
|
112 # Store alignment data into the dictionary
|
|
113 result_dict['alignments', aligns]
|
|
114 return result_dict
|
|
115
|
|
116
|
|
117 # Abstract data from BLAST responded XML (text version)
|
|
118 # input: the searching result file from BLAST in StringIO
|
|
119 # return: an dictionary include all data from BLAST result.
|
|
120 def read_blast_result_text(blast_handle):
|
|
121
|
|
122 blast_record = NCBIXML.read(blast_handle)
|
|
123
|
|
124 E_VALUE_THRESH = 0.04
|
|
125
|
|
126 result_dict = dict()
|
|
127
|
|
128 result_dict['hits'] = len(blast_record.alignments)
|
|
129 aligns = list()
|
|
130 for alignment in blast_record.alignments:
|
|
131 hsps = list()
|
|
132 for hsp in alignment.hsps:
|
|
133
|
|
134 if hsp.expect < E_VALUE_THRESH:
|
|
135 hsp_dict = dict()
|
|
136 hsp_dict['seq'] = alignment.title
|
|
137 hsp_dict['len'] = alignment.length
|
|
138 hsp_dict['eval'] = hsp.expect
|
|
139 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
|
|
140 hsp_dict['query'] = hsp.query
|
|
141 hsp_dict['match'] = hsp.match
|
|
142 hsp_dict['sbjct'] = hsp.sbjct
|
|
143 hsps.append(hsp_dict)
|
|
144 aligns.append(hsps)
|
|
145
|
|
146 result_dict['alignments'] = aligns
|
|
147 return result_dict
|
|
148
|
|
149
|
|
150 # Sample of usage (text version)
|
|
151 def text():
|
|
152 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
|
|
153 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
|
|
154 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
|
|
155 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
|
|
156 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
|
|
157 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
|
|
158 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
|
|
159 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
|
|
160 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
|
|
161 )
|
|
162 re_dict = read_blast_result_text(result)
|
|
163 print(re_dict['hits'])
|
|
164 for alignments in re_dict['alignments']:
|
|
165 for hsp in alignments:
|
|
166 print(hsp['seq'])
|
|
167 print(hsp['len'])
|
|
168 print(hsp['eval'])
|
|
169 print(hsp['ident'])
|
|
170 print(hsp['query'][0:75] + "...")
|
|
171 print(hsp['match'][0:75] + "...")
|
|
172 print(hsp['sbjct'][0:75] + "...")
|
|
173
|
|
174
|
|
175
|