0
|
1 """
|
|
2 Util class of BLAST API
|
|
3 Use Biopython's BLAST interface to access BLAST API
|
|
4 BLAST call may take more than 5 min to complete.
|
|
5
|
|
6 Author Nedias Sept, 2016
|
|
7
|
|
8 """
|
|
9
|
|
10
|
|
11 # BLAST Web api, use to calling BLAST using url
|
|
12 from Bio.Blast import NCBIWWW
|
|
13 # BLAST XML util, use to read data from BLAST's response
|
|
14 from Bio.Blast import NCBIXML
|
|
15
|
|
16
|
|
17 # Get command and parameter from entry and call corresponding function
|
|
18 def exec_tool(options):
|
|
19 if options.format and options.format == "fasta":
|
|
20 blast_call(options.input, options.output, options.program, options.database)
|
|
21
|
|
22
|
|
23 # Call BLAST protein API
|
|
24 # input: 1.in_file: an fasta format file with polypeptide data
|
|
25 # 2.out_file: an XML format file with BLAST result
|
|
26 # 3.program: define which program of BLAST is used
|
|
27 # 4.database: define which database of BLAST is used
|
|
28 # output: the searching result file from BLAST in XML format
|
|
29 # return: execute status code(developing)
|
|
30 def blast_call(in_file, out_file, program, database):
|
|
31
|
|
32 # Open input file(read only)
|
|
33 input_file = open(in_file, "rU")
|
|
34 # Create an output file(create or override)
|
|
35 output_file = open(out_file, "w+")
|
|
36
|
|
37 # Read all data from the input fasta file
|
|
38 fasta_string = input_file.read()
|
|
39
|
|
40 # Call BLAST to search similar protein data
|
|
41 result_handle = NCBIWWW.qblast(program, database, fasta_string)
|
|
42
|
|
43 # Read result form responded data and save them in output file
|
|
44 result = result_handle.read()
|
|
45 output_file.write(result)
|
|
46
|
|
47 return 0
|
|
48
|
|
49
|
|
50 # Call BLAST protein API (text version)
|
|
51 # input: 1.in_text: polypeptide data text
|
|
52 # 2.program: define which program of BLAST is used
|
|
53 # 3.database: define which database of BLAST is used
|
|
54 # return: the searching result from BLAST in StringIO format
|
|
55 def blast_call_text(in_text, program, database):
|
|
56
|
|
57 # Call BLAST to search similar protein data
|
|
58 result_handle = NCBIWWW.qblast(program, database, in_text)
|
|
59
|
|
60 return result_handle
|
|
61
|
|
62
|
|
63 # Abstract data from BLAST responded XML
|
|
64 # input: the searching result file from BLAST in XML format
|
|
65 # return: an dictionary include all data from BLAST result.
|
|
66 # the structure of the dictionary is:
|
|
67 # dict -- hits: Integer, number of total hits during searching
|
|
68 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
|
|
69 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
|
|
70 # |
|
|
71 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
|
|
72 # |
|
|
73 # hps: Dictionary, contains all data of the high-scoring pair
|
|
74 def read_blast_result(blast_xml):
|
|
75
|
|
76 # Open the BLAST result
|
|
77 result_file = open(blast_xml)
|
|
78 # Read all result using biopython
|
|
79 blast_record = NCBIXML.read(result_file)
|
|
80 # E-value limit, every data with a less-than 96% e-value will be filtered
|
|
81 E_VALUE_THRESH = 0.04
|
|
82
|
|
83 # Initialize the result dictionary
|
|
84 result_dict = dict()
|
|
85
|
|
86 # Store the hits number
|
|
87 result_dict['hits': len(blast_record.alignments)]
|
|
88 # Initialize the alignment list
|
|
89 aligns = list()
|
|
90 # Scan through all alignments
|
|
91 for alignment in blast_record.alignments:
|
|
92 # Initialize the high-score pairs list
|
|
93 hsps = list()
|
|
94 # Scan through all hsps
|
|
95 for hsp in alignment.hsps:
|
|
96 # filter all hsp has less e-value than E_VALUE_THRESH
|
|
97 if hsp.expect < E_VALUE_THRESH:
|
|
98 # Initialize high-score pair dictionary
|
|
99 hsp_dict = dict()
|
|
100 # Store all data obtained from BLAST into the dictionary
|
|
101 hsp_dict['seq'] = alignment.title
|
|
102 hsp_dict['len'] = alignment.length
|
|
103 hsp_dict['eval'] = hsp.expect
|
|
104 # Identity is calculated as the percentage of identities in the alignment
|
|
105 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
|
|
106 hsp_dict['query'] = hsp.query
|
|
107 hsp_dict['match'] = hsp.match
|
|
108 hsp_dict['sbjct'] = hsp.sbjct
|
|
109 hsps.append(hsp_dict)
|
|
110 # Store the hsps into the alignment data
|
|
111 aligns.append(hsps)
|
|
112
|
|
113 # Store alignment data into the dictionary
|
|
114 result_dict['alignments', aligns]
|
|
115 return result_dict
|
|
116
|
|
117
|
|
118 # Abstract data from BLAST responded XML (text version)
|
|
119 # input: the searching result file from BLAST in StringIO
|
|
120 # return: an dictionary include all data from BLAST result.
|
|
121 def read_blast_result_text(blast_handle):
|
|
122
|
|
123 blast_record = NCBIXML.read(blast_handle)
|
|
124
|
|
125 E_VALUE_THRESH = 0.04
|
|
126
|
|
127 result_dict = dict()
|
|
128
|
|
129 result_dict['hits'] = len(blast_record.alignments)
|
|
130 aligns = list()
|
|
131 for alignment in blast_record.alignments:
|
|
132 hsps = list()
|
|
133 for hsp in alignment.hsps:
|
|
134
|
|
135 if hsp.expect < E_VALUE_THRESH:
|
|
136 hsp_dict = dict()
|
|
137 hsp_dict['seq'] = alignment.title
|
|
138 hsp_dict['len'] = alignment.length
|
|
139 hsp_dict['eval'] = hsp.expect
|
|
140 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
|
|
141 hsp_dict['query'] = hsp.query
|
|
142 hsp_dict['match'] = hsp.match
|
|
143 hsp_dict['sbjct'] = hsp.sbjct
|
|
144 hsps.append(hsp_dict)
|
|
145 aligns.append(hsps)
|
|
146
|
|
147 result_dict['alignments'] = aligns
|
|
148 return result_dict
|
|
149
|
|
150
|
|
151 # Sample of usage (text version)
|
|
152 def text():
|
|
153 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
|
|
154 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
|
|
155 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
|
|
156 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
|
|
157 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
|
|
158 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
|
|
159 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
|
|
160 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
|
|
161 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
|
|
162 )
|
|
163 re_dict = read_blast_result_text(result)
|
|
164 print(re_dict['hits'])
|
|
165 for alignments in re_dict['alignments']:
|
|
166 for hsp in alignments:
|
|
167 print(hsp['seq'])
|
|
168 print(hsp['len'])
|
|
169 print(hsp['eval'])
|
|
170 print(hsp['ident'])
|
|
171 print(hsp['query'][0:75] + "...")
|
|
172 print(hsp['match'][0:75] + "...")
|
|
173 print(hsp['sbjct'][0:75] + "...")
|
|
174
|
|
175
|
|
176
|