comparison blast_tool.py @ 0:45fbe538fa01 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 18:11:41 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:45fbe538fa01
1 """
2 Util class of BLAST API
3 Use Biopython's BLAST interface to access BLAST API
4 BLAST call may take more than 5 min to complete.
5
6 Author Nedias Sept, 2016
7
8 """
9
10
11 # BLAST Web api, use to calling BLAST using url
12 from Bio.Blast import NCBIWWW
13 # BLAST XML util, use to read data from BLAST's response
14 from Bio.Blast import NCBIXML
15
16
17 # Get command and parameter from entry and call corresponding function
18 def exec_tool(options):
19 if options.format and options.format == "fasta":
20 blast_call(options.input, options.output, options.program, options.database)
21
22
23 # Call BLAST protein API
24 # input: 1.in_file: an fasta format file with polypeptide data
25 # 2.out_file: an XML format file with BLAST result
26 # 3.program: define which program of BLAST is used
27 # 4.database: define which database of BLAST is used
28 # output: the searching result file from BLAST in XML format
29 # return: execute status code(developing)
30 def blast_call(in_file, out_file, program, database):
31
32 # Open input file(read only)
33 input_file = open(in_file, "rU")
34 # Create an output file(create or override)
35 output_file = open(out_file, "w+")
36
37 # Read all data from the input fasta file
38 fasta_string = input_file.read()
39
40 # Call BLAST to search similar protein data
41 result_handle = NCBIWWW.qblast(program, database, fasta_string)
42
43 # Read result form responded data and save them in output file
44 result = result_handle.read()
45 output_file.write(result)
46
47 return 0
48
49
50 # Call BLAST protein API (text version)
51 # input: 1.in_text: polypeptide data text
52 # 2.program: define which program of BLAST is used
53 # 3.database: define which database of BLAST is used
54 # return: the searching result from BLAST in StringIO format
55 def blast_call_text(in_text, program, database):
56
57 # Call BLAST to search similar protein data
58 result_handle = NCBIWWW.qblast(program, database, in_text)
59
60 return result_handle
61
62
63 # Abstract data from BLAST responded XML
64 # input: the searching result file from BLAST in XML format
65 # return: an dictionary include all data from BLAST result.
66 # the structure of the dictionary is:
67 # dict -- hits: Integer, number of total hits during searching
68 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
69 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
70 # |
71 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
72 # |
73 # hps: Dictionary, contains all data of the high-scoring pair
74 def read_blast_result(blast_xml):
75
76 # Open the BLAST result
77 result_file = open(blast_xml)
78 # Read all result using biopython
79 blast_record = NCBIXML.read(result_file)
80 # E-value limit, every data with a less-than 96% e-value will be filtered
81 E_VALUE_THRESH = 0.04
82
83 # Initialize the result dictionary
84 result_dict = dict()
85
86 # Store the hits number
87 result_dict['hits': len(blast_record.alignments)]
88 # Initialize the alignment list
89 aligns = list()
90 # Scan through all alignments
91 for alignment in blast_record.alignments:
92 # Initialize the high-score pairs list
93 hsps = list()
94 # Scan through all hsps
95 for hsp in alignment.hsps:
96 # filter all hsp has less e-value than E_VALUE_THRESH
97 if hsp.expect < E_VALUE_THRESH:
98 # Initialize high-score pair dictionary
99 hsp_dict = dict()
100 # Store all data obtained from BLAST into the dictionary
101 hsp_dict['seq'] = alignment.title
102 hsp_dict['len'] = alignment.length
103 hsp_dict['eval'] = hsp.expect
104 # Identity is calculated as the percentage of identities in the alignment
105 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
106 hsp_dict['query'] = hsp.query
107 hsp_dict['match'] = hsp.match
108 hsp_dict['sbjct'] = hsp.sbjct
109 hsps.append(hsp_dict)
110 # Store the hsps into the alignment data
111 aligns.append(hsps)
112
113 # Store alignment data into the dictionary
114 result_dict['alignments', aligns]
115 return result_dict
116
117
118 # Abstract data from BLAST responded XML (text version)
119 # input: the searching result file from BLAST in StringIO
120 # return: an dictionary include all data from BLAST result.
121 def read_blast_result_text(blast_handle):
122
123 blast_record = NCBIXML.read(blast_handle)
124
125 E_VALUE_THRESH = 0.04
126
127 result_dict = dict()
128
129 result_dict['hits'] = len(blast_record.alignments)
130 aligns = list()
131 for alignment in blast_record.alignments:
132 hsps = list()
133 for hsp in alignment.hsps:
134
135 if hsp.expect < E_VALUE_THRESH:
136 hsp_dict = dict()
137 hsp_dict['seq'] = alignment.title
138 hsp_dict['len'] = alignment.length
139 hsp_dict['eval'] = hsp.expect
140 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
141 hsp_dict['query'] = hsp.query
142 hsp_dict['match'] = hsp.match
143 hsp_dict['sbjct'] = hsp.sbjct
144 hsps.append(hsp_dict)
145 aligns.append(hsps)
146
147 result_dict['alignments'] = aligns
148 return result_dict
149
150
151 # Sample of usage (text version)
152 def text():
153 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
154 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
155 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
156 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
157 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
158 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
159 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
160 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
161 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
162 )
163 re_dict = read_blast_result_text(result)
164 print(re_dict['hits'])
165 for alignments in re_dict['alignments']:
166 for hsp in alignments:
167 print(hsp['seq'])
168 print(hsp['len'])
169 print(hsp['eval'])
170 print(hsp['ident'])
171 print(hsp['query'][0:75] + "...")
172 print(hsp['match'][0:75] + "...")
173 print(hsp['sbjct'][0:75] + "...")
174
175
176