comparison blast_tool.py @ 0:ec8fe63afdb8 draft

Uploaded
author nedias
date Wed, 12 Oct 2016 00:03:16 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ec8fe63afdb8
1 """
2 Util class of BLAST API
3 Use Biopython's BLAST interface to access BLAST API
4 BLAST call may take more than 5 min to complete.
5
6 Author Nedias Sept, 2016
7
8 """
9
10
11 # BLAST Web api, use to calling BLAST using url
12 from Bio.Blast import NCBIWWW
13 # BLAST XML util, use to read data from BLAST's response
14 from Bio.Blast import NCBIXML
15
16
17 def exec_tool(options):
18 if options.format and options.format == "fasta":
19 blast_call(options.input, options.output)
20
21
22
23
24 # Call BLAST protein API
25 # input: an fasta format file with polypeptide data
26 # output: the searching result file from BLAST in XML format
27 # return: execute status code(developing)
28 def blast_call(in_file, out_file):
29
30 # Open input file(read only)
31 input_file = open(in_file, "rU")
32 # Create an output file(create or override)
33 output_file = open(out_file, "w+")
34
35 # Read all data from the input fasta file
36 fasta_string = input_file.read()
37
38 # Call BLAST to search similar protein data
39 # blastp's "p" means using the protein search API
40 # "nr" means not specific database, which will perform a search base on all possible database
41 # TODO:May consider parametrize these options
42 result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string)
43
44 # Read result form responded data and save them in output file
45 result = result_handle.read()
46 output_file.write(result)
47
48 return 0
49
50
51 # Call BLAST protein API (text version)
52 # input: polypeptide data text
53 # return: the searching result from BLAST in StringIO format
54 def blast_call_text(in_text):
55
56 # Call BLAST to search similar protein data
57 result_handle = NCBIWWW.qblast("blastp", "nr", in_text)
58
59 return result_handle
60
61
62 # Abstract data from BLAST responded XML
63 # input: the searching result file from BLAST in XML format
64 # return: an dictionary include all data from BLAST result.
65 # the structure of the dictionary is:
66 # dict -- hits: Integer, number of total hits during searching
67 # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
68 # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
69 # |
70 # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
71 # |
72 # hps: Dictionary, contains all data of the high-scoring pair
73 def read_blast_result(blast_xml):
74
75 # Open the BLAST result
76 result_file = open(blast_xml)
77 # Read all result using biopython
78 blast_record = NCBIXML.read(result_file)
79 # E-value limit, every data with a less-than 96% e-value will be filtered
80 E_VALUE_THRESH = 0.04
81
82 # Initialize the result dictionary
83 result_dict = dict()
84
85 # Store the hits number
86 result_dict['hits': len(blast_record.alignments)]
87 # Initialize the alignment list
88 aligns = list()
89 # Scan through all alignments
90 for alignment in blast_record.alignments:
91 # Initialize the high-score pairs list
92 hsps = list()
93 # Scan through all hsps
94 for hsp in alignment.hsps:
95 # filter all hsp has less e-value than E_VALUE_THRESH
96 if hsp.expect < E_VALUE_THRESH:
97 # Initialize high-score pair dictionary
98 hsp_dict = dict()
99 # Store all data obtained from BLAST into the dictionary
100 hsp_dict['seq'] = alignment.title
101 hsp_dict['len'] = alignment.length
102 hsp_dict['eval'] = hsp.expect
103 # Identity is calculated as the percentage of identities in the alignment
104 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
105 hsp_dict['query'] = hsp.query
106 hsp_dict['match'] = hsp.match
107 hsp_dict['sbjct'] = hsp.sbjct
108 hsps.append(hsp_dict)
109 # Store the hsps into the alignment data
110 aligns.append(hsps)
111
112 # Store alignment data into the dictionary
113 result_dict['alignments', aligns]
114 return result_dict
115
116
117 # Abstract data from BLAST responded XML (text version)
118 # input: the searching result file from BLAST in StringIO
119 # return: an dictionary include all data from BLAST result.
120 def read_blast_result_text(blast_handle):
121
122 blast_record = NCBIXML.read(blast_handle)
123
124 E_VALUE_THRESH = 0.04
125
126 result_dict = dict()
127
128 result_dict['hits'] = len(blast_record.alignments)
129 aligns = list()
130 for alignment in blast_record.alignments:
131 hsps = list()
132 for hsp in alignment.hsps:
133
134 if hsp.expect < E_VALUE_THRESH:
135 hsp_dict = dict()
136 hsp_dict['seq'] = alignment.title
137 hsp_dict['len'] = alignment.length
138 hsp_dict['eval'] = hsp.expect
139 hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
140 hsp_dict['query'] = hsp.query
141 hsp_dict['match'] = hsp.match
142 hsp_dict['sbjct'] = hsp.sbjct
143 hsps.append(hsp_dict)
144 aligns.append(hsps)
145
146 result_dict['alignments'] = aligns
147 return result_dict
148
149
150 # Sample of usage (text version)
151 def text():
152 result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
153 DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
154 AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
155 EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
156 APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
157 FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
158 TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
159 AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
160 PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
161 )
162 re_dict = read_blast_result_text(result)
163 print(re_dict['hits'])
164 for alignments in re_dict['alignments']:
165 for hsp in alignments:
166 print(hsp['seq'])
167 print(hsp['len'])
168 print(hsp['eval'])
169 print(hsp['ident'])
170 print(hsp['query'][0:75] + "...")
171 print(hsp['match'][0:75] + "...")
172 print(hsp['sbjct'][0:75] + "...")
173
174
175