Mercurial > repos > nedias > orf_tools
view blast_tool.py @ 3:0095bf758b19 draft
Uploaded
author | nedias |
---|---|
date | Wed, 12 Oct 2016 00:03:53 -0400 |
parents | ec8fe63afdb8 |
children |
line wrap: on
line source
""" Util class of BLAST API Use Biopython's BLAST interface to access BLAST API BLAST call may take more than 5 min to complete. Author Nedias Sept, 2016 """ # BLAST Web api, use to calling BLAST using url from Bio.Blast import NCBIWWW # BLAST XML util, use to read data from BLAST's response from Bio.Blast import NCBIXML def exec_tool(options): if options.format and options.format == "fasta": blast_call(options.input, options.output) # Call BLAST protein API # input: an fasta format file with polypeptide data # output: the searching result file from BLAST in XML format # return: execute status code(developing) def blast_call(in_file, out_file): # Open input file(read only) input_file = open(in_file, "rU") # Create an output file(create or override) output_file = open(out_file, "w+") # Read all data from the input fasta file fasta_string = input_file.read() # Call BLAST to search similar protein data # blastp's "p" means using the protein search API # "nr" means not specific database, which will perform a search base on all possible database # TODO:May consider parametrize these options result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string) # Read result form responded data and save them in output file result = result_handle.read() output_file.write(result) return 0 # Call BLAST protein API (text version) # input: polypeptide data text # return: the searching result from BLAST in StringIO format def blast_call_text(in_text): # Call BLAST to search similar protein data result_handle = NCBIWWW.qblast("blastp", "nr", in_text) return result_handle # Abstract data from BLAST responded XML # input: the searching result file from BLAST in XML format # return: an dictionary include all data from BLAST result. # the structure of the dictionary is: # dict -- hits: Integer, number of total hits during searching # -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file) # | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry) # | # -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain # | # hps: Dictionary, contains all data of the high-scoring pair def read_blast_result(blast_xml): # Open the BLAST result result_file = open(blast_xml) # Read all result using biopython blast_record = NCBIXML.read(result_file) # E-value limit, every data with a less-than 96% e-value will be filtered E_VALUE_THRESH = 0.04 # Initialize the result dictionary result_dict = dict() # Store the hits number result_dict['hits': len(blast_record.alignments)] # Initialize the alignment list aligns = list() # Scan through all alignments for alignment in blast_record.alignments: # Initialize the high-score pairs list hsps = list() # Scan through all hsps for hsp in alignment.hsps: # filter all hsp has less e-value than E_VALUE_THRESH if hsp.expect < E_VALUE_THRESH: # Initialize high-score pair dictionary hsp_dict = dict() # Store all data obtained from BLAST into the dictionary hsp_dict['seq'] = alignment.title hsp_dict['len'] = alignment.length hsp_dict['eval'] = hsp.expect # Identity is calculated as the percentage of identities in the alignment hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) hsp_dict['query'] = hsp.query hsp_dict['match'] = hsp.match hsp_dict['sbjct'] = hsp.sbjct hsps.append(hsp_dict) # Store the hsps into the alignment data aligns.append(hsps) # Store alignment data into the dictionary result_dict['alignments', aligns] return result_dict # Abstract data from BLAST responded XML (text version) # input: the searching result file from BLAST in StringIO # return: an dictionary include all data from BLAST result. def read_blast_result_text(blast_handle): blast_record = NCBIXML.read(blast_handle) E_VALUE_THRESH = 0.04 result_dict = dict() result_dict['hits'] = len(blast_record.alignments) aligns = list() for alignment in blast_record.alignments: hsps = list() for hsp in alignment.hsps: if hsp.expect < E_VALUE_THRESH: hsp_dict = dict() hsp_dict['seq'] = alignment.title hsp_dict['len'] = alignment.length hsp_dict['eval'] = hsp.expect hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) hsp_dict['query'] = hsp.query hsp_dict['match'] = hsp.match hsp_dict['sbjct'] = hsp.sbjct hsps.append(hsp_dict) aligns.append(hsps) result_dict['alignments'] = aligns return result_dict # Sample of usage (text version) def text(): result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\ DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\ AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\ EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\ APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\ FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\ TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\ AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\ PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI' ) re_dict = read_blast_result_text(result) print(re_dict['hits']) for alignments in re_dict['alignments']: for hsp in alignments: print(hsp['seq']) print(hsp['len']) print(hsp['eval']) print(hsp['ident']) print(hsp['query'][0:75] + "...") print(hsp['match'][0:75] + "...") print(hsp['sbjct'][0:75] + "...")