# HG changeset patch # User nedias # Date 1476310382 14400 # Node ID 0eda2c588bcd15a76a0d10ac08007cf2c4d77f05 # Parent c2096761a8c33c222a91ba4e151a25e31e61f9cf Deleted selected files diff -r c2096761a8c3 -r 0eda2c588bcd blast_tool.py --- a/blast_tool.py Wed Oct 12 00:05:20 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,175 +0,0 @@ -""" - Util class of BLAST API - Use Biopython's BLAST interface to access BLAST API - BLAST call may take more than 5 min to complete. - - Author Nedias Sept, 2016 - -""" - - -# BLAST Web api, use to calling BLAST using url -from Bio.Blast import NCBIWWW -# BLAST XML util, use to read data from BLAST's response -from Bio.Blast import NCBIXML - - -def exec_tool(options): - if options.format and options.format == "fasta": - blast_call(options.input, options.output) - - - - -# Call BLAST protein API -# input: an fasta format file with polypeptide data -# output: the searching result file from BLAST in XML format -# return: execute status code(developing) -def blast_call(in_file, out_file): - - # Open input file(read only) - input_file = open(in_file, "rU") - # Create an output file(create or override) - output_file = open(out_file, "w+") - - # Read all data from the input fasta file - fasta_string = input_file.read() - - # Call BLAST to search similar protein data - # blastp's "p" means using the protein search API - # "nr" means not specific database, which will perform a search base on all possible database - # TODO:May consider parametrize these options - result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string) - - # Read result form responded data and save them in output file - result = result_handle.read() - output_file.write(result) - - return 0 - - -# Call BLAST protein API (text version) -# input: polypeptide data text -# return: the searching result from BLAST in StringIO format -def blast_call_text(in_text): - - # Call BLAST to search similar protein data - result_handle = NCBIWWW.qblast("blastp", "nr", in_text) - - return result_handle - - -# Abstract data from BLAST responded XML -# input: the searching result file from BLAST in XML format -# return: an dictionary include all data from BLAST result. -# the structure of the dictionary is: -# dict -- hits: Integer, number of total hits during searching -# -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file) -# | (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry) -# | -# -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain -# | -# hps: Dictionary, contains all data of the high-scoring pair -def read_blast_result(blast_xml): - - # Open the BLAST result - result_file = open(blast_xml) - # Read all result using biopython - blast_record = NCBIXML.read(result_file) - # E-value limit, every data with a less-than 96% e-value will be filtered - E_VALUE_THRESH = 0.04 - - # Initialize the result dictionary - result_dict = dict() - - # Store the hits number - result_dict['hits': len(blast_record.alignments)] - # Initialize the alignment list - aligns = list() - # Scan through all alignments - for alignment in blast_record.alignments: - # Initialize the high-score pairs list - hsps = list() - # Scan through all hsps - for hsp in alignment.hsps: - # filter all hsp has less e-value than E_VALUE_THRESH - if hsp.expect < E_VALUE_THRESH: - # Initialize high-score pair dictionary - hsp_dict = dict() - # Store all data obtained from BLAST into the dictionary - hsp_dict['seq'] = alignment.title - hsp_dict['len'] = alignment.length - hsp_dict['eval'] = hsp.expect - # Identity is calculated as the percentage of identities in the alignment - hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) - hsp_dict['query'] = hsp.query - hsp_dict['match'] = hsp.match - hsp_dict['sbjct'] = hsp.sbjct - hsps.append(hsp_dict) - # Store the hsps into the alignment data - aligns.append(hsps) - - # Store alignment data into the dictionary - result_dict['alignments', aligns] - return result_dict - - -# Abstract data from BLAST responded XML (text version) -# input: the searching result file from BLAST in StringIO -# return: an dictionary include all data from BLAST result. -def read_blast_result_text(blast_handle): - - blast_record = NCBIXML.read(blast_handle) - - E_VALUE_THRESH = 0.04 - - result_dict = dict() - - result_dict['hits'] = len(blast_record.alignments) - aligns = list() - for alignment in blast_record.alignments: - hsps = list() - for hsp in alignment.hsps: - - if hsp.expect < E_VALUE_THRESH: - hsp_dict = dict() - hsp_dict['seq'] = alignment.title - hsp_dict['len'] = alignment.length - hsp_dict['eval'] = hsp.expect - hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length)) - hsp_dict['query'] = hsp.query - hsp_dict['match'] = hsp.match - hsp_dict['sbjct'] = hsp.sbjct - hsps.append(hsp_dict) - aligns.append(hsps) - - result_dict['alignments'] = aligns - return result_dict - - -# Sample of usage (text version) -def text(): - result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\ - DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\ - AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\ - EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\ - APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\ - FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\ - TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\ - AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\ - PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI' - ) - re_dict = read_blast_result_text(result) - print(re_dict['hits']) - for alignments in re_dict['alignments']: - for hsp in alignments: - print(hsp['seq']) - print(hsp['len']) - print(hsp['eval']) - print(hsp['ident']) - print(hsp['query'][0:75] + "...") - print(hsp['match'][0:75] + "...") - print(hsp['sbjct'][0:75] + "...") - - -