Mercurial > repos > nedias > orf_tools

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_tool.py	Wed Oct 12 00:03:16 2016 -0400
@@ -0,0 +1,175 @@
+"""
+ Util class of BLAST API
+ Use Biopython's BLAST interface to access BLAST API
+ BLAST call may take more than 5 min to complete.
+
+ Author Nedias  Sept, 2016
+
+"""
+
+
+# BLAST Web api, use to calling BLAST using url
+from Bio.Blast import NCBIWWW
+# BLAST XML util, use to read data from BLAST's response
+from Bio.Blast import NCBIXML
+
+
+def exec_tool(options):
+    if options.format and options.format == "fasta":
+        blast_call(options.input, options.output)
+
+
+
+
+# Call BLAST protein API
+# input: an fasta format file with polypeptide data
+# output: the searching result file from BLAST in XML format
+# return: execute status code(developing)
+def blast_call(in_file, out_file):
+
+    # Open input file(read only)
+    input_file = open(in_file, "rU")
+    # Create an output file(create or override)
+    output_file = open(out_file, "w+")
+
+    # Read all data from the input fasta file
+    fasta_string = input_file.read()
+
+    # Call BLAST to search similar protein data
+    # blastp's "p" means using the protein search API
+    # "nr" means not specific database, which will perform a search base on all possible database
+    # TODO:May consider parametrize these options
+    result_handle = NCBIWWW.qblast("blastp", "nr", fasta_string)
+
+    # Read result form responded data and save them in output file
+    result = result_handle.read()
+    output_file.write(result)
+
+    return 0
+
+
+# Call BLAST protein API (text version)
+# input: polypeptide data text
+# return: the searching result from BLAST in StringIO format
+def blast_call_text(in_text):
+
+    # Call BLAST to search similar protein data
+    result_handle = NCBIWWW.qblast("blastp", "nr", in_text)
+
+    return result_handle
+
+
+# Abstract data from BLAST responded XML
+# input: the searching result file from BLAST in XML format
+# return: an dictionary include all data from BLAST result.
+#      the structure of the dictionary is:
+#      dict -- hits: Integer, number of total hits during searching
+#           -- alignments: List, alignments for all polypeptide chains(there could be many chains in a single file)
+#                |          (TODO:May consider change to dictionary and use the tag of every polypeptide chain as entry)
+#                |
+#                -- hsps: List, all high-scoring pairs(alignment) for one polypeptide chain
+#                    |
+#                    hps: Dictionary, contains all data of the high-scoring pair
+def read_blast_result(blast_xml):
+
+    # Open the BLAST result
+    result_file = open(blast_xml)
+    # Read all result using biopython
+    blast_record = NCBIXML.read(result_file)
+    # E-value limit, every data with a less-than 96% e-value will be filtered
+    E_VALUE_THRESH = 0.04
+
+    # Initialize the result dictionary
+    result_dict = dict()
+
+    # Store the hits number
+    result_dict['hits': len(blast_record.alignments)]
+    # Initialize the alignment list
+    aligns = list()
+    # Scan through all alignments
+    for alignment in blast_record.alignments:
+        # Initialize the high-score pairs list
+        hsps = list()
+        # Scan through all hsps
+        for hsp in alignment.hsps:
+            # filter all hsp has less e-value than E_VALUE_THRESH
+            if hsp.expect < E_VALUE_THRESH:
+                # Initialize high-score pair dictionary
+                hsp_dict = dict()
+                # Store all data obtained from BLAST into the dictionary
+                hsp_dict['seq'] = alignment.title
+                hsp_dict['len'] = alignment.length
+                hsp_dict['eval'] = hsp.expect
+                # Identity is calculated as the percentage of identities in the alignment
+                hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
+                hsp_dict['query'] = hsp.query
+                hsp_dict['match'] = hsp.match
+                hsp_dict['sbjct'] = hsp.sbjct
+            hsps.append(hsp_dict)
+        # Store the hsps into the alignment data
+        aligns.append(hsps)
+
+    # Store alignment data into the dictionary
+    result_dict['alignments', aligns]
+    return result_dict
+
+
+# Abstract data from BLAST responded XML (text version)
+# input: the searching result file from BLAST in StringIO
+# return: an dictionary include all data from BLAST result.
+def read_blast_result_text(blast_handle):
+
+    blast_record = NCBIXML.read(blast_handle)
+
+    E_VALUE_THRESH = 0.04
+
+    result_dict = dict()
+
+    result_dict['hits'] =  len(blast_record.alignments)
+    aligns = list()
+    for alignment in blast_record.alignments:
+        hsps = list()
+        for hsp in alignment.hsps:
+
+            if hsp.expect < E_VALUE_THRESH:
+                hsp_dict = dict()
+                hsp_dict['seq'] = alignment.title
+                hsp_dict['len'] = alignment.length
+                hsp_dict['eval'] = hsp.expect
+                hsp_dict['ident'] = int(round(hsp.identities * float(100) / alignment.length))
+                hsp_dict['query'] = hsp.query
+                hsp_dict['match'] = hsp.match
+                hsp_dict['sbjct'] = hsp.sbjct
+            hsps.append(hsp_dict)
+        aligns.append(hsps)
+
+    result_dict['alignments'] = aligns
+    return result_dict
+
+
+# Sample of usage (text version)
+def text():
+    result = blast_call_text('MTSQTTINKPGPGDGSPAGSAPAKGWRGHPWVTLVTVAVGVMMVALDGTIVAIANPAIQD\
+                            DLDASLADVQWITNAYFLALAVALITAGKLGDRFGHRQTFLIGVAGFAASSGAIGLSGSI\
+                            AAVIVFRVFQGLFGALLMPAALGLLRATFPAEKLNMAIGIWGMVIGASTAGGPILGGVLV\
+                            EHVNWQSVFFINVPVGIVAVVLGVMILLDHRAANAPRSFDVVGIVLLSASMFALVWALIK\
+                            APEWGWGSGQTWVYIGGSVVGFVLFSVWETKVKEPLIPLAMFRSVPLSAGVVLMVLMAIA\
+                            FMGGLFFVTFYLQNVHGMSPVDAGLHLLPLTGMMIVASPLAGAMITKVGPRIPLAGGMVC\
+                            TAVAMFGISTLETDTGSGLMSIWFGLLGLGLAPVMVGATEVIVGNAPMELSGVAGGLQQA\
+                            AMQIGGSLGTAVLGAVMASKVDSDLAGNWKDAGLPELTPQQADQASEAVRVGVPPVAPGT\
+                            PAEVAGKITDVAHDTFISGMSLASLVAAGVAVVAVFVAFLTKRGENAEAGAGVGHI'
+                            )
+    re_dict = read_blast_result_text(result)
+    print(re_dict['hits'])
+    for alignments in re_dict['alignments']:
+        for hsp in alignments:
+            print(hsp['seq'])
+            print(hsp['len'])
+            print(hsp['eval'])
+            print(hsp['ident'])
+            print(hsp['query'][0:75] + "...")
+            print(hsp['match'][0:75] + "...")
+            print(hsp['sbjct'][0:75] + "...")
+
+
+