diff pfam_utils/__init__.py @ 0:8918de535391 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author rnateam
date Tue, 31 May 2016 05:41:03 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pfam_utils/__init__.py	Tue May 31 05:41:03 2016 -0400
@@ -0,0 +1,139 @@
+"""Utils for PFAM."""
+
+import xml.etree.ElementTree as ET
+from math import ceil
+from time import sleep
+from xml.etree.ElementTree import ParseError
+
+import requests
+
+import pandas as pd
+
+import fasta_utils
+
+__author__ = "Gianluca Corrado"
+__copyright__ = "Copyright 2016, Gianluca Corrado"
+__license__ = "MIT"
+__maintainer__ = "Gianluca Corrado"
+__email__ = "gianluca.corrado@unitn.it"
+__status__ = "Production"
+
+
+def search_header():
+    """Return the header of a Pfam scan search."""
+    return "<seq id>        <alignment start>       <alignment end> \
+    <envelope start>        <envelope end>  <hmm acc>       <hmm name>\
+          <type>  <hmm start>     <hmm end>       <hmm length>    <bit score>\
+               <E-value>       <significance>  <clan>\n"
+
+
+def sequence_search(seq_id, seq):
+    """
+    Search a sequence against PFAM.
+
+    Input
+    -----
+    seq_id : str
+        Name of the protein sequence.
+    seq : str
+        Protein sequence.
+
+    Output
+    ------
+    ret : str
+        Formatted string containing the results of the Pfam scan for the
+        given sequence
+    """
+    def add_spaces(text, mul=8):
+        """Add spaces to a string."""
+        l = len(text)
+        next_mul = int(ceil(l / mul) + 1) * mul
+        offset = next_mul - l
+        if offset == 0:
+            offset = 8
+        return text + " " * offset
+
+    url = "http://pfam.xfam.org/search/sequence"
+    params = {'seq': seq,
+              'evalue': '1.0',
+              'output': 'xml'}
+    req = requests.get(url, params=params)
+    xml = req.text
+    try:
+        root = ET.fromstring(xml)
+    # sometimes Pfam returns the HTML code
+    except ParseError:
+        print "resending: %s" % seq_id
+        return "%s" % sequence_search(seq_id, seq)
+
+    result_url = root[0][1].text
+    # wait for Pfam to compute the results
+    sleep(4)
+    while True:
+        req2 = requests.get(result_url)
+        if req2.status_code == 200:
+            break
+        else:
+            sleep(1)
+    result_xml = req2.text
+    root = ET.fromstring(result_xml)
+    try:
+        matches = root[0][0][0][0][:]
+    # Sometimes raised when the sequence has no matches
+    except IndexError:
+        return ""
+    ret = ""
+    for match in matches:
+        for location in match:
+            ret += add_spaces(seq_id)
+            ret += add_spaces(location.attrib['ali_start'])
+            ret += add_spaces(location.attrib['ali_end'])
+            ret += add_spaces(location.attrib['start'])
+            ret += add_spaces(location.attrib['end'])
+            ret += add_spaces(match.attrib['accession'])
+            ret += add_spaces(match.attrib['id'])
+            ret += add_spaces(match.attrib['class'])
+            ret += add_spaces(location.attrib['hmm_start'])
+            ret += add_spaces(location.attrib['hmm_end'])
+            ret += add_spaces("None")
+            ret += add_spaces(location.attrib['bitscore'])
+            ret += add_spaces(location.attrib['evalue'])
+            ret += add_spaces(location.attrib['significant'])
+            ret += "None\n"
+    return ret
+
+
+def read_pfam_output(pfam_out_file):
+    """Read the output of PFAM scan."""
+    cols = ["seq_id", "alignment_start", "alignment_end", "envelope_start",
+            "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start",
+            "hmm_end", "hmm_length", "bit_score", "E-value", "significance",
+            "clan"]
+    try:
+        data = pd.read_table(pfam_out_file,
+                             sep="\s*", skip_blank_lines=True, skiprows=1,
+                             names=cols, engine='python')
+    except:
+        return None
+    return data
+
+
+def download_seed_seqs(acc):
+    """
+    Download seed sequences from PFAM.
+
+    Input
+    -----
+    acc : str
+        Accession number of a Pfam domain
+
+    Output
+    ------
+    fasta : str
+        Seed sequences in fasta format
+    """
+    url = "http://pfam.xfam.org/family/%s/alignment/seed" % acc
+    req = requests.get(url)
+    stockholm = req.text
+    fasta = fasta_utils.stockholm2fasta(stockholm)
+    return fasta