comparison pfam_utils/__init__.py @ 0:8918de535391 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author rnateam
date Tue, 31 May 2016 05:41:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8918de535391
1 """Utils for PFAM."""
2
3 import xml.etree.ElementTree as ET
4 from math import ceil
5 from time import sleep
6 from xml.etree.ElementTree import ParseError
7
8 import requests
9
10 import pandas as pd
11
12 import fasta_utils
13
14 __author__ = "Gianluca Corrado"
15 __copyright__ = "Copyright 2016, Gianluca Corrado"
16 __license__ = "MIT"
17 __maintainer__ = "Gianluca Corrado"
18 __email__ = "gianluca.corrado@unitn.it"
19 __status__ = "Production"
20
21
22 def search_header():
23 """Return the header of a Pfam scan search."""
24 return "<seq id> <alignment start> <alignment end> \
25 <envelope start> <envelope end> <hmm acc> <hmm name>\
26 <type> <hmm start> <hmm end> <hmm length> <bit score>\
27 <E-value> <significance> <clan>\n"
28
29
30 def sequence_search(seq_id, seq):
31 """
32 Search a sequence against PFAM.
33
34 Input
35 -----
36 seq_id : str
37 Name of the protein sequence.
38 seq : str
39 Protein sequence.
40
41 Output
42 ------
43 ret : str
44 Formatted string containing the results of the Pfam scan for the
45 given sequence
46 """
47 def add_spaces(text, mul=8):
48 """Add spaces to a string."""
49 l = len(text)
50 next_mul = int(ceil(l / mul) + 1) * mul
51 offset = next_mul - l
52 if offset == 0:
53 offset = 8
54 return text + " " * offset
55
56 url = "http://pfam.xfam.org/search/sequence"
57 params = {'seq': seq,
58 'evalue': '1.0',
59 'output': 'xml'}
60 req = requests.get(url, params=params)
61 xml = req.text
62 try:
63 root = ET.fromstring(xml)
64 # sometimes Pfam returns the HTML code
65 except ParseError:
66 print "resending: %s" % seq_id
67 return "%s" % sequence_search(seq_id, seq)
68
69 result_url = root[0][1].text
70 # wait for Pfam to compute the results
71 sleep(4)
72 while True:
73 req2 = requests.get(result_url)
74 if req2.status_code == 200:
75 break
76 else:
77 sleep(1)
78 result_xml = req2.text
79 root = ET.fromstring(result_xml)
80 try:
81 matches = root[0][0][0][0][:]
82 # Sometimes raised when the sequence has no matches
83 except IndexError:
84 return ""
85 ret = ""
86 for match in matches:
87 for location in match:
88 ret += add_spaces(seq_id)
89 ret += add_spaces(location.attrib['ali_start'])
90 ret += add_spaces(location.attrib['ali_end'])
91 ret += add_spaces(location.attrib['start'])
92 ret += add_spaces(location.attrib['end'])
93 ret += add_spaces(match.attrib['accession'])
94 ret += add_spaces(match.attrib['id'])
95 ret += add_spaces(match.attrib['class'])
96 ret += add_spaces(location.attrib['hmm_start'])
97 ret += add_spaces(location.attrib['hmm_end'])
98 ret += add_spaces("None")
99 ret += add_spaces(location.attrib['bitscore'])
100 ret += add_spaces(location.attrib['evalue'])
101 ret += add_spaces(location.attrib['significant'])
102 ret += "None\n"
103 return ret
104
105
106 def read_pfam_output(pfam_out_file):
107 """Read the output of PFAM scan."""
108 cols = ["seq_id", "alignment_start", "alignment_end", "envelope_start",
109 "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start",
110 "hmm_end", "hmm_length", "bit_score", "E-value", "significance",
111 "clan"]
112 try:
113 data = pd.read_table(pfam_out_file,
114 sep="\s*", skip_blank_lines=True, skiprows=1,
115 names=cols, engine='python')
116 except:
117 return None
118 return data
119
120
121 def download_seed_seqs(acc):
122 """
123 Download seed sequences from PFAM.
124
125 Input
126 -----
127 acc : str
128 Accession number of a Pfam domain
129
130 Output
131 ------
132 fasta : str
133 Seed sequences in fasta format
134 """
135 url = "http://pfam.xfam.org/family/%s/alignment/seed" % acc
136 req = requests.get(url)
137 stockholm = req.text
138 fasta = fasta_utils.stockholm2fasta(stockholm)
139 return fasta