Mercurial > repos > rnateam > rnacommender
comparison pfam_utils/__init__.py @ 0:8918de535391 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author | rnateam |
---|---|
date | Tue, 31 May 2016 05:41:03 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8918de535391 |
---|---|
1 """Utils for PFAM.""" | |
2 | |
3 import xml.etree.ElementTree as ET | |
4 from math import ceil | |
5 from time import sleep | |
6 from xml.etree.ElementTree import ParseError | |
7 | |
8 import requests | |
9 | |
10 import pandas as pd | |
11 | |
12 import fasta_utils | |
13 | |
14 __author__ = "Gianluca Corrado" | |
15 __copyright__ = "Copyright 2016, Gianluca Corrado" | |
16 __license__ = "MIT" | |
17 __maintainer__ = "Gianluca Corrado" | |
18 __email__ = "gianluca.corrado@unitn.it" | |
19 __status__ = "Production" | |
20 | |
21 | |
22 def search_header(): | |
23 """Return the header of a Pfam scan search.""" | |
24 return "<seq id> <alignment start> <alignment end> \ | |
25 <envelope start> <envelope end> <hmm acc> <hmm name>\ | |
26 <type> <hmm start> <hmm end> <hmm length> <bit score>\ | |
27 <E-value> <significance> <clan>\n" | |
28 | |
29 | |
30 def sequence_search(seq_id, seq): | |
31 """ | |
32 Search a sequence against PFAM. | |
33 | |
34 Input | |
35 ----- | |
36 seq_id : str | |
37 Name of the protein sequence. | |
38 seq : str | |
39 Protein sequence. | |
40 | |
41 Output | |
42 ------ | |
43 ret : str | |
44 Formatted string containing the results of the Pfam scan for the | |
45 given sequence | |
46 """ | |
47 def add_spaces(text, mul=8): | |
48 """Add spaces to a string.""" | |
49 l = len(text) | |
50 next_mul = int(ceil(l / mul) + 1) * mul | |
51 offset = next_mul - l | |
52 if offset == 0: | |
53 offset = 8 | |
54 return text + " " * offset | |
55 | |
56 url = "http://pfam.xfam.org/search/sequence" | |
57 params = {'seq': seq, | |
58 'evalue': '1.0', | |
59 'output': 'xml'} | |
60 req = requests.get(url, params=params) | |
61 xml = req.text | |
62 try: | |
63 root = ET.fromstring(xml) | |
64 # sometimes Pfam returns the HTML code | |
65 except ParseError: | |
66 print "resending: %s" % seq_id | |
67 return "%s" % sequence_search(seq_id, seq) | |
68 | |
69 result_url = root[0][1].text | |
70 # wait for Pfam to compute the results | |
71 sleep(4) | |
72 while True: | |
73 req2 = requests.get(result_url) | |
74 if req2.status_code == 200: | |
75 break | |
76 else: | |
77 sleep(1) | |
78 result_xml = req2.text | |
79 root = ET.fromstring(result_xml) | |
80 try: | |
81 matches = root[0][0][0][0][:] | |
82 # Sometimes raised when the sequence has no matches | |
83 except IndexError: | |
84 return "" | |
85 ret = "" | |
86 for match in matches: | |
87 for location in match: | |
88 ret += add_spaces(seq_id) | |
89 ret += add_spaces(location.attrib['ali_start']) | |
90 ret += add_spaces(location.attrib['ali_end']) | |
91 ret += add_spaces(location.attrib['start']) | |
92 ret += add_spaces(location.attrib['end']) | |
93 ret += add_spaces(match.attrib['accession']) | |
94 ret += add_spaces(match.attrib['id']) | |
95 ret += add_spaces(match.attrib['class']) | |
96 ret += add_spaces(location.attrib['hmm_start']) | |
97 ret += add_spaces(location.attrib['hmm_end']) | |
98 ret += add_spaces("None") | |
99 ret += add_spaces(location.attrib['bitscore']) | |
100 ret += add_spaces(location.attrib['evalue']) | |
101 ret += add_spaces(location.attrib['significant']) | |
102 ret += "None\n" | |
103 return ret | |
104 | |
105 | |
106 def read_pfam_output(pfam_out_file): | |
107 """Read the output of PFAM scan.""" | |
108 cols = ["seq_id", "alignment_start", "alignment_end", "envelope_start", | |
109 "envelope_end", "hmm_acc", "hmm_name", "type", "hmm_start", | |
110 "hmm_end", "hmm_length", "bit_score", "E-value", "significance", | |
111 "clan"] | |
112 try: | |
113 data = pd.read_table(pfam_out_file, | |
114 sep="\s*", skip_blank_lines=True, skiprows=1, | |
115 names=cols, engine='python') | |
116 except: | |
117 return None | |
118 return data | |
119 | |
120 | |
121 def download_seed_seqs(acc): | |
122 """ | |
123 Download seed sequences from PFAM. | |
124 | |
125 Input | |
126 ----- | |
127 acc : str | |
128 Accession number of a Pfam domain | |
129 | |
130 Output | |
131 ------ | |
132 fasta : str | |
133 Seed sequences in fasta format | |
134 """ | |
135 url = "http://pfam.xfam.org/family/%s/alignment/seed" % acc | |
136 req = requests.get(url) | |
137 stockholm = req.text | |
138 fasta = fasta_utils.stockholm2fasta(stockholm) | |
139 return fasta |