comparison rbpfeatures.py @ 0:8918de535391 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author rnateam
date Tue, 31 May 2016 05:41:03 -0400
parents
children a609d6dc8047
comparison
equal deleted inserted replaced
-1:000000000000 0:8918de535391
1 """Compute the RBP features."""
2
3 import re
4 import subprocess as sp
5 import uuid
6 from os import mkdir
7 from os import listdir
8 from os.path import isfile, join
9 from os import devnull
10 from shutil import rmtree
11
12 import numpy as np
13
14 import pandas as pd
15
16 import fasta_utils
17 import pfam_utils
18
19 __author__ = "Gianluca Corrado"
20 __copyright__ = "Copyright 2016, Gianluca Corrado"
21 __license__ = "MIT"
22 __maintainer__ = "Gianluca Corrado"
23 __email__ = "gianluca.corrado@unitn.it"
24 __status__ = "Production"
25
26
27 class RBPVectorizer():
28 """Compute the RBP features."""
29
30 def __init__(self, fasta):
31 """
32 Constructor.
33
34 Parameters
35 ----------
36 fasta : str
37 Fasta file containing the RBP sequences to predict.
38 """
39 self.fasta = fasta
40
41 self._mod_fold = "AURA_Human_data/RBP_features/mod"
42 self._reference_fisher_scores = \
43 "AURA_Human_data/RBP_features/fisher_scores_ref"
44 self._train_rbps_file = \
45 "AURA_Human_data/RBP_features/rbps_in_train.txt"
46
47 self._temp_fold = "temp_" + str(uuid.uuid4())
48 self.pfam_scan = "%s/pfam_scan.txt" % self._temp_fold
49 self._dom_fold = "%s/domains" % self._temp_fold
50 self._seeds_fold = "%s/seeds" % self._temp_fold
51 self._fisher_fold = "%s/fisher_scores" % self._temp_fold
52
53 def _pfam_scan(self):
54 """Scan the sequences against the Pfam database."""
55 nf = open(self.pfam_scan, "w")
56 nf.write(pfam_utils.search_header())
57
58 fasta = fasta_utils.import_fasta(self.fasta)
59
60 for rbp in sorted(fasta.keys()):
61 seq = fasta[rbp]
62 text = pfam_utils.sequence_search(rbp, seq)
63 nf.write(text)
64
65 nf.close()
66
67 def _overlapping_domains(self):
68 """Compute the set of domains contributing to the similarity."""
69 reference_domains = set([dom.replace(".mod", "") for dom in
70 listdir(self._mod_fold) if
71 isfile(join(self._mod_fold, dom))])
72
73 data = pfam_utils.read_pfam_output(self.pfam_scan)
74 if data is None:
75 return []
76
77 prot_domains = set([a.split('.')[0] for a in data["hmm_acc"]])
78 dom_list = sorted(list(reference_domains & prot_domains))
79
80 return dom_list
81
82 def _prepare_domains(self, dom_list):
83 """Select domain subsequences from the entire protein sequences."""
84 def prepare_domains(fasta_dic, dom_list, pfam_scan, out_folder):
85 out_file_dic = {}
86 for acc in dom_list:
87 out_file_dic[acc] = open("%s/%s.fa" % (out_folder, acc), "w")
88
89 f = open(pfam_scan)
90 f.readline()
91 for line in f:
92 split = line.split()
93 rbp = split[0]
94 start = int(split[3])
95 stop = int(split[4])
96 acc = split[5].split('.')[0]
97 if acc in out_file_dic.keys():
98 out_file_dic[acc].write(
99 ">%s:%i-%i\n%s\n" % (rbp, start, stop,
100 fasta_dic[rbp][start:stop]))
101 f.close()
102
103 for acc in dom_list:
104 out_file_dic[acc].close()
105
106 mkdir(self._dom_fold)
107 fasta = fasta_utils.import_fasta(self.fasta)
108 prepare_domains(fasta, dom_list, self.pfam_scan,
109 self._dom_fold)
110
111 def _compute_fisher_scores(self, dom_list):
112 """Wrapper for SAM 3.5 get_fisher_scores."""
113 def get_fisher_scores(dom_list, mod_fold, dom_fold, fisher_fold):
114 for acc in dom_list:
115 _FNULL = open(devnull, 'w')
116 cmd = "get_fisher_scores run -i %s/%s.mod -db %s/%s.fa" % (
117 mod_fold, acc, dom_fold, acc)
118 fisher = sp.check_output(
119 cmd, shell=True, stderr=_FNULL)
120 nf = open("%s/%s.txt" % (fisher_fold, acc), "w")
121 nf.write(fisher)
122 nf.close()
123
124 mkdir(self._fisher_fold)
125 get_fisher_scores(dom_list, self._mod_fold, self._dom_fold,
126 self._fisher_fold)
127
128 def _ekm(self, dom_list):
129 """Compute the empirical kernel map from the Fisher scores."""
130 def process_seg(e):
131 """Process segment of a SAM 3.5 get_fisher_scores output file."""
132 seg = e.split()
133 c = seg[0].split(':')[0]
134 m = map(float, seg[3:])
135 return c, m
136
137 def read_sam_file(samfile):
138 """Read a SAM 3.5 get_fisher_scores output file."""
139 f = open(samfile)
140 data = f.read()
141 f.close()
142
143 columns = []
144 m = []
145 split = re.split(">A ", data)[1:]
146 for e in split:
147 c, m_ = process_seg(e)
148 columns.append(c)
149 m.append(m_)
150
151 m = np.matrix(m)
152 df = pd.DataFrame(data=m.T, columns=columns)
153 return df
154
155 def dom_features(fisher_fold, dom_list, names=None):
156 """Compute the features with respect to a domain type."""
157 dfs = []
158 for acc in dom_list:
159 df = read_sam_file("%s/%s.txt" % (fisher_fold, acc))
160 df = df.groupby(df.columns, axis=1).mean()
161 dfs.append(df)
162
163 con = pd.concat(dfs, ignore_index=True)
164
165 if names is not None:
166 add = sorted(list(set(names) - set(con.columns)))
167 fil = sorted(list(set(names) - set(add)))
168 con = con[fil]
169 for c in add:
170 con[c] = np.zeros(len(con.index), dtype='float64')
171 con = con[names]
172
173 con = con.fillna(0.0)
174 return con
175
176 f = open(self._train_rbps_file)
177 train_rbps = f.read().strip().split('\n')
178 f.close()
179 ref = dom_features(self._reference_fisher_scores, dom_list,
180 names=train_rbps)
181 ekm_ref = ref.T.dot(ref)
182 ekm_ref.index = ekm_ref.columns
183
184 sel = dom_features(self._fisher_fold, dom_list)
185
186 ekm_sel = sel.T.dot(sel)
187 ekm_sel.index = ekm_sel.columns
188
189 ekm = ref.T.dot(sel)
190
191 for rs in ekm.columns:
192 for rr in ekm.index:
193 if ekm_ref[rr][rr] > 0 and ekm_sel[rs][rs] > 0:
194 ekm[rs][rr] /= np.sqrt(ekm_ref[rr][rr] * ekm_sel[rs][rs])
195 return ekm
196
197 def vectorize(self):
198 """Produce the RBP features."""
199 # create a temporary folder
200 mkdir(self._temp_fold)
201 # scan the RBP sequences against Pfam
202 self._pfam_scan()
203 # determine the accession numbers of the pfam domains needed for
204 # computing the features
205 dom_list = self._overlapping_domains()
206 if len(dom_list) == 0:
207 rmtree(self._temp_fold)
208 return None
209 # prepare fasta file with the sequence of the domains
210 self._prepare_domains(dom_list)
211 # compute fisher scores using SAM 3.5
212 self._compute_fisher_scores(dom_list)
213 # compute the empirical kernel map
214 ekm = self._ekm(dom_list)
215 # remove the temporary folder
216 rmtree(self._temp_fold)
217 return ekm