Mercurial > repos > rnateam > rnacommender
comparison rbpfeatures.py @ 0:8918de535391 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author | rnateam |
---|---|
date | Tue, 31 May 2016 05:41:03 -0400 |
parents | |
children | a609d6dc8047 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8918de535391 |
---|---|
1 """Compute the RBP features.""" | |
2 | |
3 import re | |
4 import subprocess as sp | |
5 import uuid | |
6 from os import mkdir | |
7 from os import listdir | |
8 from os.path import isfile, join | |
9 from os import devnull | |
10 from shutil import rmtree | |
11 | |
12 import numpy as np | |
13 | |
14 import pandas as pd | |
15 | |
16 import fasta_utils | |
17 import pfam_utils | |
18 | |
19 __author__ = "Gianluca Corrado" | |
20 __copyright__ = "Copyright 2016, Gianluca Corrado" | |
21 __license__ = "MIT" | |
22 __maintainer__ = "Gianluca Corrado" | |
23 __email__ = "gianluca.corrado@unitn.it" | |
24 __status__ = "Production" | |
25 | |
26 | |
27 class RBPVectorizer(): | |
28 """Compute the RBP features.""" | |
29 | |
30 def __init__(self, fasta): | |
31 """ | |
32 Constructor. | |
33 | |
34 Parameters | |
35 ---------- | |
36 fasta : str | |
37 Fasta file containing the RBP sequences to predict. | |
38 """ | |
39 self.fasta = fasta | |
40 | |
41 self._mod_fold = "AURA_Human_data/RBP_features/mod" | |
42 self._reference_fisher_scores = \ | |
43 "AURA_Human_data/RBP_features/fisher_scores_ref" | |
44 self._train_rbps_file = \ | |
45 "AURA_Human_data/RBP_features/rbps_in_train.txt" | |
46 | |
47 self._temp_fold = "temp_" + str(uuid.uuid4()) | |
48 self.pfam_scan = "%s/pfam_scan.txt" % self._temp_fold | |
49 self._dom_fold = "%s/domains" % self._temp_fold | |
50 self._seeds_fold = "%s/seeds" % self._temp_fold | |
51 self._fisher_fold = "%s/fisher_scores" % self._temp_fold | |
52 | |
53 def _pfam_scan(self): | |
54 """Scan the sequences against the Pfam database.""" | |
55 nf = open(self.pfam_scan, "w") | |
56 nf.write(pfam_utils.search_header()) | |
57 | |
58 fasta = fasta_utils.import_fasta(self.fasta) | |
59 | |
60 for rbp in sorted(fasta.keys()): | |
61 seq = fasta[rbp] | |
62 text = pfam_utils.sequence_search(rbp, seq) | |
63 nf.write(text) | |
64 | |
65 nf.close() | |
66 | |
67 def _overlapping_domains(self): | |
68 """Compute the set of domains contributing to the similarity.""" | |
69 reference_domains = set([dom.replace(".mod", "") for dom in | |
70 listdir(self._mod_fold) if | |
71 isfile(join(self._mod_fold, dom))]) | |
72 | |
73 data = pfam_utils.read_pfam_output(self.pfam_scan) | |
74 if data is None: | |
75 return [] | |
76 | |
77 prot_domains = set([a.split('.')[0] for a in data["hmm_acc"]]) | |
78 dom_list = sorted(list(reference_domains & prot_domains)) | |
79 | |
80 return dom_list | |
81 | |
82 def _prepare_domains(self, dom_list): | |
83 """Select domain subsequences from the entire protein sequences.""" | |
84 def prepare_domains(fasta_dic, dom_list, pfam_scan, out_folder): | |
85 out_file_dic = {} | |
86 for acc in dom_list: | |
87 out_file_dic[acc] = open("%s/%s.fa" % (out_folder, acc), "w") | |
88 | |
89 f = open(pfam_scan) | |
90 f.readline() | |
91 for line in f: | |
92 split = line.split() | |
93 rbp = split[0] | |
94 start = int(split[3]) | |
95 stop = int(split[4]) | |
96 acc = split[5].split('.')[0] | |
97 if acc in out_file_dic.keys(): | |
98 out_file_dic[acc].write( | |
99 ">%s:%i-%i\n%s\n" % (rbp, start, stop, | |
100 fasta_dic[rbp][start:stop])) | |
101 f.close() | |
102 | |
103 for acc in dom_list: | |
104 out_file_dic[acc].close() | |
105 | |
106 mkdir(self._dom_fold) | |
107 fasta = fasta_utils.import_fasta(self.fasta) | |
108 prepare_domains(fasta, dom_list, self.pfam_scan, | |
109 self._dom_fold) | |
110 | |
111 def _compute_fisher_scores(self, dom_list): | |
112 """Wrapper for SAM 3.5 get_fisher_scores.""" | |
113 def get_fisher_scores(dom_list, mod_fold, dom_fold, fisher_fold): | |
114 for acc in dom_list: | |
115 _FNULL = open(devnull, 'w') | |
116 cmd = "get_fisher_scores run -i %s/%s.mod -db %s/%s.fa" % ( | |
117 mod_fold, acc, dom_fold, acc) | |
118 fisher = sp.check_output( | |
119 cmd, shell=True, stderr=_FNULL) | |
120 nf = open("%s/%s.txt" % (fisher_fold, acc), "w") | |
121 nf.write(fisher) | |
122 nf.close() | |
123 | |
124 mkdir(self._fisher_fold) | |
125 get_fisher_scores(dom_list, self._mod_fold, self._dom_fold, | |
126 self._fisher_fold) | |
127 | |
128 def _ekm(self, dom_list): | |
129 """Compute the empirical kernel map from the Fisher scores.""" | |
130 def process_seg(e): | |
131 """Process segment of a SAM 3.5 get_fisher_scores output file.""" | |
132 seg = e.split() | |
133 c = seg[0].split(':')[0] | |
134 m = map(float, seg[3:]) | |
135 return c, m | |
136 | |
137 def read_sam_file(samfile): | |
138 """Read a SAM 3.5 get_fisher_scores output file.""" | |
139 f = open(samfile) | |
140 data = f.read() | |
141 f.close() | |
142 | |
143 columns = [] | |
144 m = [] | |
145 split = re.split(">A ", data)[1:] | |
146 for e in split: | |
147 c, m_ = process_seg(e) | |
148 columns.append(c) | |
149 m.append(m_) | |
150 | |
151 m = np.matrix(m) | |
152 df = pd.DataFrame(data=m.T, columns=columns) | |
153 return df | |
154 | |
155 def dom_features(fisher_fold, dom_list, names=None): | |
156 """Compute the features with respect to a domain type.""" | |
157 dfs = [] | |
158 for acc in dom_list: | |
159 df = read_sam_file("%s/%s.txt" % (fisher_fold, acc)) | |
160 df = df.groupby(df.columns, axis=1).mean() | |
161 dfs.append(df) | |
162 | |
163 con = pd.concat(dfs, ignore_index=True) | |
164 | |
165 if names is not None: | |
166 add = sorted(list(set(names) - set(con.columns))) | |
167 fil = sorted(list(set(names) - set(add))) | |
168 con = con[fil] | |
169 for c in add: | |
170 con[c] = np.zeros(len(con.index), dtype='float64') | |
171 con = con[names] | |
172 | |
173 con = con.fillna(0.0) | |
174 return con | |
175 | |
176 f = open(self._train_rbps_file) | |
177 train_rbps = f.read().strip().split('\n') | |
178 f.close() | |
179 ref = dom_features(self._reference_fisher_scores, dom_list, | |
180 names=train_rbps) | |
181 ekm_ref = ref.T.dot(ref) | |
182 ekm_ref.index = ekm_ref.columns | |
183 | |
184 sel = dom_features(self._fisher_fold, dom_list) | |
185 | |
186 ekm_sel = sel.T.dot(sel) | |
187 ekm_sel.index = ekm_sel.columns | |
188 | |
189 ekm = ref.T.dot(sel) | |
190 | |
191 for rs in ekm.columns: | |
192 for rr in ekm.index: | |
193 if ekm_ref[rr][rr] > 0 and ekm_sel[rs][rs] > 0: | |
194 ekm[rs][rr] /= np.sqrt(ekm_ref[rr][rr] * ekm_sel[rs][rs]) | |
195 return ekm | |
196 | |
197 def vectorize(self): | |
198 """Produce the RBP features.""" | |
199 # create a temporary folder | |
200 mkdir(self._temp_fold) | |
201 # scan the RBP sequences against Pfam | |
202 self._pfam_scan() | |
203 # determine the accession numbers of the pfam domains needed for | |
204 # computing the features | |
205 dom_list = self._overlapping_domains() | |
206 if len(dom_list) == 0: | |
207 rmtree(self._temp_fold) | |
208 return None | |
209 # prepare fasta file with the sequence of the domains | |
210 self._prepare_domains(dom_list) | |
211 # compute fisher scores using SAM 3.5 | |
212 self._compute_fisher_scores(dom_list) | |
213 # compute the empirical kernel map | |
214 ekm = self._ekm(dom_list) | |
215 # remove the temporary folder | |
216 rmtree(self._temp_fold) | |
217 return ekm |