Mercurial > repos > rnateam > rnacommender
diff rbpfeatures.py @ 0:8918de535391 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 2fc7f3c08f30e2d81dc4ad19759dfe7ba9b0a3a1
author | rnateam |
---|---|
date | Tue, 31 May 2016 05:41:03 -0400 |
parents | |
children | a609d6dc8047 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rbpfeatures.py Tue May 31 05:41:03 2016 -0400 @@ -0,0 +1,217 @@ +"""Compute the RBP features.""" + +import re +import subprocess as sp +import uuid +from os import mkdir +from os import listdir +from os.path import isfile, join +from os import devnull +from shutil import rmtree + +import numpy as np + +import pandas as pd + +import fasta_utils +import pfam_utils + +__author__ = "Gianluca Corrado" +__copyright__ = "Copyright 2016, Gianluca Corrado" +__license__ = "MIT" +__maintainer__ = "Gianluca Corrado" +__email__ = "gianluca.corrado@unitn.it" +__status__ = "Production" + + +class RBPVectorizer(): + """Compute the RBP features.""" + + def __init__(self, fasta): + """ + Constructor. + + Parameters + ---------- + fasta : str + Fasta file containing the RBP sequences to predict. + """ + self.fasta = fasta + + self._mod_fold = "AURA_Human_data/RBP_features/mod" + self._reference_fisher_scores = \ + "AURA_Human_data/RBP_features/fisher_scores_ref" + self._train_rbps_file = \ + "AURA_Human_data/RBP_features/rbps_in_train.txt" + + self._temp_fold = "temp_" + str(uuid.uuid4()) + self.pfam_scan = "%s/pfam_scan.txt" % self._temp_fold + self._dom_fold = "%s/domains" % self._temp_fold + self._seeds_fold = "%s/seeds" % self._temp_fold + self._fisher_fold = "%s/fisher_scores" % self._temp_fold + + def _pfam_scan(self): + """Scan the sequences against the Pfam database.""" + nf = open(self.pfam_scan, "w") + nf.write(pfam_utils.search_header()) + + fasta = fasta_utils.import_fasta(self.fasta) + + for rbp in sorted(fasta.keys()): + seq = fasta[rbp] + text = pfam_utils.sequence_search(rbp, seq) + nf.write(text) + + nf.close() + + def _overlapping_domains(self): + """Compute the set of domains contributing to the similarity.""" + reference_domains = set([dom.replace(".mod", "") for dom in + listdir(self._mod_fold) if + isfile(join(self._mod_fold, dom))]) + + data = pfam_utils.read_pfam_output(self.pfam_scan) + if data is None: + return [] + + prot_domains = set([a.split('.')[0] for a in data["hmm_acc"]]) + dom_list = sorted(list(reference_domains & prot_domains)) + + return dom_list + + def _prepare_domains(self, dom_list): + """Select domain subsequences from the entire protein sequences.""" + def prepare_domains(fasta_dic, dom_list, pfam_scan, out_folder): + out_file_dic = {} + for acc in dom_list: + out_file_dic[acc] = open("%s/%s.fa" % (out_folder, acc), "w") + + f = open(pfam_scan) + f.readline() + for line in f: + split = line.split() + rbp = split[0] + start = int(split[3]) + stop = int(split[4]) + acc = split[5].split('.')[0] + if acc in out_file_dic.keys(): + out_file_dic[acc].write( + ">%s:%i-%i\n%s\n" % (rbp, start, stop, + fasta_dic[rbp][start:stop])) + f.close() + + for acc in dom_list: + out_file_dic[acc].close() + + mkdir(self._dom_fold) + fasta = fasta_utils.import_fasta(self.fasta) + prepare_domains(fasta, dom_list, self.pfam_scan, + self._dom_fold) + + def _compute_fisher_scores(self, dom_list): + """Wrapper for SAM 3.5 get_fisher_scores.""" + def get_fisher_scores(dom_list, mod_fold, dom_fold, fisher_fold): + for acc in dom_list: + _FNULL = open(devnull, 'w') + cmd = "get_fisher_scores run -i %s/%s.mod -db %s/%s.fa" % ( + mod_fold, acc, dom_fold, acc) + fisher = sp.check_output( + cmd, shell=True, stderr=_FNULL) + nf = open("%s/%s.txt" % (fisher_fold, acc), "w") + nf.write(fisher) + nf.close() + + mkdir(self._fisher_fold) + get_fisher_scores(dom_list, self._mod_fold, self._dom_fold, + self._fisher_fold) + + def _ekm(self, dom_list): + """Compute the empirical kernel map from the Fisher scores.""" + def process_seg(e): + """Process segment of a SAM 3.5 get_fisher_scores output file.""" + seg = e.split() + c = seg[0].split(':')[0] + m = map(float, seg[3:]) + return c, m + + def read_sam_file(samfile): + """Read a SAM 3.5 get_fisher_scores output file.""" + f = open(samfile) + data = f.read() + f.close() + + columns = [] + m = [] + split = re.split(">A ", data)[1:] + for e in split: + c, m_ = process_seg(e) + columns.append(c) + m.append(m_) + + m = np.matrix(m) + df = pd.DataFrame(data=m.T, columns=columns) + return df + + def dom_features(fisher_fold, dom_list, names=None): + """Compute the features with respect to a domain type.""" + dfs = [] + for acc in dom_list: + df = read_sam_file("%s/%s.txt" % (fisher_fold, acc)) + df = df.groupby(df.columns, axis=1).mean() + dfs.append(df) + + con = pd.concat(dfs, ignore_index=True) + + if names is not None: + add = sorted(list(set(names) - set(con.columns))) + fil = sorted(list(set(names) - set(add))) + con = con[fil] + for c in add: + con[c] = np.zeros(len(con.index), dtype='float64') + con = con[names] + + con = con.fillna(0.0) + return con + + f = open(self._train_rbps_file) + train_rbps = f.read().strip().split('\n') + f.close() + ref = dom_features(self._reference_fisher_scores, dom_list, + names=train_rbps) + ekm_ref = ref.T.dot(ref) + ekm_ref.index = ekm_ref.columns + + sel = dom_features(self._fisher_fold, dom_list) + + ekm_sel = sel.T.dot(sel) + ekm_sel.index = ekm_sel.columns + + ekm = ref.T.dot(sel) + + for rs in ekm.columns: + for rr in ekm.index: + if ekm_ref[rr][rr] > 0 and ekm_sel[rs][rs] > 0: + ekm[rs][rr] /= np.sqrt(ekm_ref[rr][rr] * ekm_sel[rs][rs]) + return ekm + + def vectorize(self): + """Produce the RBP features.""" + # create a temporary folder + mkdir(self._temp_fold) + # scan the RBP sequences against Pfam + self._pfam_scan() + # determine the accession numbers of the pfam domains needed for + # computing the features + dom_list = self._overlapping_domains() + if len(dom_list) == 0: + rmtree(self._temp_fold) + return None + # prepare fasta file with the sequence of the domains + self._prepare_domains(dom_list) + # compute fisher scores using SAM 3.5 + self._compute_fisher_scores(dom_list) + # compute the empirical kernel map + ekm = self._ekm(dom_list) + # remove the temporary folder + rmtree(self._temp_fold) + return ekm