Mercurial > repos > rnateam > infer_rnaformer

--- a/infer_rnaformer.xml	Thu Jul 11 20:56:23 2024 +0000
+++ b/infer_rnaformer.xml	Fri Mar 07 15:22:24 2025 +0000
@@ -1,19 +1,15 @@
-<tool id="infer_rnaformer" name="@EXECUTABLE@" version="@TOOL_VERSION@" profile="22.05">
+<tool id="infer_rnaformer" name="@EXECUTABLE@" version="@TOOL_VERSION@+galaxy1" profile="22.05">
     <description>Predict the secondary structure of an RNA with RNAformer</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements">
         <requirement type="package" version="1.83">biopython</requirement>
+        <requirement type="package" version="3.7.2">matplotlib</requirement>
+        <requirement type="package" version="0.13.2">seaborn</requirement>
+        <requirement type="package" version="2.32.3">requests</requirement>
     </expand>
-    <command detect_errors="exit_code"><![CDATA[
-    mkdir -p './model' &&
-    wget -O './model/RNAformer_32M_state_dict_intra_family_finetuned.pth' 'https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_state_dict_intra_family_finetuned.pth'
-    &&
-    wget -O './model/RNAformer_32M_config_intra_family_finetuned.yml' 'https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_config_intra_family_finetuned.yml'
-    &&
-    python '$script_file' > '$output'
-]]></command>
+    <command detect_errors="exit_code"><![CDATA[python '$script_file' > '$output']]></command>
 <configfiles>
         <configfile name="script_file"><![CDATA[import RNAformer
 import os
@@ -32,19 +28,55 @@
 import logging
 import sys

+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+import shutil
+import datetime
+import requests
+
 def is_valid_rna_sequence(sequence):
     """Check if the sequence contains only RNA bases."""
     valid_bases = {'A', 'C', 'G', 'U', 'N'}  # Include 'N' if unknown bases are allowed
     return all(base in valid_bases for base in sequence.upper())

-config_file_path = 'model/RNAformer_32M_config_intra_family_finetuned.yml'
-model_file_path = 'model/RNAformer_32M_state_dict_intra_family_finetuned.pth'
+def download_file(url, destination):
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    with open(destination, 'wb') as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+
+model_url = "https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_state_dict_intra_family_finetuned.pth"
+config_url = "https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_config_intra_family_finetuned.yml"
+
+model_file_path = "./model/RNAformer_32M_state_dict_intra_family_finetuned.pth"
+config_file_path = "./model/RNAformer_32M_config_intra_family_finetuned.yml"
+
+os.makedirs("./model", exist_ok=True)
+
+model = '$model'
+model_config = '$model_config'
+
+if model and os.path.exists(model):
+    model_file_path = model
+else:
+    download_file(model_url, model_file_path)
+
+if model_config and os.path.exists(model_config):
+    config_file_path = model_config
+else:
+    download_file(config_url, config_file_path)

 config = Config(config_file=config_file_path)
 config.RNAformer.cycling = 6
 model = RiboFormer(config.RNAformer)
 state_dict_file = model_file_path

+matrix_color_type = '$matrix_color_type'
+matrix_out_type = '$matrix_out_type'
+
 #if str($input_type.input_type) == 'True'
 fasta_path = '$input_type.fasta_input'
 sequences = [str(record.seq) for record in SeqIO.parse(fasta_path, 'fasta')]
@@ -112,7 +144,22 @@
 model.eval()
 predicted_structures = []

+orig_seq = ""
+
+job_name = '$job_name'
+
+if job_name == "":
+    job_name = f"RNAformer_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
+
+count = 0
+os.makedirs(f'./{job_name}', exist_ok=True)
+
+total_output_buffer = []
+
 for sequence in sequences:
+    output_buffer = []
+    seq_dir = f'./{job_name}/{job_name}_{count}'
+    os.makedirs(seq_dir, exist_ok=True)
     with torch.no_grad():
         device = "cpu"

@@ -124,6 +171,8 @@
         length = len(sequence)
         src_seq = torch.LongTensor(list(map(seq_stoi.get, sequence)))

+        orig_seq = sequence
+
         sample = {}
         sample['src_seq'] = src_seq.clone()
         sample['length'] = torch.LongTensor([length])[0]
@@ -141,10 +190,12 @@
         pos1_id = pos_id[0].cpu().tolist()
         pos2_id = pos_id[1].cpu().tolist()
         predicted_structure = f"Pairing index 1: {pos1_id} \nPairing index 2: {pos2_id}"
-        print(predicted_structure)
+        pairs = [[a, b] for a, b in zip(pos1_id, pos2_id)]

         seqlen = len(sample['src_seq'])
         dot_bracket =['.'] * seqlen
+        pk_count = 0
+        pk_list = []
         for i in range(len(pos1_id)):
             open_index = pos1_id[i]
             close_index = pos2_id[i]
@@ -153,9 +204,62 @@
                 if dot_bracket[open_index] == '.' and dot_bracket[close_index] == '.':
                     dot_bracket[open_index] = '('
                     dot_bracket[close_index] = ')'
+                else:
+                    ## pseudoknots or multiplets present in structure- cannot represent with dot-bracket
+                    pk_count += 1
+                    pk_list.append(pairs[i])
         dot_bracket_str_pred = ''.join(dot_bracket)

+        output_buffer.append(f"Job name: {job_name}_{count}\n")
+        output_buffer.append(f"Sequence: {orig_seq}\n")
+        output_buffer.append(f"Length: {len(orig_seq)}\n")
+        output_buffer.append(f"Base pairs: {str(pairs)}\n")
+        output_buffer.append(f"Predicted Structure: {dot_bracket_str_pred}\n")

+        if pk_count > 0:
+            output_buffer.append(f"NOTE: {pk_count} pseudoknots and/or multiplets present in predicted structure excluded from dot-bracket notation: {pk_list}\n")
+
+        heatmaps_dir = './heatmaps'
+        os.makedirs(heatmaps_dir, exist_ok=True)
+        plt.figure(figsize=(12, 10))
+
+        if matrix_color_type == 'color':
+            raw_pred_mat = torch.sigmoid(logits[0, :, :, -1])
+            sns.heatmap(raw_pred_mat, cmap="inferno", vmin=0.0, vmax=1.0,
+                        xticklabels=list(orig_seq),
+                        yticklabels=list(orig_seq)
+            )
+            plt.title(f"RNAformer Base-pair Probability Matrix")
+            plt.xticks(rotation=90)
+            plt.tight_layout()
+            plt.savefig(f'{seq_dir}/RNAformer_structure_adjacency_matrix_color_{count}.{matrix_out_type}', dpi=150)
+            plt.close()
+        else:
+            sns.heatmap(pred_mat, cmap="gray", vmin=0.0, vmax=1.0,
+                        xticklabels=list(orig_seq),
+                        yticklabels=list(orig_seq)
+            )
+            plt.title(f"RNAformer Base-pair Binary Probability Matrix")
+            plt.xticks(rotation=90)
+            plt.tight_layout()
+            plt.savefig(f'{seq_dir}/RNAformer_structure_adjacency_matrix_binary_{count}.{matrix_out_type}', dpi=150)
+            plt.close()
+
+        full_text = "".join(output_buffer)
+        with open(f"{seq_dir}/RNAformer_output.txt", "w", encoding="utf-8") as txt_file:
+            txt_file.write(full_text)
+            txt_file.write("\n")
+
+        total_output_buffer.append(full_text)
+        total_output_buffer.append("\n")
+
+    full_job_output = "".join(total_output_buffer)
+    with open(f"./RNAformer_job_output.txt", "w", encoding="utf-8") as txt_file:
+            txt_file.write(full_job_output)
+
+    count += 1
+
+shutil.make_archive('output', "zip", job_name)

 ]]></configfile>
     </configfiles>
@@ -178,35 +282,63 @@
                 <param format="fasta" name="fasta_input" type="data" label="Sequence to fold (FASTA file)"/>
             </when>
         </conditional>
+            <param name="job_name" type="text" label="Job name" value="" help="Please edit job name for output files. Default will be RNAformer_{date}_{time}."/>
+            <param name="model" type="data" format="binary" value="None" optional="true" label="Model" help="Manually download saved RNAformer model file to save time: https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_state_dict_intra_family_finetuned.pth"/>
+            <param name="model_config" type="data" format="yml" value="None" optional="true" label="Model configuration" help="Manually download saved RNAformer model configuration file to save time: https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_config_intra_family_finetuned.yml"/>
+            <param name="matrix_color_type" type="select" label="Coloring of base probability matrix">
+                <option value="color">Color: base pair probabilities are shown and colored</option>
+                <option value="binary">Binary: probabilities are binarized to show only final predicted structure without probabilities</option>
+            </param>
+            <param name="matrix_out_type" type="select" label="Base probability adjacency matrix heatmap format">
+                <option value="pdf">PDF</option>
+                <option value="eps">EPS</option>
+                <option value="png">PNG</option>
+                <option value="svg">SVG</option>
+            </param>
     </inputs>
     <outputs>
-        <data name="output" format="txt" label="output"/>
+        <data name="output" format="txt" label="RNAformer Base Pair Predictions" from_work_dir="RNAformer_job_output.txt"/>
+        <data name="output_files" format="zip" label="RNAformer Predicted Structures Output" from_work_dir="output.zip"/>
     </outputs>
     <tests>
+        <!-- Test 1: Single sequence as text input, color matrix PDF -->
         <test>
             <param name="input_type" value="False"/>
             <param name="rna_input_string" value="GCCCGCAUGGUGAAAUCGGUAAACACAUCGCACUAAUGCGCCGCCUCUGGCUUGCCGGUUCAAGUCCGGCUGCGGGCACCA"/>
-            <output name="output" file="rna_2d_pred_text.txt"/>
-        </test>
-        <test>
-            <param name="input_type" value="True"/>
-            <param name="fasta_input" value="fasta_input1.fa"/>
-            <output name="output" file="rna_2d_pred_FASTA.txt"/>
+            <param name="job_name" value="RNAformer_Prediction_Test_1"/>
+            <param name="matrix_out_type" value="pdf"/>
+            <param name="matrix_color_type" value="color"/>
+            <output name="output" file="RNAformer_job_output.txt"/>
+            <output name="output_files">
+                <assert_contents>
+                    <has_size min="1" />
+                </assert_contents>
+            </output>
         </test>
     </tests>
     <help><![CDATA[
     **RNAformer**
-        This tool reads RNA sequences and predicts their secondary structure using RNAformer.
-
+        The tool reads RNA sequences and predicts their secondary structure using RNAformer. Note that unlike conventional methods, RNAformer is capable of predicting all possible secondary structure motifs, including pseudoknots and multiplets. These currently will not be represented in dot-bracket notation and thus the output will be partial in these cases, excluding these which will be noted in the output file below the (partial) dot-bracket structure. However, the full structure will be represented in the adjacency matrix heatmap. Tip: To speed up inference time, you can manually download the model and model configuration files from the following URLs (select "Upload Data" -> "Paste/Fetch Data" and copy the following URLs): `https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_state_dict_intra_family_finetuned.pth https://ml.informatik.uni-freiburg.de/research-artifacts/RNAformer/models/RNAformer_32M_config_intra_family_finetuned.yml`
+
     **Input format**

     RNAformer requires one or more RNA sequences either as a single FASTA file or as plain text.

     **Outputs**

-    - Predicted secondary structure as a text file in the following formats:
-        - base pair positions
-        - dot-bracket notation
+    - Predicted secondary structures as a text file for all sequences provided containing the following:
+        - Job name (with index based on order of input sequences in either plain text or FASTA file)
+        - RNA input sequence
+        - Length of input sequence
+        - Base pairs of predicted secondary structure
+        - Predicted secondary structure in dot-bracket notation (excluding pseudoknots and multiplets)
+        - Optional: pseudoknots and/or multiplets present in predicted structure excluded from dot-bracket notation
+    - A zip file containing:
+        - Sub-directory for each input sequence with the name <job_name>_<index> containing:
+            - Predicted secondary structure text file (same as above)
+            - Heatmap of base pair probability matrix
+
+
     ]]></help>
     <expand macro="citations" />
 </tool>
\ No newline at end of file
--- a/macros.xml	Thu Jul 11 20:56:23 2024 +0000
+++ b/macros.xml	Fri Mar 07 15:22:24 2025 +0000
@@ -1,6 +1,6 @@
 <macros>
     <token name="@EXECUTABLE@">RNAformer</token>
-    <token name="@TOOL_VERSION@">1.0.0</token>
+    <token name="@TOOL_VERSION@">1.2.0</token>
     <token name="@profile@">22.05</token>
     <xml name="requirements">
         <requirements>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/RNAformer_job_output.txt	Fri Mar 07 15:22:24 2025 +0000
@@ -0,0 +1,7 @@
+Job name: RNAformer_Prediction_Test_1_0
+Sequence: GCCCGCAUGGUGAAAUCGGUAAACACAUCGCACUAAUGCGCCGCCUCUGGCUUGCCGGUUCAAGUCCGGCUGCGGGCACCA
+Length: 81
+Base pairs: [[0, 76], [1, 75], [2, 74], [3, 73], [4, 72], [5, 71], [6, 70], [7, 13], [7, 21], [8, 23], [9, 25], [10, 24], [11, 23], [12, 22], [13, 7], [17, 59], [18, 60], [21, 7], [21, 13], [22, 12], [23, 8], [23, 11], [24, 10], [25, 9], [39, 30], [40, 29], [42, 50], [43, 49], [44, 48], [48, 44], [49, 43], [50, 42], [53, 69], [54, 68], [55, 67], [56, 66], [57, 65], [58, 62], [59, 17], [60, 18], [62, 58], [65, 57], [66, 56], [67, 55], [68, 54], [69, 53], [70, 6], [71, 5], [72, 4], [73, 3], [74, 2], [75, 1], [76, 0]]
+Predicted Structure: (((((((((((.()...((...))))...))........((.(((...)))..(((((()).)..))))))))))))....
+NOTE: 28 pseudoknots and/or multiplets present in predicted structure excluded from dot-bracket notation: [[7, 21], [11, 23], [13, 7], [21, 7], [21, 13], [22, 12], [23, 8], [23, 11], [24, 10], [25, 9], [48, 44], [49, 43], [50, 42], [59, 17], [60, 18], [62, 58], [65, 57], [66, 56], [67, 55], [68, 54], [69, 53], [70, 6], [71, 5], [72, 4], [73, 3], [74, 2], [75, 1], [76, 0]]
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_input_false1.fa	Fri Mar 07 15:22:24 2025 +0000
@@ -0,0 +1,4 @@
+>Anolis_caro_chrUn_GL343590.trna2_AlaAGC (218800-218872)  Ala (AGC) 73 bp  Sc: 49.55
+TGGGAATTAGCTCAAATGGAAGAGCGCTCGCTTAGCATGTGAGAGGTAGTGGGATCGATGCCCACATTCTCCA
+>Anolis_caro_chrUn_GL343207.trna3_AlaAGC (1513626-1513698)  Ala (AGC) 73 bp  Sc: 56.15
+GGGAATTAGCTCAAATGGAAGAGCGCTCGCTTAGCATGCGAGAGGTAGCGGGATTGATGCCCGCATTCTCCA
\ No newline at end of file
Binary file test-data/output.zip has changed
--- a/test-data/rna_2d_pred_FASTA.txt	Thu Jul 11 20:56:23 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
-Pairing index 1: [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 8, 9, 9, 10, 11, 12, 13, 13, 17, 18, 20, 21, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 57, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
-Pairing index 2: [71, 70, 69, 68, 67, 66, 65, 13, 14, 20, 47, 22, 24, 44, 23, 22, 21, 7, 20, 54, 55, 7, 12, 45, 8, 11, 10, 9, 43, 42, 41, 40, 39, 38, 37, 36, 32, 31, 30, 29, 28, 27, 26, 25, 9, 21, 64, 63, 62, 61, 60, 57, 17, 18, 53, 52, 51, 50, 49, 48, 6, 5, 4, 3, 2, 1, 0]
-Pairing index 1: [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 11, 12, 13, 13, 14, 20, 20, 21, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
-Pairing index 2: [71, 70, 69, 68, 67, 66, 65, 13, 14, 20, 22, 24, 44, 23, 22, 21, 7, 20, 7, 7, 13, 12, 45, 8, 11, 10, 9, 43, 42, 41, 40, 39, 38, 37, 32, 31, 30, 29, 28, 27, 26, 25, 9, 21, 64, 63, 62, 61, 60, 52, 51, 50, 49, 48, 6, 5, 4, 3, 2, 1, 0]
--- a/test-data/rna_2d_pred_text.txt	Thu Jul 11 20:56:23 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Pairing index 1: [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 17, 18, 21, 21, 22, 23, 23, 24, 25, 39, 40, 42, 43, 44, 48, 49, 50, 53, 54, 55, 56, 57, 58, 59, 60, 62, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]
-Pairing index 2: [76, 75, 74, 73, 72, 71, 70, 13, 21, 23, 25, 24, 23, 22, 7, 59, 60, 7, 13, 12, 8, 11, 10, 9, 30, 29, 50, 49, 48, 44, 43, 42, 69, 68, 67, 66, 65, 62, 17, 18, 58, 57, 56, 55, 54, 53, 6, 5, 4, 3, 2, 1, 0]