view blasttoolssearch/blasttoolssearch.xml @ 1:5687b8f1ad69 draft

fix datatype for summary
author fubar
date Wed, 19 Jul 2023 05:48:53 +0000
parents ee581a90a85e
children e213ae40f480
line wrap: on
line source

<tool name="blasttoolssearch" id="blasttoolssearch" version="3.0">
  <!--Source in git at: https://github.com/fubar2/galaxy-->
  <!--Created by toolfactory@galaxy.org at 19/07/2023 12:39:19 using the Galaxy Tool Factory.-->
  <description>Runs a legacy Java jar called blasttools from https://github.com/schmidda/blast-tools/tree/master</description>
  <requirements>
    <requirement type="package">csvtk</requirement>
    <requirement type="package">openjdk</requirement>
  </requirements>
  <stdio>
    <exit_code range="1:" level="fatal"/>
  </stdio>
  <version_command><![CDATA[echo "3.0"]]></version_command>
  <command><![CDATA[bash
$runme
$blastn_search_outputs 
$__tool_directory__/BlastTools.jar
$summary_viruses_viroids
]]></command>
  <configfiles>
    <configfile name="runme"><![CDATA[#raw


## eResearch Office, QUT
## Created:  31 March 2021
## Last modified: 28 September 2022
## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
## Usage: ./run_VirReport_Summary.sh
## changed to accept a single input file name passed as $1
## Ross Lazarus for a ToolFactory wrapper for Robert Barrero
## July 18 2023

dataPath=${PWD}

# Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
# The script will Look for all files with the suffix *.tabular

#Processing tabular files
file=$1

    var=$(basename $file)

    #STEP0: fetch Top 1 Hits
    cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
    for i in `cat ${var}.top1.ids`
      do
        echo "fetching top hits..." $i;
        grep $i $file | head -1 >> ${var}.top1Hits.txt;
      done

    #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
    ######  namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
    cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt

    #STEP2: summarise the GA blastN files
    java -jar $2 -t blastn ${var}.txt
    #filter virus/viroid/endo
    cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt

    #STEP3: fetch unique names from Blast summary reports
    cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids

    #STEP4: retrieve the best hit for each virus/viroid
    echo "processing top hits ..."
    touch ${var}_filtered.txt
    for id in `cat ${var}_uniq.ids`
      do
        #print on the screen the name of the virus/viroids to search
        #echo "fetching species matches ..." $id

        #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5)
        grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
      done

    #print the header of the inital summary_blastn file
    cat summary_${var}.txt | head -1 > header
    #report 1
    echo "#" > $3
    cat header ${var}_filtered.txt >> $3 

#end raw]]></configfile>
  </configfiles>
  <inputs>
    <param name="blastn_search_outputs" type="data" optional="false" label="blastn_search_outputs" help="" format="tabular" multiple="false"/>
  </inputs>
  <outputs>
    <data name="summary_viruses_viroids" format="tabular" label="summary_viruses_viroids" hidden="false"/>
  </outputs>
  <tests>
    <test>
      <output name="summary_viruses_viroids" value="summary_viruses_viroids_sample" compare="diff" lines_diff="0"/>
      <param name="blastn_search_outputs" value="blastn_search_outputs_sample"/>
    </test>
  </tests>
  <help><![CDATA[

**What it Does**

Wraps https://github.com/schmidda/blast-tools/tree/master as a Galaxy tool as a demonstration for Roberto Barrero

 

------


Script::

    ## eResearch Office, QUT
    ## Created:  31 March 2021
    ## Last modified: 28 September 2022
    ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids.
    ## Usage: ./run_VirReport_Summary.sh
    ## changed to accept a single input file name passed as $1
    ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero
    ## July 18 2023
    dataPath=${PWD}
    # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed.
    # The script will Look for all files with the suffix *.tabular
    #Processing tabular files
    file=$1
        var=$(basename $file)
        #STEP0: fetch Top 1 Hits
        cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids
        for i in `cat ${var}.top1.ids`
          do
            echo "fetching top hits..." $i;
            grep $i $file | head -1 >> ${var}.top1Hits.txt;
          done
        #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool
        ######  namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe
        cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt
        #STEP2: summarise the GA blastN files
        #java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt
        java -jar $3 -t blastn ${var}.txt
        #filter virus/viroid/endo
        cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt
        #STEP3: fetch unique names from Blast summary reports
        cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids
        #STEP4: retrieve the best hit for each virus/viroid
        echo "processing top hits ..."
        touch ${var}_filtered.txt
        for id in `cat ${var}_uniq.ids`
          do
            #print on the screen the name of the virus/viroids to search
            #echo "fetching species matches ..." $id
            #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5)
            grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt
          done
        #print the header of the inital summary_blastn file
        cat summary_${var}.txt | head -1 > header
        #report 1
        cat header ${var}_filtered.txt > $2
        #removing intermediate files
        rm summary_${file}.txt ${file}.txt ${file}.top1.ids ${file}_uniq.ids summary_${file}_filtered.txt header* ${var}_filtered.txt *top1Hits.txt

]]></help>
  <citations>
    <citation type="doi">10.1093/bioinformatics/bts573</citation>
  </citations>
</tool>