Mercurial > repos > fubar > blasttools_search_test
view blasttoolssearch/blasttoolssearch.xml @ 1:5687b8f1ad69 draft
fix datatype for summary
author | fubar |
---|---|
date | Wed, 19 Jul 2023 05:48:53 +0000 |
parents | ee581a90a85e |
children | e213ae40f480 |
line wrap: on
line source
<tool name="blasttoolssearch" id="blasttoolssearch" version="3.0"> <!--Source in git at: https://github.com/fubar2/galaxy--> <!--Created by toolfactory@galaxy.org at 19/07/2023 12:39:19 using the Galaxy Tool Factory.--> <description>Runs a legacy Java jar called blasttools from https://github.com/schmidda/blast-tools/tree/master</description> <requirements> <requirement type="package">csvtk</requirement> <requirement type="package">openjdk</requirement> </requirements> <stdio> <exit_code range="1:" level="fatal"/> </stdio> <version_command><![CDATA[echo "3.0"]]></version_command> <command><![CDATA[bash $runme $blastn_search_outputs $__tool_directory__/BlastTools.jar $summary_viruses_viroids ]]></command> <configfiles> <configfile name="runme"><![CDATA[#raw ## eResearch Office, QUT ## Created: 31 March 2021 ## Last modified: 28 September 2022 ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids. ## Usage: ./run_VirReport_Summary.sh ## changed to accept a single input file name passed as $1 ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero ## July 18 2023 dataPath=${PWD} # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed. # The script will Look for all files with the suffix *.tabular #Processing tabular files file=$1 var=$(basename $file) #STEP0: fetch Top 1 Hits cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids for i in `cat ${var}.top1.ids` do echo "fetching top hits..." $i; grep $i $file | head -1 >> ${var}.top1Hits.txt; done #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt #STEP2: summarise the GA blastN files java -jar $2 -t blastn ${var}.txt #filter virus/viroid/endo cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt #STEP3: fetch unique names from Blast summary reports cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids #STEP4: retrieve the best hit for each virus/viroid echo "processing top hits ..." touch ${var}_filtered.txt for id in `cat ${var}_uniq.ids` do #print on the screen the name of the virus/viroids to search #echo "fetching species matches ..." $id #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5) grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt done #print the header of the inital summary_blastn file cat summary_${var}.txt | head -1 > header #report 1 echo "#" > $3 cat header ${var}_filtered.txt >> $3 #end raw]]></configfile> </configfiles> <inputs> <param name="blastn_search_outputs" type="data" optional="false" label="blastn_search_outputs" help="" format="tabular" multiple="false"/> </inputs> <outputs> <data name="summary_viruses_viroids" format="tabular" label="summary_viruses_viroids" hidden="false"/> </outputs> <tests> <test> <output name="summary_viruses_viroids" value="summary_viruses_viroids_sample" compare="diff" lines_diff="0"/> <param name="blastn_search_outputs" value="blastn_search_outputs_sample"/> </test> </tests> <help><![CDATA[ **What it Does** Wraps https://github.com/schmidda/blast-tools/tree/master as a Galaxy tool as a demonstration for Roberto Barrero ------ Script:: ## eResearch Office, QUT ## Created: 31 March 2021 ## Last modified: 28 September 2022 ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids. ## Usage: ./run_VirReport_Summary.sh ## changed to accept a single input file name passed as $1 ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero ## July 18 2023 dataPath=${PWD} # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed. # The script will Look for all files with the suffix *.tabular #Processing tabular files file=$1 var=$(basename $file) #STEP0: fetch Top 1 Hits cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids for i in `cat ${var}.top1.ids` do echo "fetching top hits..." $i; grep $i $file | head -1 >> ${var}.top1Hits.txt; done #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt #STEP2: summarise the GA blastN files #java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt java -jar $3 -t blastn ${var}.txt #filter virus/viroid/endo cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt #STEP3: fetch unique names from Blast summary reports cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids #STEP4: retrieve the best hit for each virus/viroid echo "processing top hits ..." touch ${var}_filtered.txt for id in `cat ${var}_uniq.ids` do #print on the screen the name of the virus/viroids to search #echo "fetching species matches ..." $id #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5) grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt done #print the header of the inital summary_blastn file cat summary_${var}.txt | head -1 > header #report 1 cat header ${var}_filtered.txt > $2 #removing intermediate files rm summary_${file}.txt ${file}.txt ${file}.top1.ids ${file}_uniq.ids summary_${file}_filtered.txt header* ${var}_filtered.txt *top1Hits.txt ]]></help> <citations> <citation type="doi">10.1093/bioinformatics/bts573</citation> </citations> </tool>