# HG changeset patch # User fubar # Date 1689749126 0 # Node ID 2051ee2bedc49ac86b1309d287fcbd4b0f4c2807 # Parent a6593725b7280ac51e79663e537d342e3ab1afed Uploaded diff -r a6593725b728 -r 2051ee2bedc4 blasttoolssearch/blasttoolssearch.xml --- a/blasttoolssearch/blasttoolssearch.xml Wed Jul 19 06:06:07 2023 +0000 +++ b/blasttoolssearch/blasttoolssearch.xml Wed Jul 19 06:45:26 2023 +0000 @@ -98,59 +98,6 @@ ------- - - -Script:: - - ## eResearch Office, QUT - ## Created: 31 March 2021 - ## Last modified: 28 September 2022 - ## Script: Processes Galaxy Australia generated blastN outputs to summarise and report hits to REGULATED and ENDEMIC viruses/viroids. - ## Usage: ./run_VirReport_Summary.sh - ## changed to accept a single input file name passed as $1 - ## Ross Lazarus for a ToolFactory wrapper for Robert Barrero - ## July 18 2023 - dataPath=${PWD} - # Requirement: One or more GA-VSD .tabular outputs need to be in the folder where the command above (Usage)is executed. - # The script will Look for all files with the suffix *.tabular - #Processing tabular files - file=$1 - var=$(basename $file) - #STEP0: fetch Top 1 Hits - cat $file | awk '{print $1}' | sort | uniq > ${var}.top1.ids - for i in `cat ${var}.top1.ids` - do - echo "fetching top hits..." $i; - grep $i $file | head -1 >> ${var}.top1Hits.txt; - done - #STEP1: modify the columns of Galaxy Australia (GA) blast output to the expected format by the BlastTools.jar tool - ###### namely: qseqid sgi sacc length pident mismatch gapopen qstart qend qlen sstart send slen sstrand evalue bitscore qcovhsp stitle staxids qseq sseq sseqid qcovs qframe sframe - cat ${var}.top1Hits.txt |csvtk cut -H -t -f 1,19,20,4,3,5,6,7,8,17,9,10,18,22,11,12,24,21,25,15,16,2,23,13,14 | sed 's/ /_/g' > ${var}.txt - #STEP2: summarise the GA blastN files - #java -jar /mnt/c/Users/lelwala/HTS/BlastTools.jar -t blastn ${var}.txt - java -jar $3 -t blastn ${var}.txt - #filter virus/viroid/endo - cat summary_${var}.txt | grep "virus\|viroid\|endo" > summary_${var}_filtered.txt - #STEP3: fetch unique names from Blast summary reports - cat summary_${var}_filtered.txt | awk '{print $7}' | awk -F "|" '{print $2}'| sort | uniq | sed 's/Species://' > ${var}_uniq.ids - #STEP4: retrieve the best hit for each virus/viroid - echo "processing top hits ..." - touch ${var}_filtered.txt - for id in `cat ${var}_uniq.ids` - do - #print on the screen the name of the virus/viroids to search - #echo "fetching species matches ..." $id - #fetch the virus name on the summary_blastn file by selecting the longest alignment (column 3) and highest genome coverage (column 5) - grep $id summary_${var}.txt | sort -k3,3nr -k5,5nr | head -1 >> ${var}_filtered.txt - done - #print the header of the inital summary_blastn file - cat summary_${var}.txt | head -1 > header - #report 1 - cat header ${var}_filtered.txt > $2 - #removing intermediate files - rm summary_${file}.txt ${file}.txt ${file}.top1.ids ${file}_uniq.ids summary_${file}_filtered.txt header* ${var}_filtered.txt *top1Hits.txt - ]]> 10.1093/bioinformatics/bts573