view pal_finder_wrapper.sh @ 8:4e625d3672ba draft

Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author pjbriggs
date Wed, 16 May 2018 07:39:16 -0400
parents 5e133b7b79a6
children
line wrap: on
line source

#!/bin/sh
#
# pal_finder_wrapper.sh: run pal_finder perl script as a Galaxy tool
#
# Usage: run_palfinder.sh FASTQ_R1 FASTQ_R2 MICROSAT_SUMMARY PAL_SUMMARY FILTERED_MICROSATS [OPTIONS]
#        run_palfinder.sh --454    FASTA    MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]
#
# Options:
#
# --primer-prefix PREFIX: prefix added to the beginning of all primer names (prPrefixName)
# --2merMinReps N: miniumum number of 2-mer repeat units to detect (0=ignore units of this size)
# --3merMinReps N
# --4merMinReps N
# --5merMinReps N
# --6merMinReps N
# --primer-mispriming-library FASTA: specify a Fasta file with sequences to avoid amplifying
# --primer-opt-size VALUE: optimum primer length
# --primer-min-size VALUE: minimum acceptable primer length
# --primer-max-size VALUE: maximum acceptable primer length
# --primer-min-gc VALUE: minimum allowable percentage of Gs and Cs in any primer
# --primer-max-gc VALUE: maximum allowable percentage of Gs and Cs
# --primer-gc-clamp VALUE: number of consecutive Gs and Cs at 3' end of both left and right primer
# --primer-max-end-gc VALUE: max number of Gs or Cs in last five 3' bases of left or right primer
# --primer-min-tm VALUE: minimum acceptable melting temperature (Celsius) for a primer oligo
# --primer-max-tm VALUE: maximum acceptable melting temperature (Celsius)
# --primer-opt-tm VALUE: optimum melting temperature (Celsius)
# --primer-pair-max-diff-tm VALUE: max difference between melting temps of left & right primers
# --output_config_file FNAME: write a copy of the config.txt file to FNAME
# --bad_primer_ranges FNAME: write a list of the read IDs generating bad primer ranges to FNAME
# --filter_microsats FNAME: write output of filter options to FNAME
# -assembly FNAME: run the 'assembly' filter option and write to FNAME
# -primers: run the 'primers' filter option
# -occurrences: run the 'occurrences' filter option
# -rankmotifs: run the 'rankmotifs' filter option
# --subset N: use a subset of reads of size N
#
# pal_finder is available from http://sourceforge.net/projects/palfinder/
#
# primer3 is available from http://primer3.sourceforge.net/releases.php
# (nb needs version 2.0.0-alpha)
#
# Explicitly set the locations of the pal_finder script, data files and the primer3
# executable by setting the following variables in the environment:
#
#  * PALFINDER_SCRIPT_DIR: location of the pal_finder Perl script (defaults to
#    /usr/bin)
#  * PALFINDER_DATA_DIR: location of the pal_finder data files (specifically
#    config.txt and simple.ref; defaults to /usr/share/pal_finder_v0.02.04)
#  * PRIMER3_CORE_EXE: name of the primer3_core program, which should include the
#    full path if it's not on the Galaxy user's PATH (defaults to primer3_core)
#
echo "### $(basename $0) ###"
echo $*
#
# Maximum size reporting log file contents
MAX_LINES=500
#
# Get helper functions
. $(dirname $0)/pal_finder_wrapper_utils.sh
#
# Initialise locations of scripts, data and executables
#
# Set these in the environment to overide at execution time
: ${PALFINDER_SCRIPT_DIR:=/usr/bin}
: ${PALFINDER_DATA_DIR:=/usr/share/pal_finder_v0.02.04}
: ${PRIMER3_CORE_EXE:=primer3_core}
#
# Filter script is in the same directory as this script
PALFINDER_FILTER=$(dirname $0)/pal_filter.py
if [ ! -f $PALFINDER_FILTER ] ; then
    fatal No $PALFINDER_FILTER script
fi
#
# Check that we have all the components
if [ "$(have_program $PRIMER3_CORE_EXE)" == "no" ] ; then
    fatal "primer3_core missing: ${PRIMER3_CORE_EXE} not found"
fi
if [ ! -f "${PALFINDER_DATA_DIR}/config.txt" ] ; then
    fatal "pal_finder config.txt not found in ${PALFINDER_DATA_DIR}"
fi
if [ ! -f "${PALFINDER_SCRIPT_DIR}/pal_finder_v0.02.04.pl" ] ; then
    fatal "pal_finder_v0.02.04.pl not found in ${PALFINDER_SCRIPT_DIR}"
fi
#
# Initialise parameters used in the config.txt file
PRIMER_PREFIX="test"
MIN_2_MER_REPS=6
MIN_3_MER_REPS=0
MIN_4_MER_REPS=0
MIN_5_MER_REPS=0
MIN_6_MER_REPS=0
PRIMER_MISPRIMING_LIBRARY=$PALFINDER_DATA_DIR/simple.ref
PRIMER_OPT_SIZE=
PRIMER_MAX_SIZE=
PRIMER_MIN_SIZE=
PRIMER_MAX_GC=
PRIMER_MIN_GC=
PRIMER_GC_CLAMP=
PRIMER_MAX_END_GC=
PRIMER_OPT_TM=
PRIMER_MAX_TM=
PRIMER_MIN_TM=
PRIMER_PAIR_MAX_DIFF_TM=
OUTPUT_CONFIG_FILE=
OUTPUT_ASSEMBLY=
FILTERED_MICROSATS=
FILTER_OPTIONS=
SUBSET=
RANDOM_SEED=568765
#
# Collect command line arguments
if [ $# -lt 2 ] ; then
  echo "Usage: $0 FASTQ_R1 FASTQ_R2 MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]"
  echo "       $0 --454    FASTA    MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]"
  fatal "Bad command line"
fi
if [ "$1" == "--454" ] ; then
    PLATFORM="454"
    FNA=$2
else
    PLATFORM="Illumina"
    FASTQ_R1=$1
    FASTQ_R2=$2
fi
MICROSAT_SUMMARY=$3
PAL_SUMMARY=$4
shift; shift; shift; shift
#
# Collect command line options
while [ ! -z "$1" ] ; do
    case "$1" in
	--primer-prefix)
	    shift
	    # Convert all non-alphanumeric characters to underscores in prefix
	    PRIMER_PREFIX=$(echo -n $1 | tr -s -c "[:alnum:]" "_")
	    ;;
	--2merMinReps)
	    shift
	    MIN_2_MER_REPS=$1
	    ;;
	--3merMinReps)
	    shift
	    MIN_3_MER_REPS=$1
	    ;;
	--4merMinReps)
	    shift
	    MIN_4_MER_REPS=$1
	    ;;
	--5merMinReps)
	    shift
	    MIN_5_MER_REPS=$1
	    ;;
	--6merMinReps)
	    shift
	    MIN_6_MER_REPS=$1
	    ;;
	--primer-mispriming-library)
	    shift
	    PRIMER_MISPRIMING_LIBRARY=$1
	    ;;
	--primer-opt-size)
	    shift
	    PRIMER_OPT_SIZE=$1
	    ;;
	--primer-max-size)
	    shift
	    PRIMER_MAX_SIZE=$1
	    ;;
	--primer-min-size)
	    shift
	    PRIMER_MIN_SIZE=$1
	    ;;
	--primer-max-gc)
	    shift
	    PRIMER_MAX_GC=$1
	    ;;
	--primer-min-gc)
	    shift
	    PRIMER_MIN_GC=$1
	    ;;
	--primer-gc-clamp)
	    shift
	    PRIMER_GC_CLAMP=$1
	    ;;
	--primer-max-end-gc)
	    shift
	    PRIMER_MAX_END_GC=$1
	    ;;
	--primer-opt-tm)
	    shift
	    PRIMER_OPT_TM=$1
	    ;;
	--primer-max-tm)
	    shift
	    PRIMER_MAX_TM=$1
	    ;;
	--primer-min-tm)
	    shift
	    PRIMER_MIN_TM=$1
	    ;;
	--primer-pair-max-diff-tm)
	    shift
	    PRIMER_PAIR_MAX_DIFF_TM=$1
	    ;;
	--output_config_file)
	    shift
	    OUTPUT_CONFIG_FILE=$1
	    ;;
	--bad_primer_ranges)
	    shift
	    BAD_PRIMER_RANGES=$1
	    ;;
	--filter_microsats)
	    shift
	    FILTERED_MICROSATS=$1
	    ;;
	-primers|-occurrences|-rankmotifs)
	    FILTER_OPTIONS="$FILTER_OPTIONS $1"
	    ;;
	-assembly)
	    FILTER_OPTIONS="$FILTER_OPTIONS $1"
	    shift
	    OUTPUT_ASSEMBLY=$1
	    ;;
	--subset)
	    shift
	    SUBSET=$1
	    ;;
	*)
	    echo Unknown option: $1 >&2
	    exit 1
	    ;;
    esac
    shift
done
#
# Check that primer3_core is available
got_primer3=`which $PRIMER3_CORE_EXE 2>&1 | grep -v "no primer3_core in"`
if [ -z "$got_primer3" ] ; then
  fatal "primer3_core not found"
fi
#
# Check the n-mers specification
if [ $MIN_6_MER_REPS -ne 0 ] ; then
    if [ $MIN_5_MER_REPS -eq 0 ] ; then
	fatal "Minimum number of 5-mers cannot be zero if number of 6-mers is non-zero"
    fi
fi
if [ $MIN_5_MER_REPS -ne 0 ] ; then
    if [ $MIN_4_MER_REPS -eq 0 ] ; then
	fatal "Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero"
    fi
fi
if [ $MIN_4_MER_REPS -ne 0 ] ; then
    if [ $MIN_3_MER_REPS -eq 0 ] ; then
	fatal "Minimum number of 3-mers cannot be zero if number of 4-mers is non-zero"
    fi
fi
if [ $MIN_2_MER_REPS -eq 0 ] ; then
    fatal "Minimum number of 2-mer repeats cannot be zero"
fi
# Set up the working dir
if [ "$PLATFORM" == "Illumina" ] ; then
    # Paired end Illumina data as input
    if [ $FASTQ_R1 == $FASTQ_R2 ] ; then
	fatal ERROR R1 and R2 fastqs are the same file
    fi
    ln -s $FASTQ_R1
    ln -s $FASTQ_R2
    fastq_r1=$(basename $FASTQ_R1)
    fastq_r2=$(basename $FASTQ_R2)
else
    # 454 data as input
    ln -s $FNA
    fna=$(basename $FNA)
fi
ln -s $PRIMER_MISPRIMING_LIBRARY
PRIMER_MISPRIMING_LIBRARY=$(basename $PRIMER_MISPRIMING_LIBRARY)
mkdir Output
#
# Use a subset of reads
if [ ! -z "$SUBSET" ] ; then
    echo "### Extracting subset of reads ###"
    $(dirname $0)/fastq_subset.py -n $SUBSET -s $RANDOM_SEED $fastq_r1 $fastq_r2
    fastq_r1="subset_r1.fq"
    fastq_r2="subset_r2.fq"
fi
#
# Copy in the default config.txt file
echo "### Creating config.txt file for pal_finder run ###"
/bin/cp $PALFINDER_DATA_DIR/config.txt .
#
# Update the config.txt file with new values
# Input files
set_config_value platform $PLATFORM config.txt
if [ "$PLATFORM" == "Illumina" ] ; then
    set_config_value inputFormat fastq config.txt
    set_config_value pairedEnd 1 config.txt
    set_config_value inputReadFile $fastq_r1 config.txt
    set_config_value pairedReadFile $fastq_r2 config.txt
else
    set_config_value inputFormat fasta config.txt
    set_config_value pairedEnd 0 config.txt
    set_config_value input454reads $fna config.txt
fi
# Output files
set_config_value MicrosatSumOut Output/microsat_summary.txt config.txt
set_config_value PALsummaryOut Output/PAL_summary.txt config.txt
# Microsat info
set_config_value 2merMinReps $MIN_2_MER_REPS config.txt
set_config_value 3merMinReps $MIN_3_MER_REPS config.txt
set_config_value 4merMinReps $MIN_4_MER_REPS config.txt
set_config_value 5merMinReps $MIN_5_MER_REPS config.txt
set_config_value 6merMinReps $MIN_6_MER_REPS config.txt
# Primer3 settings
set_config_value primer3input Output/pr3in.txt config.txt
set_config_value primer3output Output/pr3out.txt config.txt
set_config_value keepPrimer3files 1 config.txt
set_config_value primer3executable $PRIMER3_CORE_EXE config.txt
set_config_value prNamePrefix ${PRIMER_PREFIX}_ config.txt
set_config_value PRIMER_MISPRIMING_LIBRARY "$PRIMER_MISPRIMING_LIBRARY" config.txt
set_config_value PRIMER_OPT_SIZE "$PRIMER_OPT_SIZE" config.txt
set_config_value PRIMER_MIN_SIZE "$PRIMER_MIN_SIZE" config.txt
set_config_value PRIMER_MAX_SIZE "$PRIMER_MAX_SIZE" config.txt
set_config_value PRIMER_MIN_GC "$PRIMER_MIN_GC" config.txt
set_config_value PRIMER_MAX_GC "$PRIMER_MAX_GC" config.txt
set_config_value PRIMER_GC_CLAMP "$PRIMER_GC_CLAMP" config.txt
set_config_value PRIMER_MAX_END_GC "$PRIMER_MAX_END_GC" config.txt
set_config_value PRIMER_MIN_TM "$PRIMER_MIN_TM" config.txt
set_config_value PRIMER_MAX_TM "$PRIMER_MAX_TM" config.txt
set_config_value PRIMER_OPT_TM "$PRIMER_OPT_TM" config.txt
set_config_value PRIMER_PAIR_MAX_DIFF_TM "$PRIMER_PAIR_MAX_DIFF_TM" config.txt
#
# Run pal_finder
echo "### Running pal_finder ###"
perl $PALFINDER_SCRIPT_DIR/pal_finder_v0.02.04.pl config.txt 1>pal_finder.log 2>&1
echo "### pal_finder finished ###"
#
# Handlers the pal_finder log file
echo "### Output from pal_finder ###"
if [ $(wc -l pal_finder.log | cut -d" " -f1) -gt $MAX_LINES ] ; then
    echo WARNING output too long, truncated to last $MAX_LINES lines:
    echo ...
fi
tail -$MAX_LINES pal_finder.log
#
# Check for success/failure
if [ ! -z "$(tail -n 1 pal_finder.log | grep 'No microsatellites found in any reads. Ending script.')" ] ; then
    # No microsatellites found
    fatal ERROR pal_finder failed to locate any microsatellites
    exit 1
elif [ -z "$(tail -n 1 pal_finder.log | grep Done!!)" ] ; then
    # Log doesn't end with "Done!!" (indicates failure)
    fatal ERROR pal_finder failed to complete successfully
fi
echo "### pal_finder finished ###"
#
# Check for errors in pal_finder output
echo "### Checking for errors ###"
if [ ! -z "$(grep 'primer3_core: Illegal element in PRIMER_PRODUCT_SIZE_RANGE' pal_finder.log)" ] ; then
    echo WARNING primer3 terminated prematurely due to bad product size ranges
    $(find_bad_primer_ranges Output/pr3in.txt bad_primer_ranges.txt)
    N_BAD_PRIMERS=$(cat bad_primer_ranges.txt | wc -l)
    if [ -z "$BAD_PRIMER_RANGES" ] ; then
	# No output file so report to stderr
	cat <<EOF

Pal_finder generated bad ranges for the following read IDs:

EOF
	cat bad_primer_ranges.txt
	cat <<EOF

This error can occur when input data contains short R1 reads and has
has not been properly trimmed and filtered.

EOF
    else
	# Move the bad ranges to the specified file
	echo "### Writing read IDs with bad primer ranges ###"
	/bin/mv bad_primer_ranges.txt "$BAD_PRIMER_RANGES"
    fi
else
    N_BAD_PRIMERS=0
fi
#
# Sort microsat_summary output
echo "### Sorting microsat summary output ###"
head -n 7 Output/microsat_summary.txt | sort >microsat_summary.sorted
echo "readsWithBadRanges:"$'\t'"$((N_BAD_PRIMERS * 2))" >>microsat_summary.sorted
grep "^$" Output/microsat_summary.txt>>microsat_summary.sorted
grep "^Microsat Type" Output/microsat_summary.txt >>microsat_summary.sorted
tail -n +11 Output/microsat_summary.txt | sort -r -n -k 5 >>microsat_summary.sorted
mv microsat_summary.sorted Output/microsat_summary.txt
#
# Sort PAL_summary output
echo "### Sorting PAL summary output ###"
head -1 Output/PAL_summary.txt > Output/PAL_summary.sorted.txt
if [ "$PLATFORM" == "Illumina" ] ; then
    grep -v "^readPairID" Output/PAL_summary.txt | sort -k 1 >> Output/PAL_summary.sorted.txt
else
    grep -v "^SequenceID" Output/PAL_summary.txt | sort -k 1 >> Output/PAL_summary.sorted.txt
fi
mv Output/PAL_summary.sorted.txt Output/PAL_summary.txt
#
# Run the filtering & assembly script
if [ ! -z "$FILTERED_MICROSATS" ] || [ ! -z "$OUTPUT_ASSEMBLY" ] ; then
    echo "### Running filtering & assembly script ###"
    python $PALFINDER_FILTER -i $fastq_r1 -j $fastq_r2 -p Output/PAL_summary.txt $FILTER_OPTIONS 1>pal_filter.log 2>&1
    echo "### Output from pal_filter ###"
    if [ $(wc -l pal_filter.log | cut -d" " -f1) -gt $MAX_LINES ] ; then
	echo WARNING output too long, truncated to last $MAX_LINES lines:
	echo ...
    fi
    tail -$MAX_LINES pal_filter.log
    if [ $? -ne 0 ] ; then
	fatal $PALFINDER_FILTER exited with non-zero status
    elif [ ! -f PAL_summary.filtered ] ; then
	fatal no output from $PALFINDER_FILTER
    fi
fi
#
# Clean up
echo "### Handling output files ###"
if [ -f Output/microsat_summary.txt ] ; then
    /bin/mv Output/microsat_summary.txt $MICROSAT_SUMMARY
fi
if [ -f Output/PAL_summary.txt ] ; then
    /bin/mv Output/PAL_summary.txt $PAL_SUMMARY
fi
if [ ! -z "$FILTERED_MICROSATS" ] && [ -f PAL_summary.filtered ] ; then
    /bin/mv PAL_summary.filtered $FILTERED_MICROSATS
fi
if [ ! -z "$OUTPUT_ASSEMBLY" ] ; then
    assembly=${fastq_r1%.*}_pal_filter_assembly_output.txt
    if [ -f "$assembly" ] ; then
	/bin/mv $assembly "$OUTPUT_ASSEMBLY"
    else
	fatal no assembly output found
    fi
fi
if [ ! -z "$OUTPUT_CONFIG_FILE" ] && [ -f config.txt ] ; then
    /bin/mv config.txt $OUTPUT_CONFIG_FILE
fi
#
echo "### Pal_finder tool completed ###"
##
#