comparison pal_finder_wrapper.sh @ 8:4e625d3672ba draft

Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author pjbriggs
date Wed, 16 May 2018 07:39:16 -0400
parents 5e133b7b79a6
children
comparison
equal deleted inserted replaced
7:5e133b7b79a6 8:4e625d3672ba
24 # --primer-min-tm VALUE: minimum acceptable melting temperature (Celsius) for a primer oligo 24 # --primer-min-tm VALUE: minimum acceptable melting temperature (Celsius) for a primer oligo
25 # --primer-max-tm VALUE: maximum acceptable melting temperature (Celsius) 25 # --primer-max-tm VALUE: maximum acceptable melting temperature (Celsius)
26 # --primer-opt-tm VALUE: optimum melting temperature (Celsius) 26 # --primer-opt-tm VALUE: optimum melting temperature (Celsius)
27 # --primer-pair-max-diff-tm VALUE: max difference between melting temps of left & right primers 27 # --primer-pair-max-diff-tm VALUE: max difference between melting temps of left & right primers
28 # --output_config_file FNAME: write a copy of the config.txt file to FNAME 28 # --output_config_file FNAME: write a copy of the config.txt file to FNAME
29 # --filter_microsats FNAME: write output of filter options FNAME 29 # --bad_primer_ranges FNAME: write a list of the read IDs generating bad primer ranges to FNAME
30 # --filter_microsats FNAME: write output of filter options to FNAME
30 # -assembly FNAME: run the 'assembly' filter option and write to FNAME 31 # -assembly FNAME: run the 'assembly' filter option and write to FNAME
31 # -primers: run the 'primers' filter option 32 # -primers: run the 'primers' filter option
32 # -occurrences: run the 'occurrences' filter option 33 # -occurrences: run the 'occurrences' filter option
33 # -rankmotifs: run the 'rankmotifs' filter option 34 # -rankmotifs: run the 'rankmotifs' filter option
35 # --subset N: use a subset of reads of size N
34 # 36 #
35 # pal_finder is available from http://sourceforge.net/projects/palfinder/ 37 # pal_finder is available from http://sourceforge.net/projects/palfinder/
36 # 38 #
37 # primer3 is available from http://primer3.sourceforge.net/releases.php 39 # primer3 is available from http://primer3.sourceforge.net/releases.php
38 # (nb needs version 2.0.0-alpha) 40 # (nb needs version 2.0.0-alpha)
51 echo $* 53 echo $*
52 # 54 #
53 # Maximum size reporting log file contents 55 # Maximum size reporting log file contents
54 MAX_LINES=500 56 MAX_LINES=500
55 # 57 #
58 # Get helper functions
59 . $(dirname $0)/pal_finder_wrapper_utils.sh
60 #
56 # Initialise locations of scripts, data and executables 61 # Initialise locations of scripts, data and executables
57 # 62 #
58 # Set these in the environment to overide at execution time 63 # Set these in the environment to overide at execution time
59 : ${PALFINDER_SCRIPT_DIR:=/usr/bin} 64 : ${PALFINDER_SCRIPT_DIR:=/usr/bin}
60 : ${PALFINDER_DATA_DIR:=/usr/share/pal_finder_v0.02.04} 65 : ${PALFINDER_DATA_DIR:=/usr/share/pal_finder_v0.02.04}
61 : ${PRIMER3_CORE_EXE:=primer3_core} 66 : ${PRIMER3_CORE_EXE:=primer3_core}
62 # 67 #
63 # Filter script is in the same directory as this script 68 # Filter script is in the same directory as this script
64 PALFINDER_FILTER=$(dirname $0)/pal_filter.py 69 PALFINDER_FILTER=$(dirname $0)/pal_filter.py
65 if [ ! -f $PALFINDER_FILTER ] ; then 70 if [ ! -f $PALFINDER_FILTER ] ; then
66 echo No $PALFINDER_FILTER script >&2 71 fatal No $PALFINDER_FILTER script
67 exit 1
68 fi 72 fi
69 # 73 #
70 # Check that we have all the components 74 # Check that we have all the components
71 function have_program() {
72 local program=$1
73 local got_program=$(which $program 2>&1 | grep "no $(basename $program) in")
74 if [ -z "$got_program" ] ; then
75 echo yes
76 else
77 echo no
78 fi
79 }
80 if [ "$(have_program $PRIMER3_CORE_EXE)" == "no" ] ; then 75 if [ "$(have_program $PRIMER3_CORE_EXE)" == "no" ] ; then
81 echo "ERROR primer3_core missing: ${PRIMER3_CORE_EXE} not found" >&2 76 fatal "primer3_core missing: ${PRIMER3_CORE_EXE} not found"
82 exit 1
83 fi 77 fi
84 if [ ! -f "${PALFINDER_DATA_DIR}/config.txt" ] ; then 78 if [ ! -f "${PALFINDER_DATA_DIR}/config.txt" ] ; then
85 echo "ERROR pal_finder config.txt not found in ${PALFINDER_DATA_DIR}" >&2 79 fatal "pal_finder config.txt not found in ${PALFINDER_DATA_DIR}"
86 exit 1
87 fi 80 fi
88 if [ ! -f "${PALFINDER_SCRIPT_DIR}/pal_finder_v0.02.04.pl" ] ; then 81 if [ ! -f "${PALFINDER_SCRIPT_DIR}/pal_finder_v0.02.04.pl" ] ; then
89 echo "ERROR pal_finder_v0.02.04.pl not found in ${PALFINDER_SCRIPT_DIR}" >&2 82 fatal "pal_finder_v0.02.04.pl not found in ${PALFINDER_SCRIPT_DIR}"
90 exit 1
91 fi 83 fi
92 # 84 #
93 # Initialise parameters used in the config.txt file 85 # Initialise parameters used in the config.txt file
94 PRIMER_PREFIX="test" 86 PRIMER_PREFIX="test"
95 MIN_2_MER_REPS=6 87 MIN_2_MER_REPS=6
111 PRIMER_PAIR_MAX_DIFF_TM= 103 PRIMER_PAIR_MAX_DIFF_TM=
112 OUTPUT_CONFIG_FILE= 104 OUTPUT_CONFIG_FILE=
113 OUTPUT_ASSEMBLY= 105 OUTPUT_ASSEMBLY=
114 FILTERED_MICROSATS= 106 FILTERED_MICROSATS=
115 FILTER_OPTIONS= 107 FILTER_OPTIONS=
108 SUBSET=
109 RANDOM_SEED=568765
116 # 110 #
117 # Collect command line arguments 111 # Collect command line arguments
118 if [ $# -lt 2 ] ; then 112 if [ $# -lt 2 ] ; then
119 echo "Usage: $0 FASTQ_R1 FASTQ_R2 MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]" 113 echo "Usage: $0 FASTQ_R1 FASTQ_R2 MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]"
120 echo " $0 --454 FASTA MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]" 114 echo " $0 --454 FASTA MICROSAT_SUMMARY PAL_SUMMARY [OPTIONS]"
121 exits 115 fatal "Bad command line"
122 fi 116 fi
123 if [ "$1" == "--454" ] ; then 117 if [ "$1" == "--454" ] ; then
124 PLATFORM="454" 118 PLATFORM="454"
125 FNA=$2 119 FNA=$2
126 else 120 else
210 ;; 204 ;;
211 --output_config_file) 205 --output_config_file)
212 shift 206 shift
213 OUTPUT_CONFIG_FILE=$1 207 OUTPUT_CONFIG_FILE=$1
214 ;; 208 ;;
209 --bad_primer_ranges)
210 shift
211 BAD_PRIMER_RANGES=$1
212 ;;
215 --filter_microsats) 213 --filter_microsats)
216 shift 214 shift
217 FILTERED_MICROSATS=$1 215 FILTERED_MICROSATS=$1
218 ;; 216 ;;
219 -primers|-occurrences|-rankmotifs) 217 -primers|-occurrences|-rankmotifs)
221 ;; 219 ;;
222 -assembly) 220 -assembly)
223 FILTER_OPTIONS="$FILTER_OPTIONS $1" 221 FILTER_OPTIONS="$FILTER_OPTIONS $1"
224 shift 222 shift
225 OUTPUT_ASSEMBLY=$1 223 OUTPUT_ASSEMBLY=$1
224 ;;
225 --subset)
226 shift
227 SUBSET=$1
226 ;; 228 ;;
227 *) 229 *)
228 echo Unknown option: $1 >&2 230 echo Unknown option: $1 >&2
229 exit 1 231 exit 1
230 ;; 232 ;;
233 done 235 done
234 # 236 #
235 # Check that primer3_core is available 237 # Check that primer3_core is available
236 got_primer3=`which $PRIMER3_CORE_EXE 2>&1 | grep -v "no primer3_core in"` 238 got_primer3=`which $PRIMER3_CORE_EXE 2>&1 | grep -v "no primer3_core in"`
237 if [ -z "$got_primer3" ] ; then 239 if [ -z "$got_primer3" ] ; then
238 echo ERROR primer3_core not found >&2 240 fatal "primer3_core not found"
239 exit 1 241 fi
240 fi 242 #
241 # 243 # Check the n-mers specification
244 if [ $MIN_6_MER_REPS -ne 0 ] ; then
245 if [ $MIN_5_MER_REPS -eq 0 ] ; then
246 fatal "Minimum number of 5-mers cannot be zero if number of 6-mers is non-zero"
247 fi
248 fi
249 if [ $MIN_5_MER_REPS -ne 0 ] ; then
250 if [ $MIN_4_MER_REPS -eq 0 ] ; then
251 fatal "Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero"
252 fi
253 fi
254 if [ $MIN_4_MER_REPS -ne 0 ] ; then
255 if [ $MIN_3_MER_REPS -eq 0 ] ; then
256 fatal "Minimum number of 3-mers cannot be zero if number of 4-mers is non-zero"
257 fi
258 fi
259 if [ $MIN_2_MER_REPS -eq 0 ] ; then
260 fatal "Minimum number of 2-mer repeats cannot be zero"
261 fi
242 # Set up the working dir 262 # Set up the working dir
243 if [ "$PLATFORM" == "Illumina" ] ; then 263 if [ "$PLATFORM" == "Illumina" ] ; then
244 # Paired end Illumina data as input 264 # Paired end Illumina data as input
245 if [ $FASTQ_R1 == $FASTQ_R2 ] ; then 265 if [ $FASTQ_R1 == $FASTQ_R2 ] ; then
246 echo ERROR R1 and R2 fastqs are the same file >&2 266 fatal ERROR R1 and R2 fastqs are the same file
247 exit 1
248 fi 267 fi
249 ln -s $FASTQ_R1 268 ln -s $FASTQ_R1
250 ln -s $FASTQ_R2 269 ln -s $FASTQ_R2
251 fastq_r1=$(basename $FASTQ_R1) 270 fastq_r1=$(basename $FASTQ_R1)
252 fastq_r2=$(basename $FASTQ_R2) 271 fastq_r2=$(basename $FASTQ_R2)
257 fi 276 fi
258 ln -s $PRIMER_MISPRIMING_LIBRARY 277 ln -s $PRIMER_MISPRIMING_LIBRARY
259 PRIMER_MISPRIMING_LIBRARY=$(basename $PRIMER_MISPRIMING_LIBRARY) 278 PRIMER_MISPRIMING_LIBRARY=$(basename $PRIMER_MISPRIMING_LIBRARY)
260 mkdir Output 279 mkdir Output
261 # 280 #
281 # Use a subset of reads
282 if [ ! -z "$SUBSET" ] ; then
283 echo "### Extracting subset of reads ###"
284 $(dirname $0)/fastq_subset.py -n $SUBSET -s $RANDOM_SEED $fastq_r1 $fastq_r2
285 fastq_r1="subset_r1.fq"
286 fastq_r2="subset_r2.fq"
287 fi
288 #
262 # Copy in the default config.txt file 289 # Copy in the default config.txt file
263 echo "### Creating config.txt file for pal_finder run ###" 290 echo "### Creating config.txt file for pal_finder run ###"
264 /bin/cp $PALFINDER_DATA_DIR/config.txt . 291 /bin/cp $PALFINDER_DATA_DIR/config.txt .
265 # 292 #
266 # Update the config.txt file with new values 293 # Update the config.txt file with new values
267 function set_config_value() {
268 local key=$1
269 local value=$2
270 local config_txt=$3
271 if [ -z "$value" ] ; then
272 echo "No value for $key, left as default"
273 else
274 echo Setting "$key" to "$value"
275 sed -i 's,^'"$key"' .*,'"$key"' '"$value"',' $config_txt
276 fi
277 }
278 # Input files 294 # Input files
279 set_config_value platform $PLATFORM config.txt 295 set_config_value platform $PLATFORM config.txt
280 if [ "$PLATFORM" == "Illumina" ] ; then 296 if [ "$PLATFORM" == "Illumina" ] ; then
281 set_config_value inputFormat fastq config.txt 297 set_config_value inputFormat fastq config.txt
282 set_config_value pairedEnd 1 config.txt 298 set_config_value pairedEnd 1 config.txt
297 set_config_value 5merMinReps $MIN_5_MER_REPS config.txt 313 set_config_value 5merMinReps $MIN_5_MER_REPS config.txt
298 set_config_value 6merMinReps $MIN_6_MER_REPS config.txt 314 set_config_value 6merMinReps $MIN_6_MER_REPS config.txt
299 # Primer3 settings 315 # Primer3 settings
300 set_config_value primer3input Output/pr3in.txt config.txt 316 set_config_value primer3input Output/pr3in.txt config.txt
301 set_config_value primer3output Output/pr3out.txt config.txt 317 set_config_value primer3output Output/pr3out.txt config.txt
318 set_config_value keepPrimer3files 1 config.txt
302 set_config_value primer3executable $PRIMER3_CORE_EXE config.txt 319 set_config_value primer3executable $PRIMER3_CORE_EXE config.txt
303 set_config_value prNamePrefix ${PRIMER_PREFIX}_ config.txt 320 set_config_value prNamePrefix ${PRIMER_PREFIX}_ config.txt
304 set_config_value PRIMER_MISPRIMING_LIBRARY "$PRIMER_MISPRIMING_LIBRARY" config.txt 321 set_config_value PRIMER_MISPRIMING_LIBRARY "$PRIMER_MISPRIMING_LIBRARY" config.txt
305 set_config_value PRIMER_OPT_SIZE "$PRIMER_OPT_SIZE" config.txt 322 set_config_value PRIMER_OPT_SIZE "$PRIMER_OPT_SIZE" config.txt
306 set_config_value PRIMER_MIN_SIZE "$PRIMER_MIN_SIZE" config.txt 323 set_config_value PRIMER_MIN_SIZE "$PRIMER_MIN_SIZE" config.txt
325 echo WARNING output too long, truncated to last $MAX_LINES lines: 342 echo WARNING output too long, truncated to last $MAX_LINES lines:
326 echo ... 343 echo ...
327 fi 344 fi
328 tail -$MAX_LINES pal_finder.log 345 tail -$MAX_LINES pal_finder.log
329 # 346 #
330 # Check that log ends with "Done!!" message 347 # Check for success/failure
331 if [ -z "$(tail -n 1 pal_finder.log | grep Done!!)" ] ; then 348 if [ ! -z "$(tail -n 1 pal_finder.log | grep 'No microsatellites found in any reads. Ending script.')" ] ; then
332 echo ERROR pal_finder failed to complete successfully >&2 349 # No microsatellites found
350 fatal ERROR pal_finder failed to locate any microsatellites
333 exit 1 351 exit 1
352 elif [ -z "$(tail -n 1 pal_finder.log | grep Done!!)" ] ; then
353 # Log doesn't end with "Done!!" (indicates failure)
354 fatal ERROR pal_finder failed to complete successfully
355 fi
356 echo "### pal_finder finished ###"
357 #
358 # Check for errors in pal_finder output
359 echo "### Checking for errors ###"
360 if [ ! -z "$(grep 'primer3_core: Illegal element in PRIMER_PRODUCT_SIZE_RANGE' pal_finder.log)" ] ; then
361 echo WARNING primer3 terminated prematurely due to bad product size ranges
362 $(find_bad_primer_ranges Output/pr3in.txt bad_primer_ranges.txt)
363 N_BAD_PRIMERS=$(cat bad_primer_ranges.txt | wc -l)
364 if [ -z "$BAD_PRIMER_RANGES" ] ; then
365 # No output file so report to stderr
366 cat <<EOF
367
368 Pal_finder generated bad ranges for the following read IDs:
369
370 EOF
371 cat bad_primer_ranges.txt
372 cat <<EOF
373
374 This error can occur when input data contains short R1 reads and has
375 has not been properly trimmed and filtered.
376
377 EOF
378 else
379 # Move the bad ranges to the specified file
380 echo "### Writing read IDs with bad primer ranges ###"
381 /bin/mv bad_primer_ranges.txt "$BAD_PRIMER_RANGES"
382 fi
383 else
384 N_BAD_PRIMERS=0
334 fi 385 fi
335 # 386 #
336 # Sort microsat_summary output 387 # Sort microsat_summary output
337 echo "### Sorting microsat summary output ###" 388 echo "### Sorting microsat summary output ###"
338 head -n 7 Output/microsat_summary.txt | sort >microsat_summary.sorted 389 head -n 7 Output/microsat_summary.txt | sort >microsat_summary.sorted
390 echo "readsWithBadRanges:"$'\t'"$((N_BAD_PRIMERS * 2))" >>microsat_summary.sorted
339 grep "^$" Output/microsat_summary.txt>>microsat_summary.sorted 391 grep "^$" Output/microsat_summary.txt>>microsat_summary.sorted
340 grep "^Microsat Type" Output/microsat_summary.txt >>microsat_summary.sorted 392 grep "^Microsat Type" Output/microsat_summary.txt >>microsat_summary.sorted
341 tail -n +11 Output/microsat_summary.txt >>microsat_summary.sorted 393 tail -n +11 Output/microsat_summary.txt | sort -r -n -k 5 >>microsat_summary.sorted
342 mv microsat_summary.sorted Output/microsat_summary.txt 394 mv microsat_summary.sorted Output/microsat_summary.txt
343 # 395 #
344 # Sort PAL_summary output 396 # Sort PAL_summary output
345 echo "### Sorting PAL summary output ###" 397 echo "### Sorting PAL summary output ###"
346 head -1 Output/PAL_summary.txt > Output/PAL_summary.sorted.txt 398 head -1 Output/PAL_summary.txt > Output/PAL_summary.sorted.txt
360 echo WARNING output too long, truncated to last $MAX_LINES lines: 412 echo WARNING output too long, truncated to last $MAX_LINES lines:
361 echo ... 413 echo ...
362 fi 414 fi
363 tail -$MAX_LINES pal_filter.log 415 tail -$MAX_LINES pal_filter.log
364 if [ $? -ne 0 ] ; then 416 if [ $? -ne 0 ] ; then
365 echo ERROR $PALFINDER_FILTER exited with non-zero status >&2 417 fatal $PALFINDER_FILTER exited with non-zero status
366 exit 1
367 elif [ ! -f PAL_summary.filtered ] ; then 418 elif [ ! -f PAL_summary.filtered ] ; then
368 echo ERROR no output from $PALFINDER_FILTER >&2 419 fatal no output from $PALFINDER_FILTER
369 exit 1
370 fi 420 fi
371 fi 421 fi
372 # 422 #
373 # Clean up 423 # Clean up
374 echo "### Handling output files ###" 424 echo "### Handling output files ###"
384 if [ ! -z "$OUTPUT_ASSEMBLY" ] ; then 434 if [ ! -z "$OUTPUT_ASSEMBLY" ] ; then
385 assembly=${fastq_r1%.*}_pal_filter_assembly_output.txt 435 assembly=${fastq_r1%.*}_pal_filter_assembly_output.txt
386 if [ -f "$assembly" ] ; then 436 if [ -f "$assembly" ] ; then
387 /bin/mv $assembly "$OUTPUT_ASSEMBLY" 437 /bin/mv $assembly "$OUTPUT_ASSEMBLY"
388 else 438 else
389 echo ERROR no assembly output found >&2 439 fatal no assembly output found
390 exit 1
391 fi 440 fi
392 fi 441 fi
393 if [ ! -z "$OUTPUT_CONFIG_FILE" ] && [ -f config.txt ] ; then 442 if [ ! -z "$OUTPUT_CONFIG_FILE" ] && [ -f config.txt ] ; then
394 /bin/mv config.txt $OUTPUT_CONFIG_FILE 443 /bin/mv config.txt $OUTPUT_CONFIG_FILE
395 fi 444 fi