Mercurial > repos > xuebing > sharplabtool
diff tools/human_genome_variation/sift_variants_wrapper.sh @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/human_genome_variation/sift_variants_wrapper.sh Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,184 @@ +#!/usr/bin/env bash + +input_file=$1 +output_file=$2 +org=$3 +db_loc=$4 +chrom_col=$5 +pos_col=$6 +base=$7 +allele_col=$8 +strand_col=$9 +comment_col=${10} +output_opts=${11} + +working_dir=$PWD +sift_input="$working_dir/sift_input.txt" +sift_output="$working_dir/sift_output.txt" + +################################################################################ +## make sure input file column selections are mutually exclusive ## +################################################################################ +ERROR=0 +declare -a col_use + +function check_col () { + local col=$1 + local use=$2 + local int=$3 + + if [ -n "${col//[0-9]}" ]; then + if [ $int -eq 1 ]; then + echo "ERROR: invalid value for $use column: $col" 1>&2 + ERROR=1 + fi + return + fi + + local cur=${col_use[$col]} + if [ -n "$cur" ]; then + echo "ERROR: $use column is the same as $cur column" 1>&2 + col_use[$col]="${cur},$use" + ERROR=1 + else + col_use[$col]=$use + fi +} + +check_col $chrom_col 'chromosome' 1 +check_col $pos_col 'position' 1 +check_col $allele_col 'allele' 1 +check_col $strand_col 'strand' 0 +check_col $comment_col 'comment' 0 + +if [ $ERROR -ne 0 ]; then + exit 1 +fi + +################################################################################ +## get/check the db directory from the argument org,db_loc ## +################################################################################ +db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc ) + +if [ -z "$db_dir" ]; then + echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2 + exit 1 +fi + +if [ ! -d "$db_dir" ]; then + echo "Can't access SIFT database directory \"$db_dir\"" 1>&2 + exit 1 +fi + +################################################################################ +## create input file for SIFT_exome_nssnvs.pl ## +################################################################################ +if [ ! -r "$input_file" ]; then + echo "Can't read input file \"$input_file\"" 1>&2 + exit 1 +fi + +if [ $base -eq 0 ]; then + beg_col="$pos_col" + end_col="$pos_col + 1" + pos_adj='$2 = $2 - 1' +else + beg_col="$pos_col - 1" + end_col="$pos_col" + pos_adj='' +fi + +strand_cvt='' +if [ \( "$strand_col" = "+" \) ]; then + strand='"1"' +elif [ \( "$strand_col" = "-" \) ]; then + strand='"-1"' +else + strand="\$$strand_col" + strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}' +fi + +print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"'' +if [ "$comment_col" != "-" ]; then + print_row=''"${print_row}"', $'"${comment_col}"'' +fi + +awk ' +BEGIN {FS="\t";OFS=","} +$'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)} +{ + '"${strand_cvt}"' + '"${print_row}"' +} +' "$input_file" > "$sift_input" + +################################################################################ +## run SIFT_exome_nssnvs.pl command line program ## +################################################################################ +if [ "$output_opts" = "None" ]; then + output_opts="" +else + output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' ) + output_opts="-$output_opts 1" +fi + +SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output" +if [ $? -ne 0 ]; then + echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts" + exit 1 +fi + +################################################################################ +## locate the SIFT_exome_nssnvs.pl output file ## +################################################################################ +sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" ) + +if [ -z "$sift_pid" ]; then + echo "Can't find SIFT pid in \"$sift_output\"" 1>&2 + exit 1 +fi + +sift_outdir="$working_dir/$sift_pid" +if [ ! -d "$sift_outdir" ]; then + echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2 + exit 1 +fi + +sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv" +if [ ! -r "$sift_outfile" ]; then + echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2 + exit 1 +fi + +################################################################################ +## create galaxy output file ## +################################################################################ +awk ' +BEGIN {FS="\t";OFS="\t"} +NR == 1 { + $12 = "Num seqs at position" + $1 = "Chrom\tPosition\tStrand\tAllele" + print +} +NR != 1 { + $1 = "chr" $1 + gsub(/,/, "\t", $1) + print +} +' "$sift_outfile" | awk ' +BEGIN {FS="\t";OFS="\t"} +NR == 1 { + print "#" $0 +} +NR != 1 { + if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"} + '"${pos_adj}"' + print +} +' > "$output_file" + +################################################################################ +## cleanup ## +################################################################################ +rm -rf "$sift_outdir" "$sift_input" "$sift_output" +