comparison tools/human_genome_variation/sift_variants_wrapper.sh @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env bash
2
3 input_file=$1
4 output_file=$2
5 org=$3
6 db_loc=$4
7 chrom_col=$5
8 pos_col=$6
9 base=$7
10 allele_col=$8
11 strand_col=$9
12 comment_col=${10}
13 output_opts=${11}
14
15 working_dir=$PWD
16 sift_input="$working_dir/sift_input.txt"
17 sift_output="$working_dir/sift_output.txt"
18
19 ################################################################################
20 ## make sure input file column selections are mutually exclusive ##
21 ################################################################################
22 ERROR=0
23 declare -a col_use
24
25 function check_col () {
26 local col=$1
27 local use=$2
28 local int=$3
29
30 if [ -n "${col//[0-9]}" ]; then
31 if [ $int -eq 1 ]; then
32 echo "ERROR: invalid value for $use column: $col" 1>&2
33 ERROR=1
34 fi
35 return
36 fi
37
38 local cur=${col_use[$col]}
39 if [ -n "$cur" ]; then
40 echo "ERROR: $use column is the same as $cur column" 1>&2
41 col_use[$col]="${cur},$use"
42 ERROR=1
43 else
44 col_use[$col]=$use
45 fi
46 }
47
48 check_col $chrom_col 'chromosome' 1
49 check_col $pos_col 'position' 1
50 check_col $allele_col 'allele' 1
51 check_col $strand_col 'strand' 0
52 check_col $comment_col 'comment' 0
53
54 if [ $ERROR -ne 0 ]; then
55 exit 1
56 fi
57
58 ################################################################################
59 ## get/check the db directory from the argument org,db_loc ##
60 ################################################################################
61 db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc )
62
63 if [ -z "$db_dir" ]; then
64 echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2
65 exit 1
66 fi
67
68 if [ ! -d "$db_dir" ]; then
69 echo "Can't access SIFT database directory \"$db_dir\"" 1>&2
70 exit 1
71 fi
72
73 ################################################################################
74 ## create input file for SIFT_exome_nssnvs.pl ##
75 ################################################################################
76 if [ ! -r "$input_file" ]; then
77 echo "Can't read input file \"$input_file\"" 1>&2
78 exit 1
79 fi
80
81 if [ $base -eq 0 ]; then
82 beg_col="$pos_col"
83 end_col="$pos_col + 1"
84 pos_adj='$2 = $2 - 1'
85 else
86 beg_col="$pos_col - 1"
87 end_col="$pos_col"
88 pos_adj=''
89 fi
90
91 strand_cvt=''
92 if [ \( "$strand_col" = "+" \) ]; then
93 strand='"1"'
94 elif [ \( "$strand_col" = "-" \) ]; then
95 strand='"-1"'
96 else
97 strand="\$$strand_col"
98 strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}'
99 fi
100
101 print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"''
102 if [ "$comment_col" != "-" ]; then
103 print_row=''"${print_row}"', $'"${comment_col}"''
104 fi
105
106 awk '
107 BEGIN {FS="\t";OFS=","}
108 $'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)}
109 {
110 '"${strand_cvt}"'
111 '"${print_row}"'
112 }
113 ' "$input_file" > "$sift_input"
114
115 ################################################################################
116 ## run SIFT_exome_nssnvs.pl command line program ##
117 ################################################################################
118 if [ "$output_opts" = "None" ]; then
119 output_opts=""
120 else
121 output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' )
122 output_opts="-$output_opts 1"
123 fi
124
125 SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output"
126 if [ $? -ne 0 ]; then
127 echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts"
128 exit 1
129 fi
130
131 ################################################################################
132 ## locate the SIFT_exome_nssnvs.pl output file ##
133 ################################################################################
134 sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" )
135
136 if [ -z "$sift_pid" ]; then
137 echo "Can't find SIFT pid in \"$sift_output\"" 1>&2
138 exit 1
139 fi
140
141 sift_outdir="$working_dir/$sift_pid"
142 if [ ! -d "$sift_outdir" ]; then
143 echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2
144 exit 1
145 fi
146
147 sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv"
148 if [ ! -r "$sift_outfile" ]; then
149 echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2
150 exit 1
151 fi
152
153 ################################################################################
154 ## create galaxy output file ##
155 ################################################################################
156 awk '
157 BEGIN {FS="\t";OFS="\t"}
158 NR == 1 {
159 $12 = "Num seqs at position"
160 $1 = "Chrom\tPosition\tStrand\tAllele"
161 print
162 }
163 NR != 1 {
164 $1 = "chr" $1
165 gsub(/,/, "\t", $1)
166 print
167 }
168 ' "$sift_outfile" | awk '
169 BEGIN {FS="\t";OFS="\t"}
170 NR == 1 {
171 print "#" $0
172 }
173 NR != 1 {
174 if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"}
175 '"${pos_adj}"'
176 print
177 }
178 ' > "$output_file"
179
180 ################################################################################
181 ## cleanup ##
182 ################################################################################
183 rm -rf "$sift_outdir" "$sift_input" "$sift_output"
184