0
|
1 #!/usr/bin/env bash
|
|
2
|
|
3 input_file=$1
|
|
4 output_file=$2
|
|
5 org=$3
|
|
6 db_loc=$4
|
|
7 chrom_col=$5
|
|
8 pos_col=$6
|
|
9 base=$7
|
|
10 allele_col=$8
|
|
11 strand_col=$9
|
|
12 comment_col=${10}
|
|
13 output_opts=${11}
|
|
14
|
|
15 working_dir=$PWD
|
|
16 sift_input="$working_dir/sift_input.txt"
|
|
17 sift_output="$working_dir/sift_output.txt"
|
|
18
|
|
19 ################################################################################
|
|
20 ## make sure input file column selections are mutually exclusive ##
|
|
21 ################################################################################
|
|
22 ERROR=0
|
|
23 declare -a col_use
|
|
24
|
|
25 function check_col () {
|
|
26 local col=$1
|
|
27 local use=$2
|
|
28 local int=$3
|
|
29
|
|
30 if [ -n "${col//[0-9]}" ]; then
|
|
31 if [ $int -eq 1 ]; then
|
|
32 echo "ERROR: invalid value for $use column: $col" 1>&2
|
|
33 ERROR=1
|
|
34 fi
|
|
35 return
|
|
36 fi
|
|
37
|
|
38 local cur=${col_use[$col]}
|
|
39 if [ -n "$cur" ]; then
|
|
40 echo "ERROR: $use column is the same as $cur column" 1>&2
|
|
41 col_use[$col]="${cur},$use"
|
|
42 ERROR=1
|
|
43 else
|
|
44 col_use[$col]=$use
|
|
45 fi
|
|
46 }
|
|
47
|
|
48 check_col $chrom_col 'chromosome' 1
|
|
49 check_col $pos_col 'position' 1
|
|
50 check_col $allele_col 'allele' 1
|
|
51 check_col $strand_col 'strand' 0
|
|
52 check_col $comment_col 'comment' 0
|
|
53
|
|
54 if [ $ERROR -ne 0 ]; then
|
|
55 exit 1
|
|
56 fi
|
|
57
|
|
58 ################################################################################
|
|
59 ## get/check the db directory from the argument org,db_loc ##
|
|
60 ################################################################################
|
|
61 db_dir=$( awk '$1 == org { print $2 }' org=$org $db_loc )
|
|
62
|
|
63 if [ -z "$db_dir" ]; then
|
|
64 echo "Can't find dbkey \"$org\" in loc file \"$db_loc\"" 1>&2
|
|
65 exit 1
|
|
66 fi
|
|
67
|
|
68 if [ ! -d "$db_dir" ]; then
|
|
69 echo "Can't access SIFT database directory \"$db_dir\"" 1>&2
|
|
70 exit 1
|
|
71 fi
|
|
72
|
|
73 ################################################################################
|
|
74 ## create input file for SIFT_exome_nssnvs.pl ##
|
|
75 ################################################################################
|
|
76 if [ ! -r "$input_file" ]; then
|
|
77 echo "Can't read input file \"$input_file\"" 1>&2
|
|
78 exit 1
|
|
79 fi
|
|
80
|
|
81 if [ $base -eq 0 ]; then
|
|
82 beg_col="$pos_col"
|
|
83 end_col="$pos_col + 1"
|
|
84 pos_adj='$2 = $2 - 1'
|
|
85 else
|
|
86 beg_col="$pos_col - 1"
|
|
87 end_col="$pos_col"
|
|
88 pos_adj=''
|
|
89 fi
|
|
90
|
|
91 strand_cvt=''
|
|
92 if [ \( "$strand_col" = "+" \) ]; then
|
|
93 strand='"1"'
|
|
94 elif [ \( "$strand_col" = "-" \) ]; then
|
|
95 strand='"-1"'
|
|
96 else
|
|
97 strand="\$$strand_col"
|
|
98 strand_cvt='if ('"${strand}"' == "+") {'"${strand}"' = "1"} else if ('"${strand}"' == "-") {'"${strand}"' = "-1"}'
|
|
99 fi
|
|
100
|
|
101 print_row='print $'"${chrom_col}"', $'"${beg_col}"', $'"${end_col}"', '"${strand}"', $'"${allele_col}"''
|
|
102 if [ "$comment_col" != "-" ]; then
|
|
103 print_row=''"${print_row}"', $'"${comment_col}"''
|
|
104 fi
|
|
105
|
|
106 awk '
|
|
107 BEGIN {FS="\t";OFS=","}
|
|
108 $'"${chrom_col}"' ~ /^[cC][hH][rR]/ {$'"${chrom_col}"' = substr($'"${chrom_col}"',4)}
|
|
109 {
|
|
110 '"${strand_cvt}"'
|
|
111 '"${print_row}"'
|
|
112 }
|
|
113 ' "$input_file" > "$sift_input"
|
|
114
|
|
115 ################################################################################
|
|
116 ## run SIFT_exome_nssnvs.pl command line program ##
|
|
117 ################################################################################
|
|
118 if [ "$output_opts" = "None" ]; then
|
|
119 output_opts=""
|
|
120 else
|
|
121 output_opts=$( echo "$output_opts" | sed -e 's/,/ 1 -/g' )
|
|
122 output_opts="-$output_opts 1"
|
|
123 fi
|
|
124
|
|
125 SIFT_exome_nssnvs.pl -i "$sift_input" -d "$db_dir" -o "$working_dir" $output_opts &> "$sift_output"
|
|
126 if [ $? -ne 0 ]; then
|
|
127 echo "failed: SIFT_exome_nssnvs.pl -i \"$sift_input\" -d \"$db_dir\" -o \"$working_dir\" $output_opts"
|
|
128 exit 1
|
|
129 fi
|
|
130
|
|
131 ################################################################################
|
|
132 ## locate the SIFT_exome_nssnvs.pl output file ##
|
|
133 ################################################################################
|
|
134 sift_pid=$( sed -n -e 's/^.*Your job id is \([0-9][0-9]*\) and is currently running.*$/\1/p' "$sift_output" )
|
|
135
|
|
136 if [ -z "$sift_pid" ]; then
|
|
137 echo "Can't find SIFT pid in \"$sift_output\"" 1>&2
|
|
138 exit 1
|
|
139 fi
|
|
140
|
|
141 sift_outdir="$working_dir/$sift_pid"
|
|
142 if [ ! -d "$sift_outdir" ]; then
|
|
143 echo "Can't access SIFT output directory \"$sift_outdir\"" 1>&2
|
|
144 exit 1
|
|
145 fi
|
|
146
|
|
147 sift_outfile="$sift_outdir/${sift_pid}_predictions.tsv"
|
|
148 if [ ! -r "$sift_outfile" ]; then
|
|
149 echo "Can't access SIFT output file \"$sift_outfile\"" 1>&2
|
|
150 exit 1
|
|
151 fi
|
|
152
|
|
153 ################################################################################
|
|
154 ## create galaxy output file ##
|
|
155 ################################################################################
|
|
156 awk '
|
|
157 BEGIN {FS="\t";OFS="\t"}
|
|
158 NR == 1 {
|
|
159 $12 = "Num seqs at position"
|
|
160 $1 = "Chrom\tPosition\tStrand\tAllele"
|
|
161 print
|
|
162 }
|
|
163 NR != 1 {
|
|
164 $1 = "chr" $1
|
|
165 gsub(/,/, "\t", $1)
|
|
166 print
|
|
167 }
|
|
168 ' "$sift_outfile" | awk '
|
|
169 BEGIN {FS="\t";OFS="\t"}
|
|
170 NR == 1 {
|
|
171 print "#" $0
|
|
172 }
|
|
173 NR != 1 {
|
|
174 if ($3 == "1") {$3 = "+"} else if ($3 == "-1") {$3 = "-"}
|
|
175 '"${pos_adj}"'
|
|
176 print
|
|
177 }
|
|
178 ' > "$output_file"
|
|
179
|
|
180 ################################################################################
|
|
181 ## cleanup ##
|
|
182 ################################################################################
|
|
183 rm -rf "$sift_outdir" "$sift_input" "$sift_output"
|
|
184
|