Mercurial > repos > bornea > prohits_dotplot_generator
diff Dotplot_Release/dotplot.bash @ 3:bc752a05f16d draft
Uploaded
| author | bornea | 
|---|---|
| date | Tue, 15 Mar 2016 15:25:15 -0400 | 
| parents | |
| children | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dotplot_Release/dotplot.bash Tue Mar 15 15:25:15 2016 -0400 @@ -0,0 +1,252 @@ +#!/bin/bash +#SCRIPT=$(readlink -e $0) +#SCRIPTPATH=`dirname $SCRIPT` +pushd `dirname $0` > /dev/null +SCRIPTPATH=`pwd` +popd > /dev/null + +usage() { printf "Usage: $0 +[-f <saint_file_name.txt>] +[-i <0 for SaintExpress format, 1 for other>] +[-c <clustering to perform. Options: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering; see options -b and -p)>] +[-n <clustering type to be performed if option -c is set to \"h\">] +[-d <distance metric to use if option -c is set to \"h\">] +[-b <list of bait proteins in display order (see option -c n)>] +[-p <list of prey proteins in display order (see option -c n). Set this to \"all\" if you want to include all preys and cluster them>] +[-s <primary FDR cutoff [0-1, recommended=0.01]>] +[-t <secondary FDR cutoff [must be less than the primary, recommended=0.025]> +[-x <spectral count minimum. Only preys with >= this will be used]> +[-m <maximum spectral count>] +[-N <normalization, 0 for no (default), 1 for yes, 2 for normalization based on significant preys counts (prey FDR <= option -t)>] +[-C <FDR cutoff for normalization if using option -N 2 (deafult is -t)>]\n" +1>&2; exit 1; } + +N=0 +n="ward" +d="canberra" +x=0 +i=0 +while getopts ":f:i:s:t:x:m:c:n:d:b:p:N:C:" o; do + case "${o}" in + f) + f=${OPTARG} + ;; + i) + i=${OPTARG} + ;; + s) + s=${OPTARG} + ;; + t) + t=${OPTARG} + ;; + x) + x=${OPTARG} + ;; + m) + m=${OPTARG} + ;; + c) + c=${OPTARG} + ;; + n) + n=${OPTARG} + ;; + d) + d=${OPTARG} + ;; + b) + b=${OPTARG} + ;; + p) + p=${OPTARG} + ;; + N) + N=${OPTARG} + ;; + C) + C=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +filename=${f%%.*} +echo "Saint input file = ${f}" +echo "Primary FDR cutoff = ${s}" +echo "Secondary FDR cutoff for dotplot = ${t}" +echo "Minimum spectral count for significant preys = ${x}" +echo "Maximum spectral count for dot plot = ${m}" + +if [ -z "${f}" ] || [ -z "${s}" ] || [ -z "${t}" ] || [ -z "${m}" ] || [ -z "${c}" ]; then + usage +fi + +if [ "${i}" == 1 ]; then + $SCRIPTPATH/SaintConvert.pl -i ${f} + f="mockSaintExpress.txt" +fi + +if [ "${x}" -ge "${m}" ]; then + echo "spectral count minimum (${x}) cannot be greater than or equal to the maximum (${m})" + exit 1; +elif [ "${x}" -lt 0 ]; then + echo "spectral count minimum (${x}) cannot be less than 0. Setting to 0 and continuing" + x=0 +fi + +###Check for normalization + +if [ "${N}" == 1 ]; then + printf "\nNormalization is being performed\n" + $SCRIPTPATH/Normalization.R ${f} + f="norm_saint.txt" +elif [ "${N}" == 2 ]; then + printf "\nNormalization is being performed\n" + if [ -z "${C}" ]; then + C=${t} + fi + $SCRIPTPATH/Normalization_sigpreys.R ${f} ${C} + f="norm_saint.txt" +fi + + +###Check for clustering etc + +if [ "${c}" == "h" ] && [ -z "${n}" ]; then + printf "\nHierarchial clustering was selected (-c = h), but no clustering method (-n) was chosen.\n" + printf "The input parameter -n must be set to one of \"average\", \"centroid\", \"complete\", \"mcquitty\",\n" + printf "\"median\", \"single\" or \"ward\". \"ward\" will be selected as default.\n\n" + n="ward" +elif [ "${c}" == "h" ] && [ -n "${n}" ]; then + if [ "${n}" == "average" ] || [ "${n}" == "centroid" ] || [ "${n}" == "complete" ] || [ "${n}" == "mcquitty" ] || [ "${n}" == "median" ] || [ "${n}" == "single" ] || [ "${n}" == "ward" ]; then + printf "\nHierarchical clustering (method = ${n}) will be performed\n\n" + else + printf "\n${n} is not a valid Hierarchical clustering method.\n" + printf "Choose one of \"average\", \"centroid\", \"complete\", \"mcquitty\", \"median\", \"single\" or \"ward\"\n\n" + exit 1 + fi +fi + +p_c=0 +if [ "${c}" == "h" ] && [ -z "${d}" ]; then + printf "\nHierarchial clustering was selected (-c = h), but no distance metric (-d) was chosen.\n" + printf "The input parameter -d must be set to one of \"binary\", \"canberra\", \"euclidean\",\n" + printf "\"manhattan\", \"maximum\" or \"minkowski\". \"canberra\" will be selected as default.\n\n" + d="canberra" +elif [ "${c}" == "h" ] && [ -n "${d}" ]; then + if [ "${d}" == "binary" ] || [ "${d}" == "canberra" ] || [ "${d}" == "euclidean" ] || [ "${d}" == "manhattan" ] || [ "${d}" == "maximum" ] || [ "${d}" == "minkowski" ]; then + printf "\nHierarchical clustering (distance metric = ${d}) will be performed\n\n" + else + printf "\n${d} is not a valid Hierarchical clustering distance metric.\n" + printf "Choose one of \"binary\", \"canberra\", \"euclidean\", \"manhattan\", \"maximum\" or \"minkowski\"\n\n" + exit 1 + fi +fi + +if [ "${c}" == "n" ] && [ -z "${b}" ]; then + printf "\n\"No Clustering\" option was selected (-c = n), but no bait list was included (option -b).\n" + printf "Bait list must be in .txt formart.\n\n" + exit 1 +elif [ "${c}" == "n" ] && [ -z "${p}" ]; then + printf "\n\"No Clustering\" option was selected (-c = n), but no prey list was included (option -p).\n" + printf "Prey list must be in .txt formart.\n\n" + exit 1 +elif [ "${c}" == "n" ] && [ "${p}" == "all" ]; then + printf "\n\"No Clustering\" option was selected (-c = n) for baits, but preys will still be clustered.\n" + printf "using \"ward\" and \"canberra\" as defaults or options as supplied on command line.\n\n" + p="empty" + p_c=1 + n="ward" + d="canberra" +fi + + +###Check number of baits + +bait_n=$(perl $SCRIPTPATH/BaitCheck.pl -i ${f}) +echo "Number of baits = "$bait_n +printf "\n\n" + +if [ "${c}" == "b" ] && [ $bait_n == 2 ]; then + printf "\nWarning only 2 baits are present. Biclustering will not performed.\n" + printf "Hierarchical clustering (method = ward) will be performed instead.\n\n" + c="h" + n="ward" +fi + + +###Generate plots + +if [ "${c}" == "b" ]; then + printf "\nBiclustering will be performed\n\n" + $SCRIPTPATH/Step1_data_reformating.R ${f} ${s} ${filename} + $SCRIPTPATH/Step2_data_filtering.R ${filename}_matrix.txt ${x} ${filename} + GSL_RNG_SEED=123 $SCRIPTPATH/Step3_nestedcluster ${filename}.dat $SCRIPTPATH/biclust_param.txt + $SCRIPTPATH/Step4_biclustering.R ${filename}.dat + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + $SCRIPTPATH/R_dotPlot.R ${s} ${t} ${m} + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv bait_lists Output_${filename}/TempData_${filename} + mv Clusters Output_${filename}/TempData_${filename} + mv MCMCparameters Output_${filename}/TempData_${filename} + mv NestedClusters Output_${filename}/TempData_${filename} + mv NestedMu Output_${filename}/TempData_${filename} + mv NestedSigma2 Output_${filename}/TempData_${filename} + mv OPTclusters Output_${filename}/TempData_${filename} + mv ${filename}_matrix.txt Output_${filename}/TempData_${filename} + mv ${filename}.dat Output_${filename}/TempData_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + mv clustered_matrix.txt Output_${filename}/TempData_${filename} + mv singletons.txt Output_${filename}/TempData_${filename} + mv bait2bait_matrix.txt Output_${filename}/TempData_${filename} + mv baitClusters Output_${filename}/TempData_${filename} + mv clusteredData Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv bait2bait.pdf Output_${filename} + mv estimated.pdf Output_${filename} + mv stats.pdf Output_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +elif [ "${c}" == "h" ]; then + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + $SCRIPTPATH/R_dotPlot_hc.R ${s} ${t} ${m} ${n} ${d} $SCRIPTPATH + + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv heatmap_borders.pdf Output_${filename} + mv heatmap_no_borders.pdf Output_${filename} + mv bait2bait.pdf Output_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +elif [ "${c}" == "n" ]; then + + $SCRIPTPATH/SOFD.pl -i ${f} -s ${s} -x ${x} + echo "$SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH" + $SCRIPTPATH/R_dotPlot_nc.R ${s} ${t} ${m} ${b} $p_c ${p} ${n} ${d} $SCRIPTPATH + + mkdir Output_${filename} + mkdir Output_${filename}/TempData_${filename} + mv dotplot.pdf Output_${filename} + mv heatmap_borders.pdf Output_${filename} + mv heatmap_no_borders.pdf Output_${filename} + mv SC_data.txt Output_${filename}/TempData_${filename} + mv FDR_data.txt Output_${filename}/TempData_${filename} + cp $SCRIPTPATH/legend.pdf Output_${filename} +else + printf -- "-c must be one of [b, h, n]: b (biclustering), h (hierarchical), n (none, requires input text files for bait and prey ordering>\n" + exit 1; +fi + +if [ "${N}" == "1" ] || [ "${N}" == "2" ]; then + mv norm_saint.txt Output_${filename}/TempData_${filename} +fi +
