diff chromeister/bin/generate-one-score.sh @ 0:7fdf47a0bae8 draft

Uploaded
author alvarofaure
date Wed, 12 Dec 2018 07:18:40 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chromeister/bin/generate-one-score.sh	Wed Dec 12 07:18:40 2018 -0500
@@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+CSV=$1
+TH=$2
+
+if [ $# -ne 2 ]; then
+   echo " ==== ERROR ... you called this script inappropriately."
+   echo ""
+   echo "   usage:  $0 <index.csv> <threshold>"
+   echo ""
+   exit -1
+fi
+
+
+BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# get first genome in list (they are sorted)
+currgenome=$(tail -n +2 "$CSV" | head -1 | awk -F "," '{print $6}')
+
+# fill array of chromosomes similarity
+
+array=()
+arraytosort=()
+names=()
+homologies=()
+condition=0
+othergencounter=0
+# for problems with chromo X and Y
+highest=1 
+# For all lines
+
+cat $CSV | tail -n +2 > $1.temp
+
+while IFS= read -r i
+do
+
+	othergenome=$(echo "$i" | awk -F "," '{print $6}')
+	if [ "$condition" -eq 0 ]; then
+		currgenome=$othergenome
+		condition=1
+	fi
+	
+	if [ "$othergenome" != "$currgenome" ]; then
+	
+		# Sort the array with temporal values
+		#printf '%s\n' "${arraytosort[@]}"
+		#echo "name is $currgenome"
+		
+		sorted=($(printf '%s\n' "${arraytosort[@]}"|sort))
+		
+		#echo "For chroomo $currgenome we have "
+		#echo $(printf '%s,' "${sorted[@]}")
+		# accumulate sum until threshold is reached
+		usedValues=1
+		usedValuesNext=2
+		first=${sorted[0]}
+		next=${sorted[${usedValues}]}
+		nextofnext=${sorted[${usedValuesNext}]}
+		finalvalue=$first
+		divisor=0
+		currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }')
+		TH=$(printf '%4.6f' $TH)
+		#echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')"
+		condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ')
+		#echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue"
+		while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ];
+		do
+			usedValues=`expr $usedValues + 1`
+			usedValuesNext=`expr $usedValuesNext + 1`
+			finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}')
+			next=${sorted[${usedValues}]}
+			nextofnext=${sorted[${usedValuesNext}]}
+			
+			currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }')
+			condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ')
+			divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}')
+			
+		done
+
+		#echo "so this is what we got $finalvalue, when divided using $divisor"
+		
+		# array holds the results
+		#array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}')
+		finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}')
+		array[$highest]=$finalvalue
+		homologies[$highest]=$usedValues
+		
+		highest=`expr $highest + 1`
+		condition=0
+		names+=($currgenome)
+		othergencounter=0
+		unset arraytosort
+		
+		
+		
+		
+		getvalue=$(echo "$i" | awk -F "," '{print $8}')
+		# Copy value to array 
+		arraytosort[$othergencounter]=$getvalue
+		#currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}')
+		othergencounter=`expr $othergencounter + 1`
+	else
+		getvalue=$(echo "$i" | awk -F "," '{print $8}')
+		# Copy value to array 
+		arraytosort[$othergencounter]=$getvalue
+		#currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}')
+		othergencounter=`expr $othergencounter + 1`
+
+	fi
+
+done < "$1.temp"
+
+# do the last!!!
+#if [ "$lastprint" == "$currgenome" ]; then
+# Sort the array with temporal values
+sorted=($(printf '%s\n' "${arraytosort[@]}"|sort))
+
+usedValues=1
+usedValuesNext=2
+first=${sorted[0]}
+next=${sorted[${usedValues}]}
+nextofnext=${sorted[${usedValuesNext}]}
+finalvalue=$first
+divisor=0
+currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }')
+TH=$(printf '%4.6f' $TH)
+#echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')"
+condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ')
+#echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue"
+while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ];
+do
+        usedValues=`expr $usedValues + 1`
+        usedValuesNext=`expr $usedValuesNext + 1`
+        finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}')
+        next=${sorted[${usedValues}]}
+        nextofnext=${sorted[${usedValuesNext}]}
+
+        currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }')
+        condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ')
+        divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}')
+
+done
+
+#echo "so this is what we got $finalvalue, when divided using $divisor"
+
+# array holds the results
+#array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}')
+finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}')
+array[$highest]=$finalvalue
+homologies[$highest]=$usedValues
+
+
+highest=`expr $highest + 1`
+currsum=0
+names+=($currgenome)	
+currgenome=$othergenome
+othergencounter=0
+#fi
+
+
+highest=`expr $highest - 1`
+rm $1.temp
+
+tsum=0
+echo "deleteme" > $1.inter
+rm $1.inter
+aux=0
+for ((i = 1; i <= highest; i++)); do
+	echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}" >> $1.inter
+	#echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}"
+	aux=`expr $aux + 1`
+	#val=${array[${i}]}
+	#tsum=$(awk -v a="$tsum" -v b="$val" '{print a=a+b}')
+done
+
+#awk -v a="$tsum" b="$highest" '{print a/b}'
+
+#sumfirst=$(awk -F "," 'BEGIN{suma=0}{suma = suma + $8}END{print suma}' "$CSV")
+#echo "$sumfirst"
+
+
+