0
|
1 #!/usr/bin/env bash
|
|
2 CSV=$1
|
|
3 TH=$2
|
|
4
|
|
5 if [ $# -ne 2 ]; then
|
|
6 echo " ==== ERROR ... you called this script inappropriately."
|
|
7 echo ""
|
|
8 echo " usage: $0 <index.csv> <threshold>"
|
|
9 echo ""
|
|
10 exit -1
|
|
11 fi
|
|
12
|
|
13
|
|
14 BINDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
|
15
|
|
16 # get first genome in list (they are sorted)
|
|
17 currgenome=$(tail -n +2 "$CSV" | head -1 | awk -F "," '{print $6}')
|
|
18
|
|
19 # fill array of chromosomes similarity
|
|
20
|
|
21 array=()
|
|
22 arraytosort=()
|
|
23 names=()
|
|
24 homologies=()
|
|
25 condition=0
|
|
26 othergencounter=0
|
|
27 # for problems with chromo X and Y
|
|
28 highest=1
|
|
29 # For all lines
|
|
30
|
|
31 cat $CSV | tail -n +2 > $1.temp
|
|
32
|
|
33 while IFS= read -r i
|
|
34 do
|
|
35
|
|
36 othergenome=$(echo "$i" | awk -F "," '{print $6}')
|
|
37 if [ "$condition" -eq 0 ]; then
|
|
38 currgenome=$othergenome
|
|
39 condition=1
|
|
40 fi
|
|
41
|
|
42 if [ "$othergenome" != "$currgenome" ]; then
|
|
43
|
|
44 # Sort the array with temporal values
|
|
45 #printf '%s\n' "${arraytosort[@]}"
|
|
46 #echo "name is $currgenome"
|
|
47
|
|
48 sorted=($(printf '%s\n' "${arraytosort[@]}"|sort))
|
|
49
|
|
50 #echo "For chroomo $currgenome we have "
|
|
51 #echo $(printf '%s,' "${sorted[@]}")
|
|
52 # accumulate sum until threshold is reached
|
|
53 usedValues=1
|
|
54 usedValuesNext=2
|
|
55 first=${sorted[0]}
|
|
56 next=${sorted[${usedValues}]}
|
|
57 nextofnext=${sorted[${usedValuesNext}]}
|
|
58 finalvalue=$first
|
|
59 divisor=0
|
|
60 currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }')
|
|
61 TH=$(printf '%4.6f' $TH)
|
|
62 #echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')"
|
|
63 condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ')
|
|
64 #echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue"
|
|
65 while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ];
|
|
66 do
|
|
67 usedValues=`expr $usedValues + 1`
|
|
68 usedValuesNext=`expr $usedValuesNext + 1`
|
|
69 finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}')
|
|
70 next=${sorted[${usedValues}]}
|
|
71 nextofnext=${sorted[${usedValuesNext}]}
|
|
72
|
|
73 currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }')
|
|
74 condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ')
|
|
75 divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}')
|
|
76
|
|
77 done
|
|
78
|
|
79 #echo "so this is what we got $finalvalue, when divided using $divisor"
|
|
80
|
|
81 # array holds the results
|
|
82 #array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}')
|
|
83 finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}')
|
|
84 array[$highest]=$finalvalue
|
|
85 homologies[$highest]=$usedValues
|
|
86
|
|
87 highest=`expr $highest + 1`
|
|
88 condition=0
|
|
89 names+=($currgenome)
|
|
90 othergencounter=0
|
|
91 unset arraytosort
|
|
92
|
|
93
|
|
94
|
|
95
|
|
96 getvalue=$(echo "$i" | awk -F "," '{print $8}')
|
|
97 # Copy value to array
|
|
98 arraytosort[$othergencounter]=$getvalue
|
|
99 #currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}')
|
|
100 othergencounter=`expr $othergencounter + 1`
|
|
101 else
|
|
102 getvalue=$(echo "$i" | awk -F "," '{print $8}')
|
|
103 # Copy value to array
|
|
104 arraytosort[$othergencounter]=$getvalue
|
|
105 #currsum=$(awk -v a="$currsum" -v b="$getvalue" 'BEGIN {print a=a+(1-b); exit}')
|
|
106 othergencounter=`expr $othergencounter + 1`
|
|
107
|
|
108 fi
|
|
109
|
|
110 done < "$1.temp"
|
|
111
|
|
112 # do the last!!!
|
|
113 #if [ "$lastprint" == "$currgenome" ]; then
|
|
114 # Sort the array with temporal values
|
|
115 sorted=($(printf '%s\n' "${arraytosort[@]}"|sort))
|
|
116
|
|
117 usedValues=1
|
|
118 usedValuesNext=2
|
|
119 first=${sorted[0]}
|
|
120 next=${sorted[${usedValues}]}
|
|
121 nextofnext=${sorted[${usedValuesNext}]}
|
|
122 finalvalue=$first
|
|
123 divisor=0
|
|
124 currdiff=$(LC_NUMERIC=POSIX awk -v a="$next" -v b="$nextofnext" 'BEGIN {print b-a }')
|
|
125 TH=$(printf '%4.6f' $TH)
|
|
126 #echo "$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("comp %f > %f = %d",a,b,a>b)} ')"
|
|
127 condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d",a>b)} ')
|
|
128 #echo "first $first next $next result $currdiff condition $condition divisor $divisor th $TH finalvalue $finalvalue"
|
|
129 while [ $condition -eq 1 -a $usedValuesNext -lt ${#sorted[@]} ];
|
|
130 do
|
|
131 usedValues=`expr $usedValues + 1`
|
|
132 usedValuesNext=`expr $usedValuesNext + 1`
|
|
133 finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$next" 'BEGIN {print (a+b)}')
|
|
134 next=${sorted[${usedValues}]}
|
|
135 nextofnext=${sorted[${usedValuesNext}]}
|
|
136
|
|
137 currdiff=$(LC_NUMERIC=POSIX awk -v a="$nextofnext" -v b="$next" 'BEGIN {printf("%f", b-a) }')
|
|
138 condition=$(LC_NUMERIC=POSIX awk -v a="$currdiff" -v b="$TH" 'BEGIN { printf("%d", a>b)} ')
|
|
139 divisor=$(LC_NUMERIC=POSIX awk -v a="$divisor" 'BEGIN {print a+0.1}')
|
|
140
|
|
141 done
|
|
142
|
|
143 #echo "so this is what we got $finalvalue, when divided using $divisor"
|
|
144
|
|
145 # array holds the results
|
|
146 #array[$highest]=$(awk -v a="$currsum" -v b="$othergencounter" 'BEGIN {print a/b}')
|
|
147 finalvalue=$(LC_NUMERIC=POSIX awk -v a="$finalvalue" -v b="$usedValues" -v c="$divisor" 'BEGIN {printf("%f", a/(b-c))}')
|
|
148 array[$highest]=$finalvalue
|
|
149 homologies[$highest]=$usedValues
|
|
150
|
|
151
|
|
152 highest=`expr $highest + 1`
|
|
153 currsum=0
|
|
154 names+=($currgenome)
|
|
155 currgenome=$othergenome
|
|
156 othergencounter=0
|
|
157 #fi
|
|
158
|
|
159
|
|
160 highest=`expr $highest - 1`
|
|
161 rm $1.temp
|
|
162
|
|
163 tsum=0
|
|
164 echo "deleteme" > $1.inter
|
|
165 rm $1.inter
|
|
166 aux=0
|
|
167 for ((i = 1; i <= highest; i++)); do
|
|
168 echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}" >> $1.inter
|
|
169 #echo "${names[${aux}]} ${array[${i}]} ${homologies[${i}]}"
|
|
170 aux=`expr $aux + 1`
|
|
171 #val=${array[${i}]}
|
|
172 #tsum=$(awk -v a="$tsum" -v b="$val" '{print a=a+b}')
|
|
173 done
|
|
174
|
|
175 #awk -v a="$tsum" b="$highest" '{print a/b}'
|
|
176
|
|
177 #sumfirst=$(awk -F "," 'BEGIN{suma=0}{suma = suma + $8}END{print suma}' "$CSV")
|
|
178 #echo "$sumfirst"
|
|
179
|
|
180
|
|
181
|