Mercurial > repos > saskia-hiltemann > file_manipulation
comparison filter_columns.sh @ 0:e77c9484b2d0 draft default tip
Uploaded
| author | saskia-hiltemann | 
|---|---|
| date | Thu, 22 Oct 2015 09:18:30 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:e77c9484b2d0 | 
|---|---|
| 1 #!/bin/bash | |
| 2 | |
| 3 echo $@ | |
| 4 count=1 | |
| 5 | |
| 6 set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage | |
| 7 [ $# -eq 0 ] && usage | |
| 8 | |
| 9 | |
| 10 while [ $# -gt 0 ] | |
| 11 do | |
| 12 case "$1" in | |
| 13 --infile) infile=$2;shift;; | |
| 14 --outfile_rem) outfile_rem=$2;shift;; | |
| 15 --outfile_ret) outfile_ret=$2;shift;; | |
| 16 --filter) filter[$count]=$2 | |
| 17 count=$[$count+1] | |
| 18 shift;; | |
| 19 -h) shift;; | |
| 20 --) shift;break;; | |
| 21 -*) usage;; | |
| 22 *) break;; | |
| 23 esac | |
| 24 shift | |
| 25 done | |
| 26 | |
| 27 | |
| 28 | |
| 29 | |
| 30 # remove but remember header | |
| 31 cp $infile inputfile | |
| 32 sed -i -e "1d" inputfile | |
| 33 | |
| 34 head inputfile | |
| 35 | |
| 36 count=1 | |
| 37 cp $infile inputfile | |
| 38 | |
| 39 for f in ${filter[@]} | |
| 40 do | |
| 41 echo "filter $count: $f" | |
| 42 count=$[$count+1] | |
| 43 | |
| 44 IFS=',' read -a filt <<< "$f" | |
| 45 | |
| 46 #filt=split(${f//,/ }) | |
| 47 column=${filt[0]} | |
| 48 op=${filt[1]} | |
| 49 value=${filt[2]} | |
| 50 | |
| 51 echo "column: $column" | |
| 52 echo "op : $op" | |
| 53 echo "value: $value" | |
| 54 | |
| 55 #perform filtering | |
| 56 | |
| 57 awk 'BEGIN{ | |
| 58 FS="\t" | |
| 59 OFS="\t" | |
| 60 op="'"$op"'" | |
| 61 numeric_value="'"$value"'"+0 | |
| 62 | |
| 63 }{ | |
| 64 # keep header in both output files | |
| 65 if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ | |
| 66 print $0 >> "outfile_removed" | |
| 67 print $0 >> "outfile_retained" | |
| 68 } | |
| 69 | |
| 70 if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ | |
| 71 print $0 >> "outfile_removed" | |
| 72 } | |
| 73 else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ | |
| 74 print $0 >> "outfile_removed" | |
| 75 } | |
| 76 else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ | |
| 77 print $0 >> "outfile_removed" | |
| 78 } | |
| 79 else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ | |
| 80 print $0 >> "outfile_removed" | |
| 81 } | |
| 82 else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ | |
| 83 print $0 >> "outfile_removed" | |
| 84 } | |
| 85 else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ | |
| 86 print $0 >> "outfile_removed" | |
| 87 } | |
| 88 else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ | |
| 89 print $0 >> "outfile_removed" | |
| 90 } | |
| 91 else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ | |
| 92 print $0 >> "outfile_removed" | |
| 93 } | |
| 94 else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ | |
| 95 print $0 >> "outfile_removed" | |
| 96 } | |
| 97 else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ | |
| 98 print $0 >> "outfile_removed" | |
| 99 } | |
| 100 else | |
| 101 print $0 >> "outfile_retained" | |
| 102 | |
| 103 }END{}' inputfile | |
| 104 | |
| 105 #next iteration only run on retained lines | |
| 106 cp outfile_retained inputfile | |
| 107 rm outfile_retained | |
| 108 done | |
| 109 | |
| 110 #remove duplicate lines in outputfiles | |
| 111 cat outfile_removed | uniq > $outfile_rem | |
| 112 cat inputfile | uniq > $outfile_ret | |
| 113 | |
| 114 #awk ' !x[$0]++' outfile_removed > $outfile_rem | |
| 115 #awk ' !x[$0]++' outfile_retained > $outfile_ret | |
| 116 | 
