Mercurial > repos > saskia-hiltemann > file_manipulation
comparison filter_columns.sh @ 0:e77c9484b2d0 draft default tip
Uploaded
author | saskia-hiltemann |
---|---|
date | Thu, 22 Oct 2015 09:18:30 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e77c9484b2d0 |
---|---|
1 #!/bin/bash | |
2 | |
3 echo $@ | |
4 count=1 | |
5 | |
6 set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage | |
7 [ $# -eq 0 ] && usage | |
8 | |
9 | |
10 while [ $# -gt 0 ] | |
11 do | |
12 case "$1" in | |
13 --infile) infile=$2;shift;; | |
14 --outfile_rem) outfile_rem=$2;shift;; | |
15 --outfile_ret) outfile_ret=$2;shift;; | |
16 --filter) filter[$count]=$2 | |
17 count=$[$count+1] | |
18 shift;; | |
19 -h) shift;; | |
20 --) shift;break;; | |
21 -*) usage;; | |
22 *) break;; | |
23 esac | |
24 shift | |
25 done | |
26 | |
27 | |
28 | |
29 | |
30 # remove but remember header | |
31 cp $infile inputfile | |
32 sed -i -e "1d" inputfile | |
33 | |
34 head inputfile | |
35 | |
36 count=1 | |
37 cp $infile inputfile | |
38 | |
39 for f in ${filter[@]} | |
40 do | |
41 echo "filter $count: $f" | |
42 count=$[$count+1] | |
43 | |
44 IFS=',' read -a filt <<< "$f" | |
45 | |
46 #filt=split(${f//,/ }) | |
47 column=${filt[0]} | |
48 op=${filt[1]} | |
49 value=${filt[2]} | |
50 | |
51 echo "column: $column" | |
52 echo "op : $op" | |
53 echo "value: $value" | |
54 | |
55 #perform filtering | |
56 | |
57 awk 'BEGIN{ | |
58 FS="\t" | |
59 OFS="\t" | |
60 op="'"$op"'" | |
61 numeric_value="'"$value"'"+0 | |
62 | |
63 }{ | |
64 # keep header in both output files | |
65 if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ | |
66 print $0 >> "outfile_removed" | |
67 print $0 >> "outfile_retained" | |
68 } | |
69 | |
70 if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ | |
71 print $0 >> "outfile_removed" | |
72 } | |
73 else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ | |
74 print $0 >> "outfile_removed" | |
75 } | |
76 else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ | |
77 print $0 >> "outfile_removed" | |
78 } | |
79 else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ | |
80 print $0 >> "outfile_removed" | |
81 } | |
82 else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ | |
83 print $0 >> "outfile_removed" | |
84 } | |
85 else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ | |
86 print $0 >> "outfile_removed" | |
87 } | |
88 else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ | |
89 print $0 >> "outfile_removed" | |
90 } | |
91 else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ | |
92 print $0 >> "outfile_removed" | |
93 } | |
94 else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ | |
95 print $0 >> "outfile_removed" | |
96 } | |
97 else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ | |
98 print $0 >> "outfile_removed" | |
99 } | |
100 else | |
101 print $0 >> "outfile_retained" | |
102 | |
103 }END{}' inputfile | |
104 | |
105 #next iteration only run on retained lines | |
106 cp outfile_retained inputfile | |
107 rm outfile_retained | |
108 done | |
109 | |
110 #remove duplicate lines in outputfiles | |
111 cat outfile_removed | uniq > $outfile_rem | |
112 cat inputfile | uniq > $outfile_ret | |
113 | |
114 #awk ' !x[$0]++' outfile_removed > $outfile_rem | |
115 #awk ' !x[$0]++' outfile_retained > $outfile_ret | |
116 |