Mercurial > repos > saskia-hiltemann > file_manipulation
view filter_columns.sh @ 0:e77c9484b2d0 draft default tip
Uploaded
author | saskia-hiltemann |
---|---|
date | Thu, 22 Oct 2015 09:18:30 -0400 |
parents | |
children |
line wrap: on
line source
#!/bin/bash echo $@ count=1 set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage [ $# -eq 0 ] && usage while [ $# -gt 0 ] do case "$1" in --infile) infile=$2;shift;; --outfile_rem) outfile_rem=$2;shift;; --outfile_ret) outfile_ret=$2;shift;; --filter) filter[$count]=$2 count=$[$count+1] shift;; -h) shift;; --) shift;break;; -*) usage;; *) break;; esac shift done # remove but remember header cp $infile inputfile sed -i -e "1d" inputfile head inputfile count=1 cp $infile inputfile for f in ${filter[@]} do echo "filter $count: $f" count=$[$count+1] IFS=',' read -a filt <<< "$f" #filt=split(${f//,/ }) column=${filt[0]} op=${filt[1]} value=${filt[2]} echo "column: $column" echo "op : $op" echo "value: $value" #perform filtering awk 'BEGIN{ FS="\t" OFS="\t" op="'"$op"'" numeric_value="'"$value"'"+0 }{ # keep header in both output files if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){ print $0 >> "outfile_removed" print $0 >> "outfile_retained" } if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){ print $0 >> "outfile_removed" } else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){ print $0 >> "outfile_removed" } else print $0 >> "outfile_retained" }END{}' inputfile #next iteration only run on retained lines cp outfile_retained inputfile rm outfile_retained done #remove duplicate lines in outputfiles cat outfile_removed | uniq > $outfile_rem cat inputfile | uniq > $outfile_ret #awk ' !x[$0]++' outfile_removed > $outfile_rem #awk ' !x[$0]++' outfile_retained > $outfile_ret