# HG changeset patch
# User saskia-hiltemann
# Date 1445519910 14400
# Node ID e77c9484b2d06563f90b307a3ff2de15f1a3fe4c
Uploaded
diff -r 000000000000 -r e77c9484b2d0 chrprefix.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/chrprefix.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+inputfile=$1
+col=$2
+addremove=$3
+outputfile=$4
+
+echo "args: $@"
+echo "inputfile: $inputfile"
+echo "column: $column"
+echo "addremove: $addremove"
+echo "outputfile: $outputfile"
+
+#get column number
+column=`expr match "$col" '\([0-9]*\)'`
+echo "colnumber: $column"
+
+if [ $addremove == "add" ]
+then
+ echo "adding prefix to column $column"
+ awk 'BEGIN{
+ FS="\t"
+ OFS="\t"
+ c="'"$column"'"
+ }{
+ if (index($0,"#")!=1){
+ $c="chr"$c
+ }
+ print $0
+
+ }END{}' $inputfile > $outputfile
+
+else #remove prefix
+ echo "removing prefix from column $column"
+ awk 'BEGIN{
+ FS="\t"
+ OFS="\t"
+ c="'"$column"'"
+ }{
+ if (FNR>1 && index($0,"#")!=1){
+ $c=substr($c,4)
+ }
+ print $0
+
+ }END{}' $inputfile > $outputfile
+fi
+
+echo "inputfile: "
+head -5 $inputfile
+
+echo "outputfile: "
+head -5 $outputfile
diff -r 000000000000 -r e77c9484b2d0 chrprefix.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/chrprefix.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,47 @@
+
+
+ add or remove chr prefix from a column
+
+
+
+
+ chrprefix.sh
+ $infile
+ "${go.column}"
+ $addremove
+ $outputfile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+removes or adds the "chr" prefix in a column of a file. Some tools expect you to indicate chromosomes as "chr1,chr2,chrX", while others expect only 1,2,X as input. This tool allows you to easily switch notations
+
+
+
diff -r 000000000000 -r e77c9484b2d0 column_extract.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+inputfile=$1
+outputfile=$2
+removeheader=$3
+columns="$@"
+
+cols="${columns// /,}" #make comma-separated
+
+#skip first three arguments
+firstcomma=`expr index "$cols" ,`
+cols="${cols:$firstcomma}"
+secondcomma=`expr index "$cols" ,`
+cols="${cols:$secondcomma}"
+thirdcomma=`expr index "$cols" ,`
+cols="${cols:$thirdcomma}"
+cols="${cols//:/}" #remove colons
+echo "colums to print: $cols"
+
+arr=$(echo $cols | tr "," "\n")
+
+for x in $arr
+do
+ echo $x
+done
+
+myArray=($columns)
+i=3
+len=${#myArray[@]}
+mycols=""
+echo "len: $len"
+while [ $i -le $len ]
+do
+ echo "myarray: ${myArray[$i]}"
+ mycols+=${myArray[$i]}
+ i=$[$i+2]
+done
+mycols="${mycols//:/,}" #make comma-separated
+mycols="${mycols%?}"
+echo "mycols: $mycols"
+
+awk 'BEGIN{
+ FS="\t";
+ OFS="\t";
+ columns="'"$mycols"'";
+ len=split(columns,arr,",")
+ }{
+ if (index($1,"#")==1 || $1==""){ #print header as--s
+ if("'"$removeheader"'"=="N"){
+ print $0
+ }
+ }
+ else{
+ for (i=1;i $outputfile
+
+
diff -r 000000000000 -r e77c9484b2d0 column_extract.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/column_extract.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,50 @@
+
+
+ extract/rearrange columns from a tab-delimited file
+
+
+
+ column_extract.sh
+ $infile
+ $outputfile
+ $removeheader
+ #for $c in $go.columns
+ ${c.column}
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+Outputs columns of input file in order specified by user. Columns not selected will not be output.
+
+
+
diff -r 000000000000 -r e77c9484b2d0 concatenate.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+cat $1 $2 > $3
diff -r 000000000000 -r e77c9484b2d0 concatenate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/concatenate.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+
+
+ concatenate 2 files
+
+
+ concatenate.sh
+ $infile
+ $infile2
+ $outputfile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+Concatenates 2 files
+
+
+
diff -r 000000000000 -r e77c9484b2d0 filter_columns.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+echo $@
+count=1
+
+set -- `getopt -n$0 -u -a --longoptions="infile: outfile_rem: outfile_ret: filter: " "h:" "$@"` || usage
+[ $# -eq 0 ] && usage
+
+
+while [ $# -gt 0 ]
+do
+ case "$1" in
+ --infile) infile=$2;shift;;
+ --outfile_rem) outfile_rem=$2;shift;;
+ --outfile_ret) outfile_ret=$2;shift;;
+ --filter) filter[$count]=$2
+ count=$[$count+1]
+ shift;;
+ -h) shift;;
+ --) shift;break;;
+ -*) usage;;
+ *) break;;
+ esac
+ shift
+done
+
+
+
+
+# remove but remember header
+cp $infile inputfile
+sed -i -e "1d" inputfile
+
+head inputfile
+
+count=1
+cp $infile inputfile
+
+for f in ${filter[@]}
+do
+ echo "filter $count: $f"
+ count=$[$count+1]
+
+ IFS=',' read -a filt <<< "$f"
+
+ #filt=split(${f//,/ })
+ column=${filt[0]}
+ op=${filt[1]}
+ value=${filt[2]}
+
+ echo "column: $column"
+ echo "op : $op"
+ echo "value: $value"
+
+ #perform filtering
+
+ awk 'BEGIN{
+ FS="\t"
+ OFS="\t"
+ op="'"$op"'"
+ numeric_value="'"$value"'"+0
+
+ }{
+ # keep header in both output files
+ if(FNR==1 || index($0,"#")==1 || index($0,"<")==1 || index($0,">")==1 || NF==0){
+ print $0 >> "outfile_removed"
+ print $0 >> "outfile_retained"
+ }
+
+ if ( "'"$op"'"== "equals" && $"'"$column"'" == "'"$value"'"){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "nequals" && $"'"$column"'" != "'"$value"'"){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "contains" && index($"'"$column"'", "'"$value"'") != 0){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "ncontains" && index($"'"$column"'", "'"$value"'") == 0){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "empty" && $"'"$column"'" == ""){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "nonempty" && $"'"$column"'" != ""){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "lt" && $"'"$column"'"+0 < "'"$value"'"+0){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "le" && $"'"$column"'"+0 <= "'"$value"'"+0){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "gt" && $"'"$column"'"+0 > "'"$value"'"+0){
+ print $0 >> "outfile_removed"
+ }
+ else if ( "'"$op"'"== "ge" && $"'"$column"'"+0 >= "'"$value"'"+0){
+ print $0 >> "outfile_removed"
+ }
+ else
+ print $0 >> "outfile_retained"
+
+ }END{}' inputfile
+
+ #next iteration only run on retained lines
+ cp outfile_retained inputfile
+ rm outfile_retained
+done
+
+#remove duplicate lines in outputfiles
+cat outfile_removed | uniq > $outfile_rem
+cat inputfile | uniq > $outfile_ret
+
+#awk ' !x[$0]++' outfile_removed > $outfile_rem
+#awk ' !x[$0]++' outfile_retained > $outfile_ret
+
diff -r 000000000000 -r e77c9484b2d0 filter_columns.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_columns.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,53 @@
+
+
+ filter file based on column values
+
+
+ filter_columns.sh
+ --infile $infile
+ #for $f in $filters
+ --filter ${f.column},${f.condition},${f.value}
+ #end for
+ --outfile_rem $outputfile_removed
+ --outfile_ret $outputfile_retained
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+Concatenates 2 files
+
+
+
diff -r 000000000000 -r e77c9484b2d0 getcolumnnames.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/getcolumnnames.py Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,30 @@
+import os, sys
+import fnmatch
+import csv
+
+def get_headers(inputfile):
+ columnList=[]
+ #line=inputfile.readlines()[0]
+ filename=inputfile.get_file_name()
+ try:
+ f = open(filename)
+ line=f.readline()
+ while(line[0]=='#' or (not line.strip())): #remove header (starting with hash sign and empty lines to get to headerline
+ line=f.readline()
+ line = line.strip()
+ i=1;
+ for col in line.split("\t"):
+ label=str(i)+': '+str(col)
+ columnList.append([label,label,False])
+ i+=1
+
+ except IOError as e:
+ pass
+
+ return columnList
+
+
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines
+
+
+cp $1 inputfile.tsv
+chrcol=$2
+startcol=$3
+endcol=$4
+num_headerlines=$5
+outfile=$6
+
+#remember header
+head -$num_headerlines inputfile.tsv > header.tsv
+
+#remove header
+sed -i "1,$num_headerlines d" inputfile.tsv
+
+#sort file
+sort -k ${chrcol},${chrcol}V -k${startcol},${startcol}n inputfile.tsv > tmpout.txt
+
+cat header.tsv tmpout.txt > $outfile
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 sort_chromosomal_position.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_chromosomal_position.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,31 @@
+
+ sort file by chromosome, then by position
+
+ sort_chromosomal_position.sh $infile $chrcol $startcol $endcol $num_headerlines $sorted_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e77c9484b2d0 strip_header.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.sh Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,11 @@
+ #!/bin/bash
+
+inputfile=$1
+outputfile=$2
+header=$3
+commentchar=$4
+
+echo "commentchar: -${commentchar}-"
+
+sed -e "/^${commentchar}/d" -e '/^$/d' $inputfile > $outputfile
+sed -n "/^${commentchar}/p" $inputfile > $header
diff -r 000000000000 -r e77c9484b2d0 strip_header.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/strip_header.xml Thu Oct 22 09:18:30 2015 -0400
@@ -0,0 +1,35 @@
+