diff bin/maf-sort @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/maf-sort	Sun Apr 19 20:51:13 2015 +0900
@@ -0,0 +1,77 @@
+#! /bin/sh
+
+# Sort MAF-format alignments by sequence name, then strand, then start
+# position, then end position, of the top sequence.  Also, merge
+# identical alignments.  Comment lines starting with "#" are written
+# at the top, in unchanged order.  If option "-d" is specified, then
+# alignments that appear only once are omitted (like uniq -d).
+
+# Minor flaws, that do not matter for typical MAF input:
+# 1) It might not work if the input includes TABs.
+# 2) Preceding whitespace is considered part of the sequence name.  I
+# want to use sort -b, but it seems to be broken in different ways for
+# different versions of sort!
+# 3) Alignments with differences in whitespace are considered
+# non-identical.
+
+# This script uses perl instead of specialized commands like uniq.
+# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
+# work with long lines.
+
+# Make "sort" use a standard ordering:
+LC_ALL=C
+export LC_ALL
+
+uniqOpt=1
+whichSequence=1
+while getopts hdn: opt
+do
+    case $opt in
+	h)  cat <<EOF
+Usage: $(basename $0) [options] my-alignments.maf
+
+Options:
+  -h  show this help message and exit
+  -d  only print duplicate alignments
+  -n  sort by the n-th sequence (default: 1)
+EOF
+	    exit
+	    ;;
+	d)  uniqOpt=2
+            ;;
+	n)  whichSequence="$OPTARG"
+	    ;;
+    esac
+done
+shift $((OPTIND - 1))
+
+baseField=$((6 * $whichSequence))
+a=$(($baseField - 4))
+a=$a,$a
+b=$(($baseField - 1))
+b=$b,$b
+c=$(($baseField - 3))
+c=$c,$c
+d=$(($baseField - 2))
+d=$d,$d
+
+# 1) Add digits to "#" lines, so that sorting won't change their order.
+# 2) Replace spaces, except in "s" lines.
+# 3) Join each alignment into one big line.
+perl -pe '
+s/^#/sprintf("#%.9d",$c++)/e;
+y/ /\a/ unless /^s/;
+y/\n/\b/ if /^\w/;
+' "$@" |
+
+sort -k$a -k$b -k${c}n -k${d}n |  # sort the lines
+
+# Print only the first (or second) of each run of identical lines:
+perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
+
+# 1) Remove the digits from "#" lines.
+# 2) Restore spaces and newlines.
+perl -pe '
+s/^#.{9}/#/;
+y/\a\b/ \n/;
+'