view bin/maf-sort @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
line wrap: on
line source

#! /bin/sh

# Sort MAF-format alignments by sequence name, then strand, then start
# position, then end position, of the top sequence.  Also, merge
# identical alignments.  Comment lines starting with "#" are written
# at the top, in unchanged order.  If option "-d" is specified, then
# alignments that appear only once are omitted (like uniq -d).

# Minor flaws, that do not matter for typical MAF input:
# 1) It might not work if the input includes TABs.
# 2) Preceding whitespace is considered part of the sequence name.  I
# want to use sort -b, but it seems to be broken in different ways for
# different versions of sort!
# 3) Alignments with differences in whitespace are considered
# non-identical.

# This script uses perl instead of specialized commands like uniq.
# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
# work with long lines.

# Make "sort" use a standard ordering:
LC_ALL=C
export LC_ALL

uniqOpt=1
whichSequence=1
while getopts hdn: opt
do
    case $opt in
	h)  cat <<EOF
Usage: $(basename $0) [options] my-alignments.maf

Options:
  -h  show this help message and exit
  -d  only print duplicate alignments
  -n  sort by the n-th sequence (default: 1)
EOF
	    exit
	    ;;
	d)  uniqOpt=2
            ;;
	n)  whichSequence="$OPTARG"
	    ;;
    esac
done
shift $((OPTIND - 1))

baseField=$((6 * $whichSequence))
a=$(($baseField - 4))
a=$a,$a
b=$(($baseField - 1))
b=$b,$b
c=$(($baseField - 3))
c=$c,$c
d=$(($baseField - 2))
d=$d,$d

# 1) Add digits to "#" lines, so that sorting won't change their order.
# 2) Replace spaces, except in "s" lines.
# 3) Join each alignment into one big line.
perl -pe '
s/^#/sprintf("#%.9d",$c++)/e;
y/ /\a/ unless /^s/;
y/\n/\b/ if /^\w/;
' "$@" |

sort -k$a -k$b -k${c}n -k${d}n |  # sort the lines

# Print only the first (or second) of each run of identical lines:
perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |

# 1) Remove the digits from "#" lines.
# 2) Restore spaces and newlines.
perl -pe '
s/^#.{9}/#/;
y/\a\b/ \n/;
'