Mercurial > repos > yutaka-saito > bsfcall
diff bin/maf-sort @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/maf-sort Sun Apr 19 20:51:13 2015 +0900 @@ -0,0 +1,77 @@ +#! /bin/sh + +# Sort MAF-format alignments by sequence name, then strand, then start +# position, then end position, of the top sequence. Also, merge +# identical alignments. Comment lines starting with "#" are written +# at the top, in unchanged order. If option "-d" is specified, then +# alignments that appear only once are omitted (like uniq -d). + +# Minor flaws, that do not matter for typical MAF input: +# 1) It might not work if the input includes TABs. +# 2) Preceding whitespace is considered part of the sequence name. I +# want to use sort -b, but it seems to be broken in different ways for +# different versions of sort! +# 3) Alignments with differences in whitespace are considered +# non-identical. + +# This script uses perl instead of specialized commands like uniq. +# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't +# work with long lines. + +# Make "sort" use a standard ordering: +LC_ALL=C +export LC_ALL + +uniqOpt=1 +whichSequence=1 +while getopts hdn: opt +do + case $opt in + h) cat <<EOF +Usage: $(basename $0) [options] my-alignments.maf + +Options: + -h show this help message and exit + -d only print duplicate alignments + -n sort by the n-th sequence (default: 1) +EOF + exit + ;; + d) uniqOpt=2 + ;; + n) whichSequence="$OPTARG" + ;; + esac +done +shift $((OPTIND - 1)) + +baseField=$((6 * $whichSequence)) +a=$(($baseField - 4)) +a=$a,$a +b=$(($baseField - 1)) +b=$b,$b +c=$(($baseField - 3)) +c=$c,$c +d=$(($baseField - 2)) +d=$d,$d + +# 1) Add digits to "#" lines, so that sorting won't change their order. +# 2) Replace spaces, except in "s" lines. +# 3) Join each alignment into one big line. +perl -pe ' +s/^#/sprintf("#%.9d",$c++)/e; +y/ /\a/ unless /^s/; +y/\n/\b/ if /^\w/; +' "$@" | + +sort -k$a -k$b -k${c}n -k${d}n | # sort the lines + +# Print only the first (or second) of each run of identical lines: +perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt | + +# 1) Remove the digits from "#" lines. +# 2) Restore spaces and newlines. +perl -pe ' +s/^#.{9}/#/; +y/\a\b/ \n/; +'