0
|
1 #! /bin/sh
|
|
2
|
|
3 # Sort MAF-format alignments by sequence name, then strand, then start
|
|
4 # position, then end position, of the top sequence. Also, merge
|
|
5 # identical alignments. Comment lines starting with "#" are written
|
|
6 # at the top, in unchanged order. If option "-d" is specified, then
|
|
7 # alignments that appear only once are omitted (like uniq -d).
|
|
8
|
|
9 # Minor flaws, that do not matter for typical MAF input:
|
|
10 # 1) It might not work if the input includes TABs.
|
|
11 # 2) Preceding whitespace is considered part of the sequence name. I
|
|
12 # want to use sort -b, but it seems to be broken in different ways for
|
|
13 # different versions of sort!
|
|
14 # 3) Alignments with differences in whitespace are considered
|
|
15 # non-identical.
|
|
16
|
|
17 # This script uses perl instead of specialized commands like uniq.
|
|
18 # The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
|
|
19 # work with long lines.
|
|
20
|
|
21 # Make "sort" use a standard ordering:
|
|
22 LC_ALL=C
|
|
23 export LC_ALL
|
|
24
|
|
25 uniqOpt=1
|
|
26 whichSequence=1
|
|
27 while getopts hdn: opt
|
|
28 do
|
|
29 case $opt in
|
|
30 h) cat <<EOF
|
|
31 Usage: $(basename $0) [options] my-alignments.maf
|
|
32
|
|
33 Options:
|
|
34 -h show this help message and exit
|
|
35 -d only print duplicate alignments
|
|
36 -n sort by the n-th sequence (default: 1)
|
|
37 EOF
|
|
38 exit
|
|
39 ;;
|
|
40 d) uniqOpt=2
|
|
41 ;;
|
|
42 n) whichSequence="$OPTARG"
|
|
43 ;;
|
|
44 esac
|
|
45 done
|
|
46 shift $((OPTIND - 1))
|
|
47
|
|
48 baseField=$((6 * $whichSequence))
|
|
49 a=$(($baseField - 4))
|
|
50 a=$a,$a
|
|
51 b=$(($baseField - 1))
|
|
52 b=$b,$b
|
|
53 c=$(($baseField - 3))
|
|
54 c=$c,$c
|
|
55 d=$(($baseField - 2))
|
|
56 d=$d,$d
|
|
57
|
|
58 # 1) Add digits to "#" lines, so that sorting won't change their order.
|
|
59 # 2) Replace spaces, except in "s" lines.
|
|
60 # 3) Join each alignment into one big line.
|
|
61 perl -pe '
|
|
62 s/^#/sprintf("#%.9d",$c++)/e;
|
|
63 y/ /\a/ unless /^s/;
|
|
64 y/\n/\b/ if /^\w/;
|
|
65 ' "$@" |
|
|
66
|
|
67 sort -k$a -k$b -k${c}n -k${d}n | # sort the lines
|
|
68
|
|
69 # Print only the first (or second) of each run of identical lines:
|
|
70 perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
|
|
71
|
|
72 # 1) Remove the digits from "#" lines.
|
|
73 # 2) Restore spaces and newlines.
|
|
74 perl -pe '
|
|
75 s/^#.{9}/#/;
|
|
76 y/\a\b/ \n/;
|
|
77 '
|