Mercurial > repos > yutaka-saito > bsfcall
comparison bin/maf-sort @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:06f8460885ff |
---|---|
1 #! /bin/sh | |
2 | |
3 # Sort MAF-format alignments by sequence name, then strand, then start | |
4 # position, then end position, of the top sequence. Also, merge | |
5 # identical alignments. Comment lines starting with "#" are written | |
6 # at the top, in unchanged order. If option "-d" is specified, then | |
7 # alignments that appear only once are omitted (like uniq -d). | |
8 | |
9 # Minor flaws, that do not matter for typical MAF input: | |
10 # 1) It might not work if the input includes TABs. | |
11 # 2) Preceding whitespace is considered part of the sequence name. I | |
12 # want to use sort -b, but it seems to be broken in different ways for | |
13 # different versions of sort! | |
14 # 3) Alignments with differences in whitespace are considered | |
15 # non-identical. | |
16 | |
17 # This script uses perl instead of specialized commands like uniq. | |
18 # The reason is that, on some systems (e.g. Mac OS X), uniq doesn't | |
19 # work with long lines. | |
20 | |
21 # Make "sort" use a standard ordering: | |
22 LC_ALL=C | |
23 export LC_ALL | |
24 | |
25 uniqOpt=1 | |
26 whichSequence=1 | |
27 while getopts hdn: opt | |
28 do | |
29 case $opt in | |
30 h) cat <<EOF | |
31 Usage: $(basename $0) [options] my-alignments.maf | |
32 | |
33 Options: | |
34 -h show this help message and exit | |
35 -d only print duplicate alignments | |
36 -n sort by the n-th sequence (default: 1) | |
37 EOF | |
38 exit | |
39 ;; | |
40 d) uniqOpt=2 | |
41 ;; | |
42 n) whichSequence="$OPTARG" | |
43 ;; | |
44 esac | |
45 done | |
46 shift $((OPTIND - 1)) | |
47 | |
48 baseField=$((6 * $whichSequence)) | |
49 a=$(($baseField - 4)) | |
50 a=$a,$a | |
51 b=$(($baseField - 1)) | |
52 b=$b,$b | |
53 c=$(($baseField - 3)) | |
54 c=$c,$c | |
55 d=$(($baseField - 2)) | |
56 d=$d,$d | |
57 | |
58 # 1) Add digits to "#" lines, so that sorting won't change their order. | |
59 # 2) Replace spaces, except in "s" lines. | |
60 # 3) Join each alignment into one big line. | |
61 perl -pe ' | |
62 s/^#/sprintf("#%.9d",$c++)/e; | |
63 y/ /\a/ unless /^s/; | |
64 y/\n/\b/ if /^\w/; | |
65 ' "$@" | | |
66 | |
67 sort -k$a -k$b -k${c}n -k${d}n | # sort the lines | |
68 | |
69 # Print only the first (or second) of each run of identical lines: | |
70 perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt | | |
71 | |
72 # 1) Remove the digits from "#" lines. | |
73 # 2) Restore spaces and newlines. | |
74 perl -pe ' | |
75 s/^#.{9}/#/; | |
76 y/\a\b/ \n/; | |
77 ' |