0
|
1 #!/bin/sh
|
|
2 #
|
|
3 # CHECK-entries.sh
|
|
4 # Marco van Zwetselaar <zwets@kcri.ac.tz>
|
|
5 #
|
|
6 # Checks that each sequence in the fsa files has an entry in phenotypes.txt,
|
|
7 # and that every entry in phenotypes has a corresponding sequence in an fsa.
|
|
8 #
|
|
9 # Writes to stdout the lists of missing entries, as well as close matches
|
|
10 # based on allele ID or accession.
|
|
11 #
|
|
12
|
|
13 # Echo all identifiers from the phenotypes.txt file, one per line.
|
|
14 phenotype_ids() {
|
|
15 cut -f1 phenotypes.txt | tail -n +2 | sed -e 's/ *$//'
|
|
16 }
|
|
17
|
|
18 # Echo all sequence identifiers from the *.fsa, one per line.
|
|
19 sequence_ids() {
|
|
20 fgrep -h '>' *.fsa | sed -e 's/>\([^ ]*\).*/\1/'
|
|
21 }
|
|
22
|
|
23 # Filter stdin for near matches of seqid $1. A near match is when either
|
|
24 # the allele ID (without accession) or the accession matches.
|
|
25 near_matches() {
|
|
26 # Horrid REGEX, but we need to escape special chars and split the SEQID
|
|
27 local REGEX="$(echo "$1" |
|
|
28 sed -e 's,\([][().+?|*]\),\\\1,g' \
|
|
29 -e 's,^\([^_]*_[^_]*\)\(_.*\),^\1|\2$,')"
|
|
30 grep -E "$REGEX" | tr '\n' ' '
|
|
31 }
|
|
32
|
|
33
|
|
34 printf "
|
|
35 ===============================================================================
|
|
36 I. Entries in phenotypes.txt with trailing whitespace in their identifier (col 1).
|
|
37 (This whitespace breaks simple key based lookups.)
|
|
38 -------------------------------------------------------------------------------\n"
|
|
39 cut -f1 phenotypes.txt | tail -n +2 | fgrep ' '
|
|
40
|
|
41
|
|
42 printf "
|
|
43 ===============================================================================
|
|
44 II. Entries in phenotypes.txt with no matching sequence in an *.fsa file.
|
|
45 Second column lists close matches (having identical alleleID or accession).
|
|
46 -------------------------------------------------------------------------------\n"
|
|
47 phenotype_ids | while read SEQID; do
|
|
48 sequence_ids | fgrep -xq "${SEQID}" ||
|
|
49 printf "$SEQID\t%s\n" "$(sequence_ids | near_matches "$SEQID")"
|
|
50 done
|
|
51
|
|
52
|
|
53 printf "
|
|
54 ===============================================================================
|
|
55 III. Sequences in *.fsa that have no corresponding entry in phenotypes.txt.
|
|
56 Second column lists close matches (having identical alleleID or accession).
|
|
57 -------------------------------------------------------------------------------\n"
|
|
58 sequence_ids | while read SEQID; do
|
|
59 phenotype_ids | fgrep -xq "${SEQID}" ||
|
|
60 printf "$SEQID\t%s\n" "$(phenotype_ids | near_matches "$SEQID")"
|
|
61 done
|
|
62
|
|
63 # vim: sts=4:sw=4:si:et:ai
|