annotate resfinder/db_resfinder/CHECK-entries.sh @ 0:55051a9bc58d draft default tip

Uploaded
author dcouvin
date Mon, 10 Jan 2022 20:06:07 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
1 #!/bin/sh
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
2 #
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
3 # CHECK-entries.sh
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
4 # Marco van Zwetselaar <zwets@kcri.ac.tz>
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
5 #
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
6 # Checks that each sequence in the fsa files has an entry in phenotypes.txt,
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
7 # and that every entry in phenotypes has a corresponding sequence in an fsa.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
8 #
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
9 # Writes to stdout the lists of missing entries, as well as close matches
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
10 # based on allele ID or accession.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
11 #
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
12
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
13 # Echo all identifiers from the phenotypes.txt file, one per line.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
14 phenotype_ids() {
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
15 cut -f1 phenotypes.txt | tail -n +2 | sed -e 's/ *$//'
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
16 }
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
17
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
18 # Echo all sequence identifiers from the *.fsa, one per line.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
19 sequence_ids() {
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
20 fgrep -h '>' *.fsa | sed -e 's/>\([^ ]*\).*/\1/'
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
21 }
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
22
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
23 # Filter stdin for near matches of seqid $1. A near match is when either
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
24 # the allele ID (without accession) or the accession matches.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
25 near_matches() {
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
26 # Horrid REGEX, but we need to escape special chars and split the SEQID
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
27 local REGEX="$(echo "$1" |
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
28 sed -e 's,\([][().+?|*]\),\\\1,g' \
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
29 -e 's,^\([^_]*_[^_]*\)\(_.*\),^\1|\2$,')"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
30 grep -E "$REGEX" | tr '\n' ' '
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
31 }
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
32
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
33
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
34 printf "
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
35 ===============================================================================
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
36 I. Entries in phenotypes.txt with trailing whitespace in their identifier (col 1).
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
37 (This whitespace breaks simple key based lookups.)
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
38 -------------------------------------------------------------------------------\n"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
39 cut -f1 phenotypes.txt | tail -n +2 | fgrep ' '
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
40
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
41
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
42 printf "
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
43 ===============================================================================
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
44 II. Entries in phenotypes.txt with no matching sequence in an *.fsa file.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
45 Second column lists close matches (having identical alleleID or accession).
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
46 -------------------------------------------------------------------------------\n"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
47 phenotype_ids | while read SEQID; do
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
48 sequence_ids | fgrep -xq "${SEQID}" ||
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
49 printf "$SEQID\t%s\n" "$(sequence_ids | near_matches "$SEQID")"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
50 done
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
51
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
52
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
53 printf "
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
54 ===============================================================================
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
55 III. Sequences in *.fsa that have no corresponding entry in phenotypes.txt.
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
56 Second column lists close matches (having identical alleleID or accession).
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
57 -------------------------------------------------------------------------------\n"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
58 sequence_ids | while read SEQID; do
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
59 phenotype_ids | fgrep -xq "${SEQID}" ||
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
60 printf "$SEQID\t%s\n" "$(phenotype_ids | near_matches "$SEQID")"
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
61 done
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
62
55051a9bc58d Uploaded
dcouvin
parents:
diff changeset
63 # vim: sts=4:sw=4:si:et:ai