Mercurial > repos > earlhaminst > blast_parser
annotate blast_parser.py @ 3:70df762b48a8 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
author | earlhaminst |
---|---|
date | Tue, 03 Oct 2017 04:51:45 -0400 |
parents | |
children | 363f3480622d |
rev | line source |
---|---|
3
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
1 """ |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
3 3-column tabular input for hcluster_hg (id1, id2, weight): |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
4 """ |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
5 import argparse |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
6 import math |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
7 from collections import OrderedDict |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
8 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
9 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
10 def main(): |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
11 parser = argparse.ArgumentParser() |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
12 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
14 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
16 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
17 parser.add_argument('-r', action='store_true', default=False, |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
18 dest='reciprocal', |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
19 help='Annotate homolog pair') |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
20 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
21 parser.add_argument('--version', action='version', version='%(prog)s 1.0') |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
22 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
23 options = parser.parse_args() |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
24 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
25 results = OrderedDict() |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
26 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
27 for line in options.i: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
28 line = line.rstrip() |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
29 line_cols = line.split('\t') |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
30 sequence1_id = line_cols[0] |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
31 sequence2_id = line_cols[1] |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
32 evalue = float(line_cols[10]) |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
33 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
34 # Ignore self-matching hits |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
35 if sequence1_id != sequence2_id: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
36 # Convert evalue to an integer weight with max 100 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
37 weight = 100 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
38 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
39 # If the evalue is 0, leave weight at 100 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
40 if evalue != 0.0: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
41 weight = min(100, round(math.log10(evalue) / -2.0)) |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
42 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
43 if (sequence1_id, sequence2_id) not in results: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
44 results[(sequence1_id, sequence2_id)] = weight |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
45 else: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
47 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
48 for (sequence1_id, sequence2_id), weight in results.items(): |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
51 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
52 |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
53 if __name__ == "__main__": |
70df762b48a8
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
earlhaminst
parents:
diff
changeset
|
54 main() |