Mercurial > repos > earlhaminst > blast_parser
comparison blast_parser.py @ 3:70df762b48a8 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
author | earlhaminst |
---|---|
date | Tue, 03 Oct 2017 04:51:45 -0400 |
parents | |
children | 363f3480622d |
comparison
equal
deleted
inserted
replaced
2:376ed15e0d27 | 3:70df762b48a8 |
---|---|
1 """ | |
2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a | |
3 3-column tabular input for hcluster_hg (id1, id2, weight): | |
4 """ | |
5 import argparse | |
6 import math | |
7 from collections import OrderedDict | |
8 | |
9 | |
10 def main(): | |
11 parser = argparse.ArgumentParser() | |
12 | |
13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') | |
14 | |
15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') | |
16 | |
17 parser.add_argument('-r', action='store_true', default=False, | |
18 dest='reciprocal', | |
19 help='Annotate homolog pair') | |
20 | |
21 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
22 | |
23 options = parser.parse_args() | |
24 | |
25 results = OrderedDict() | |
26 | |
27 for line in options.i: | |
28 line = line.rstrip() | |
29 line_cols = line.split('\t') | |
30 sequence1_id = line_cols[0] | |
31 sequence2_id = line_cols[1] | |
32 evalue = float(line_cols[10]) | |
33 | |
34 # Ignore self-matching hits | |
35 if sequence1_id != sequence2_id: | |
36 # Convert evalue to an integer weight with max 100 | |
37 weight = 100 | |
38 | |
39 # If the evalue is 0, leave weight at 100 | |
40 if evalue != 0.0: | |
41 weight = min(100, round(math.log10(evalue) / -2.0)) | |
42 | |
43 if (sequence1_id, sequence2_id) not in results: | |
44 results[(sequence1_id, sequence2_id)] = weight | |
45 else: | |
46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) | |
47 | |
48 for (sequence1_id, sequence2_id), weight in results.items(): | |
49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: | |
50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) | |
51 | |
52 | |
53 if __name__ == "__main__": | |
54 main() |