Mercurial > repos > earlhaminst > blast_parser
comparison blast_parser.py @ 4:363f3480622d draft default tip
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
author | earlhaminst |
---|---|
date | Thu, 12 Oct 2017 07:25:29 -0400 |
parents | 70df762b48a8 |
children |
comparison
equal
deleted
inserted
replaced
3:70df762b48a8 | 4:363f3480622d |
---|---|
2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a | 2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a |
3 3-column tabular input for hcluster_hg (id1, id2, weight): | 3 3-column tabular input for hcluster_hg (id1, id2, weight): |
4 """ | 4 """ |
5 import argparse | 5 import argparse |
6 import math | 6 import math |
7 from collections import OrderedDict | 7 import sqlite3 |
8 import tempfile | |
9 | |
10 BATCH_SIZE = 2000 | |
11 | |
12 | |
13 def create_tables(conn): | |
14 cur = conn.cursor() | |
15 cur.execute('''CREATE TABLE alignment ( | |
16 id INTEGER PRIMARY KEY, | |
17 sequence1_id VARCHAR NOT NULL, | |
18 sequence2_id VARCHAR NOT NULL, | |
19 weight INTEGER NOT NULL)''') | |
20 conn.commit() | |
8 | 21 |
9 | 22 |
10 def main(): | 23 def main(): |
11 parser = argparse.ArgumentParser() | 24 parser = argparse.ArgumentParser() |
12 | |
13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') | 25 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') |
14 | |
15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') | 26 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') |
16 | |
17 parser.add_argument('-r', action='store_true', default=False, | 27 parser.add_argument('-r', action='store_true', default=False, |
18 dest='reciprocal', | 28 dest='reciprocal', |
19 help='Annotate homolog pair') | 29 help='Annotate homolog pair') |
20 | |
21 parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
22 | |
23 options = parser.parse_args() | 30 options = parser.parse_args() |
24 | 31 |
25 results = OrderedDict() | 32 db_file = tempfile.NamedTemporaryFile(suffix=".sqlite") |
26 | 33 |
34 conn = sqlite3.connect(db_file.name) | |
35 conn.execute('PRAGMA foreign_keys = ON') | |
36 | |
37 create_tables(conn) | |
38 | |
39 cur = conn.cursor() | |
40 | |
41 i = 0 | |
27 for line in options.i: | 42 for line in options.i: |
28 line = line.rstrip() | 43 line = line.rstrip() |
29 line_cols = line.split('\t') | 44 line_cols = line.split('\t') |
30 sequence1_id = line_cols[0] | 45 sequence1_id = line_cols[0] |
31 sequence2_id = line_cols[1] | 46 sequence2_id = line_cols[1] |
47 | |
48 # Ignore self-matching hits | |
49 if sequence1_id == sequence2_id: | |
50 continue | |
51 | |
52 i = i + 1 | |
32 evalue = float(line_cols[10]) | 53 evalue = float(line_cols[10]) |
33 | 54 |
34 # Ignore self-matching hits | 55 # Convert evalue to an integer weight with max 100 |
35 if sequence1_id != sequence2_id: | 56 if evalue != 0.0: |
36 # Convert evalue to an integer weight with max 100 | 57 weight = min(100, round(math.log10(evalue) / -2.0)) |
58 else: | |
59 # If the evalue is 0, leave weight at 100 | |
37 weight = 100 | 60 weight = 100 |
38 | 61 |
39 # If the evalue is 0, leave weight at 100 | 62 cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)', |
40 if evalue != 0.0: | 63 (i, sequence1_id, sequence2_id, weight)) |
41 weight = min(100, round(math.log10(evalue) / -2.0)) | |
42 | 64 |
43 if (sequence1_id, sequence2_id) not in results: | 65 # Commit transaction at every BATCH_SIZE rows to save memory |
44 results[(sequence1_id, sequence2_id)] = weight | 66 if i % BATCH_SIZE == 0: |
45 else: | 67 conn.commit() |
46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) | 68 # Commit final transaction |
69 conn.commit() | |
70 options.i.close() | |
47 | 71 |
48 for (sequence1_id, sequence2_id), weight in results.items(): | 72 # Delete alternative alignments keeping only one with max weight |
49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: | 73 cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))') |
50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) | 74 conn.commit() |
75 | |
76 if options.reciprocal: | |
77 query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id' | |
78 else: | |
79 query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id' | |
80 | |
81 cur.execute(query) | |
82 while True: | |
83 rows = cur.fetchmany(BATCH_SIZE) | |
84 if not rows: | |
85 break | |
86 for row in rows: | |
87 options.o.write("%s\t%s\t%d\n" % row) | |
88 | |
89 conn.close() | |
90 db_file.close() | |
91 options.o.close() | |
51 | 92 |
52 | 93 |
53 if __name__ == "__main__": | 94 if __name__ == "__main__": |
54 main() | 95 main() |