comparison blast_parser.py @ 4:363f3480622d draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
author earlhaminst
date Thu, 12 Oct 2017 07:25:29 -0400
parents 70df762b48a8
children
comparison
equal deleted inserted replaced
3:70df762b48a8 4:363f3480622d
2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a 2 Simple parser to convert a BLAST 12-column or 24-column tabular output into a
3 3-column tabular input for hcluster_hg (id1, id2, weight): 3 3-column tabular input for hcluster_hg (id1, id2, weight):
4 """ 4 """
5 import argparse 5 import argparse
6 import math 6 import math
7 from collections import OrderedDict 7 import sqlite3
8 import tempfile
9
10 BATCH_SIZE = 2000
11
12
13 def create_tables(conn):
14 cur = conn.cursor()
15 cur.execute('''CREATE TABLE alignment (
16 id INTEGER PRIMARY KEY,
17 sequence1_id VARCHAR NOT NULL,
18 sequence2_id VARCHAR NOT NULL,
19 weight INTEGER NOT NULL)''')
20 conn.commit()
8 21
9 22
10 def main(): 23 def main():
11 parser = argparse.ArgumentParser() 24 parser = argparse.ArgumentParser()
12
13 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') 25 parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
14
15 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') 26 parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
16
17 parser.add_argument('-r', action='store_true', default=False, 27 parser.add_argument('-r', action='store_true', default=False,
18 dest='reciprocal', 28 dest='reciprocal',
19 help='Annotate homolog pair') 29 help='Annotate homolog pair')
20
21 parser.add_argument('--version', action='version', version='%(prog)s 1.0')
22
23 options = parser.parse_args() 30 options = parser.parse_args()
24 31
25 results = OrderedDict() 32 db_file = tempfile.NamedTemporaryFile(suffix=".sqlite")
26 33
34 conn = sqlite3.connect(db_file.name)
35 conn.execute('PRAGMA foreign_keys = ON')
36
37 create_tables(conn)
38
39 cur = conn.cursor()
40
41 i = 0
27 for line in options.i: 42 for line in options.i:
28 line = line.rstrip() 43 line = line.rstrip()
29 line_cols = line.split('\t') 44 line_cols = line.split('\t')
30 sequence1_id = line_cols[0] 45 sequence1_id = line_cols[0]
31 sequence2_id = line_cols[1] 46 sequence2_id = line_cols[1]
47
48 # Ignore self-matching hits
49 if sequence1_id == sequence2_id:
50 continue
51
52 i = i + 1
32 evalue = float(line_cols[10]) 53 evalue = float(line_cols[10])
33 54
34 # Ignore self-matching hits 55 # Convert evalue to an integer weight with max 100
35 if sequence1_id != sequence2_id: 56 if evalue != 0.0:
36 # Convert evalue to an integer weight with max 100 57 weight = min(100, round(math.log10(evalue) / -2.0))
58 else:
59 # If the evalue is 0, leave weight at 100
37 weight = 100 60 weight = 100
38 61
39 # If the evalue is 0, leave weight at 100 62 cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)',
40 if evalue != 0.0: 63 (i, sequence1_id, sequence2_id, weight))
41 weight = min(100, round(math.log10(evalue) / -2.0))
42 64
43 if (sequence1_id, sequence2_id) not in results: 65 # Commit transaction at every BATCH_SIZE rows to save memory
44 results[(sequence1_id, sequence2_id)] = weight 66 if i % BATCH_SIZE == 0:
45 else: 67 conn.commit()
46 results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) 68 # Commit final transaction
69 conn.commit()
70 options.i.close()
47 71
48 for (sequence1_id, sequence2_id), weight in results.items(): 72 # Delete alternative alignments keeping only one with max weight
49 if not options.reciprocal or (sequence2_id, sequence1_id) in results: 73 cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))')
50 options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) 74 conn.commit()
75
76 if options.reciprocal:
77 query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id'
78 else:
79 query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id'
80
81 cur.execute(query)
82 while True:
83 rows = cur.fetchmany(BATCH_SIZE)
84 if not rows:
85 break
86 for row in rows:
87 options.o.write("%s\t%s\t%d\n" % row)
88
89 conn.close()
90 db_file.close()
91 options.o.close()
51 92
52 93
53 if __name__ == "__main__": 94 if __name__ == "__main__":
54 main() 95 main()