# HG changeset patch # User earlhaminst # Date 1507807529 14400 # Node ID 363f3480622d1b8d43ecc77184eee48f2218af09 # Parent 70df762b48a8484c5c03d6b8f3eee3b0164faaa9 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty diff -r 70df762b48a8 -r 363f3480622d blast_parser.py --- a/blast_parser.py Tue Oct 03 04:51:45 2017 -0400 +++ b/blast_parser.py Thu Oct 12 07:25:29 2017 -0400 @@ -4,50 +4,91 @@ """ import argparse import math -from collections import OrderedDict +import sqlite3 +import tempfile + +BATCH_SIZE = 2000 + + +def create_tables(conn): + cur = conn.cursor() + cur.execute('''CREATE TABLE alignment ( + id INTEGER PRIMARY KEY, + sequence1_id VARCHAR NOT NULL, + sequence2_id VARCHAR NOT NULL, + weight INTEGER NOT NULL)''') + conn.commit() def main(): parser = argparse.ArgumentParser() - parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file') - parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file') - parser.add_argument('-r', action='store_true', default=False, dest='reciprocal', help='Annotate homolog pair') - - parser.add_argument('--version', action='version', version='%(prog)s 1.0') - options = parser.parse_args() - results = OrderedDict() + db_file = tempfile.NamedTemporaryFile(suffix=".sqlite") + + conn = sqlite3.connect(db_file.name) + conn.execute('PRAGMA foreign_keys = ON') + create_tables(conn) + + cur = conn.cursor() + + i = 0 for line in options.i: line = line.rstrip() line_cols = line.split('\t') sequence1_id = line_cols[0] sequence2_id = line_cols[1] - evalue = float(line_cols[10]) # Ignore self-matching hits - if sequence1_id != sequence2_id: - # Convert evalue to an integer weight with max 100 + if sequence1_id == sequence2_id: + continue + + i = i + 1 + evalue = float(line_cols[10]) + + # Convert evalue to an integer weight with max 100 + if evalue != 0.0: + weight = min(100, round(math.log10(evalue) / -2.0)) + else: + # If the evalue is 0, leave weight at 100 weight = 100 - # If the evalue is 0, leave weight at 100 - if evalue != 0.0: - weight = min(100, round(math.log10(evalue) / -2.0)) + cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)', + (i, sequence1_id, sequence2_id, weight)) + + # Commit transaction at every BATCH_SIZE rows to save memory + if i % BATCH_SIZE == 0: + conn.commit() + # Commit final transaction + conn.commit() + options.i.close() + + # Delete alternative alignments keeping only one with max weight + cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))') + conn.commit() - if (sequence1_id, sequence2_id) not in results: - results[(sequence1_id, sequence2_id)] = weight - else: - results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight) + if options.reciprocal: + query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id' + else: + query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id' - for (sequence1_id, sequence2_id), weight in results.items(): - if not options.reciprocal or (sequence2_id, sequence1_id) in results: - options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight)) + cur.execute(query) + while True: + rows = cur.fetchmany(BATCH_SIZE) + if not rows: + break + for row in rows: + options.o.write("%s\t%s\t%d\n" % row) + + conn.close() + db_file.close() + options.o.close() if __name__ == "__main__": diff -r 70df762b48a8 -r 363f3480622d blast_parser.xml --- a/blast_parser.xml Tue Oct 03 04:51:45 2017 -0400 +++ b/blast_parser.xml Thu Oct 12 07:25:29 2017 -0400 @@ -2,7 +2,6 @@ Convert 12- or 24-column BLAST output into 3-column hcluster_sg input -