# HG changeset patch
# User earlhaminst
# Date 1507807529 14400
# Node ID 363f3480622d1b8d43ecc77184eee48f2218af09
# Parent 70df762b48a8484c5c03d6b8f3eee3b0164faaa9
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
diff -r 70df762b48a8 -r 363f3480622d blast_parser.py
--- a/blast_parser.py Tue Oct 03 04:51:45 2017 -0400
+++ b/blast_parser.py Thu Oct 12 07:25:29 2017 -0400
@@ -4,50 +4,91 @@
"""
import argparse
import math
-from collections import OrderedDict
+import sqlite3
+import tempfile
+
+BATCH_SIZE = 2000
+
+
+def create_tables(conn):
+ cur = conn.cursor()
+ cur.execute('''CREATE TABLE alignment (
+ id INTEGER PRIMARY KEY,
+ sequence1_id VARCHAR NOT NULL,
+ sequence2_id VARCHAR NOT NULL,
+ weight INTEGER NOT NULL)''')
+ conn.commit()
def main():
parser = argparse.ArgumentParser()
-
parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
-
parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
-
parser.add_argument('-r', action='store_true', default=False,
dest='reciprocal',
help='Annotate homolog pair')
-
- parser.add_argument('--version', action='version', version='%(prog)s 1.0')
-
options = parser.parse_args()
- results = OrderedDict()
+ db_file = tempfile.NamedTemporaryFile(suffix=".sqlite")
+
+ conn = sqlite3.connect(db_file.name)
+ conn.execute('PRAGMA foreign_keys = ON')
+ create_tables(conn)
+
+ cur = conn.cursor()
+
+ i = 0
for line in options.i:
line = line.rstrip()
line_cols = line.split('\t')
sequence1_id = line_cols[0]
sequence2_id = line_cols[1]
- evalue = float(line_cols[10])
# Ignore self-matching hits
- if sequence1_id != sequence2_id:
- # Convert evalue to an integer weight with max 100
+ if sequence1_id == sequence2_id:
+ continue
+
+ i = i + 1
+ evalue = float(line_cols[10])
+
+ # Convert evalue to an integer weight with max 100
+ if evalue != 0.0:
+ weight = min(100, round(math.log10(evalue) / -2.0))
+ else:
+ # If the evalue is 0, leave weight at 100
weight = 100
- # If the evalue is 0, leave weight at 100
- if evalue != 0.0:
- weight = min(100, round(math.log10(evalue) / -2.0))
+ cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)',
+ (i, sequence1_id, sequence2_id, weight))
+
+ # Commit transaction at every BATCH_SIZE rows to save memory
+ if i % BATCH_SIZE == 0:
+ conn.commit()
+ # Commit final transaction
+ conn.commit()
+ options.i.close()
+
+ # Delete alternative alignments keeping only one with max weight
+ cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))')
+ conn.commit()
- if (sequence1_id, sequence2_id) not in results:
- results[(sequence1_id, sequence2_id)] = weight
- else:
- results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight)
+ if options.reciprocal:
+ query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id'
+ else:
+ query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id'
- for (sequence1_id, sequence2_id), weight in results.items():
- if not options.reciprocal or (sequence2_id, sequence1_id) in results:
- options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight))
+ cur.execute(query)
+ while True:
+ rows = cur.fetchmany(BATCH_SIZE)
+ if not rows:
+ break
+ for row in rows:
+ options.o.write("%s\t%s\t%d\n" % row)
+
+ conn.close()
+ db_file.close()
+ options.o.close()
if __name__ == "__main__":
diff -r 70df762b48a8 -r 363f3480622d blast_parser.xml
--- a/blast_parser.xml Tue Oct 03 04:51:45 2017 -0400
+++ b/blast_parser.xml Thu Oct 12 07:25:29 2017 -0400
@@ -2,7 +2,6 @@
Convert 12- or 24-column BLAST output into 3-column hcluster_sg input
-