diff blast_parser.py @ 4:363f3480622d draft default tip

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 598e396ffaf5834a1c8f32868b1568a2b48a0a78-dirty
author earlhaminst
date Thu, 12 Oct 2017 07:25:29 -0400
parents 70df762b48a8
children
line wrap: on
line diff
--- a/blast_parser.py	Tue Oct 03 04:51:45 2017 -0400
+++ b/blast_parser.py	Thu Oct 12 07:25:29 2017 -0400
@@ -4,50 +4,91 @@
 """
 import argparse
 import math
-from collections import OrderedDict
+import sqlite3
+import tempfile
+
+BATCH_SIZE = 2000
+
+
+def create_tables(conn):
+    cur = conn.cursor()
+    cur.execute('''CREATE TABLE alignment (
+        id INTEGER PRIMARY KEY,
+        sequence1_id VARCHAR NOT NULL,
+        sequence2_id VARCHAR NOT NULL,
+        weight INTEGER NOT NULL)''')
+    conn.commit()
 
 
 def main():
     parser = argparse.ArgumentParser()
-
     parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
-
     parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
-
     parser.add_argument('-r', action='store_true', default=False,
                         dest='reciprocal',
                         help='Annotate homolog pair')
-
-    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
-
     options = parser.parse_args()
 
-    results = OrderedDict()
+    db_file = tempfile.NamedTemporaryFile(suffix=".sqlite")
+
+    conn = sqlite3.connect(db_file.name)
+    conn.execute('PRAGMA foreign_keys = ON')
 
+    create_tables(conn)
+
+    cur = conn.cursor()
+
+    i = 0
     for line in options.i:
         line = line.rstrip()
         line_cols = line.split('\t')
         sequence1_id = line_cols[0]
         sequence2_id = line_cols[1]
-        evalue = float(line_cols[10])
 
         # Ignore self-matching hits
-        if sequence1_id != sequence2_id:
-            # Convert evalue to an integer weight with max 100
+        if sequence1_id == sequence2_id:
+            continue
+
+        i = i + 1
+        evalue = float(line_cols[10])
+
+        # Convert evalue to an integer weight with max 100
+        if evalue != 0.0:
+            weight = min(100, round(math.log10(evalue) / -2.0))
+        else:
+            # If the evalue is 0, leave weight at 100
             weight = 100
 
-            # If the evalue is 0, leave weight at 100
-            if evalue != 0.0:
-                weight = min(100, round(math.log10(evalue) / -2.0))
+        cur.execute('INSERT INTO alignment (id, sequence1_id, sequence2_id, weight) VALUES (?, ?, ?, ?)',
+                    (i, sequence1_id, sequence2_id, weight))
+
+        # Commit transaction at every BATCH_SIZE rows to save memory
+        if i % BATCH_SIZE == 0:
+            conn.commit()
+    # Commit final transaction
+    conn.commit()
+    options.i.close()
+
+    # Delete alternative alignments keeping only one with max weight
+    cur.execute('DELETE FROM alignment WHERE id NOT IN (SELECT id FROM alignment GROUP BY sequence1_id, sequence2_id HAVING weight=max(weight))')
+    conn.commit()
 
-            if (sequence1_id, sequence2_id) not in results:
-                results[(sequence1_id, sequence2_id)] = weight
-            else:
-                results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight)
+    if options.reciprocal:
+        query = 'SELECT a1.sequence1_id, a1.sequence2_id, a1.weight FROM alignment a1, alignment a2 WHERE a1.sequence1_id = a2.sequence2_id AND a1.sequence2_id = a2.sequence1_id ORDER BY a1.id'
+    else:
+        query = 'SELECT sequence1_id, sequence2_id, weight FROM alignment ORDER BY id'
 
-    for (sequence1_id, sequence2_id), weight in results.items():
-        if not options.reciprocal or (sequence2_id, sequence1_id) in results:
-            options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight))
+    cur.execute(query)
+    while True:
+        rows = cur.fetchmany(BATCH_SIZE)
+        if not rows:
+            break
+        for row in rows:
+            options.o.write("%s\t%s\t%d\n" % row)
+
+    conn.close()
+    db_file.close()
+    options.o.close()
 
 
 if __name__ == "__main__":