diff blast_parser.py @ 3:70df762b48a8 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/blast_parser commit 32272744ad83a704fd427b48aae574496a279901-dirty
author earlhaminst
date Tue, 03 Oct 2017 04:51:45 -0400
parents
children 363f3480622d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_parser.py	Tue Oct 03 04:51:45 2017 -0400
@@ -0,0 +1,54 @@
+"""
+Simple parser to convert a BLAST 12-column or 24-column tabular output into a
+3-column tabular input for hcluster_hg (id1, id2, weight):
+"""
+import argparse
+import math
+from collections import OrderedDict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-i', metavar='in-file', type=argparse.FileType('rt'), required=True, help='Path to input file')
+
+    parser.add_argument('-o', metavar='out-file', type=argparse.FileType('wt'), required=True, help='Path to output file')
+
+    parser.add_argument('-r', action='store_true', default=False,
+                        dest='reciprocal',
+                        help='Annotate homolog pair')
+
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+    options = parser.parse_args()
+
+    results = OrderedDict()
+
+    for line in options.i:
+        line = line.rstrip()
+        line_cols = line.split('\t')
+        sequence1_id = line_cols[0]
+        sequence2_id = line_cols[1]
+        evalue = float(line_cols[10])
+
+        # Ignore self-matching hits
+        if sequence1_id != sequence2_id:
+            # Convert evalue to an integer weight with max 100
+            weight = 100
+
+            # If the evalue is 0, leave weight at 100
+            if evalue != 0.0:
+                weight = min(100, round(math.log10(evalue) / -2.0))
+
+            if (sequence1_id, sequence2_id) not in results:
+                results[(sequence1_id, sequence2_id)] = weight
+            else:
+                results[(sequence1_id, sequence2_id)] = max(results[(sequence1_id, sequence2_id)], weight)
+
+    for (sequence1_id, sequence2_id), weight in results.items():
+        if not options.reciprocal or (sequence2_id, sequence1_id) in results:
+            options.o.write("%s\t%s\t%d\n" % (sequence1_id, sequence2_id, weight))
+
+
+if __name__ == "__main__":
+    main()