diff fasta2database.py @ 10:d0431a839606 draft

Uploaded
author petr-novak
date Wed, 14 Aug 2019 11:24:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2database.py	Wed Aug 14 11:24:15 2019 -0400
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+'''
+Helper script to create DANTE databese which can be used in second iteration
+'''
+import sys
+
+fasta_input = sys.argv[1]
+db_fasta_output_file = sys.argv[2]
+db_classification_file = sys.argv[3]
+classification_table = set()
+# fasta header will be reformatted to correct REXdb classification
+with open(fasta_input, 'r') as f, open(db_fasta_output_file, 'w') as out:
+    for line in f:
+        if line[0] == ">":
+            ## modify header
+            name, domain, classification = line.split(" ")
+            name_clean=name[1:].replace("-","_")
+            new_header = ">NA-{}__{}\n".format(domain, name_clean)
+            classification_string = "\t".join(classification.split("|"))
+            classification_table.add("{}\t{}".format(name_clean, classification_string))
+            out.write(new_header)
+        else:
+            out.write(line)
+with open(db_classification_file, 'w') as f:
+    f.writelines(classification_table)