diff tn93_filter.py @ 1:84849140a3bc draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
author iuc
date Fri, 23 Apr 2021 03:04:45 +0000
parents
children 1d2ec0b0a0a7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tn93_filter.py	Fri Apr 23 03:04:45 2021 +0000
@@ -0,0 +1,43 @@
+import argparse
+import csv
+
+from Bio import SeqIO
+
+arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well')
+
+arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str)
+arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str)
+arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str)
+arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str)
+settings = arguments.parse_args()
+
+reference_name = 'REFERENCE'
+reference_seq = ''
+
+with open(settings.reference) as seq_fh:
+    for seq_record in SeqIO.parse(seq_fh, 'fasta'):
+        reference_name = seq_record.name
+        reference_seq = seq_record.seq
+        break
+
+with open(settings.distances) as fh:
+    reader = csv.reader(fh, delimiter=',')
+    next(reader)
+    seqs_to_filter = set()
+    for line in reader:
+        if line[1] not in seqs_to_filter:
+            seqs_to_filter.add(line[1])
+    if reference_name in seqs_to_filter:
+        seqs_to_filter.remove(reference_name)
+
+with open(settings.reads, "a+") as fh:
+    seqs_filtered = list()
+    for seq_record in SeqIO.parse(settings.clusters, "fasta"):
+        if seq_record.name not in seqs_to_filter:
+            if seq_record.name == reference_name:
+                if seq_record.name not in seqs_filtered:
+                    seqs_filtered.append(seq_record.name)
+                else:
+                    continue
+    if reference_name not in seqs_filtered:
+        fh.write('\n>REFERENCE\n%s' % reference_seq)