Mercurial > repos > iuc > tn93_cluster
diff tn93_filter.py @ 1:112d80c9ccca draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author | iuc |
---|---|
date | Wed, 20 Apr 2022 17:00:11 +0000 |
parents | af03f3398f03 |
children |
line wrap: on
line diff
--- a/tn93_filter.py Fri Apr 23 03:04:15 2021 +0000 +++ b/tn93_filter.py Wed Apr 20 17:00:11 2022 +0000 @@ -1,5 +1,6 @@ import argparse import csv +import random from Bio import SeqIO @@ -8,15 +9,22 @@ arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) -arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str) +arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str) settings = arguments.parse_args() reference_name = 'REFERENCE' reference_seq = '' + +def unique_id(new_id, existing_ids): + while new_id in existing_ids: + new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10)) + return new_id + + with open(settings.reference) as seq_fh: for seq_record in SeqIO.parse(seq_fh, 'fasta'): - reference_name = seq_record.name + reference_name = seq_record.name.split(' ')[0] reference_seq = seq_record.seq break @@ -27,17 +35,19 @@ for line in reader: if line[1] not in seqs_to_filter: seqs_to_filter.add(line[1]) + else: + seqs_to_filter.add(unique_id(line[1], seqs_to_filter)) if reference_name in seqs_to_filter: seqs_to_filter.remove(reference_name) with open(settings.reads, "a+") as fh: seqs_filtered = list() for seq_record in SeqIO.parse(settings.clusters, "fasta"): + if seq_record.name.split(' ')[0] == reference_name: + continue if seq_record.name not in seqs_to_filter: - if seq_record.name == reference_name: - if seq_record.name not in seqs_filtered: - seqs_filtered.append(seq_record.name) - else: - continue + unique_name = unique_id(seq_record.name, seqs_filtered) + fh.write('\n>%s\n%s' % (unique_name, seq_record.seq)) + seqs_filtered.append(unique_name) if reference_name not in seqs_filtered: fh.write('\n>REFERENCE\n%s' % reference_seq)