Mercurial > repos > iuc > tn93
comparison tn93_filter.py @ 2:b38f620a3628 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author | iuc |
---|---|
date | Wed, 20 Apr 2022 16:59:49 +0000 |
parents | 9d793e88e15f |
children |
comparison
equal
deleted
inserted
replaced
1:9d793e88e15f | 2:b38f620a3628 |
---|---|
1 import argparse | 1 import argparse |
2 import csv | 2 import csv |
3 import random | |
3 | 4 |
4 from Bio import SeqIO | 5 from Bio import SeqIO |
5 | 6 |
6 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well') | 7 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well') |
7 | 8 |
8 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) | 9 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) |
9 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) | 10 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) |
10 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) | 11 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) |
11 arguments.add_argument('-q', '--clusters', help='Compressed clusters', required=True, type=str) | 12 arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str) |
12 settings = arguments.parse_args() | 13 settings = arguments.parse_args() |
13 | 14 |
14 reference_name = 'REFERENCE' | 15 reference_name = 'REFERENCE' |
15 reference_seq = '' | 16 reference_seq = '' |
16 | 17 |
18 | |
19 def unique_id(new_id, existing_ids): | |
20 while new_id in existing_ids: | |
21 new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10)) | |
22 return new_id | |
23 | |
24 | |
17 with open(settings.reference) as seq_fh: | 25 with open(settings.reference) as seq_fh: |
18 for seq_record in SeqIO.parse(seq_fh, 'fasta'): | 26 for seq_record in SeqIO.parse(seq_fh, 'fasta'): |
19 reference_name = seq_record.name | 27 reference_name = seq_record.name.split(' ')[0] |
20 reference_seq = seq_record.seq | 28 reference_seq = seq_record.seq |
21 break | 29 break |
22 | 30 |
23 with open(settings.distances) as fh: | 31 with open(settings.distances) as fh: |
24 reader = csv.reader(fh, delimiter=',') | 32 reader = csv.reader(fh, delimiter=',') |
25 next(reader) | 33 next(reader) |
26 seqs_to_filter = set() | 34 seqs_to_filter = set() |
27 for line in reader: | 35 for line in reader: |
28 if line[1] not in seqs_to_filter: | 36 if line[1] not in seqs_to_filter: |
29 seqs_to_filter.add(line[1]) | 37 seqs_to_filter.add(line[1]) |
38 else: | |
39 seqs_to_filter.add(unique_id(line[1], seqs_to_filter)) | |
30 if reference_name in seqs_to_filter: | 40 if reference_name in seqs_to_filter: |
31 seqs_to_filter.remove(reference_name) | 41 seqs_to_filter.remove(reference_name) |
32 | 42 |
33 with open(settings.reads, "a+") as fh: | 43 with open(settings.reads, "a+") as fh: |
34 seqs_filtered = list() | 44 seqs_filtered = list() |
35 for seq_record in SeqIO.parse(settings.clusters, "fasta"): | 45 for seq_record in SeqIO.parse(settings.clusters, "fasta"): |
46 if seq_record.name.split(' ')[0] == reference_name: | |
47 continue | |
36 if seq_record.name not in seqs_to_filter: | 48 if seq_record.name not in seqs_to_filter: |
37 if seq_record.name == reference_name: | 49 unique_name = unique_id(seq_record.name, seqs_filtered) |
38 if seq_record.name not in seqs_filtered: | 50 fh.write('\n>%s\n%s' % (unique_name, seq_record.seq)) |
39 seqs_filtered.append(seq_record.name) | 51 seqs_filtered.append(unique_name) |
40 else: | |
41 continue | |
42 if reference_name not in seqs_filtered: | 52 if reference_name not in seqs_filtered: |
43 fh.write('\n>REFERENCE\n%s' % reference_seq) | 53 fh.write('\n>REFERENCE\n%s' % reference_seq) |