Mercurial > repos > iuc > tn93_readreduce
annotate tn93_filter.py @ 3:c176164dc8a5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit e9f254ea1c6712a96512cae4df91bfec8207a492
author | iuc |
---|---|
date | Sat, 28 Sep 2024 16:34:19 +0000 |
parents | 1d2ec0b0a0a7 |
children |
rev | line source |
---|---|
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
1 import argparse |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
2 import csv |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
3 import random |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
4 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
5 from Bio import SeqIO |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
6 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
7 arguments = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well') |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
8 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
9 arguments.add_argument('-f', '--reference', help='Reference sequence', required=True, type=str) |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
10 arguments.add_argument('-d', '--distances', help='Calculated pairwise distances', required=True, type=str) |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
11 arguments.add_argument('-r', '--reads', help='Output file for filtered reads', required=True, type=str) |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
12 arguments.add_argument('-q', '--clusters', help='Compressed background clusters', required=True, type=str) |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
13 settings = arguments.parse_args() |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
14 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
15 reference_name = 'REFERENCE' |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
16 reference_seq = '' |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
17 |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
18 |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
19 def unique_id(new_id, existing_ids): |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
20 while new_id in existing_ids: |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
21 new_id += '_' + ''.join(random.choices('0123456789abcdef', k=10)) |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
22 return new_id |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
23 |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
24 |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
25 with open(settings.reference) as seq_fh: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
26 for seq_record in SeqIO.parse(seq_fh, 'fasta'): |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
27 reference_name = seq_record.name.split(' ')[0] |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
28 reference_seq = seq_record.seq |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
29 break |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
30 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
31 with open(settings.distances) as fh: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
32 reader = csv.reader(fh, delimiter=',') |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
33 next(reader) |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
34 seqs_to_filter = set() |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
35 for line in reader: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
36 if line[1] not in seqs_to_filter: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
37 seqs_to_filter.add(line[1]) |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
38 else: |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
39 seqs_to_filter.add(unique_id(line[1], seqs_to_filter)) |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
40 if reference_name in seqs_to_filter: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
41 seqs_to_filter.remove(reference_name) |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
42 |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
43 with open(settings.reads, "a+") as fh: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
44 seqs_filtered = list() |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
45 for seq_record in SeqIO.parse(settings.clusters, "fasta"): |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
46 if seq_record.name.split(' ')[0] == reference_name: |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
47 continue |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
48 if seq_record.name not in seqs_to_filter: |
2
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
49 unique_name = unique_id(seq_record.name, seqs_filtered) |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
50 fh.write('\n>%s\n%s' % (unique_name, seq_record.seq)) |
1d2ec0b0a0a7
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
iuc
parents:
1
diff
changeset
|
51 seqs_filtered.append(unique_name) |
1
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
52 if reference_name not in seqs_filtered: |
84849140a3bc
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/tn93/ commit eec640a7c26b728f8175885926fe368b0756d9e5"
iuc
parents:
diff
changeset
|
53 fh.write('\n>REFERENCE\n%s' % reference_seq) |