annotate utils/preprocess.py @ 0:b856d3d95413 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
author iuc
date Mon, 09 Jan 2023 13:27:09 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
1 #!/usr/bin/env python
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
2 # -*- coding: utf-8 -*-
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
3 # Credits: Grigorii Sukhorukov, Macha Nikolski
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
4
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
5 import math
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
6 import os
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
7 import pathlib
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
8 import random
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
9
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
10 import h5py
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
11 import numpy as np
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
12 from Bio import SeqIO
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
13 from Bio.Seq import Seq
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
14 from Bio.SeqRecord import SeqRecord
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
15 from sklearn.utils import shuffle
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
16
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
17
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
18 def reverse_complement(fragment):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
19 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
20 provides reverse complement to sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
21 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
22 sequences - list with SeqRecord sequences in fasta format
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
23 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
24 complementary_sequences -
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
25 list with SeqRecord complementary sequences in fasta format
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
26 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
27 # complementary_sequences = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
28 # for sequence in sequences:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
29 # complementary_sequence = SeqRecord(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
30 # seq=Seq(sequence.seq).reverse_complement(),
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
31 # id=sequence.id + "_reverse_complement",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
32 # )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
33 # complementary_sequences.append(complementary_sequence)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
34 fragment = fragment[::-1].translate(str.maketrans('ACGT', 'TGCA'))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
35 return fragment
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
36
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
37
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
38 def introduce_mutations(seqs, mut_rate, rs=None):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
39 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
40 Function that mutates sequences in the entering fasta file
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
41 A proportion of nucleotides are changed to other nucleotide
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
42 Not yet taking account of mutation for gaps
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
43 mut_rate - proportion from 0.0 to 1.0, float
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
44 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
45 random.seed(a=rs)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
46 assert 0.0 <= mut_rate <= 1.0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
47 mutated_seqs = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
48 for seq in seqs:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
49 mut_seq = list(str(seq.seq))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
50 l_ = len(mut_seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
51 mutated_sites_i = random.sample(range(l_), int(mut_rate * l_))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
52 for mut_site_i in mutated_sites_i:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
53 mut_site = mut_seq[mut_site_i]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
54 mutations = ["A", "C", "T", "G"]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
55 if mut_site in mutations:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
56 mutations.remove(mut_site)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
57 mut_seq[mut_site_i] = random.sample(mutations, 1)[0]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
58 mutated_seq = SeqRecord(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
59 seq=Seq("".join(mut_seq)),
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
60 id=seq.id + f"mut_{mut_rate}",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
61 name="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
62 description="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
63 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
64 mutated_seqs.append(mutated_seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
65 return mutated_seqs
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
66
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
67
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
68 def separate_by_length(length_, seq_list, fold=None,):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
69 # TODO: add docs
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
70 included = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
71 to_process = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
72 excluded = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
73 for seq_ in seq_list:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
74 l_ = len(seq_.seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
75 if l_ >= length_:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
76 if fold is None:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
77 included.append(seq_)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
78 elif l_ < length_ * fold:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
79 included.append(seq_)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
80 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
81 to_process.append(seq_)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
82 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
83 excluded += 1
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
84 print(f"A total of {excluded} sequences was excluded due to being smaller than {length_}")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
85 return included, to_process
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
86
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
87
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
88 def chunks(lst, n):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
89 """Yield successive n-sized chunks from lst.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
90 https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks"""
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
91 for i in range(0, len(lst), n):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
92 yield lst[i:i + n]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
93
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
94
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
95 def correct(frag):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
96 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
97 leaves only unambiguous DNA code (ACTG-)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
98 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
99 frag - string of nucleotides
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
100 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
101 pr_frag - corrected string of nucleotides
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
102 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
103 pr_frag = frag.upper()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
104 pr_frag_s = set(pr_frag)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
105 if pr_frag_s != {"A", "C", "G", "T", "-"}:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
106 for letter in pr_frag_s - {"A", "C", "G", "T", "-"}:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
107 pr_frag = pr_frag.replace(letter, "-")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
108 return pr_frag
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
109
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
110
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
111 def fragmenting(sequences, sl_wind_size, max_gap=0.05, sl_wind_step=None):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
112 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
113 slices sequences in fragments by sliding window
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
114 based on its size and step.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
115 last fragment is padded by '-'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
116 fragments have ambiguous bases replaced by '-'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
117 fragments with many '-' are discarded
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
118 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
119 sequences - list with SeqRecord sequences in fasta format
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
120 max_gap - max allowed proportion of '-'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
121 sl_wind_size - sliding window step
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
122 sl_wind_step - sliding window step, by default equals
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
123 sliding window size (None is replaced by it)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
124 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
125 fragments - list with sequence fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
126 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
127 if sl_wind_step is None:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
128 sl_wind_step = sl_wind_size
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
129 fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
130 fragments_rc = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
131 out_sequences = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
132 for sequence in sequences:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
133 seq = str(sequence.seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
134 n_fragments = 1 + max(0, math.ceil((len(seq) - sl_wind_size) / sl_wind_step))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
135 for n in range(n_fragments):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
136 if n + 1 != n_fragments:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
137 frag = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
138 elif n_fragments == 1:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
139 # padding the shorter fragment to sl_wind_size
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
140 frag_short = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
141 frag = frag_short + (sl_wind_size - len(frag_short)) * "-"
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
142 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
143 frag = seq[(len(seq) - sl_wind_size):]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
144 # replace ambiguous characters
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
145 frag = correct(frag)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
146 assert len(frag) == sl_wind_size, f"{len(frag)} vs {sl_wind_size}"
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
147 # skipping sequences with many gaps
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
148 if frag.count("-") / sl_wind_size <= max_gap:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
149 fragments.append(frag)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
150 # generating reverse complement
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
151 fragments_rc.append(reverse_complement(frag))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
152 fr_seq = SeqRecord(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
153 seq=Seq(frag),
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
154 id=f"{sequence.id}_{n*sl_wind_step}_{sl_wind_size}",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
155 name="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
156 description="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
157 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
158 out_sequences.append(fr_seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
159 return fragments, fragments_rc, out_sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
160
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
161
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
162 def label_fasta_fragments(sequences, label):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
163 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
164 Provides labels to generated fragments stored in fasta
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
165 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
166 sequences - list with SeqRecord sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
167 label - type of label (bacteria, virus, plant)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
168 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
169 labeled_fragments - list with labeled SeqRecord sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
170 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
171 # assert label in ["virus", "plant", "bacteria"]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
172 labeled_fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
173 for sequence in sequences:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
174 sequence.id = sequence.id + f"_{label}"
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
175 labeled_fragments.append(sequence)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
176 return labeled_fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
177
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
178
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
179 def one_hot_encode(fragments):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
180 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
181 produces one-hot matrices from fragments and labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
182 '-' is given all zeros
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
183 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
184 fragments - list with sequence fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
185 label - type of label (int <= depth)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
186 label_depth - number of possible labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
187 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
188 encoded_fragments - list with one-hot encoded fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
189 labels - list with one-hot encoded labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
190 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
191 import tensorflow as tf
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
192 encoded_fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
193 map_dict = {"A": 0, "C": 1, "G": 2, "T": 3, "-": -1}
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
194 for frag in fragments:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
195 frag_array = np.array(list(frag))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
196 integer_encoded = np.int8(np.vectorize(map_dict.get)(frag_array))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
197 one_hot_encoded = tf.one_hot(integer_encoded, depth=4, dtype=tf.int8).numpy()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
198 encoded_fragments.append(one_hot_encoded)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
199 encoded_fragments = np.stack(encoded_fragments)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
200 return encoded_fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
201
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
202
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
203 def prepare_labels(fragments, label, label_depth):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
204 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
205 produces one-hot labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
206 '-' is given all zeros
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
207 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
208 fragments - list with sequence fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
209 label - type of label (int <= depth)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
210 label_depth - number of possible labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
211 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
212 labels - list with one-hot encoded labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
213 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
214 import tensorflow as tf
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
215 n_fragments = len(fragments)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
216 labels = np.int8(np.full(n_fragments, label))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
217 labels = tf.one_hot(labels, depth=label_depth).numpy()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
218 return labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
219
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
220
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
221 # TODO: write docs for functions
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
222 def calculate_total_length(seq_path):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
223 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
224 Calculate total length of the sequences in the fasta file.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
225 Needed for weighted sampling
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
226 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
227 seq_path - path to the file with sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
228 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
229 seq_length - total length of all sequences in the file
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
230 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
231 seqs = list(SeqIO.parse(seq_path, "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
232 seq_length = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
233 for seq in seqs:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
234 seq_length += len(seq.seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
235 return seq_length
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
236
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
237
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
238 def prepare_seq_lists(in_paths, n_fragments, weights=None,):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
239 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
240 selects files with sequences based on extension
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
241 and calculates number of fragments to be sampled
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
242 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
243 in_paths - list of paths to folder with sequence files. Can be a string also a string
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
244 n_fragments - number of fragments to be sampled
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
245 weights - upsampling of fragments. fractions should sum to one
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
246 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
247 seqs_list - list with path to files with sequences
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
248 n_fragments_list - number of fragments to be sampled
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
249 lists are zipped to work with ray iterators
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
250 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
251 # case when we recieve a single sequence file
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
252 if type(in_paths) is str and in_paths.endswith(('.fna', '.fasta')):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
253 return [[in_paths, n_fragments]]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
254 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
255 # transform string to list
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
256 if type(in_paths) is str or type(in_paths) is pathlib.PosixPath:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
257 in_paths = [in_paths]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
258
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
259 if weights:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
260 assert len(weights) == len(in_paths)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
261 assert 1.01 > round(sum(weights), 2) > 0.99
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
262 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
263 l_ = len(in_paths)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
264 weights = [1 / l_] * l_
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
265 n_fragments_list_all = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
266 seqs_list_all = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
267 for in_paths, w_ in zip(in_paths, weights):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
268 seqs_list = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
269 seq_length_list = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
270 total_length = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
271 for file in os.listdir(in_paths):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
272 if file.endswith("fna") or file.endswith("fasta"):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
273 seq_path = (os.path.join(in_paths, file))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
274 seqs_length = calculate_total_length(seq_path)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
275 seqs_list.append(seq_path)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
276 seq_length_list.append(seqs_length)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
277 total_length += seqs_length
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
278 # + 1 may lead to a slightly bigger number than desired
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
279 n_fragments_list = [((seq_length / total_length) * n_fragments * w_ + 1) for seq_length in seq_length_list]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
280 n_fragments_list_all.extend(n_fragments_list)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
281 seqs_list_all.extend(seqs_list)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
282 print("list calculation done")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
283 return list(zip(seqs_list_all, n_fragments_list_all))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
284
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
285
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
286 def sample_fragments(seq_container, length, random_seed=1, limit=None, max_gap=0.05, sl_wind_step=None):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
287 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
288 Randomly samples fragments from sequences in the list.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
289 Is a bit cumbersome written to work with ray.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
290 Input:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
291 seq_container - list with each entry containing path to sequence,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
292 and n samples from this sequence.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
293 length - desired length of sampled fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
294 Output:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
295 fragments - list with sequence fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
296 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
297 random.seed(a=random_seed)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
298 total_fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
299 total_fragments_rc = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
300 total_seqs = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
301 for entry in seq_container:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
302 seq = list(SeqIO.parse(entry[0], "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
303 n_fragments = entry[1]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
304 seqs = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
305 fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
306 fragments_rc = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
307 counter_1 = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
308 counter_2 = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
309 while counter_1 < n_fragments:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
310 # select chromosomes if there are any
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
311 fragment_full = random.choice(seq)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
312 r_end = len(fragment_full.seq) - length
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
313 try:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
314 r_start = random.randrange(r_end)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
315 fragment = SeqRecord(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
316 seq=fragment_full.seq[r_start:(r_start + length)],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
317 id=f"{fragment_full.id}_{length}_{r_start}",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
318 name="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
319 description="",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
320 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
321 temp_, temp_rc, _ = fragmenting([fragment], length, max_gap, sl_wind_step=sl_wind_step)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
322 if temp_ and temp_rc:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
323 seqs.append(fragment)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
324 fragments.extend(temp_)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
325 fragments_rc.extend(temp_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
326 counter_1 += 1
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
327 except ValueError:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
328 # print(f"{fragment_full.id} has length {len(fragment_full.seq)} and is too short to be sampled")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
329 pass
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
330 counter_2 += 1
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
331 if limit:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
332 assert counter_2 <= limit * n_fragments, f"While cycle iterated more than {limit}, data is ambiguous." \
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
333 f" Only {len(fragments)} fragments were sampled out of {n_fragments}"
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
334 total_fragments.extend(fragments)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
335 total_fragments_rc.extend(fragments_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
336 total_seqs.extend(seqs)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
337 # print("sequence sampling done")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
338 return total_fragments, total_fragments_rc, total_seqs
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
339
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
340
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
341 def prepare_ds_fragmenting(in_seq, label, label_int, fragment_length, sl_wind_step, max_gap=0.05, n_cpus=1):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
342 if sl_wind_step is None:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
343 sl_wind_step = int(fragment_length / 2)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
344 # generating viral fragments and labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
345 seqs = list(SeqIO.parse(in_seq, "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
346 frags, frags_rc, seqs_ = fragmenting(seqs, fragment_length, max_gap=max_gap, sl_wind_step=sl_wind_step)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
347 encoded = one_hot_encode(frags)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
348 encoded_rc = one_hot_encode(frags_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
349 labs = prepare_labels(frags, label=label_int, label_depth=2)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
350 seqs_ = label_fasta_fragments(seqs_, label=label)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
351 # subsetting to unique fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
352 u_encoded, indices = np.unique(encoded, axis=0, return_index=True)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
353 u_encoded_rc = encoded_rc[indices]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
354 u_labs = labs[indices]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
355 u_seqs = [seqs_[i] for i in indices]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
356 assert (np.shape(u_encoded)[0] == np.shape(u_encoded_rc)[0])
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
357 print(f"Encoding {label} sequences finished")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
358 # print(f"{np.shape(u_encoded)[0]} forward fragments generated")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
359 n_frags = np.shape(u_encoded)[0]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
360 return u_encoded, u_encoded_rc, u_labs, u_seqs, n_frags
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
361
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
362
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
363 def prepare_ds_sampling(in_seqs, fragment_length, n_frags, label, label_int, random_seed, n_cpus=1, limit=100):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
364 # generating plant fragments and labels
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
365 seqs_list = prepare_seq_lists(in_seqs, n_frags)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
366 frags, frags_rc, seqs_ = sample_fragments(seqs_list, fragment_length, random_seed, limit=limit, max_gap=0.05)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
367 frags, frags_rc, seqs_ = shuffle(frags, frags_rc, seqs_, random_state=random_seed, n_samples=int(n_frags))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
368 encoded = one_hot_encode(frags)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
369 encoded_rc = one_hot_encode(frags_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
370 labs = prepare_labels(frags, label=label_int, label_depth=2)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
371 seqs_ = label_fasta_fragments(seqs_, label=label)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
372 assert (np.shape(encoded)[0] == np.shape(encoded_rc)[0])
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
373 print(f"Encoding {label} sequences finished")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
374 # print(f"{np.shape(encoded)[0]} forward fragments generated")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
375 return encoded, encoded_rc, labs, seqs_, n_frags
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
376
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
377
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
378 def storing_encoded(encoded, encoded_rc, labs, out_path, ):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
379 f = h5py.File(out_path, "w")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
380 f.create_dataset("fragments", data=encoded)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
381 f.create_dataset("fragments_rc", data=encoded_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
382 f.create_dataset("labels", data=labs)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
383 f.close()