Mercurial > repos > iuc > decontaminator
annotate utils/preprocess.py @ 0:b856d3d95413 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
author | iuc |
---|---|
date | Mon, 09 Jan 2023 13:27:09 +0000 |
parents | |
children |
rev | line source |
---|---|
0
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
3 # Credits: Grigorii Sukhorukov, Macha Nikolski |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
4 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
5 import math |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
6 import os |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
7 import pathlib |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
8 import random |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
9 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
10 import h5py |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
11 import numpy as np |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
12 from Bio import SeqIO |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
13 from Bio.Seq import Seq |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
14 from Bio.SeqRecord import SeqRecord |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
15 from sklearn.utils import shuffle |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
16 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
17 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
18 def reverse_complement(fragment): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
19 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
20 provides reverse complement to sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
21 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
22 sequences - list with SeqRecord sequences in fasta format |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
23 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
24 complementary_sequences - |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
25 list with SeqRecord complementary sequences in fasta format |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
26 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
27 # complementary_sequences = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
28 # for sequence in sequences: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
29 # complementary_sequence = SeqRecord( |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
30 # seq=Seq(sequence.seq).reverse_complement(), |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
31 # id=sequence.id + "_reverse_complement", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
32 # ) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
33 # complementary_sequences.append(complementary_sequence) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
34 fragment = fragment[::-1].translate(str.maketrans('ACGT', 'TGCA')) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
35 return fragment |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
36 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
37 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
38 def introduce_mutations(seqs, mut_rate, rs=None): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
39 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
40 Function that mutates sequences in the entering fasta file |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
41 A proportion of nucleotides are changed to other nucleotide |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
42 Not yet taking account of mutation for gaps |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
43 mut_rate - proportion from 0.0 to 1.0, float |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
44 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
45 random.seed(a=rs) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
46 assert 0.0 <= mut_rate <= 1.0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
47 mutated_seqs = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
48 for seq in seqs: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
49 mut_seq = list(str(seq.seq)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
50 l_ = len(mut_seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
51 mutated_sites_i = random.sample(range(l_), int(mut_rate * l_)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
52 for mut_site_i in mutated_sites_i: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
53 mut_site = mut_seq[mut_site_i] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
54 mutations = ["A", "C", "T", "G"] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
55 if mut_site in mutations: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
56 mutations.remove(mut_site) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
57 mut_seq[mut_site_i] = random.sample(mutations, 1)[0] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
58 mutated_seq = SeqRecord( |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
59 seq=Seq("".join(mut_seq)), |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
60 id=seq.id + f"mut_{mut_rate}", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
61 name="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
62 description="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
63 ) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
64 mutated_seqs.append(mutated_seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
65 return mutated_seqs |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
66 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
67 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
68 def separate_by_length(length_, seq_list, fold=None,): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
69 # TODO: add docs |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
70 included = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
71 to_process = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
72 excluded = 0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
73 for seq_ in seq_list: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
74 l_ = len(seq_.seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
75 if l_ >= length_: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
76 if fold is None: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
77 included.append(seq_) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
78 elif l_ < length_ * fold: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
79 included.append(seq_) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
80 else: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
81 to_process.append(seq_) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
82 else: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
83 excluded += 1 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
84 print(f"A total of {excluded} sequences was excluded due to being smaller than {length_}") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
85 return included, to_process |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
86 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
87 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
88 def chunks(lst, n): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
89 """Yield successive n-sized chunks from lst. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
90 https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks""" |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
91 for i in range(0, len(lst), n): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
92 yield lst[i:i + n] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
93 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
94 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
95 def correct(frag): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
96 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
97 leaves only unambiguous DNA code (ACTG-) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
98 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
99 frag - string of nucleotides |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
100 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
101 pr_frag - corrected string of nucleotides |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
102 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
103 pr_frag = frag.upper() |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
104 pr_frag_s = set(pr_frag) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
105 if pr_frag_s != {"A", "C", "G", "T", "-"}: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
106 for letter in pr_frag_s - {"A", "C", "G", "T", "-"}: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
107 pr_frag = pr_frag.replace(letter, "-") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
108 return pr_frag |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
109 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
110 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
111 def fragmenting(sequences, sl_wind_size, max_gap=0.05, sl_wind_step=None): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
112 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
113 slices sequences in fragments by sliding window |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
114 based on its size and step. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
115 last fragment is padded by '-' |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
116 fragments have ambiguous bases replaced by '-' |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
117 fragments with many '-' are discarded |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
118 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
119 sequences - list with SeqRecord sequences in fasta format |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
120 max_gap - max allowed proportion of '-' |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
121 sl_wind_size - sliding window step |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
122 sl_wind_step - sliding window step, by default equals |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
123 sliding window size (None is replaced by it) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
124 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
125 fragments - list with sequence fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
126 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
127 if sl_wind_step is None: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
128 sl_wind_step = sl_wind_size |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
129 fragments = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
130 fragments_rc = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
131 out_sequences = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
132 for sequence in sequences: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
133 seq = str(sequence.seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
134 n_fragments = 1 + max(0, math.ceil((len(seq) - sl_wind_size) / sl_wind_step)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
135 for n in range(n_fragments): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
136 if n + 1 != n_fragments: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
137 frag = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
138 elif n_fragments == 1: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
139 # padding the shorter fragment to sl_wind_size |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
140 frag_short = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
141 frag = frag_short + (sl_wind_size - len(frag_short)) * "-" |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
142 else: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
143 frag = seq[(len(seq) - sl_wind_size):] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
144 # replace ambiguous characters |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
145 frag = correct(frag) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
146 assert len(frag) == sl_wind_size, f"{len(frag)} vs {sl_wind_size}" |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
147 # skipping sequences with many gaps |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
148 if frag.count("-") / sl_wind_size <= max_gap: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
149 fragments.append(frag) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
150 # generating reverse complement |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
151 fragments_rc.append(reverse_complement(frag)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
152 fr_seq = SeqRecord( |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
153 seq=Seq(frag), |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
154 id=f"{sequence.id}_{n*sl_wind_step}_{sl_wind_size}", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
155 name="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
156 description="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
157 ) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
158 out_sequences.append(fr_seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
159 return fragments, fragments_rc, out_sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
160 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
161 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
162 def label_fasta_fragments(sequences, label): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
163 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
164 Provides labels to generated fragments stored in fasta |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
165 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
166 sequences - list with SeqRecord sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
167 label - type of label (bacteria, virus, plant) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
168 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
169 labeled_fragments - list with labeled SeqRecord sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
170 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
171 # assert label in ["virus", "plant", "bacteria"] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
172 labeled_fragments = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
173 for sequence in sequences: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
174 sequence.id = sequence.id + f"_{label}" |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
175 labeled_fragments.append(sequence) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
176 return labeled_fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
177 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
178 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
179 def one_hot_encode(fragments): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
180 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
181 produces one-hot matrices from fragments and labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
182 '-' is given all zeros |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
183 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
184 fragments - list with sequence fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
185 label - type of label (int <= depth) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
186 label_depth - number of possible labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
187 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
188 encoded_fragments - list with one-hot encoded fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
189 labels - list with one-hot encoded labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
190 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
191 import tensorflow as tf |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
192 encoded_fragments = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
193 map_dict = {"A": 0, "C": 1, "G": 2, "T": 3, "-": -1} |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
194 for frag in fragments: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
195 frag_array = np.array(list(frag)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
196 integer_encoded = np.int8(np.vectorize(map_dict.get)(frag_array)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
197 one_hot_encoded = tf.one_hot(integer_encoded, depth=4, dtype=tf.int8).numpy() |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
198 encoded_fragments.append(one_hot_encoded) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
199 encoded_fragments = np.stack(encoded_fragments) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
200 return encoded_fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
201 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
202 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
203 def prepare_labels(fragments, label, label_depth): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
204 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
205 produces one-hot labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
206 '-' is given all zeros |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
207 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
208 fragments - list with sequence fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
209 label - type of label (int <= depth) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
210 label_depth - number of possible labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
211 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
212 labels - list with one-hot encoded labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
213 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
214 import tensorflow as tf |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
215 n_fragments = len(fragments) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
216 labels = np.int8(np.full(n_fragments, label)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
217 labels = tf.one_hot(labels, depth=label_depth).numpy() |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
218 return labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
219 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
220 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
221 # TODO: write docs for functions |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
222 def calculate_total_length(seq_path): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
223 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
224 Calculate total length of the sequences in the fasta file. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
225 Needed for weighted sampling |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
226 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
227 seq_path - path to the file with sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
228 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
229 seq_length - total length of all sequences in the file |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
230 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
231 seqs = list(SeqIO.parse(seq_path, "fasta")) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
232 seq_length = 0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
233 for seq in seqs: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
234 seq_length += len(seq.seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
235 return seq_length |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
236 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
237 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
238 def prepare_seq_lists(in_paths, n_fragments, weights=None,): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
239 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
240 selects files with sequences based on extension |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
241 and calculates number of fragments to be sampled |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
242 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
243 in_paths - list of paths to folder with sequence files. Can be a string also a string |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
244 n_fragments - number of fragments to be sampled |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
245 weights - upsampling of fragments. fractions should sum to one |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
246 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
247 seqs_list - list with path to files with sequences |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
248 n_fragments_list - number of fragments to be sampled |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
249 lists are zipped to work with ray iterators |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
250 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
251 # case when we recieve a single sequence file |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
252 if type(in_paths) is str and in_paths.endswith(('.fna', '.fasta')): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
253 return [[in_paths, n_fragments]] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
254 else: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
255 # transform string to list |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
256 if type(in_paths) is str or type(in_paths) is pathlib.PosixPath: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
257 in_paths = [in_paths] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
258 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
259 if weights: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
260 assert len(weights) == len(in_paths) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
261 assert 1.01 > round(sum(weights), 2) > 0.99 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
262 else: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
263 l_ = len(in_paths) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
264 weights = [1 / l_] * l_ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
265 n_fragments_list_all = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
266 seqs_list_all = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
267 for in_paths, w_ in zip(in_paths, weights): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
268 seqs_list = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
269 seq_length_list = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
270 total_length = 0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
271 for file in os.listdir(in_paths): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
272 if file.endswith("fna") or file.endswith("fasta"): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
273 seq_path = (os.path.join(in_paths, file)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
274 seqs_length = calculate_total_length(seq_path) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
275 seqs_list.append(seq_path) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
276 seq_length_list.append(seqs_length) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
277 total_length += seqs_length |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
278 # + 1 may lead to a slightly bigger number than desired |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
279 n_fragments_list = [((seq_length / total_length) * n_fragments * w_ + 1) for seq_length in seq_length_list] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
280 n_fragments_list_all.extend(n_fragments_list) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
281 seqs_list_all.extend(seqs_list) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
282 print("list calculation done") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
283 return list(zip(seqs_list_all, n_fragments_list_all)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
284 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
285 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
286 def sample_fragments(seq_container, length, random_seed=1, limit=None, max_gap=0.05, sl_wind_step=None): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
287 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
288 Randomly samples fragments from sequences in the list. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
289 Is a bit cumbersome written to work with ray. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
290 Input: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
291 seq_container - list with each entry containing path to sequence, |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
292 and n samples from this sequence. |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
293 length - desired length of sampled fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
294 Output: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
295 fragments - list with sequence fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
296 """ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
297 random.seed(a=random_seed) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
298 total_fragments = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
299 total_fragments_rc = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
300 total_seqs = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
301 for entry in seq_container: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
302 seq = list(SeqIO.parse(entry[0], "fasta")) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
303 n_fragments = entry[1] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
304 seqs = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
305 fragments = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
306 fragments_rc = [] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
307 counter_1 = 0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
308 counter_2 = 0 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
309 while counter_1 < n_fragments: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
310 # select chromosomes if there are any |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
311 fragment_full = random.choice(seq) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
312 r_end = len(fragment_full.seq) - length |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
313 try: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
314 r_start = random.randrange(r_end) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
315 fragment = SeqRecord( |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
316 seq=fragment_full.seq[r_start:(r_start + length)], |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
317 id=f"{fragment_full.id}_{length}_{r_start}", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
318 name="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
319 description="", |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
320 ) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
321 temp_, temp_rc, _ = fragmenting([fragment], length, max_gap, sl_wind_step=sl_wind_step) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
322 if temp_ and temp_rc: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
323 seqs.append(fragment) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
324 fragments.extend(temp_) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
325 fragments_rc.extend(temp_rc) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
326 counter_1 += 1 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
327 except ValueError: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
328 # print(f"{fragment_full.id} has length {len(fragment_full.seq)} and is too short to be sampled") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
329 pass |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
330 counter_2 += 1 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
331 if limit: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
332 assert counter_2 <= limit * n_fragments, f"While cycle iterated more than {limit}, data is ambiguous." \ |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
333 f" Only {len(fragments)} fragments were sampled out of {n_fragments}" |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
334 total_fragments.extend(fragments) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
335 total_fragments_rc.extend(fragments_rc) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
336 total_seqs.extend(seqs) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
337 # print("sequence sampling done") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
338 return total_fragments, total_fragments_rc, total_seqs |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
339 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
340 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
341 def prepare_ds_fragmenting(in_seq, label, label_int, fragment_length, sl_wind_step, max_gap=0.05, n_cpus=1): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
342 if sl_wind_step is None: |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
343 sl_wind_step = int(fragment_length / 2) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
344 # generating viral fragments and labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
345 seqs = list(SeqIO.parse(in_seq, "fasta")) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
346 frags, frags_rc, seqs_ = fragmenting(seqs, fragment_length, max_gap=max_gap, sl_wind_step=sl_wind_step) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
347 encoded = one_hot_encode(frags) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
348 encoded_rc = one_hot_encode(frags_rc) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
349 labs = prepare_labels(frags, label=label_int, label_depth=2) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
350 seqs_ = label_fasta_fragments(seqs_, label=label) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
351 # subsetting to unique fragments |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
352 u_encoded, indices = np.unique(encoded, axis=0, return_index=True) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
353 u_encoded_rc = encoded_rc[indices] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
354 u_labs = labs[indices] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
355 u_seqs = [seqs_[i] for i in indices] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
356 assert (np.shape(u_encoded)[0] == np.shape(u_encoded_rc)[0]) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
357 print(f"Encoding {label} sequences finished") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
358 # print(f"{np.shape(u_encoded)[0]} forward fragments generated") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
359 n_frags = np.shape(u_encoded)[0] |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
360 return u_encoded, u_encoded_rc, u_labs, u_seqs, n_frags |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
361 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
362 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
363 def prepare_ds_sampling(in_seqs, fragment_length, n_frags, label, label_int, random_seed, n_cpus=1, limit=100): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
364 # generating plant fragments and labels |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
365 seqs_list = prepare_seq_lists(in_seqs, n_frags) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
366 frags, frags_rc, seqs_ = sample_fragments(seqs_list, fragment_length, random_seed, limit=limit, max_gap=0.05) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
367 frags, frags_rc, seqs_ = shuffle(frags, frags_rc, seqs_, random_state=random_seed, n_samples=int(n_frags)) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
368 encoded = one_hot_encode(frags) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
369 encoded_rc = one_hot_encode(frags_rc) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
370 labs = prepare_labels(frags, label=label_int, label_depth=2) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
371 seqs_ = label_fasta_fragments(seqs_, label=label) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
372 assert (np.shape(encoded)[0] == np.shape(encoded_rc)[0]) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
373 print(f"Encoding {label} sequences finished") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
374 # print(f"{np.shape(encoded)[0]} forward fragments generated") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
375 return encoded, encoded_rc, labs, seqs_, n_frags |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
376 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
377 |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
378 def storing_encoded(encoded, encoded_rc, labs, out_path, ): |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
379 f = h5py.File(out_path, "w") |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
380 f.create_dataset("fragments", data=encoded) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
381 f.create_dataset("fragments_rc", data=encoded_rc) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
382 f.create_dataset("labels", data=labs) |
b856d3d95413
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff
changeset
|
383 f.close() |