Mercurial > repos > iuc > virhunter
annotate utils/preprocess.py @ 2:ea2cccb9f73e draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit c3685ed6a70b47012b62b95a2a3db062bd3b7475
author | iuc |
---|---|
date | Thu, 05 Jan 2023 14:27:54 +0000 |
parents | 457fd8fd681a |
children |
rev | line source |
---|---|
0
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
3 # Credits: Grigorii Sukhorukov, Macha Nikolski |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
4 import math |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
5 import os |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
6 import pathlib |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
7 import random |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
8 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
9 import h5py |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
10 import numpy as np |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
11 from Bio import SeqIO |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
12 from Bio.Seq import Seq |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
13 from Bio.SeqRecord import SeqRecord |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
14 from sklearn.utils import shuffle |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
15 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
16 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
17 def reverse_complement(fragment): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
18 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
19 provides reverse complement to sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
20 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
21 sequences - list with SeqRecord sequences in fasta format |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
22 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
23 complementary_sequences - |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
24 list with SeqRecord complementary sequences in fasta format |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
25 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
26 # complementary_sequences = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
27 # for sequence in sequences: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
28 # complementary_sequence = SeqRecord( |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
29 # seq=Seq(sequence.seq).reverse_complement(), |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
30 # id=sequence.id + "_reverse_complement", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
31 # ) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
32 # complementary_sequences.append(complementary_sequence) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
33 fragment = fragment[::-1].translate(str.maketrans('ACGT', 'TGCA')) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
34 return fragment |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
35 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
36 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
37 def introduce_mutations(seqs, mut_rate, rs=None): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
38 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
39 Function that mutates sequences in the entering fasta file |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
40 A proportion of nucleotides are changed to other nucleotide |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
41 Not yet taking account of mutation for gaps |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
42 mut_rate - proportion from 0.0 to 1.0, float |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
43 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
44 random.seed(a=rs) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
45 assert 0.0 <= mut_rate <= 1.0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
46 mutated_seqs = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
47 for seq in seqs: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
48 mut_seq = list(str(seq.seq)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
49 l_ = len(mut_seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
50 mutated_sites_i = random.sample(range(l_), int(mut_rate * l_)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
51 for mut_site_i in mutated_sites_i: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
52 mut_site = mut_seq[mut_site_i] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
53 mutations = ["A", "C", "T", "G"] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
54 if mut_site in mutations: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
55 mutations.remove(mut_site) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
56 mut_seq[mut_site_i] = random.sample(mutations, 1)[0] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
57 mutated_seq = SeqRecord( |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
58 seq=Seq("".join(mut_seq)), |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
59 id=seq.id + f"mut_{mut_rate}", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
60 name="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
61 description="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
62 ) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
63 mutated_seqs.append(mutated_seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
64 return mutated_seqs |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
65 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
66 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
67 def separate_by_length(length_, seq_list, fold=None,): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
68 # TODO: add docs |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
69 included = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
70 to_process = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
71 excluded = 0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
72 for seq_ in seq_list: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
73 l_ = len(seq_.seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
74 if l_ >= length_: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
75 if fold is None: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
76 included.append(seq_) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
77 elif l_ < length_ * fold: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
78 included.append(seq_) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
79 else: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
80 to_process.append(seq_) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
81 else: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
82 excluded += 1 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
83 print(f"A total of {excluded} sequences was excluded due to being smaller than {length_}") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
84 return included, to_process |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
85 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
86 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
87 def chunks(lst, n): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
88 """Yield successive n-sized chunks from lst. |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
89 https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks""" |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
90 for i in range(0, len(lst), n): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
91 yield lst[i:i + n] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
92 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
93 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
94 def correct(frag): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
95 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
96 leaves only unambiguous DNA code (ACTG-) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
97 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
98 frag - string of nucleotides |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
99 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
100 pr_frag - corrected string of nucleotides |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
101 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
102 pr_frag = frag.upper() |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
103 pr_frag_s = set(pr_frag) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
104 if pr_frag_s != {"A", "C", "G", "T", "-"}: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
105 for letter in pr_frag_s - {"A", "C", "G", "T", "-"}: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
106 pr_frag = pr_frag.replace(letter, "-") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
107 return pr_frag |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
108 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
109 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
110 def fragmenting(sequences, sl_wind_size, max_gap=0.05, sl_wind_step=None): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
111 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
112 slices sequences in fragments by sliding window |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
113 based on its size and step. |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
114 last fragment is padded by '-' |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
115 fragments have ambiguous bases replaced by '-' |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
116 fragments with many '-' are discarded |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
117 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
118 sequences - list with SeqRecord sequences in fasta format |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
119 max_gap - max allowed proportion of '-' |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
120 sl_wind_size - sliding window step |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
121 sl_wind_step - sliding window step, by default equals |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
122 sliding window size (None is replaced by it) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
123 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
124 fragments - list with sequence fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
125 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
126 if sl_wind_step is None: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
127 sl_wind_step = sl_wind_size |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
128 fragments = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
129 fragments_rc = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
130 out_sequences = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
131 for sequence in sequences: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
132 seq = str(sequence.seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
133 n_fragments = 1 + max(0, math.ceil((len(seq) - sl_wind_size) / sl_wind_step)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
134 for n in range(n_fragments): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
135 if n + 1 != n_fragments: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
136 frag = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
137 elif n_fragments == 1: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
138 # padding the shorter fragment to sl_wind_size |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
139 frag_short = seq[n * sl_wind_step: n * sl_wind_step + sl_wind_size] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
140 frag = frag_short + (sl_wind_size - len(frag_short)) * "-" |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
141 else: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
142 frag = seq[(len(seq) - sl_wind_size):] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
143 # replace ambiguous characters |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
144 frag = correct(frag) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
145 assert len(frag) == sl_wind_size, f"{len(frag)} vs {sl_wind_size}" |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
146 # skipping sequences with many gaps |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
147 if frag.count("-") / sl_wind_size <= max_gap: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
148 fragments.append(frag) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
149 # generating reverse complement |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
150 fragments_rc.append(reverse_complement(frag)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
151 fr_seq = SeqRecord( |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
152 seq=Seq(frag), |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
153 id=f"{sequence.id}_{n*sl_wind_step}_{sl_wind_size}", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
154 name="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
155 description="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
156 ) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
157 out_sequences.append(fr_seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
158 return fragments, fragments_rc, out_sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
159 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
160 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
161 def label_fasta_fragments(sequences, label): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
162 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
163 Provides labels to generated fragments stored in fasta |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
164 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
165 sequences - list with SeqRecord sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
166 label - type of label (bacteria, virus, plant) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
167 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
168 labeled_fragments - list with labeled SeqRecord sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
169 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
170 assert label in ["virus", "plant", "bacteria"] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
171 labeled_fragments = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
172 for sequence in sequences: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
173 sequence.id = sequence.id + f"_{label}" |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
174 labeled_fragments.append(sequence) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
175 return labeled_fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
176 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
177 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
178 def one_hot_encode(fragments): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
179 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
180 produces one-hot matrices from fragments and labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
181 '-' is given all zeros |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
182 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
183 fragments - list with sequence fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
184 label - type of label (int <= depth) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
185 label_depth - number of possible labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
186 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
187 encoded_fragments - list with one-hot encoded fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
188 labels - list with one-hot encoded labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
189 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
190 import tensorflow as tf |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
191 encoded_fragments = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
192 map_dict = {"A": 0, "C": 1, "G": 2, "T": 3, "-": -1} |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
193 for frag in fragments: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
194 frag_array = np.array(list(frag)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
195 integer_encoded = np.int8(np.vectorize(map_dict.get)(frag_array)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
196 one_hot_encoded = tf.one_hot(integer_encoded, depth=4, dtype=tf.int8).numpy() |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
197 encoded_fragments.append(one_hot_encoded) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
198 encoded_fragments = np.stack(encoded_fragments) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
199 return encoded_fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
200 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
201 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
202 def prepare_labels(fragments, label, label_depth): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
203 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
204 produces one-hot labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
205 '-' is given all zeros |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
206 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
207 fragments - list with sequence fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
208 label - type of label (int <= depth) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
209 label_depth - number of possible labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
210 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
211 labels - list with one-hot encoded labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
212 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
213 import tensorflow as tf |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
214 n_fragments = len(fragments) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
215 labels = np.int8(np.full(n_fragments, label)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
216 labels = tf.one_hot(labels, depth=label_depth).numpy() |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
217 return labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
218 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
219 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
220 # TODO: write docs for functions |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
221 def calculate_total_length(seq_path): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
222 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
223 Calculate total length of the sequences in the fasta file. |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
224 Needed for weighted sampling |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
225 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
226 seq_path - path to the file with sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
227 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
228 seq_length - total length of all sequences in the file |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
229 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
230 seqs = list(SeqIO.parse(seq_path, "fasta")) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
231 seq_length = 0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
232 for seq in seqs: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
233 seq_length += len(seq.seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
234 return seq_length |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
235 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
236 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
237 def prepare_seq_lists(in_paths, n_fragments, weights=None,): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
238 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
239 selects files with sequences based on extension |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
240 and calculates number of fragments to be sampled |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
241 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
242 in_paths - list of paths to folder with sequence files. Can be a string also a string |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
243 n_fragments - number of fragments to be sampled |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
244 weights - upsampling of fragments. fractions should sum to one |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
245 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
246 seqs_list - list with path to files with sequences |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
247 n_fragments_list - number of fragments to be sampled |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
248 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
249 # case when we recieve a single sequence file |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
250 if type(in_paths) is str and in_paths.endswith(('.fna', '.fasta')): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
251 return [[in_paths, n_fragments]] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
252 else: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
253 # transform string to list |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
254 if type(in_paths) is str or type(in_paths) is pathlib.PosixPath: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
255 in_paths = [in_paths] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
256 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
257 if weights: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
258 assert len(weights) == len(in_paths) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
259 assert 1.01 > round(sum(weights), 2) > 0.99 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
260 else: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
261 l_ = len(in_paths) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
262 weights = [1 / l_] * l_ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
263 n_fragments_list_all = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
264 seqs_list_all = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
265 for in_paths, w_ in zip(in_paths, weights): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
266 seqs_list = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
267 seq_length_list = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
268 total_length = 0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
269 for file in os.listdir(in_paths): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
270 if file.endswith("fna") or file.endswith("fasta"): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
271 seq_path = (os.path.join(in_paths, file)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
272 seqs_length = calculate_total_length(seq_path) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
273 seqs_list.append(seq_path) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
274 seq_length_list.append(seqs_length) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
275 total_length += seqs_length |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
276 # + 1 may lead to a slightly bigger number than desired |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
277 n_fragments_list = [((seq_length / total_length) * n_fragments * w_ + 1) for seq_length in seq_length_list] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
278 n_fragments_list_all.extend(n_fragments_list) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
279 seqs_list_all.extend(seqs_list) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
280 print("list calculation done") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
281 return list(zip(seqs_list_all, n_fragments_list_all)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
282 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
283 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
284 def sample_fragments(seq_container, length, random_seed=1, limit=None, max_gap=0.05, sl_wind_step=None): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
285 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
286 Randomly samples fragments from sequences in the list. |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
287 Input: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
288 seq_container - list with each entry containing path to sequence, |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
289 and n samples from this sequence. |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
290 length - desired length of sampled fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
291 Output: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
292 fragments - list with sequence fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
293 """ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
294 random.seed(a=random_seed) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
295 total_fragments = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
296 total_fragments_rc = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
297 total_seqs = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
298 for entry in seq_container: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
299 seq = list(SeqIO.parse(entry[0], "fasta")) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
300 n_fragments = entry[1] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
301 seqs = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
302 fragments = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
303 fragments_rc = [] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
304 counter_1 = 0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
305 counter_2 = 0 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
306 while counter_1 < n_fragments: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
307 # select chromosomes if there are any |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
308 fragment_full = random.choice(seq) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
309 r_end = len(fragment_full.seq) - length |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
310 try: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
311 r_start = random.randrange(r_end) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
312 fragment = SeqRecord( |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
313 seq=fragment_full.seq[r_start:(r_start + length)], |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
314 id=f"{fragment_full.id}_{length}_{r_start}", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
315 name="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
316 description="", |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
317 ) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
318 temp_, temp_rc, _ = fragmenting([fragment], length, max_gap, sl_wind_step=sl_wind_step) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
319 if temp_ and temp_rc: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
320 seqs.append(fragment) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
321 fragments.extend(temp_) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
322 fragments_rc.extend(temp_rc) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
323 counter_1 += 1 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
324 except ValueError: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
325 # print(f"{fragment_full.id} has length {len(fragment_full.seq)} and is too short to be sampled") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
326 pass |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
327 counter_2 += 1 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
328 if limit: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
329 assert counter_2 <= limit * n_fragments, f"While cycle iterated more than {limit}, data is ambiguous." \ |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
330 f" Only {len(fragments)} fragments were sampled out of {n_fragments}" |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
331 total_fragments.extend(fragments) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
332 total_fragments_rc.extend(fragments_rc) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
333 total_seqs.extend(seqs) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
334 # print("sequence sampling done") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
335 return total_fragments, total_fragments_rc, total_seqs |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
336 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
337 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
338 def prepare_ds_fragmenting(in_seq, label, label_int, fragment_length, sl_wind_step, max_gap=0.05, n_cpus=1): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
339 if sl_wind_step is None: |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
340 sl_wind_step = int(fragment_length / 2) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
341 # generating viral fragments and labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
342 seqs = list(SeqIO.parse(in_seq, "fasta")) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
343 frags, frags_rc, seqs_ = fragmenting(seqs, fragment_length, max_gap=max_gap, sl_wind_step=sl_wind_step) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
344 encoded = one_hot_encode(frags) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
345 encoded_rc = one_hot_encode(frags_rc) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
346 labs = prepare_labels(frags, label=label_int, label_depth=3) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
347 seqs_ = label_fasta_fragments(seqs_, label=label) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
348 # subsetting to unique fragments |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
349 u_encoded, indices = np.unique(encoded, axis=0, return_index=True) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
350 u_encoded_rc = encoded_rc[indices] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
351 u_labs = labs[indices] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
352 u_seqs = [seqs_[i] for i in indices] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
353 assert (np.shape(u_encoded)[0] == np.shape(u_encoded_rc)[0]) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
354 print(f"Encoding {label} sequences finished") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
355 # print(f"{np.shape(u_encoded)[0]} forward fragments generated") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
356 n_frags = np.shape(u_encoded)[0] |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
357 return u_encoded, u_encoded_rc, u_labs, u_seqs, n_frags |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
358 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
359 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
360 def prepare_ds_sampling(in_seqs, fragment_length, n_frags, label, label_int, random_seed, n_cpus=1, limit=100): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
361 # generating plant fragments and labels |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
362 seqs_list = prepare_seq_lists(in_seqs, n_frags) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
363 frags, frags_rc, seqs_ = sample_fragments.remote(seqs_list, fragment_length, random_seed, limit=limit, max_gap=0.05) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
364 frags, frags_rc, seqs_ = shuffle(frags, frags_rc, seqs_, random_state=random_seed, n_samples=int(n_frags)) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
365 encoded = one_hot_encode(frags) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
366 encoded_rc = one_hot_encode(frags_rc) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
367 labs = prepare_labels(frags, label=label_int, label_depth=3) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
368 seqs_ = label_fasta_fragments(seqs_, label=label) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
369 assert (np.shape(encoded)[0] == np.shape(encoded_rc)[0]) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
370 print(f"Encoding {label} sequences finished") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
371 # print(f"{np.shape(encoded)[0]} forward fragments generated") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
372 return encoded, encoded_rc, labs, seqs_, n_frags |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
373 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
374 |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
375 def storing_encoded(encoded, encoded_rc, labs, out_path, ): |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
376 f = h5py.File(out_path, "w") |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
377 f.create_dataset("fragments", data=encoded) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
378 f.create_dataset("fragments_rc", data=encoded_rc) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
379 f.create_dataset("labels", data=labs) |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
380 f.close() |
457fd8fd681a
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc
iuc
parents:
diff
changeset
|
381 print(f"encoded fragments and labels stored in {out_path}") |