Mercurial > repos > cstrittmatter > ss2v110
annotate build/scripts-3.6/SeqSero2_update_kmer_database.py @ 12:08832c0d3cbd draft
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
author | cstrittmatter |
---|---|
date | Fri, 15 May 2020 10:19:08 -0400 |
parents | e6437d423693 |
children |
rev | line source |
---|---|
10
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
1 #!/Users/charles.strittmatter/miniconda3/bin/python |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
2 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
3 import argparse |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
4 import os,subprocess |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
5 import pickle |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
6 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
7 ### SeqSero Kmer |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
8 def parse_args(): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
9 "Parse the input arguments, use '-h' for help." |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
10 parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically') |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
11 return parser.parse_args() |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
12 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
13 def reverse_complement(sequence): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
14 complement = { |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
15 'A': 'T', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
16 'C': 'G', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
17 'G': 'C', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
18 'T': 'A', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
19 'N': 'N', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
20 'M': 'K', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
21 'R': 'Y', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
22 'W': 'W', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
23 'S': 'S', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
24 'Y': 'R', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
25 'K': 'M', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
26 'V': 'B', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
27 'H': 'D', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
28 'D': 'H', |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
29 'B': 'V' |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
30 } |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
31 return "".join(complement[base] for base in reversed(sequence)) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
32 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
33 def multifasta_dict(multifasta): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
34 multifasta_list = [ |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
35 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
36 ] |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
37 headers = [i for i in multifasta_list if i[0] == '>'] |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
38 multifasta_dict = {} |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
39 for h in headers: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
40 start = multifasta_list.index(h) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
41 for element in multifasta_list[start + 1:]: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
42 if element[0] == '>': |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
43 break |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
44 else: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
45 if h[1:] in multifasta_dict: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
46 multifasta_dict[h[1:]] += element |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
47 else: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
48 multifasta_dict[h[1:]] = element |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
49 return multifasta_dict |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
50 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
51 def createKmerDict_reads(list_of_strings, kmer): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
52 kmer_table = {} |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
53 for string in list_of_strings: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
54 sequence = string.strip('\n') |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
55 for i in range(len(sequence) - kmer + 1): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
56 new_mer = sequence[i:i + kmer].upper() |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
57 new_mer_rc = reverse_complement(new_mer) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
58 if new_mer in kmer_table: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
59 kmer_table[new_mer.upper()] += 1 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
60 else: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
61 kmer_table[new_mer.upper()] = 1 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
62 if new_mer_rc in kmer_table: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
63 kmer_table[new_mer_rc.upper()] += 1 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
64 else: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
65 kmer_table[new_mer_rc.upper()] = 1 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
66 return kmer_table |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
67 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
68 def multifasta_to_kmers_dict(multifasta): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
69 multi_seq_dict = multifasta_dict(multifasta) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
70 lib_dict = {} |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
71 for h in multi_seq_dict: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
72 lib_dict[h] = set( |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
73 [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)]) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
74 return lib_dict |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
75 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
76 def get_salmid_invA_database(ex_dir): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
77 # read invA kmer and return it |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
78 a = open(ex_dir + '/invA_mers_dict', 'rb') |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
79 invA_dict = pickle.load(a) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
80 try: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
81 del invA_dict['version'] |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
82 except: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
83 pass |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
84 return invA_dict |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
85 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
86 def get_salmid_rpoB_database(ex_dir): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
87 # read invA kmer and return it |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
88 a = open(ex_dir + '/rpoB_mers_dict', 'rb') |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
89 rpoB_dict = pickle.load(a) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
90 try: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
91 del rpoB_dict['version'] |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
92 except: |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
93 pass |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
94 return rpoB_dict |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
95 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
96 def main(): |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
97 args = parse_args() |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
98 ex_dir = os.path.dirname(os.path.realpath(__file__)) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
99 lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta') |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
100 invA_dict=get_salmid_invA_database(ex_dir) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
101 #rpoB_dict=get_salmid_rpoB_database(ex_dir) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
102 lib_dict_new = lib_dict.copy() |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
103 #print(len(lib_dict_new)) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
104 lib_dict_new.update(invA_dict) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
105 #print(len(lib_dict_new)) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
106 #lib_dict_new.update(rpoB_dict) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
107 #print(len(lib_dict_new)) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
108 f = open(ex_dir + '/antigens.pickle', "wb") |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
109 pickle.dump(lib_dict_new, f) |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
110 f.close() |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
111 |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
112 if __name__ == '__main__': |
e6437d423693
planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff
changeset
|
113 main() |