annotate build/scripts-3.6/SeqSero2_update_kmer_database.py @ 12:08832c0d3cbd draft

planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
author cstrittmatter
date Fri, 15 May 2020 10:19:08 -0400
parents e6437d423693
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
1 #!/Users/charles.strittmatter/miniconda3/bin/python
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
2
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
3 import argparse
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
4 import os,subprocess
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
5 import pickle
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
6
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
7 ### SeqSero Kmer
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
8 def parse_args():
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
9 "Parse the input arguments, use '-h' for help."
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
10 parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
11 return parser.parse_args()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
12
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
13 def reverse_complement(sequence):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
14 complement = {
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
15 'A': 'T',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
16 'C': 'G',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
17 'G': 'C',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
18 'T': 'A',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
19 'N': 'N',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
20 'M': 'K',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
21 'R': 'Y',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
22 'W': 'W',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
23 'S': 'S',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
24 'Y': 'R',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
25 'K': 'M',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
26 'V': 'B',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
27 'H': 'D',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
28 'D': 'H',
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
29 'B': 'V'
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
30 }
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
31 return "".join(complement[base] for base in reversed(sequence))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
32
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
33 def multifasta_dict(multifasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
34 multifasta_list = [
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
35 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
36 ]
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
37 headers = [i for i in multifasta_list if i[0] == '>']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
38 multifasta_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
39 for h in headers:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
40 start = multifasta_list.index(h)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
41 for element in multifasta_list[start + 1:]:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
42 if element[0] == '>':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
43 break
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
44 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
45 if h[1:] in multifasta_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
46 multifasta_dict[h[1:]] += element
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
47 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
48 multifasta_dict[h[1:]] = element
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
49 return multifasta_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
50
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
51 def createKmerDict_reads(list_of_strings, kmer):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
52 kmer_table = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
53 for string in list_of_strings:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
54 sequence = string.strip('\n')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
55 for i in range(len(sequence) - kmer + 1):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
56 new_mer = sequence[i:i + kmer].upper()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
57 new_mer_rc = reverse_complement(new_mer)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
58 if new_mer in kmer_table:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
59 kmer_table[new_mer.upper()] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
60 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
61 kmer_table[new_mer.upper()] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
62 if new_mer_rc in kmer_table:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
63 kmer_table[new_mer_rc.upper()] += 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
64 else:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
65 kmer_table[new_mer_rc.upper()] = 1
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
66 return kmer_table
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
67
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
68 def multifasta_to_kmers_dict(multifasta):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
69 multi_seq_dict = multifasta_dict(multifasta)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
70 lib_dict = {}
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
71 for h in multi_seq_dict:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
72 lib_dict[h] = set(
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
73 [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)])
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
74 return lib_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
75
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
76 def get_salmid_invA_database(ex_dir):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
77 # read invA kmer and return it
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
78 a = open(ex_dir + '/invA_mers_dict', 'rb')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
79 invA_dict = pickle.load(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
80 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
81 del invA_dict['version']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
82 except:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
83 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
84 return invA_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
85
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
86 def get_salmid_rpoB_database(ex_dir):
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
87 # read invA kmer and return it
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
88 a = open(ex_dir + '/rpoB_mers_dict', 'rb')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
89 rpoB_dict = pickle.load(a)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
90 try:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
91 del rpoB_dict['version']
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
92 except:
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
93 pass
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
94 return rpoB_dict
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
95
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
96 def main():
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
97 args = parse_args()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
98 ex_dir = os.path.dirname(os.path.realpath(__file__))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
99 lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta')
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
100 invA_dict=get_salmid_invA_database(ex_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
101 #rpoB_dict=get_salmid_rpoB_database(ex_dir)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
102 lib_dict_new = lib_dict.copy()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
103 #print(len(lib_dict_new))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
104 lib_dict_new.update(invA_dict)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
105 #print(len(lib_dict_new))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
106 #lib_dict_new.update(rpoB_dict)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
107 #print(len(lib_dict_new))
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
108 f = open(ex_dir + '/antigens.pickle', "wb")
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
109 pickle.dump(lib_dict_new, f)
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
110 f.close()
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
111
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
112 if __name__ == '__main__':
e6437d423693 planemo upload commit 70dc513aa7d7ac6785847dfd86323687613b6b68-dirty
cstrittmatter
parents:
diff changeset
113 main()