comparison scripts/S03_remove_site_with_not_enough_species_represented.py @ 1:c79bdda8abfb draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author abims-sbr
date Thu, 09 Jun 2022 12:40:00 +0000
parents eb95bf7f90ae
children
comparison
equal deleted inserted replaced
0:eb95bf7f90ae 1:c79bdda8abfb
10 ###### DEF 2 ####### 10 ###### DEF 2 #######
11 #################### 11 ####################
12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): 12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB):
13 13
14 ## 1 ## Get alignment length 14 ## 1 ## Get alignment length
15 fasta_name0 = bash_aa.keys()[0] 15 fasta_name0 = list(bash_aa.keys())[0]
16 ln_aa = len(bash_aa[fasta_name0]) 16 ln_aa = len(bash_aa[fasta_name0])
17 17
18 ln_nuc = len(bash_nuc[fasta_name0]) 18 ln_nuc = len(bash_nuc[fasta_name0])
19 19
20 20
21 ## 2 ## Get positions keeped in aa alignment 21 ## 2 ## Get positions keeped in aa alignment
22 LIST_POSITION_KEEPED_aa = [] 22 LIST_POSITION_KEEPED_aa = []
23 i=0 23 i=0
24 while i < ln_aa: 24 while i < ln_aa:
25 site = [] 25 site = []
26 for fasta_name in bash_aa.keys(): 26 for fasta_name in list(bash_aa.keys()):
27 pos = bash_aa[fasta_name][i] 27 pos = bash_aa[fasta_name][i]
28 28
29 if pos != "-" and pos != "?" and pos != "X": 29 if pos != "-" and pos != "?" and pos != "X":
30 site.append(pos) 30 site.append(pos)
31 if len(site) >= MIN_SPECIES_NB: 31 if len(site) >= MIN_SPECIES_NB:
43 LIST_POSITION_KEEPED_nuc.append(position3) 43 LIST_POSITION_KEEPED_nuc.append(position3)
44 44
45 ## 4 ## Create entries for "filtered_bash" for aa & nuc 45 ## 4 ## Create entries for "filtered_bash" for aa & nuc
46 filtered_bash_aa = {} 46 filtered_bash_aa = {}
47 filtered_bash_nuc = {} 47 filtered_bash_nuc = {}
48 for fasta_name in bash_aa.keys(): 48 for fasta_name in list(bash_aa.keys()):
49 filtered_bash_aa[fasta_name] = "" 49 filtered_bash_aa[fasta_name] = ""
50 for fasta_name in bash_nuc.keys(): 50 for fasta_name in list(bash_nuc.keys()):
51 filtered_bash_nuc[fasta_name] = "" 51 filtered_bash_nuc[fasta_name] = ""
52 52
53 ## 5 ## Write "filtered_bash" for aa 53 ## 5 ## Write "filtered_bash" for aa
54 j=0 54 j=0
55 while j < ln_aa: 55 while j < ln_aa:
56 for fasta_name in bash_aa.keys(): 56 for fasta_name in list(bash_aa.keys()):
57 seq=filtered_bash_aa[fasta_name] 57 seq=filtered_bash_aa[fasta_name]
58 pos=bash_aa[fasta_name][j] 58 pos=bash_aa[fasta_name][j]
59 59
60 if j in LIST_POSITION_KEEPED_aa: 60 if j in LIST_POSITION_KEEPED_aa:
61 seq = seq + pos 61 seq = seq + pos
62 filtered_bash_aa[fasta_name] = seq 62 filtered_bash_aa[fasta_name] = seq
63 j = j + 1 63 j = j + 1
64 64
65 ## 6 ## Remove empty sequence 65 ## 6 ## Remove empty sequence
66 for name in filtered_bash_aa.keys(): 66 for name in list(filtered_bash_aa.keys()):
67 seq = filtered_bash_aa[name] 67 seq = filtered_bash_aa[name]
68 if seq == '': 68 if seq == '':
69 del filtered_bash_aa[name] 69 del filtered_bash_aa[name]
70 70
71 71
72 ## 7 ## Write "filtered_bash" for nuc 72 ## 7 ## Write "filtered_bash" for nuc
73 j=0 73 j=0
74 while j < ln_nuc: 74 while j < ln_nuc:
75 for fasta_name in bash_nuc.keys(): 75 for fasta_name in list(bash_nuc.keys()):
76 seq=filtered_bash_nuc[fasta_name] 76 seq=filtered_bash_nuc[fasta_name]
77 #print seq 77 #print seq
78 pos=bash_nuc[fasta_name][j] 78 pos=bash_nuc[fasta_name][j]
79 79
80 if j in LIST_POSITION_KEEPED_nuc: 80 if j in LIST_POSITION_KEEPED_nuc:
81 seq = seq + pos 81 seq = seq + pos
82 filtered_bash_nuc[fasta_name] = seq 82 filtered_bash_nuc[fasta_name] = seq
83 j = j + 1 83 j = j + 1
84 84
85 ## 8 ## Remove empty sequence 85 ## 8 ## Remove empty sequence
86 for name in filtered_bash_nuc.keys(): 86 for name in list(filtered_bash_nuc.keys()):
87 seq = filtered_bash_nuc[name] 87 seq = filtered_bash_nuc[name]
88 if seq == '': 88 if seq == '':
89 del filtered_bash_nuc[name] 89 del filtered_bash_nuc[name]
90 90
91 return(filtered_bash_aa, filtered_bash_nuc) 91 return(filtered_bash_aa, filtered_bash_nuc)
145 dico_nuc = dico(file_INnuc) ### DEF 1 ### 145 dico_nuc = dico(file_INnuc) ### DEF 1 ###
146 146
147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) 147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment)
148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### 148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ###
149 149
150 k = filtered_bash_nuc.keys() 150 k = list(filtered_bash_nuc.keys())
151 new_leng_nuc = 0 151 new_leng_nuc = 0
152 if k != []: 152 if k != []:
153 k0 = k[0] 153 k0 = k[0]
154 seq0 = filtered_bash_nuc[k0] 154 seq0 = filtered_bash_nuc[k0]
155 new_leng_nuc = len(seq0) 155 new_leng_nuc = len(seq0)
156 156
157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment 157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment
158 n0+=1 158 n0+=1
159 #name_elems[1] = str(n0) 159 #name_elems[1] = str(n0)
160 name_elems[1] = file.split('_')[1] 160 name_elems[1] = file.split('_')[1]
161 name_elems[3] = str(len(filtered_bash_aa.keys())) 161 name_elems[3] = str(len(list(filtered_bash_aa.keys())))
162 new_name = "_".join(name_elems) 162 new_name = "_".join(name_elems)
163 163
164 ## 4.5 ## Write filtered alignment in OUTPUTs 164 ## 4.5 ## Write filtered alignment in OUTPUTs
165 ## aa 165 ## aa
166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: 166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") 167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w")
168 for fasta_name in filtered_bash_aa.keys(): 168 for fasta_name in list(filtered_bash_aa.keys()):
169 seq_aa = filtered_bash_aa[fasta_name] 169 seq_aa = filtered_bash_aa[fasta_name]
170 OUTaa.write("%s\n" %fasta_name) 170 OUTaa.write("%s\n" %fasta_name)
171 OUTaa.write("%s\n" %seq_aa) 171 OUTaa.write("%s\n" %seq_aa)
172 OUTaa.close() 172 OUTaa.close()
173 # nuc 173 # nuc
174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: 174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
175 good+=1 175 good+=1
176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") 176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w")
177 for fasta_name in filtered_bash_nuc.keys(): 177 for fasta_name in list(filtered_bash_nuc.keys()):
178 seq_nuc = filtered_bash_nuc[fasta_name] 178 seq_nuc = filtered_bash_nuc[fasta_name]
179 OUTnuc.write("%s\n" %fasta_name) 179 OUTnuc.write("%s\n" %fasta_name)
180 OUTnuc.write("%s\n" %seq_nuc) 180 OUTnuc.write("%s\n" %seq_nuc)
181 OUTnuc.close() 181 OUTnuc.close()
182 else: 182 else:
183 bad+=1 183 bad+=1
184 184
185 185
186 ## 5 ## Print 186 ## 5 ## Print
187 print "*************** 2nd Filter : removal of the indel ***************" 187 print("*************** 2nd Filter : removal of the indel ***************")
188 print "\nTotal number of locus recorded = %d" %n0 188 print("\nTotal number of locus recorded = %d" %n0)
189 print "\tTotal number of locus with no indels (SAVED) = %d" %good 189 print("\tTotal number of locus with no indels (SAVED) = %d" %good)
190 print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad 190 print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad)
191 print "" 191 print("")