Mercurial > repos > abims-sbr > cds_search
comparison scripts/S03_remove_site_with_not_enough_species_represented.py @ 1:c79bdda8abfb draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author | abims-sbr |
---|---|
date | Thu, 09 Jun 2022 12:40:00 +0000 |
parents | eb95bf7f90ae |
children |
comparison
equal
deleted
inserted
replaced
0:eb95bf7f90ae | 1:c79bdda8abfb |
---|---|
10 ###### DEF 2 ####### | 10 ###### DEF 2 ####### |
11 #################### | 11 #################### |
12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): | 12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): |
13 | 13 |
14 ## 1 ## Get alignment length | 14 ## 1 ## Get alignment length |
15 fasta_name0 = bash_aa.keys()[0] | 15 fasta_name0 = list(bash_aa.keys())[0] |
16 ln_aa = len(bash_aa[fasta_name0]) | 16 ln_aa = len(bash_aa[fasta_name0]) |
17 | 17 |
18 ln_nuc = len(bash_nuc[fasta_name0]) | 18 ln_nuc = len(bash_nuc[fasta_name0]) |
19 | 19 |
20 | 20 |
21 ## 2 ## Get positions keeped in aa alignment | 21 ## 2 ## Get positions keeped in aa alignment |
22 LIST_POSITION_KEEPED_aa = [] | 22 LIST_POSITION_KEEPED_aa = [] |
23 i=0 | 23 i=0 |
24 while i < ln_aa: | 24 while i < ln_aa: |
25 site = [] | 25 site = [] |
26 for fasta_name in bash_aa.keys(): | 26 for fasta_name in list(bash_aa.keys()): |
27 pos = bash_aa[fasta_name][i] | 27 pos = bash_aa[fasta_name][i] |
28 | 28 |
29 if pos != "-" and pos != "?" and pos != "X": | 29 if pos != "-" and pos != "?" and pos != "X": |
30 site.append(pos) | 30 site.append(pos) |
31 if len(site) >= MIN_SPECIES_NB: | 31 if len(site) >= MIN_SPECIES_NB: |
43 LIST_POSITION_KEEPED_nuc.append(position3) | 43 LIST_POSITION_KEEPED_nuc.append(position3) |
44 | 44 |
45 ## 4 ## Create entries for "filtered_bash" for aa & nuc | 45 ## 4 ## Create entries for "filtered_bash" for aa & nuc |
46 filtered_bash_aa = {} | 46 filtered_bash_aa = {} |
47 filtered_bash_nuc = {} | 47 filtered_bash_nuc = {} |
48 for fasta_name in bash_aa.keys(): | 48 for fasta_name in list(bash_aa.keys()): |
49 filtered_bash_aa[fasta_name] = "" | 49 filtered_bash_aa[fasta_name] = "" |
50 for fasta_name in bash_nuc.keys(): | 50 for fasta_name in list(bash_nuc.keys()): |
51 filtered_bash_nuc[fasta_name] = "" | 51 filtered_bash_nuc[fasta_name] = "" |
52 | 52 |
53 ## 5 ## Write "filtered_bash" for aa | 53 ## 5 ## Write "filtered_bash" for aa |
54 j=0 | 54 j=0 |
55 while j < ln_aa: | 55 while j < ln_aa: |
56 for fasta_name in bash_aa.keys(): | 56 for fasta_name in list(bash_aa.keys()): |
57 seq=filtered_bash_aa[fasta_name] | 57 seq=filtered_bash_aa[fasta_name] |
58 pos=bash_aa[fasta_name][j] | 58 pos=bash_aa[fasta_name][j] |
59 | 59 |
60 if j in LIST_POSITION_KEEPED_aa: | 60 if j in LIST_POSITION_KEEPED_aa: |
61 seq = seq + pos | 61 seq = seq + pos |
62 filtered_bash_aa[fasta_name] = seq | 62 filtered_bash_aa[fasta_name] = seq |
63 j = j + 1 | 63 j = j + 1 |
64 | 64 |
65 ## 6 ## Remove empty sequence | 65 ## 6 ## Remove empty sequence |
66 for name in filtered_bash_aa.keys(): | 66 for name in list(filtered_bash_aa.keys()): |
67 seq = filtered_bash_aa[name] | 67 seq = filtered_bash_aa[name] |
68 if seq == '': | 68 if seq == '': |
69 del filtered_bash_aa[name] | 69 del filtered_bash_aa[name] |
70 | 70 |
71 | 71 |
72 ## 7 ## Write "filtered_bash" for nuc | 72 ## 7 ## Write "filtered_bash" for nuc |
73 j=0 | 73 j=0 |
74 while j < ln_nuc: | 74 while j < ln_nuc: |
75 for fasta_name in bash_nuc.keys(): | 75 for fasta_name in list(bash_nuc.keys()): |
76 seq=filtered_bash_nuc[fasta_name] | 76 seq=filtered_bash_nuc[fasta_name] |
77 #print seq | 77 #print seq |
78 pos=bash_nuc[fasta_name][j] | 78 pos=bash_nuc[fasta_name][j] |
79 | 79 |
80 if j in LIST_POSITION_KEEPED_nuc: | 80 if j in LIST_POSITION_KEEPED_nuc: |
81 seq = seq + pos | 81 seq = seq + pos |
82 filtered_bash_nuc[fasta_name] = seq | 82 filtered_bash_nuc[fasta_name] = seq |
83 j = j + 1 | 83 j = j + 1 |
84 | 84 |
85 ## 8 ## Remove empty sequence | 85 ## 8 ## Remove empty sequence |
86 for name in filtered_bash_nuc.keys(): | 86 for name in list(filtered_bash_nuc.keys()): |
87 seq = filtered_bash_nuc[name] | 87 seq = filtered_bash_nuc[name] |
88 if seq == '': | 88 if seq == '': |
89 del filtered_bash_nuc[name] | 89 del filtered_bash_nuc[name] |
90 | 90 |
91 return(filtered_bash_aa, filtered_bash_nuc) | 91 return(filtered_bash_aa, filtered_bash_nuc) |
145 dico_nuc = dico(file_INnuc) ### DEF 1 ### | 145 dico_nuc = dico(file_INnuc) ### DEF 1 ### |
146 | 146 |
147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) | 147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) |
148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### | 148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### |
149 | 149 |
150 k = filtered_bash_nuc.keys() | 150 k = list(filtered_bash_nuc.keys()) |
151 new_leng_nuc = 0 | 151 new_leng_nuc = 0 |
152 if k != []: | 152 if k != []: |
153 k0 = k[0] | 153 k0 = k[0] |
154 seq0 = filtered_bash_nuc[k0] | 154 seq0 = filtered_bash_nuc[k0] |
155 new_leng_nuc = len(seq0) | 155 new_leng_nuc = len(seq0) |
156 | 156 |
157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment | 157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment |
158 n0+=1 | 158 n0+=1 |
159 #name_elems[1] = str(n0) | 159 #name_elems[1] = str(n0) |
160 name_elems[1] = file.split('_')[1] | 160 name_elems[1] = file.split('_')[1] |
161 name_elems[3] = str(len(filtered_bash_aa.keys())) | 161 name_elems[3] = str(len(list(filtered_bash_aa.keys()))) |
162 new_name = "_".join(name_elems) | 162 new_name = "_".join(name_elems) |
163 | 163 |
164 ## 4.5 ## Write filtered alignment in OUTPUTs | 164 ## 4.5 ## Write filtered alignment in OUTPUTs |
165 ## aa | 165 ## aa |
166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: | 166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: |
167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") | 167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") |
168 for fasta_name in filtered_bash_aa.keys(): | 168 for fasta_name in list(filtered_bash_aa.keys()): |
169 seq_aa = filtered_bash_aa[fasta_name] | 169 seq_aa = filtered_bash_aa[fasta_name] |
170 OUTaa.write("%s\n" %fasta_name) | 170 OUTaa.write("%s\n" %fasta_name) |
171 OUTaa.write("%s\n" %seq_aa) | 171 OUTaa.write("%s\n" %seq_aa) |
172 OUTaa.close() | 172 OUTaa.close() |
173 # nuc | 173 # nuc |
174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: | 174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: |
175 good+=1 | 175 good+=1 |
176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") | 176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") |
177 for fasta_name in filtered_bash_nuc.keys(): | 177 for fasta_name in list(filtered_bash_nuc.keys()): |
178 seq_nuc = filtered_bash_nuc[fasta_name] | 178 seq_nuc = filtered_bash_nuc[fasta_name] |
179 OUTnuc.write("%s\n" %fasta_name) | 179 OUTnuc.write("%s\n" %fasta_name) |
180 OUTnuc.write("%s\n" %seq_nuc) | 180 OUTnuc.write("%s\n" %seq_nuc) |
181 OUTnuc.close() | 181 OUTnuc.close() |
182 else: | 182 else: |
183 bad+=1 | 183 bad+=1 |
184 | 184 |
185 | 185 |
186 ## 5 ## Print | 186 ## 5 ## Print |
187 print "*************** 2nd Filter : removal of the indel ***************" | 187 print("*************** 2nd Filter : removal of the indel ***************") |
188 print "\nTotal number of locus recorded = %d" %n0 | 188 print("\nTotal number of locus recorded = %d" %n0) |
189 print "\tTotal number of locus with no indels (SAVED) = %d" %good | 189 print("\tTotal number of locus with no indels (SAVED) = %d" %good) |
190 print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad | 190 print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad) |
191 print "" | 191 print("") |