Mercurial > repos > abims-sbr > cds_search
comparison scripts/S03_remove_site_with_not_enough_species_represented.py @ 0:eb95bf7f90ae draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:26:37 -0500 |
parents | |
children | c79bdda8abfb |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:eb95bf7f90ae |
---|---|
1 #!/usr/bin/env python | |
2 # coding: utf8 | |
3 ## Author: Eric Fontanillas | |
4 ## Modification: 03/09/14 by Julie BAFFARD | |
5 ## Last modification : 05/03/18 by Victor Mataigne | |
6 | |
7 ## Description : find and remove indels | |
8 | |
9 #################### | |
10 ###### DEF 2 ####### | |
11 #################### | |
12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): | |
13 | |
14 ## 1 ## Get alignment length | |
15 fasta_name0 = bash_aa.keys()[0] | |
16 ln_aa = len(bash_aa[fasta_name0]) | |
17 | |
18 ln_nuc = len(bash_nuc[fasta_name0]) | |
19 | |
20 | |
21 ## 2 ## Get positions keeped in aa alignment | |
22 LIST_POSITION_KEEPED_aa = [] | |
23 i=0 | |
24 while i < ln_aa: | |
25 site = [] | |
26 for fasta_name in bash_aa.keys(): | |
27 pos = bash_aa[fasta_name][i] | |
28 | |
29 if pos != "-" and pos != "?" and pos != "X": | |
30 site.append(pos) | |
31 if len(site) >= MIN_SPECIES_NB: | |
32 LIST_POSITION_KEEPED_aa.append(i) | |
33 i = i+1 | |
34 | |
35 ## 3 ## Get positions keeped in nuc alignment | |
36 LIST_POSITION_KEEPED_nuc = [] | |
37 for position in LIST_POSITION_KEEPED_aa: | |
38 position1 = position*3 | |
39 position2 = position*3 + 1 | |
40 position3 = position*3 + 2 | |
41 LIST_POSITION_KEEPED_nuc.append(position1) | |
42 LIST_POSITION_KEEPED_nuc.append(position2) | |
43 LIST_POSITION_KEEPED_nuc.append(position3) | |
44 | |
45 ## 4 ## Create entries for "filtered_bash" for aa & nuc | |
46 filtered_bash_aa = {} | |
47 filtered_bash_nuc = {} | |
48 for fasta_name in bash_aa.keys(): | |
49 filtered_bash_aa[fasta_name] = "" | |
50 for fasta_name in bash_nuc.keys(): | |
51 filtered_bash_nuc[fasta_name] = "" | |
52 | |
53 ## 5 ## Write "filtered_bash" for aa | |
54 j=0 | |
55 while j < ln_aa: | |
56 for fasta_name in bash_aa.keys(): | |
57 seq=filtered_bash_aa[fasta_name] | |
58 pos=bash_aa[fasta_name][j] | |
59 | |
60 if j in LIST_POSITION_KEEPED_aa: | |
61 seq = seq + pos | |
62 filtered_bash_aa[fasta_name] = seq | |
63 j = j + 1 | |
64 | |
65 ## 6 ## Remove empty sequence | |
66 for name in filtered_bash_aa.keys(): | |
67 seq = filtered_bash_aa[name] | |
68 if seq == '': | |
69 del filtered_bash_aa[name] | |
70 | |
71 | |
72 ## 7 ## Write "filtered_bash" for nuc | |
73 j=0 | |
74 while j < ln_nuc: | |
75 for fasta_name in bash_nuc.keys(): | |
76 seq=filtered_bash_nuc[fasta_name] | |
77 #print seq | |
78 pos=bash_nuc[fasta_name][j] | |
79 | |
80 if j in LIST_POSITION_KEEPED_nuc: | |
81 seq = seq + pos | |
82 filtered_bash_nuc[fasta_name] = seq | |
83 j = j + 1 | |
84 | |
85 ## 8 ## Remove empty sequence | |
86 for name in filtered_bash_nuc.keys(): | |
87 seq = filtered_bash_nuc[name] | |
88 if seq == '': | |
89 del filtered_bash_nuc[name] | |
90 | |
91 return(filtered_bash_aa, filtered_bash_nuc) | |
92 #################################### | |
93 | |
94 | |
95 ####################### | |
96 ##### RUN RUN RUN ##### | |
97 ####################### | |
98 import string, os, time, re, sys | |
99 from dico import dico | |
100 | |
101 ### 0 ### PARAMETERS | |
102 MIN_SPECIES_NB = int(sys.argv[1]) | |
103 MIN_LENGTH_FINAL_ALIGNMENT_NUC = int(sys.argv[2]) | |
104 n0 = 0 | |
105 bad = 0 | |
106 good = 0 | |
107 list_new_file = [] | |
108 dicoco = {} | |
109 list_file = [] | |
110 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"] | |
111 | |
112 ### 1 ### IN | |
113 path_IN1 = "./07_CDS_aa/" | |
114 L_IN1 = os.listdir(path_IN1) | |
115 lenght = len(L_IN1) | |
116 path_IN2 = "./07_CDS_nuc/" | |
117 L_IN2 = os.listdir(path_IN2) | |
118 | |
119 ## 2 ## OUT | |
120 os.mkdir("08_CDS_aa_MINIMUM_MISSING_SEQUENCES") | |
121 path_OUT1 = "08_CDS_aa_MINIMUM_MISSING_SEQUENCES" | |
122 os.mkdir("08_CDS_nuc_MINIMUM_MISSING_SEQUENCES") | |
123 path_OUT2 = "08_CDS_nuc_MINIMUM_MISSING_SEQUENCES" | |
124 | |
125 | |
126 for file in L_IN1: | |
127 file_INaa = "%s/%s" %(path_IN1, file) | |
128 file_INnuc = "%s/%s" %(path_IN2, file) | |
129 | |
130 dico_aa = dico(file_INaa) ### DEF 1 ### | |
131 dico_nuc = dico(file_INnuc) ### DEF 1 ### | |
132 | |
133 if len(dico_aa) < MIN_SPECIES_NB : | |
134 list_file.append(file) | |
135 | |
136 if list_file == lenght : | |
137 MIN_SPECIES_NB == MIN_SPECIES_NB - 1 | |
138 | |
139 | |
140 for file in L_IN1 : | |
141 file_INaa = "%s/%s" %(path_IN1, file) | |
142 file_INnuc = "%s/%s" %(path_IN2, file) | |
143 | |
144 dico_aa = dico(file_INaa) ### DEF 1 ### | |
145 dico_nuc = dico(file_INnuc) ### DEF 1 ### | |
146 | |
147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) | |
148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### | |
149 | |
150 k = filtered_bash_nuc.keys() | |
151 new_leng_nuc = 0 | |
152 if k != []: | |
153 k0 = k[0] | |
154 seq0 = filtered_bash_nuc[k0] | |
155 new_leng_nuc = len(seq0) | |
156 | |
157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment | |
158 n0+=1 | |
159 #name_elems[1] = str(n0) | |
160 name_elems[1] = file.split('_')[1] | |
161 name_elems[3] = str(len(filtered_bash_aa.keys())) | |
162 new_name = "_".join(name_elems) | |
163 | |
164 ## 4.5 ## Write filtered alignment in OUTPUTs | |
165 ## aa | |
166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: | |
167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") | |
168 for fasta_name in filtered_bash_aa.keys(): | |
169 seq_aa = filtered_bash_aa[fasta_name] | |
170 OUTaa.write("%s\n" %fasta_name) | |
171 OUTaa.write("%s\n" %seq_aa) | |
172 OUTaa.close() | |
173 # nuc | |
174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: | |
175 good+=1 | |
176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") | |
177 for fasta_name in filtered_bash_nuc.keys(): | |
178 seq_nuc = filtered_bash_nuc[fasta_name] | |
179 OUTnuc.write("%s\n" %fasta_name) | |
180 OUTnuc.write("%s\n" %seq_nuc) | |
181 OUTnuc.close() | |
182 else: | |
183 bad+=1 | |
184 | |
185 | |
186 ## 5 ## Print | |
187 print "*************** 2nd Filter : removal of the indel ***************" | |
188 print "\nTotal number of locus recorded = %d" %n0 | |
189 print "\tTotal number of locus with no indels (SAVED) = %d" %good | |
190 print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad | |
191 print "" |