comparison scripts/S03_remove_site_with_not_enough_species_represented.py @ 0:eb95bf7f90ae draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:26:37 -0500
parents
children c79bdda8abfb
comparison
equal deleted inserted replaced
-1:000000000000 0:eb95bf7f90ae
1 #!/usr/bin/env python
2 # coding: utf8
3 ## Author: Eric Fontanillas
4 ## Modification: 03/09/14 by Julie BAFFARD
5 ## Last modification : 05/03/18 by Victor Mataigne
6
7 ## Description : find and remove indels
8
9 ####################
10 ###### DEF 2 #######
11 ####################
12 def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB):
13
14 ## 1 ## Get alignment length
15 fasta_name0 = bash_aa.keys()[0]
16 ln_aa = len(bash_aa[fasta_name0])
17
18 ln_nuc = len(bash_nuc[fasta_name0])
19
20
21 ## 2 ## Get positions keeped in aa alignment
22 LIST_POSITION_KEEPED_aa = []
23 i=0
24 while i < ln_aa:
25 site = []
26 for fasta_name in bash_aa.keys():
27 pos = bash_aa[fasta_name][i]
28
29 if pos != "-" and pos != "?" and pos != "X":
30 site.append(pos)
31 if len(site) >= MIN_SPECIES_NB:
32 LIST_POSITION_KEEPED_aa.append(i)
33 i = i+1
34
35 ## 3 ## Get positions keeped in nuc alignment
36 LIST_POSITION_KEEPED_nuc = []
37 for position in LIST_POSITION_KEEPED_aa:
38 position1 = position*3
39 position2 = position*3 + 1
40 position3 = position*3 + 2
41 LIST_POSITION_KEEPED_nuc.append(position1)
42 LIST_POSITION_KEEPED_nuc.append(position2)
43 LIST_POSITION_KEEPED_nuc.append(position3)
44
45 ## 4 ## Create entries for "filtered_bash" for aa & nuc
46 filtered_bash_aa = {}
47 filtered_bash_nuc = {}
48 for fasta_name in bash_aa.keys():
49 filtered_bash_aa[fasta_name] = ""
50 for fasta_name in bash_nuc.keys():
51 filtered_bash_nuc[fasta_name] = ""
52
53 ## 5 ## Write "filtered_bash" for aa
54 j=0
55 while j < ln_aa:
56 for fasta_name in bash_aa.keys():
57 seq=filtered_bash_aa[fasta_name]
58 pos=bash_aa[fasta_name][j]
59
60 if j in LIST_POSITION_KEEPED_aa:
61 seq = seq + pos
62 filtered_bash_aa[fasta_name] = seq
63 j = j + 1
64
65 ## 6 ## Remove empty sequence
66 for name in filtered_bash_aa.keys():
67 seq = filtered_bash_aa[name]
68 if seq == '':
69 del filtered_bash_aa[name]
70
71
72 ## 7 ## Write "filtered_bash" for nuc
73 j=0
74 while j < ln_nuc:
75 for fasta_name in bash_nuc.keys():
76 seq=filtered_bash_nuc[fasta_name]
77 #print seq
78 pos=bash_nuc[fasta_name][j]
79
80 if j in LIST_POSITION_KEEPED_nuc:
81 seq = seq + pos
82 filtered_bash_nuc[fasta_name] = seq
83 j = j + 1
84
85 ## 8 ## Remove empty sequence
86 for name in filtered_bash_nuc.keys():
87 seq = filtered_bash_nuc[name]
88 if seq == '':
89 del filtered_bash_nuc[name]
90
91 return(filtered_bash_aa, filtered_bash_nuc)
92 ####################################
93
94
95 #######################
96 ##### RUN RUN RUN #####
97 #######################
98 import string, os, time, re, sys
99 from dico import dico
100
101 ### 0 ### PARAMETERS
102 MIN_SPECIES_NB = int(sys.argv[1])
103 MIN_LENGTH_FINAL_ALIGNMENT_NUC = int(sys.argv[2])
104 n0 = 0
105 bad = 0
106 good = 0
107 list_new_file = []
108 dicoco = {}
109 list_file = []
110 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
111
112 ### 1 ### IN
113 path_IN1 = "./07_CDS_aa/"
114 L_IN1 = os.listdir(path_IN1)
115 lenght = len(L_IN1)
116 path_IN2 = "./07_CDS_nuc/"
117 L_IN2 = os.listdir(path_IN2)
118
119 ## 2 ## OUT
120 os.mkdir("08_CDS_aa_MINIMUM_MISSING_SEQUENCES")
121 path_OUT1 = "08_CDS_aa_MINIMUM_MISSING_SEQUENCES"
122 os.mkdir("08_CDS_nuc_MINIMUM_MISSING_SEQUENCES")
123 path_OUT2 = "08_CDS_nuc_MINIMUM_MISSING_SEQUENCES"
124
125
126 for file in L_IN1:
127 file_INaa = "%s/%s" %(path_IN1, file)
128 file_INnuc = "%s/%s" %(path_IN2, file)
129
130 dico_aa = dico(file_INaa) ### DEF 1 ###
131 dico_nuc = dico(file_INnuc) ### DEF 1 ###
132
133 if len(dico_aa) < MIN_SPECIES_NB :
134 list_file.append(file)
135
136 if list_file == lenght :
137 MIN_SPECIES_NB == MIN_SPECIES_NB - 1
138
139
140 for file in L_IN1 :
141 file_INaa = "%s/%s" %(path_IN1, file)
142 file_INnuc = "%s/%s" %(path_IN2, file)
143
144 dico_aa = dico(file_INaa) ### DEF 1 ###
145 dico_nuc = dico(file_INnuc) ### DEF 1 ###
146
147 ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment)
148 filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ###
149
150 k = filtered_bash_nuc.keys()
151 new_leng_nuc = 0
152 if k != []:
153 k0 = k[0]
154 seq0 = filtered_bash_nuc[k0]
155 new_leng_nuc = len(seq0)
156
157 ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment
158 n0+=1
159 #name_elems[1] = str(n0)
160 name_elems[1] = file.split('_')[1]
161 name_elems[3] = str(len(filtered_bash_aa.keys()))
162 new_name = "_".join(name_elems)
163
164 ## 4.5 ## Write filtered alignment in OUTPUTs
165 ## aa
166 if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
167 OUTaa=open("%s/%s" %(path_OUT1, new_name), "w")
168 for fasta_name in filtered_bash_aa.keys():
169 seq_aa = filtered_bash_aa[fasta_name]
170 OUTaa.write("%s\n" %fasta_name)
171 OUTaa.write("%s\n" %seq_aa)
172 OUTaa.close()
173 # nuc
174 if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
175 good+=1
176 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w")
177 for fasta_name in filtered_bash_nuc.keys():
178 seq_nuc = filtered_bash_nuc[fasta_name]
179 OUTnuc.write("%s\n" %fasta_name)
180 OUTnuc.write("%s\n" %seq_nuc)
181 OUTnuc.close()
182 else:
183 bad+=1
184
185
186 ## 5 ## Print
187 print "*************** 2nd Filter : removal of the indel ***************"
188 print "\nTotal number of locus recorded = %d" %n0
189 print "\tTotal number of locus with no indels (SAVED) = %d" %good
190 print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad
191 print ""