Mercurial > repos > abims-sbr > concatphyl
comparison scripts/S01_concatenate.py @ 0:b186cae246bd draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:27:42 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b186cae246bd |
---|---|
1 #!/usr/bin/python | |
2 ## Author: Eric Fontanillas | |
3 ## Last modification: 17/06/2011 | |
4 ## Subject: find and remove indels | |
5 | |
6 ############################### | |
7 ##### DEF 0 : Dico fasta ##### | |
8 ############################### | |
9 def dico(F2): | |
10 dicoco = {} | |
11 with open(F2, "r") as file: | |
12 for name, query in itertools.izip_longest(*[file]*2): | |
13 if not name: | |
14 break | |
15 if name[0] == ">": | |
16 fasta_name_query = name[:-1] | |
17 Sn = string.split(fasta_name_query, "||") | |
18 fasta_name_query = Sn[0] | |
19 fasta_seq_query = query[:-1] | |
20 dicoco[fasta_name_query]=fasta_seq_query | |
21 return dicoco | |
22 ################################################################################### | |
23 | |
24 | |
25 #################### | |
26 ###### DEF 11 ###### | |
27 #################### | |
28 ## Concatenate sequences | |
29 ########################### | |
30 def concatenate(L_IN, SPECIES_ID_LIST): | |
31 ## 4 ## Process files | |
32 ## 4.1 ## Create the bash and the fasta names entries (name of the species) | |
33 bash_concat = {} | |
34 | |
35 for species_ID in SPECIES_ID_LIST: | |
36 bash_concat[species_ID] = '' | |
37 | |
38 ln_concat = 0 | |
39 nb_locus = 0 | |
40 pos=1 | |
41 list_genes_position=[] | |
42 ## 4.2 ## Concatenate | |
43 for file in L_IN: | |
44 nb_locus=nb_locus+1 | |
45 | |
46 ## a ## Open alignments | |
47 dico_seq = dico(file) ### DEF 0 ### | |
48 ## b ## Get alignment length + genes positions for RAxML | |
49 key0 = dico_seq.keys()[0] | |
50 ln = len(dico_seq[key0]) | |
51 ln_concat = ln_concat + ln | |
52 | |
53 pos_start = pos | |
54 pos_end = pos+ln-1 | |
55 pos=pos_end+1 | |
56 position="%d-%d" %(pos_start, pos_end) | |
57 RAxML_name = file[:-6] | |
58 sublist = [RAxML_name, position] | |
59 list_genes_position.append(sublist) | |
60 | |
61 ## c ## Generate "empty" sequence with alignment length * "-" | |
62 empty_seq = "-" * ln | |
63 | |
64 ## d ## Concatenate | |
65 ## d.1 ## Detect missing species in this alignment | |
66 list_ID=[] | |
67 list_absent_ID=[] | |
68 bash_fastaName={} | |
69 for fasta_name in dico_seq: | |
70 ID = fasta_name[1:3] | |
71 list_ID.append(ID) | |
72 seq = dico_seq[fasta_name] | |
73 bash_fastaName[ID]=fasta_name | |
74 for sp_ID in SPECIES_ID_LIST: | |
75 if sp_ID not in list_ID: | |
76 list_absent_ID.append(sp_ID) | |
77 | |
78 for ID in SPECIES_ID_LIST: | |
79 if ID in list_absent_ID: | |
80 bash_concat[ID] = bash_concat[ID] + empty_seq | |
81 else: | |
82 fasta_name = bash_fastaName[ID] | |
83 seq = dico_seq[fasta_name] | |
84 bash_concat[ID] = bash_concat[ID] + seq | |
85 | |
86 return(bash_concat, ln_concat, nb_locus, list_genes_position) | |
87 #################################### | |
88 | |
89 | |
90 ######################################## | |
91 ##### DEF 12 : get codon position ##### | |
92 ######################################## | |
93 def get_codon_position(seq_inORF): | |
94 | |
95 ln = len(seq_inORF) | |
96 | |
97 i=0 | |
98 seq_pos1="" | |
99 seq_pos2="" | |
100 seq_pos12="" | |
101 seq_pos3="" | |
102 while i<ln: | |
103 pos1 = seq_inORF[i] | |
104 pos2 = seq_inORF[i+1] | |
105 pos3 = seq_inORF[i+2] | |
106 | |
107 seq_pos1 = seq_pos1 + pos1 | |
108 seq_pos2 = seq_pos2 + pos2 | |
109 seq_pos12 = seq_pos12 + pos1 + pos2 | |
110 seq_pos3 = seq_pos3 + pos3 | |
111 | |
112 i = i+3 | |
113 | |
114 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) | |
115 ############################################################################### | |
116 | |
117 | |
118 | |
119 ####################### | |
120 ##### RUN RUN RUN ##### | |
121 ####################### | |
122 import string, os, time, re, sys, itertools | |
123 | |
124 list_species = [] | |
125 SPECIES_ID_LIST = [] | |
126 fasta = "^.*fasta$" | |
127 i=3 | |
128 | |
129 ## Arguments | |
130 infiles_filter_assemblies = sys.argv[1] | |
131 format_run = sys.argv[2] | |
132 | |
133 ## add file to list_species | |
134 list_species = str.split(infiles_filter_assemblies,",") | |
135 | |
136 ## in SPECIES_ID_LIST, only the 2 first letters of name of species | |
137 for name in list_species : | |
138 name = name[:2] | |
139 SPECIES_ID_LIST.append(name) | |
140 | |
141 ## add alignment files to L_IN | |
142 list_files = [] | |
143 with open(sys.argv[3], 'r') as f: | |
144 for line in f.readlines(): | |
145 list_files.append(line.strip('\n')) | |
146 | |
147 L_IN = [] | |
148 for file in list_files: | |
149 L_IN.append(file) | |
150 | |
151 #L_IN = str.split(input_alignments,",") | |
152 print(L_IN) | |
153 | |
154 ### 1 ### Proteic | |
155 if format_run == "proteic" : | |
156 | |
157 OUT1 = open("02_Concatenation_aa.fas", "w") | |
158 OUT2 = open("02_Concatenation_aa.phy", "w") | |
159 OUT3 = open("02_Concatenation_aa.nex", "w") | |
160 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") | |
161 | |
162 | |
163 ## Get bash with concatenation | |
164 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
165 | |
166 ## Write gene AA partition file for RAxML | |
167 for sublist in list_genes_position: | |
168 name = sublist[0] | |
169 positions=sublist[1] | |
170 OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) | |
171 OUT_PARTITION_gene_AA.close() | |
172 | |
173 ## Get "ntax" for NEXUS HEADER | |
174 nb_taxa = len(bash_concatenation.keys()) | |
175 | |
176 print "******************** CONCATENATION ********************\n" | |
177 print "Process amino-acid concatenation:" | |
178 print "\tNumber of taxa aligned = %d" %nb_taxa | |
179 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
180 print "\tTotal length of the concatenated sequences = %d" %ln | |
181 | |
182 | |
183 ## Print NEXUS HEADER: | |
184 OUT3.write("#NEXUS\n\n") | |
185 OUT3.write("Begin data;\n") | |
186 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
187 OUT3.write("\tFormat datatype=aa gap=-;\n") | |
188 OUT3.write("\tMatrix\n") | |
189 | |
190 ## Print PHYLIP HEADER: | |
191 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
192 | |
193 ## 3.5 ## Print outputs | |
194 for seq_name in bash_concatenation.keys(): | |
195 seq = bash_concatenation[seq_name] | |
196 | |
197 ## Filtering the sequence in case of remaining "?" | |
198 seq = string.replace(seq, "?", "-") | |
199 | |
200 #print seq FASTA FORMAT | |
201 OUT1.write(">%s\n" %seq_name) | |
202 OUT1.write("%s\n" %seq) | |
203 | |
204 #print seq PHYLIP FORMAT | |
205 OUT2.write("%s\n" %seq_name) | |
206 OUT2.write("%s\n" %seq) | |
207 | |
208 #print seq NEXUS FORMAT | |
209 OUT3.write("%s" %seq_name) | |
210 OUT3.write(" %s\n" %seq) | |
211 | |
212 OUT3.write("\t;\n") | |
213 OUT3.write("End;\n") | |
214 OUT1.close() | |
215 OUT2.close() | |
216 OUT2.close() | |
217 | |
218 | |
219 ### 2 ### Nucleic | |
220 elif format_run == "nucleic" : | |
221 | |
222 OUT1 = open("03_Concatenation_nuc.fas", "w") | |
223 OUT2 = open("03_Concatenation_nuc.phy", "w") | |
224 OUT3 = open("03_Concatenation_nuc.nex", "w") | |
225 | |
226 OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") | |
227 OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") | |
228 OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") | |
229 | |
230 OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") | |
231 OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") | |
232 OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") | |
233 | |
234 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") | |
235 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") | |
236 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") | |
237 | |
238 ## Get bash with concatenation | |
239 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## | |
240 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position | |
241 ln_3 = ln/3 ### length of the alignment when only the third codon position | |
242 | |
243 ## Write partition files for RAxML subsequent runs | |
244 # a # Codon partition | |
245 OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) | |
246 OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) | |
247 OUT_PARTITION_codon_12_3.close() | |
248 | |
249 # b # Gene partition | |
250 for sublist in list_genes_position: | |
251 name=sublist[0] | |
252 positions=sublist[1] | |
253 OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) | |
254 OUT_PARTITION_gene_NUC.close() | |
255 | |
256 # c # Mixed partition (codon + gene) | |
257 for sublist in list_genes_position: | |
258 name = sublist[0] | |
259 positions = sublist[1] | |
260 S1 = string.split(positions, "-") | |
261 pos_start1 = string.atoi(S1[0]) | |
262 pos_end = string.atoi(S1[1]) | |
263 pos_start2=pos_start1+1 | |
264 pos_start3=pos_start2+1 | |
265 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) | |
266 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) | |
267 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) | |
268 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) | |
269 | |
270 OUT_PARTITION_gene_PLUS_codon_12_3.close() | |
271 | |
272 | |
273 ## Get "ntax" for NEXUS HEADER | |
274 nb_taxa = len(bash_concatenation.keys()) | |
275 | |
276 print "******************** CONCATENATION ********************\n" | |
277 print "Process nucleotides concatenation:" | |
278 print "\tNumber of taxa aligned = %d" %nb_taxa | |
279 print "\tNumber of loci concatenated = %d\n" %nb_locus | |
280 print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln | |
281 print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 | |
282 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 | |
283 | |
284 | |
285 ## Print NEXUS HEADER: | |
286 OUT3.write("#NEXUS\n\n") | |
287 OUT3.write("Begin data;\n") | |
288 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) | |
289 OUT3.write("\tFormat datatype=dna gap=-;\n") | |
290 OUT3.write("\tMatrix\n") | |
291 | |
292 OUT3_pos12.write("#NEXUS\n\n") | |
293 OUT3_pos12.write("Begin data;\n") | |
294 OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) | |
295 OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") | |
296 OUT3_pos12.write("\tMatrix\n") | |
297 | |
298 OUT3_pos3.write("#NEXUS\n\n") | |
299 OUT3_pos3.write("Begin data;\n") | |
300 OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) | |
301 OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") | |
302 OUT3_pos3.write("\tMatrix\n") | |
303 | |
304 ## Print PHYLIP HEADER: | |
305 OUT2.write(" %d %d\n" %(nb_taxa, ln)) | |
306 OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) | |
307 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) | |
308 | |
309 ## Print outputs | |
310 for seq_name in bash_concatenation.keys(): | |
311 seq = bash_concatenation[seq_name] | |
312 | |
313 ## Filtering the sequence in case of remaining "?" | |
314 seq = string.replace(seq, "?", "-") | |
315 | |
316 ## Get the differentes codons partitions | |
317 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### | |
318 | |
319 #print seq FASTA FORMAT | |
320 OUT1.write(">%s\n" %seq_name) | |
321 OUT1.write("%s\n" %seq) | |
322 OUT1_pos12.write(">%s\n" %seq_name) | |
323 OUT1_pos12.write("%s\n" %seq_pos12) | |
324 OUT1_pos3.write(">%s\n" %seq_name) | |
325 OUT1_pos3.write("%s\n" %seq_pos3) | |
326 | |
327 #print seq PHYLIP FORMAT | |
328 OUT2.write("%s\n" %seq_name) | |
329 OUT2.write("%s\n" %seq) | |
330 OUT2_pos12.write("%s\n" %seq_name) | |
331 OUT2_pos12.write("%s\n" %seq_pos12) | |
332 OUT2_pos3.write("%s\n" %seq_name) | |
333 OUT2_pos3.write("%s\n" %seq_pos3) | |
334 | |
335 #print seq NEXUS FORMAT | |
336 OUT3.write("%s" %seq_name) | |
337 OUT3.write(" %s\n" %seq) | |
338 OUT3_pos12.write("%s" %seq_name) | |
339 OUT3_pos12.write(" %s\n" %seq_pos12) | |
340 OUT3_pos3.write("%s" %seq_name) | |
341 OUT3_pos3.write(" %s\n" %seq_pos3) | |
342 | |
343 | |
344 OUT3.write("\t;\n") | |
345 OUT3.write("End;\n") | |
346 OUT3_pos12.write("\t;\n") | |
347 OUT3_pos12.write("End;\n") | |
348 OUT3_pos3.write("\t;\n") | |
349 OUT3_pos3.write("End;\n") | |
350 | |
351 OUT1.close() | |
352 OUT2.close() | |
353 OUT3.close() | |
354 OUT1_pos12.close() | |
355 OUT2_pos12.close() | |
356 OUT3_pos12.close() | |
357 OUT1_pos3.close() | |
358 OUT2_pos3.close() | |
359 OUT3_pos3.close() | |
360 | |
361 print "\n\n\n******************** RAxML RUN ********************\n" |