comparison scripts/S01_concatenate.py @ 0:b186cae246bd draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:27:42 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b186cae246bd
1 #!/usr/bin/python
2 ## Author: Eric Fontanillas
3 ## Last modification: 17/06/2011
4 ## Subject: find and remove indels
5
6 ###############################
7 ##### DEF 0 : Dico fasta #####
8 ###############################
9 def dico(F2):
10 dicoco = {}
11 with open(F2, "r") as file:
12 for name, query in itertools.izip_longest(*[file]*2):
13 if not name:
14 break
15 if name[0] == ">":
16 fasta_name_query = name[:-1]
17 Sn = string.split(fasta_name_query, "||")
18 fasta_name_query = Sn[0]
19 fasta_seq_query = query[:-1]
20 dicoco[fasta_name_query]=fasta_seq_query
21 return dicoco
22 ###################################################################################
23
24
25 ####################
26 ###### DEF 11 ######
27 ####################
28 ## Concatenate sequences
29 ###########################
30 def concatenate(L_IN, SPECIES_ID_LIST):
31 ## 4 ## Process files
32 ## 4.1 ## Create the bash and the fasta names entries (name of the species)
33 bash_concat = {}
34
35 for species_ID in SPECIES_ID_LIST:
36 bash_concat[species_ID] = ''
37
38 ln_concat = 0
39 nb_locus = 0
40 pos=1
41 list_genes_position=[]
42 ## 4.2 ## Concatenate
43 for file in L_IN:
44 nb_locus=nb_locus+1
45
46 ## a ## Open alignments
47 dico_seq = dico(file) ### DEF 0 ###
48 ## b ## Get alignment length + genes positions for RAxML
49 key0 = dico_seq.keys()[0]
50 ln = len(dico_seq[key0])
51 ln_concat = ln_concat + ln
52
53 pos_start = pos
54 pos_end = pos+ln-1
55 pos=pos_end+1
56 position="%d-%d" %(pos_start, pos_end)
57 RAxML_name = file[:-6]
58 sublist = [RAxML_name, position]
59 list_genes_position.append(sublist)
60
61 ## c ## Generate "empty" sequence with alignment length * "-"
62 empty_seq = "-" * ln
63
64 ## d ## Concatenate
65 ## d.1 ## Detect missing species in this alignment
66 list_ID=[]
67 list_absent_ID=[]
68 bash_fastaName={}
69 for fasta_name in dico_seq:
70 ID = fasta_name[1:3]
71 list_ID.append(ID)
72 seq = dico_seq[fasta_name]
73 bash_fastaName[ID]=fasta_name
74 for sp_ID in SPECIES_ID_LIST:
75 if sp_ID not in list_ID:
76 list_absent_ID.append(sp_ID)
77
78 for ID in SPECIES_ID_LIST:
79 if ID in list_absent_ID:
80 bash_concat[ID] = bash_concat[ID] + empty_seq
81 else:
82 fasta_name = bash_fastaName[ID]
83 seq = dico_seq[fasta_name]
84 bash_concat[ID] = bash_concat[ID] + seq
85
86 return(bash_concat, ln_concat, nb_locus, list_genes_position)
87 ####################################
88
89
90 ########################################
91 ##### DEF 12 : get codon position #####
92 ########################################
93 def get_codon_position(seq_inORF):
94
95 ln = len(seq_inORF)
96
97 i=0
98 seq_pos1=""
99 seq_pos2=""
100 seq_pos12=""
101 seq_pos3=""
102 while i<ln:
103 pos1 = seq_inORF[i]
104 pos2 = seq_inORF[i+1]
105 pos3 = seq_inORF[i+2]
106
107 seq_pos1 = seq_pos1 + pos1
108 seq_pos2 = seq_pos2 + pos2
109 seq_pos12 = seq_pos12 + pos1 + pos2
110 seq_pos3 = seq_pos3 + pos3
111
112 i = i+3
113
114 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3)
115 ###############################################################################
116
117
118
119 #######################
120 ##### RUN RUN RUN #####
121 #######################
122 import string, os, time, re, sys, itertools
123
124 list_species = []
125 SPECIES_ID_LIST = []
126 fasta = "^.*fasta$"
127 i=3
128
129 ## Arguments
130 infiles_filter_assemblies = sys.argv[1]
131 format_run = sys.argv[2]
132
133 ## add file to list_species
134 list_species = str.split(infiles_filter_assemblies,",")
135
136 ## in SPECIES_ID_LIST, only the 2 first letters of name of species
137 for name in list_species :
138 name = name[:2]
139 SPECIES_ID_LIST.append(name)
140
141 ## add alignment files to L_IN
142 list_files = []
143 with open(sys.argv[3], 'r') as f:
144 for line in f.readlines():
145 list_files.append(line.strip('\n'))
146
147 L_IN = []
148 for file in list_files:
149 L_IN.append(file)
150
151 #L_IN = str.split(input_alignments,",")
152 print(L_IN)
153
154 ### 1 ### Proteic
155 if format_run == "proteic" :
156
157 OUT1 = open("02_Concatenation_aa.fas", "w")
158 OUT2 = open("02_Concatenation_aa.phy", "w")
159 OUT3 = open("02_Concatenation_aa.nex", "w")
160 OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w")
161
162
163 ## Get bash with concatenation
164 bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ##
165
166 ## Write gene AA partition file for RAxML
167 for sublist in list_genes_position:
168 name = sublist[0]
169 positions=sublist[1]
170 OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions))
171 OUT_PARTITION_gene_AA.close()
172
173 ## Get "ntax" for NEXUS HEADER
174 nb_taxa = len(bash_concatenation.keys())
175
176 print "******************** CONCATENATION ********************\n"
177 print "Process amino-acid concatenation:"
178 print "\tNumber of taxa aligned = %d" %nb_taxa
179 print "\tNumber of loci concatenated = %d\n" %nb_locus
180 print "\tTotal length of the concatenated sequences = %d" %ln
181
182
183 ## Print NEXUS HEADER:
184 OUT3.write("#NEXUS\n\n")
185 OUT3.write("Begin data;\n")
186 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
187 OUT3.write("\tFormat datatype=aa gap=-;\n")
188 OUT3.write("\tMatrix\n")
189
190 ## Print PHYLIP HEADER:
191 OUT2.write(" %d %d\n" %(nb_taxa, ln))
192
193 ## 3.5 ## Print outputs
194 for seq_name in bash_concatenation.keys():
195 seq = bash_concatenation[seq_name]
196
197 ## Filtering the sequence in case of remaining "?"
198 seq = string.replace(seq, "?", "-")
199
200 #print seq FASTA FORMAT
201 OUT1.write(">%s\n" %seq_name)
202 OUT1.write("%s\n" %seq)
203
204 #print seq PHYLIP FORMAT
205 OUT2.write("%s\n" %seq_name)
206 OUT2.write("%s\n" %seq)
207
208 #print seq NEXUS FORMAT
209 OUT3.write("%s" %seq_name)
210 OUT3.write(" %s\n" %seq)
211
212 OUT3.write("\t;\n")
213 OUT3.write("End;\n")
214 OUT1.close()
215 OUT2.close()
216 OUT2.close()
217
218
219 ### 2 ### Nucleic
220 elif format_run == "nucleic" :
221
222 OUT1 = open("03_Concatenation_nuc.fas", "w")
223 OUT2 = open("03_Concatenation_nuc.phy", "w")
224 OUT3 = open("03_Concatenation_nuc.nex", "w")
225
226 OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w")
227 OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w")
228 OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w")
229
230 OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w")
231 OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w")
232 OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w")
233
234 OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w")
235 OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w")
236 OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w")
237
238 ## Get bash with concatenation
239 bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ##
240 ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position
241 ln_3 = ln/3 ### length of the alignment when only the third codon position
242
243 ## Write partition files for RAxML subsequent runs
244 # a # Codon partition
245 OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln))
246 OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln))
247 OUT_PARTITION_codon_12_3.close()
248
249 # b # Gene partition
250 for sublist in list_genes_position:
251 name=sublist[0]
252 positions=sublist[1]
253 OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions))
254 OUT_PARTITION_gene_NUC.close()
255
256 # c # Mixed partition (codon + gene)
257 for sublist in list_genes_position:
258 name = sublist[0]
259 positions = sublist[1]
260 S1 = string.split(positions, "-")
261 pos_start1 = string.atoi(S1[0])
262 pos_end = string.atoi(S1[1])
263 pos_start2=pos_start1+1
264 pos_start3=pos_start2+1
265 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end)
266 partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end)
267 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1)
268 OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2)
269
270 OUT_PARTITION_gene_PLUS_codon_12_3.close()
271
272
273 ## Get "ntax" for NEXUS HEADER
274 nb_taxa = len(bash_concatenation.keys())
275
276 print "******************** CONCATENATION ********************\n"
277 print "Process nucleotides concatenation:"
278 print "\tNumber of taxa aligned = %d" %nb_taxa
279 print "\tNumber of loci concatenated = %d\n" %nb_locus
280 print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln
281 print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12
282 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3
283
284
285 ## Print NEXUS HEADER:
286 OUT3.write("#NEXUS\n\n")
287 OUT3.write("Begin data;\n")
288 OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
289 OUT3.write("\tFormat datatype=dna gap=-;\n")
290 OUT3.write("\tMatrix\n")
291
292 OUT3_pos12.write("#NEXUS\n\n")
293 OUT3_pos12.write("Begin data;\n")
294 OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12))
295 OUT3_pos12.write("\tFormat datatype=dna gap=-;\n")
296 OUT3_pos12.write("\tMatrix\n")
297
298 OUT3_pos3.write("#NEXUS\n\n")
299 OUT3_pos3.write("Begin data;\n")
300 OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3))
301 OUT3_pos3.write("\tFormat datatype=dna gap=-;\n")
302 OUT3_pos3.write("\tMatrix\n")
303
304 ## Print PHYLIP HEADER:
305 OUT2.write(" %d %d\n" %(nb_taxa, ln))
306 OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12))
307 OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3))
308
309 ## Print outputs
310 for seq_name in bash_concatenation.keys():
311 seq = bash_concatenation[seq_name]
312
313 ## Filtering the sequence in case of remaining "?"
314 seq = string.replace(seq, "?", "-")
315
316 ## Get the differentes codons partitions
317 seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ###
318
319 #print seq FASTA FORMAT
320 OUT1.write(">%s\n" %seq_name)
321 OUT1.write("%s\n" %seq)
322 OUT1_pos12.write(">%s\n" %seq_name)
323 OUT1_pos12.write("%s\n" %seq_pos12)
324 OUT1_pos3.write(">%s\n" %seq_name)
325 OUT1_pos3.write("%s\n" %seq_pos3)
326
327 #print seq PHYLIP FORMAT
328 OUT2.write("%s\n" %seq_name)
329 OUT2.write("%s\n" %seq)
330 OUT2_pos12.write("%s\n" %seq_name)
331 OUT2_pos12.write("%s\n" %seq_pos12)
332 OUT2_pos3.write("%s\n" %seq_name)
333 OUT2_pos3.write("%s\n" %seq_pos3)
334
335 #print seq NEXUS FORMAT
336 OUT3.write("%s" %seq_name)
337 OUT3.write(" %s\n" %seq)
338 OUT3_pos12.write("%s" %seq_name)
339 OUT3_pos12.write(" %s\n" %seq_pos12)
340 OUT3_pos3.write("%s" %seq_name)
341 OUT3_pos3.write(" %s\n" %seq_pos3)
342
343
344 OUT3.write("\t;\n")
345 OUT3.write("End;\n")
346 OUT3_pos12.write("\t;\n")
347 OUT3_pos12.write("End;\n")
348 OUT3_pos3.write("\t;\n")
349 OUT3_pos3.write("End;\n")
350
351 OUT1.close()
352 OUT2.close()
353 OUT3.close()
354 OUT1_pos12.close()
355 OUT2_pos12.close()
356 OUT3_pos12.close()
357 OUT1_pos3.close()
358 OUT2_pos3.close()
359 OUT3_pos3.close()
360
361 print "\n\n\n******************** RAxML RUN ********************\n"