Mercurial > repos > izsam > phylogeny_converter
comparison converter.py @ 0:37392af48c37 draft default tip
Uploaded
author | izsam |
---|---|
date | Thu, 19 Mar 2015 11:46:50 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:37392af48c37 |
---|---|
1 | |
2 #!/usr/bin/env python | |
3 | |
4 import sys | |
5 import string | |
6 import os | |
7 | |
8 class convertitori: | |
9 def __init__(self,input,inouttype,type,output): | |
10 self.input = input | |
11 self.inouttype = inouttype | |
12 self.type = type | |
13 self.output = output | |
14 | |
15 def fp(self): | |
16 count = 0 | |
17 cseq = 0 | |
18 fasta = [] | |
19 for riga in self.input: | |
20 count += 1 | |
21 if ">" in riga: | |
22 f = "" | |
23 p = self.input.index(riga,count-1) | |
24 c = 1 | |
25 y = riga[1:].replace(" ","_") | |
26 if y >= 10: | |
27 f = f + y[:10] + '\t' | |
28 else: | |
29 f = f + y + "_"*(10-len(y)) + '\t' | |
30 try: | |
31 while ">" not in self.input[p+c]: | |
32 f = f + (self.input[p+c].strip()) | |
33 c += 1 | |
34 except: | |
35 pass | |
36 fasta.append(f) | |
37 num = str(len(fasta)) | |
38 lun = str(len(fasta[0].split("\t")[1])) | |
39 for sequence in fasta: | |
40 if str(len(sequence.split("\t")[1])) != lun: | |
41 sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") | |
42 self.output.write(num + '\t' + lun + '\n') | |
43 for seq in fasta: | |
44 self.output.write(seq + '\n') | |
45 | |
46 def fn(self): | |
47 count = 0 | |
48 fasta = [] | |
49 for riga in self.input: | |
50 count += 1 | |
51 if ">" in riga: | |
52 f = "" | |
53 p = self.input.index(riga,count-1) | |
54 c = 1 | |
55 y = riga[1:].replace(" ","_") | |
56 if y >= 10: | |
57 f = f + y[:10] + '\t' | |
58 else: | |
59 f = f + y + "_"*(10-len(y)) + '\t' | |
60 try: | |
61 while ">" not in self.input[p+c]: | |
62 f = f + (self.input[p+c].strip()) | |
63 c += 1 | |
64 except: | |
65 pass | |
66 fasta.append(f) | |
67 num = str(len(fasta)) | |
68 lun = str(len(fasta[0].split("\t")[1])) | |
69 for sequence in fasta: | |
70 if str(len(sequence.split("\t")[1])) != lun: | |
71 sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length") | |
72 self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(num,lun)) | |
73 porzioni = int(lun)/100 | |
74 for volte in range(porzioni): | |
75 for seq in fasta: | |
76 part = "" | |
77 self.output.write(seq.split("\t")[0] + '\t') | |
78 cont = 0 | |
79 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: | |
80 cont += 1 | |
81 part = part + chara | |
82 if cont%20.0 == 0: | |
83 part = part + " " | |
84 part = part + "\n" | |
85 self.output.write(part) | |
86 self.output.write("\n\n\n") | |
87 for seq in fasta: | |
88 part = "" | |
89 cont = 0 | |
90 self.output.write(seq.split("\t")[0] + '\t') | |
91 for chara in seq.split("\t")[1][(volte+1)*100:]: | |
92 cont += 1 | |
93 part = part + chara | |
94 if cont%20.0 == 0: | |
95 part = part + " " | |
96 part = part + "\n" | |
97 self.output.write(part) | |
98 | |
99 def pn(self): | |
100 num = int(self.input[0].split()[0]) | |
101 lun = float(self.input[0].split()[1]) | |
102 lunf = float(len(self.input)) | |
103 self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(int(num),lun)) | |
104 spia = 0 | |
105 porzioni = int(lun)/100 | |
106 if (lunf-1)/num == 1.0: | |
107 spia = 1 | |
108 if spia == 1: | |
109 for volte in range(porzioni): | |
110 for seq in self.input[1:]: | |
111 part = "" | |
112 self.output.write(seq.split("\t")[0] + '\t') | |
113 cont = 0 | |
114 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]: | |
115 cont += 1 | |
116 part = part + chara | |
117 if cont%20.0 == 0: | |
118 part = part + " " | |
119 part = part + "\n" | |
120 self.output.write(part) | |
121 self.output.write("\n\n\n") | |
122 for seq in self.input[1:]: | |
123 part = "" | |
124 cont = 0 | |
125 self.output.write(seq.split("\t")[0] + '\t') | |
126 for chara in seq.split("\t")[1][(volte+1)*100:]: | |
127 cont += 1 | |
128 part = part + chara | |
129 if cont%20.0 == 0: | |
130 part = part + " " | |
131 part = part + "\n" | |
132 self.output.write(part) | |
133 else: | |
134 if len(self.input[1])<=11: | |
135 for volte in range(porzioni): | |
136 interm = 0 | |
137 for seq in self.input[1:]: | |
138 if seq == "\n": | |
139 interm += 1 | |
140 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": | |
141 part = "" | |
142 cont = 0 | |
143 for chara in seq[volte*100:(volte+1)*100]: | |
144 cont += 1 | |
145 part = part + chara | |
146 if cont%20.0 == 0: | |
147 part = part + " " | |
148 part = part + "\n" | |
149 self.output.write(part) | |
150 elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": | |
151 self.output.write(seq[:10] + "\t") | |
152 self.output.write("\n\n\n") | |
153 interm = 0 | |
154 for seq in self.input[1:]: | |
155 if seq == "\n": | |
156 interm += 1 | |
157 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n": | |
158 part = "" | |
159 cont = 0 | |
160 for chara in seq[(volte+1)*100:]: | |
161 cont += 1 | |
162 part = part + chara | |
163 if cont%20.0 == 0: | |
164 part = part + " " | |
165 part = part + "\n" | |
166 self.output.write(part) | |
167 elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n": | |
168 self.output.write(seq[:10] + "\t") | |
169 else: | |
170 try: | |
171 diz = {} | |
172 volta = 0 | |
173 for riga in self.input[1:]: | |
174 if self.input.index(riga) in range(num+1): | |
175 numriga = self.input.index(riga) | |
176 diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] | |
177 else: | |
178 if riga == "\n": | |
179 volta += 1 | |
180 else: | |
181 numriga = self.input.index(riga) | |
182 prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") | |
183 diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] | |
184 for volte in range(porzioni): | |
185 for seq in diz.keys(): | |
186 cont = 0 | |
187 self.output.write(diz[seq][0] + "\t") | |
188 for chara in diz[seq][1][volte*100:(volte+1)*100]: | |
189 self.output.write(chara) | |
190 cont += 1 | |
191 if cont%20.0 == 0: | |
192 self.output.write(" ") | |
193 self.output.write("\n") | |
194 self.output.write("\n\n\n") | |
195 for seq in diz.keys(): | |
196 cont = 0 | |
197 self.output.write(diz[seq][0] + "\t") | |
198 for chara in diz[seq][1][(volte+1)*100:]: | |
199 self.output.write(chara) | |
200 cont += 1 | |
201 if cont%20.0 == 0: | |
202 self.output.write(" ") | |
203 self.output.write("\n") | |
204 except: | |
205 sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") | |
206 | |
207 def pf(self): | |
208 num = int(self.input[0].split()[0]) | |
209 lun = float(len(self.input)) | |
210 spia = 0 | |
211 if (lun-1)/num == 1.0: | |
212 spia = 1 | |
213 if spia == 1: | |
214 for riga in self.input[1:]: | |
215 for ele in range(int(lun-1)): | |
216 cont = 0 | |
217 self.output.write(">" + self.input[ele+1][:10] + "\n") | |
218 for char in self.input[ele+1][10:].strip().replace(" ",""): | |
219 self.output.write(char) | |
220 cont += 1 | |
221 if cont%80.0 == 0: | |
222 self.output.write('\n') | |
223 self.output.write('\n') | |
224 else: | |
225 if len(self.input[1])<=11: | |
226 interm = 0 | |
227 for riga in self.input[1:]: | |
228 if riga == "\n": | |
229 interm += 1 | |
230 if (self.input.index(riga)+interm)%2 == 0 and riga != "\n": | |
231 cont = 0 | |
232 for char in riga.strip().replace(" ",""): | |
233 self.output.write(char) | |
234 cont += 1 | |
235 if cont%80.0 == 0: | |
236 self.output.write('\n') | |
237 self.output.write('\n') | |
238 elif (self.input.index(riga)+interm)%2 != 0 and riga != "\n": | |
239 self.output.write(">" + riga[:10] + "\n") | |
240 else: | |
241 try: | |
242 diz = {} | |
243 volta = 0 | |
244 for riga in self.input[1:]: | |
245 if self.input.index(riga) in range(num+1): | |
246 numriga = self.input.index(riga) | |
247 diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")] | |
248 else: | |
249 if riga == "\n": | |
250 volta += 1 | |
251 else: | |
252 numriga = self.input.index(riga) | |
253 prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","") | |
254 diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima] | |
255 for elemento in diz.keys(): | |
256 self.output.write(">" + diz[elemento][0] + '\n') | |
257 con = 0 | |
258 for char in diz[elemento][1]: | |
259 self.output.write(char) | |
260 con += 1 | |
261 if con%80 == 0: | |
262 self.output.write('\n') | |
263 self.output.write('\n') | |
264 except: | |
265 sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ") | |
266 | |
267 def nf(self): | |
268 try: | |
269 diz = {} | |
270 spia = 0 | |
271 for riga in self.input: | |
272 if "MATRIX" in riga: | |
273 spia = 1 | |
274 if spia == 1 and "MATRIX" not in riga and riga != "\n": | |
275 if riga.split()[0] not in diz.keys(): | |
276 diz[riga.split()[0]] = "" | |
277 else: | |
278 for ele in riga.split()[1:]: | |
279 diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() | |
280 for elemento in diz.keys(): | |
281 self.output.write(">" + elemento + '\n') | |
282 con = 0 | |
283 for char in diz[elemento]: | |
284 self.output.write(char) | |
285 con += 1 | |
286 if con%80 == 0: | |
287 self.output.write('\n') | |
288 self.output.write('\n') | |
289 except: | |
290 sys.exit("The input file is not in Nexus format. ") | |
291 | |
292 def np(self): | |
293 try: | |
294 diz = {} | |
295 spia = 0 | |
296 for riga in self.input: | |
297 if "MATRIX" in riga: | |
298 spia = 1 | |
299 if spia == 1 and "MATRIX" not in riga and riga != "\n": | |
300 if riga.split()[0] not in diz.keys(): | |
301 diz[riga.split()[0]] = "" | |
302 else: | |
303 for ele in riga.split()[1:]: | |
304 diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip() | |
305 num = str(len(diz.keys())) | |
306 lun = str(len(diz.values()[0])) | |
307 self.output.write(num + '\t' + lun + '\n') | |
308 for elemento in diz.keys(): | |
309 if elemento >= 10: | |
310 nome = elemento[:10] + '\t' | |
311 else: | |
312 nome = elemento + "_"*(10-len(elemento)) + '\t' | |
313 self.output.write(nome + diz[elemento] + '\n') | |
314 except: | |
315 sys.exit("The input file is not in Nexus format. ") | |
316 | |
317 | |
318 def fg(self): | |
319 count = 0 | |
320 fasta = [] | |
321 for riga in self.input: | |
322 count += 1 | |
323 if ">" in riga: | |
324 f = "" | |
325 p = self.input.index(riga,count-1) | |
326 c = 1 | |
327 y = riga[1:-1].replace(" ","_") | |
328 f = f + y + '\t' | |
329 try: | |
330 while ">" not in self.input[p+c]: | |
331 f = f + (self.input[p+c].strip()) | |
332 c += 1 | |
333 except: | |
334 pass | |
335 fasta.append(f) | |
336 for seq in fasta: | |
337 lun = str(len(seq.split("\t")[1])) | |
338 self.output.write("LOCUS\t%s\t%s bp\nORIGIN\n"%(seq.split("\t")[0],lun)) | |
339 porzioni = int(lun)/60 | |
340 cont = 0 | |
341 for volte in range(porzioni): | |
342 part = "" | |
343 self.output.write(str(cont+1) + "\t") | |
344 for chara in seq.split("\t")[1][volte*60:(volte+1)*60]: | |
345 cont += 1 | |
346 part = part + chara | |
347 if cont%10.0 == 0: | |
348 part = part + " " | |
349 self.output.write(part) | |
350 self.output.write("\n") | |
351 self.output.write(str(cont+1) + "\t") | |
352 part = "" | |
353 for chara in seq.split("\t")[1][(volte+1)*60:]: | |
354 cont += 1 | |
355 part = part + chara | |
356 if cont%10.0 == 0: | |
357 part = part + " " | |
358 self.output.write(part) | |
359 self.output.write("\n") | |
360 self.output.write("//\n\n") | |
361 | |
362 def gf(self): | |
363 for riga in self.input: | |
364 if "LOCUS" in riga: | |
365 nome = "" | |
366 spia = 0 | |
367 len = "" | |
368 seq = "" | |
369 part = riga.split() | |
370 for ele in part: | |
371 if "bp" in ele: | |
372 len = str(riga.index(ele)-1) | |
373 nome = part[1] + '\t' | |
374 if "DEFINITION" in riga: | |
375 part = riga.split() | |
376 for ele in part[1:]: | |
377 nome = nome + ele + ' ' | |
378 if "ORIGIN" in riga: | |
379 spia = 1 | |
380 if spia == 1 and "ORIGIN" not in riga: | |
381 part = riga.split() | |
382 for ele in part[1:]: | |
383 seq = seq + ele.strip() | |
384 if "//" in riga: | |
385 self.output.write(">" + nome + '\n') | |
386 con = 0 | |
387 for char in seq: | |
388 self.output.write(char) | |
389 con += 1 | |
390 if con%80 == 0: | |
391 self.output.write('\n') | |
392 self.output.write('\n') | |
393 spia = 0 | |
394 | |
395 class check_fileformat: | |
396 def __init__(self,inouttype,input): | |
397 self.intype = inouttype[0] | |
398 self.infile = input | |
399 def single(self): | |
400 if self.intype == "f": | |
401 count = 0 | |
402 for riga in self.infile: | |
403 if riga[0] == ">": | |
404 count += 1 | |
405 if count == 1: | |
406 if len(self.infile) < 2: | |
407 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") | |
408 else: | |
409 return "ok" | |
410 else: | |
411 if count >1: | |
412 sys.exit("The input file is a multi-fasta file. Please resubmit the job using the 'multi sequence' option") | |
413 if count == 0: | |
414 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") | |
415 if self.intype == "g": | |
416 locus = 0 | |
417 origin = 0 | |
418 end = 0 | |
419 lun = 1 | |
420 for riga in self.infile: | |
421 if "LOCUS" in riga: | |
422 locus = 1 | |
423 if "ORIGIN" in riga: | |
424 origin = 1 | |
425 elif origin == 1 and len(riga.split()) >= 7: | |
426 lun = 0 | |
427 if "//" in riga: | |
428 end = 1 | |
429 if locus == 0 or origin == 0 or end == 0 or lun == 1: | |
430 sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") | |
431 else: | |
432 return "ok" | |
433 def multi(self): | |
434 if self.intype == "p": | |
435 if len(self.infile[0].split()) == 2 or len(self.infile[0].split()) == 3: | |
436 if int(self.infile[0].split()[0]) > 1: | |
437 return "ok" | |
438 else: | |
439 sys.exit("There is only one sequence in the file") | |
440 else: | |
441 sys.exit("the input file is not in phylip format.") | |
442 if self.intype == "n": | |
443 begin = 0 | |
444 matrix = 0 | |
445 ntax = 0 | |
446 if "#NEXUS" in self.infile[0]: | |
447 for riga in self.infile: | |
448 if "begin data;" in riga.lower(): | |
449 begin = 1 | |
450 if "matrix" in riga.lower(): | |
451 matrix = 1 | |
452 if "ntax" in riga.lower(): | |
453 r = riga.split() | |
454 ntax = int(r[1][5:]) | |
455 if begin==1 and matrix == 1: | |
456 return "ok" | |
457 else: | |
458 sys.exit("the input file is not in nexus format.") | |
459 if ntax <= 1: | |
460 sys.exit("There is only one sequence in the file") | |
461 else: | |
462 sys.exit("the input file is not in nexus format.") | |
463 if self.intype == "f": | |
464 count = 0 | |
465 for riga in self.infile: | |
466 if riga[0] == ">": | |
467 count += 1 | |
468 if count > 1: | |
469 if len(self.infile) < 4: | |
470 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") | |
471 else: | |
472 return "ok" | |
473 else: | |
474 if count == 1: | |
475 sys.exit("The input file is a single-fasta file. Please resubmit the job using the 'single sequence' option") | |
476 if count == 0: | |
477 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line") | |
478 if self.intype == "g": | |
479 locus = 0 | |
480 origin = 0 | |
481 end = 0 | |
482 lun = 1 | |
483 for riga in self.infile: | |
484 if "LOCUS" in riga: | |
485 locus = 1 | |
486 if "ORIGIN" in riga: | |
487 origin = 1 | |
488 if origin == 1 and len(riga.split()) >= 7: | |
489 lun = 0 | |
490 if "//" in riga: | |
491 end = 1 | |
492 if locus == 0 or origin == 0 or end == 0 or lun == 1: | |
493 sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //") | |
494 else: | |
495 return "ok" | |
496 | |
497 | |
498 def main(input,output,inouttype,type): | |
499 check = check_fileformat(inouttype,input) | |
500 if type == "single": | |
501 c = check.single() | |
502 if c == "ok": | |
503 conv = convertitori(input,inouttype,type,output) | |
504 if inouttype == "f-g": | |
505 conv.fg() | |
506 if inouttype == "g-f": | |
507 conv.gf() | |
508 if type == "multi": | |
509 c = check.multi() | |
510 if c == "ok": | |
511 conv = convertitori(input,inouttype,type,output) | |
512 if inouttype == "f-g": | |
513 conv.fg() | |
514 if inouttype == "g-f": | |
515 conv.gf() | |
516 if inouttype == "f-p": | |
517 conv.fp() | |
518 if inouttype == "f-n": | |
519 conv.fn() | |
520 if inouttype == "p-f": | |
521 conv.pf() | |
522 if inouttype == "p-n": | |
523 conv.pn() | |
524 if inouttype == "n-p": | |
525 conv.np() | |
526 if inouttype == "n-f": | |
527 conv.nf() | |
528 output.close() | |
529 | |
530 if __name__ == "__main__" : | |
531 input = open(sys.argv[1],"r").readlines() | |
532 output = open(sys.argv[2],"a") | |
533 inouttype = sys.argv[3] | |
534 type = sys.argv[4] | |
535 main(input,output,inouttype,type) |