comparison converter.py @ 0:37392af48c37 draft default tip

Uploaded
author izsam
date Thu, 19 Mar 2015 11:46:50 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:37392af48c37
1
2 #!/usr/bin/env python
3
4 import sys
5 import string
6 import os
7
8 class convertitori:
9 def __init__(self,input,inouttype,type,output):
10 self.input = input
11 self.inouttype = inouttype
12 self.type = type
13 self.output = output
14
15 def fp(self):
16 count = 0
17 cseq = 0
18 fasta = []
19 for riga in self.input:
20 count += 1
21 if ">" in riga:
22 f = ""
23 p = self.input.index(riga,count-1)
24 c = 1
25 y = riga[1:].replace(" ","_")
26 if y >= 10:
27 f = f + y[:10] + '\t'
28 else:
29 f = f + y + "_"*(10-len(y)) + '\t'
30 try:
31 while ">" not in self.input[p+c]:
32 f = f + (self.input[p+c].strip())
33 c += 1
34 except:
35 pass
36 fasta.append(f)
37 num = str(len(fasta))
38 lun = str(len(fasta[0].split("\t")[1]))
39 for sequence in fasta:
40 if str(len(sequence.split("\t")[1])) != lun:
41 sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length")
42 self.output.write(num + '\t' + lun + '\n')
43 for seq in fasta:
44 self.output.write(seq + '\n')
45
46 def fn(self):
47 count = 0
48 fasta = []
49 for riga in self.input:
50 count += 1
51 if ">" in riga:
52 f = ""
53 p = self.input.index(riga,count-1)
54 c = 1
55 y = riga[1:].replace(" ","_")
56 if y >= 10:
57 f = f + y[:10] + '\t'
58 else:
59 f = f + y + "_"*(10-len(y)) + '\t'
60 try:
61 while ">" not in self.input[p+c]:
62 f = f + (self.input[p+c].strip())
63 c += 1
64 except:
65 pass
66 fasta.append(f)
67 num = str(len(fasta))
68 lun = str(len(fasta[0].split("\t")[1]))
69 for sequence in fasta:
70 if str(len(sequence.split("\t")[1])) != lun:
71 sys.exit("The input file does not contains a multiple alignment in fasta format. Please ensure that all the sequences have the same length")
72 self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(num,lun))
73 porzioni = int(lun)/100
74 for volte in range(porzioni):
75 for seq in fasta:
76 part = ""
77 self.output.write(seq.split("\t")[0] + '\t')
78 cont = 0
79 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]:
80 cont += 1
81 part = part + chara
82 if cont%20.0 == 0:
83 part = part + " "
84 part = part + "\n"
85 self.output.write(part)
86 self.output.write("\n\n\n")
87 for seq in fasta:
88 part = ""
89 cont = 0
90 self.output.write(seq.split("\t")[0] + '\t')
91 for chara in seq.split("\t")[1][(volte+1)*100:]:
92 cont += 1
93 part = part + chara
94 if cont%20.0 == 0:
95 part = part + " "
96 part = part + "\n"
97 self.output.write(part)
98
99 def pn(self):
100 num = int(self.input[0].split()[0])
101 lun = float(self.input[0].split()[1])
102 lunf = float(len(self.input))
103 self.output.write("#NEXUS\n\nBEGIN DATA;\nDIMENSIONS NTAX=%s NCHAR=%s;\nFORMAT DATATYPE=DNA INTERLEAVE MISSING=-;\n\nMATRIX\n"%(int(num),lun))
104 spia = 0
105 porzioni = int(lun)/100
106 if (lunf-1)/num == 1.0:
107 spia = 1
108 if spia == 1:
109 for volte in range(porzioni):
110 for seq in self.input[1:]:
111 part = ""
112 self.output.write(seq.split("\t")[0] + '\t')
113 cont = 0
114 for chara in seq.split("\t")[1][volte*100:(volte+1)*100]:
115 cont += 1
116 part = part + chara
117 if cont%20.0 == 0:
118 part = part + " "
119 part = part + "\n"
120 self.output.write(part)
121 self.output.write("\n\n\n")
122 for seq in self.input[1:]:
123 part = ""
124 cont = 0
125 self.output.write(seq.split("\t")[0] + '\t')
126 for chara in seq.split("\t")[1][(volte+1)*100:]:
127 cont += 1
128 part = part + chara
129 if cont%20.0 == 0:
130 part = part + " "
131 part = part + "\n"
132 self.output.write(part)
133 else:
134 if len(self.input[1])<=11:
135 for volte in range(porzioni):
136 interm = 0
137 for seq in self.input[1:]:
138 if seq == "\n":
139 interm += 1
140 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n":
141 part = ""
142 cont = 0
143 for chara in seq[volte*100:(volte+1)*100]:
144 cont += 1
145 part = part + chara
146 if cont%20.0 == 0:
147 part = part + " "
148 part = part + "\n"
149 self.output.write(part)
150 elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n":
151 self.output.write(seq[:10] + "\t")
152 self.output.write("\n\n\n")
153 interm = 0
154 for seq in self.input[1:]:
155 if seq == "\n":
156 interm += 1
157 if (self.input.index(seq)+interm)%2 == 0 and seq != "\n":
158 part = ""
159 cont = 0
160 for chara in seq[(volte+1)*100:]:
161 cont += 1
162 part = part + chara
163 if cont%20.0 == 0:
164 part = part + " "
165 part = part + "\n"
166 self.output.write(part)
167 elif (self.input.index(seq)+interm)%2 != 0 and seq != "\n":
168 self.output.write(seq[:10] + "\t")
169 else:
170 try:
171 diz = {}
172 volta = 0
173 for riga in self.input[1:]:
174 if self.input.index(riga) in range(num+1):
175 numriga = self.input.index(riga)
176 diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")]
177 else:
178 if riga == "\n":
179 volta += 1
180 else:
181 numriga = self.input.index(riga)
182 prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","")
183 diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima]
184 for volte in range(porzioni):
185 for seq in diz.keys():
186 cont = 0
187 self.output.write(diz[seq][0] + "\t")
188 for chara in diz[seq][1][volte*100:(volte+1)*100]:
189 self.output.write(chara)
190 cont += 1
191 if cont%20.0 == 0:
192 self.output.write(" ")
193 self.output.write("\n")
194 self.output.write("\n\n\n")
195 for seq in diz.keys():
196 cont = 0
197 self.output.write(diz[seq][0] + "\t")
198 for chara in diz[seq][1][(volte+1)*100:]:
199 self.output.write(chara)
200 cont += 1
201 if cont%20.0 == 0:
202 self.output.write(" ")
203 self.output.write("\n")
204 except:
205 sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ")
206
207 def pf(self):
208 num = int(self.input[0].split()[0])
209 lun = float(len(self.input))
210 spia = 0
211 if (lun-1)/num == 1.0:
212 spia = 1
213 if spia == 1:
214 for riga in self.input[1:]:
215 for ele in range(int(lun-1)):
216 cont = 0
217 self.output.write(">" + self.input[ele+1][:10] + "\n")
218 for char in self.input[ele+1][10:].strip().replace(" ",""):
219 self.output.write(char)
220 cont += 1
221 if cont%80.0 == 0:
222 self.output.write('\n')
223 self.output.write('\n')
224 else:
225 if len(self.input[1])<=11:
226 interm = 0
227 for riga in self.input[1:]:
228 if riga == "\n":
229 interm += 1
230 if (self.input.index(riga)+interm)%2 == 0 and riga != "\n":
231 cont = 0
232 for char in riga.strip().replace(" ",""):
233 self.output.write(char)
234 cont += 1
235 if cont%80.0 == 0:
236 self.output.write('\n')
237 self.output.write('\n')
238 elif (self.input.index(riga)+interm)%2 != 0 and riga != "\n":
239 self.output.write(">" + riga[:10] + "\n")
240 else:
241 try:
242 diz = {}
243 volta = 0
244 for riga in self.input[1:]:
245 if self.input.index(riga) in range(num+1):
246 numriga = self.input.index(riga)
247 diz[self.input.index(riga)] = [self.input[numriga][:10],self.input[numriga][10:].strip().replace(" ","")]
248 else:
249 if riga == "\n":
250 volta += 1
251 else:
252 numriga = self.input.index(riga)
253 prima = diz[numriga - ((num+1)*volta)][1] + self.input[self.input.index(riga)].strip().replace(" ","")
254 diz[numriga - ((num+1)*volta)] = [diz[numriga - ((num+1)*volta)][0],prima]
255 for elemento in diz.keys():
256 self.output.write(">" + diz[elemento][0] + '\n')
257 con = 0
258 for char in diz[elemento][1]:
259 self.output.write(char)
260 con += 1
261 if con%80 == 0:
262 self.output.write('\n')
263 self.output.write('\n')
264 except:
265 sys.exit("The input file is not in the proper format. Please check that your file is in Phylip standard interleaved (or sequential) format ")
266
267 def nf(self):
268 try:
269 diz = {}
270 spia = 0
271 for riga in self.input:
272 if "MATRIX" in riga:
273 spia = 1
274 if spia == 1 and "MATRIX" not in riga and riga != "\n":
275 if riga.split()[0] not in diz.keys():
276 diz[riga.split()[0]] = ""
277 else:
278 for ele in riga.split()[1:]:
279 diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip()
280 for elemento in diz.keys():
281 self.output.write(">" + elemento + '\n')
282 con = 0
283 for char in diz[elemento]:
284 self.output.write(char)
285 con += 1
286 if con%80 == 0:
287 self.output.write('\n')
288 self.output.write('\n')
289 except:
290 sys.exit("The input file is not in Nexus format. ")
291
292 def np(self):
293 try:
294 diz = {}
295 spia = 0
296 for riga in self.input:
297 if "MATRIX" in riga:
298 spia = 1
299 if spia == 1 and "MATRIX" not in riga and riga != "\n":
300 if riga.split()[0] not in diz.keys():
301 diz[riga.split()[0]] = ""
302 else:
303 for ele in riga.split()[1:]:
304 diz[riga.split()[0]] = diz[riga.split()[0]] + ele.strip()
305 num = str(len(diz.keys()))
306 lun = str(len(diz.values()[0]))
307 self.output.write(num + '\t' + lun + '\n')
308 for elemento in diz.keys():
309 if elemento >= 10:
310 nome = elemento[:10] + '\t'
311 else:
312 nome = elemento + "_"*(10-len(elemento)) + '\t'
313 self.output.write(nome + diz[elemento] + '\n')
314 except:
315 sys.exit("The input file is not in Nexus format. ")
316
317
318 def fg(self):
319 count = 0
320 fasta = []
321 for riga in self.input:
322 count += 1
323 if ">" in riga:
324 f = ""
325 p = self.input.index(riga,count-1)
326 c = 1
327 y = riga[1:-1].replace(" ","_")
328 f = f + y + '\t'
329 try:
330 while ">" not in self.input[p+c]:
331 f = f + (self.input[p+c].strip())
332 c += 1
333 except:
334 pass
335 fasta.append(f)
336 for seq in fasta:
337 lun = str(len(seq.split("\t")[1]))
338 self.output.write("LOCUS\t%s\t%s bp\nORIGIN\n"%(seq.split("\t")[0],lun))
339 porzioni = int(lun)/60
340 cont = 0
341 for volte in range(porzioni):
342 part = ""
343 self.output.write(str(cont+1) + "\t")
344 for chara in seq.split("\t")[1][volte*60:(volte+1)*60]:
345 cont += 1
346 part = part + chara
347 if cont%10.0 == 0:
348 part = part + " "
349 self.output.write(part)
350 self.output.write("\n")
351 self.output.write(str(cont+1) + "\t")
352 part = ""
353 for chara in seq.split("\t")[1][(volte+1)*60:]:
354 cont += 1
355 part = part + chara
356 if cont%10.0 == 0:
357 part = part + " "
358 self.output.write(part)
359 self.output.write("\n")
360 self.output.write("//\n\n")
361
362 def gf(self):
363 for riga in self.input:
364 if "LOCUS" in riga:
365 nome = ""
366 spia = 0
367 len = ""
368 seq = ""
369 part = riga.split()
370 for ele in part:
371 if "bp" in ele:
372 len = str(riga.index(ele)-1)
373 nome = part[1] + '\t'
374 if "DEFINITION" in riga:
375 part = riga.split()
376 for ele in part[1:]:
377 nome = nome + ele + ' '
378 if "ORIGIN" in riga:
379 spia = 1
380 if spia == 1 and "ORIGIN" not in riga:
381 part = riga.split()
382 for ele in part[1:]:
383 seq = seq + ele.strip()
384 if "//" in riga:
385 self.output.write(">" + nome + '\n')
386 con = 0
387 for char in seq:
388 self.output.write(char)
389 con += 1
390 if con%80 == 0:
391 self.output.write('\n')
392 self.output.write('\n')
393 spia = 0
394
395 class check_fileformat:
396 def __init__(self,inouttype,input):
397 self.intype = inouttype[0]
398 self.infile = input
399 def single(self):
400 if self.intype == "f":
401 count = 0
402 for riga in self.infile:
403 if riga[0] == ">":
404 count += 1
405 if count == 1:
406 if len(self.infile) < 2:
407 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line")
408 else:
409 return "ok"
410 else:
411 if count >1:
412 sys.exit("The input file is a multi-fasta file. Please resubmit the job using the 'multi sequence' option")
413 if count == 0:
414 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line")
415 if self.intype == "g":
416 locus = 0
417 origin = 0
418 end = 0
419 lun = 1
420 for riga in self.infile:
421 if "LOCUS" in riga:
422 locus = 1
423 if "ORIGIN" in riga:
424 origin = 1
425 elif origin == 1 and len(riga.split()) >= 7:
426 lun = 0
427 if "//" in riga:
428 end = 1
429 if locus == 0 or origin == 0 or end == 0 or lun == 1:
430 sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //")
431 else:
432 return "ok"
433 def multi(self):
434 if self.intype == "p":
435 if len(self.infile[0].split()) == 2 or len(self.infile[0].split()) == 3:
436 if int(self.infile[0].split()[0]) > 1:
437 return "ok"
438 else:
439 sys.exit("There is only one sequence in the file")
440 else:
441 sys.exit("the input file is not in phylip format.")
442 if self.intype == "n":
443 begin = 0
444 matrix = 0
445 ntax = 0
446 if "#NEXUS" in self.infile[0]:
447 for riga in self.infile:
448 if "begin data;" in riga.lower():
449 begin = 1
450 if "matrix" in riga.lower():
451 matrix = 1
452 if "ntax" in riga.lower():
453 r = riga.split()
454 ntax = int(r[1][5:])
455 if begin==1 and matrix == 1:
456 return "ok"
457 else:
458 sys.exit("the input file is not in nexus format.")
459 if ntax <= 1:
460 sys.exit("There is only one sequence in the file")
461 else:
462 sys.exit("the input file is not in nexus format.")
463 if self.intype == "f":
464 count = 0
465 for riga in self.infile:
466 if riga[0] == ">":
467 count += 1
468 if count > 1:
469 if len(self.infile) < 4:
470 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line")
471 else:
472 return "ok"
473 else:
474 if count == 1:
475 sys.exit("The input file is a single-fasta file. Please resubmit the job using the 'single sequence' option")
476 if count == 0:
477 sys.exit("The input file is not in fasta format. Please check that the first row starts with > and that the sequence starts from the second line")
478 if self.intype == "g":
479 locus = 0
480 origin = 0
481 end = 0
482 lun = 1
483 for riga in self.infile:
484 if "LOCUS" in riga:
485 locus = 1
486 if "ORIGIN" in riga:
487 origin = 1
488 if origin == 1 and len(riga.split()) >= 7:
489 lun = 0
490 if "//" in riga:
491 end = 1
492 if locus == 0 or origin == 0 or end == 0 or lun == 1:
493 sys.exit("The input file is not in GenBank format. Please make sure that the file contains at least the LOCUS and ORIGIN fields. The file must also ends with //")
494 else:
495 return "ok"
496
497
498 def main(input,output,inouttype,type):
499 check = check_fileformat(inouttype,input)
500 if type == "single":
501 c = check.single()
502 if c == "ok":
503 conv = convertitori(input,inouttype,type,output)
504 if inouttype == "f-g":
505 conv.fg()
506 if inouttype == "g-f":
507 conv.gf()
508 if type == "multi":
509 c = check.multi()
510 if c == "ok":
511 conv = convertitori(input,inouttype,type,output)
512 if inouttype == "f-g":
513 conv.fg()
514 if inouttype == "g-f":
515 conv.gf()
516 if inouttype == "f-p":
517 conv.fp()
518 if inouttype == "f-n":
519 conv.fn()
520 if inouttype == "p-f":
521 conv.pf()
522 if inouttype == "p-n":
523 conv.pn()
524 if inouttype == "n-p":
525 conv.np()
526 if inouttype == "n-f":
527 conv.nf()
528 output.close()
529
530 if __name__ == "__main__" :
531 input = open(sys.argv[1],"r").readlines()
532 output = open(sys.argv[2],"a")
533 inouttype = sys.argv[3]
534 type = sys.argv[4]
535 main(input,output,inouttype,type)