Mercurial > repos > drosofff > msp_fasta_tabular_converter
annotate fasta_tabular_converter.py @ 0:951cb6b3979b draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Sun, 21 Jun 2015 14:28:49 -0400 |
parents | |
children | 2f7278120be9 |
rev | line source |
---|---|
0
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
1 #!/usr/bin/python |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
2 # |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
3 import sys |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
4 from collections import defaultdict |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
5 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
6 def readfasta_writetabular(fasta, tabular): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
7 F = open(fasta, "r") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
8 for line in F: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
9 if line[0] == ">": continue |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
10 else: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
11 seqdic[line[:-1]] += 1 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
12 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
13 F = open(tabular, "w") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
14 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
15 print >> F, "%s\t%s" % (seq, seqdic[seq]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
16 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
17 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
18 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
19 def readtabular_writefasta(tabular, fasta): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
20 F = open(tabular, "r") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
21 Fw = open(fasta, "w") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
22 counter = 0 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
23 for line in F: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
24 fields = line.split() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
25 for i in range(int(fields[1])): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
26 counter += 1 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
27 print >> Fw, ">%s\n%s" % (counter, fields[0]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
28 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
29 Fw.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
30 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
31 def readtabular_writefastaweighted (tabular, fasta): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
32 F = open(tabular, "r") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
33 Fw = open(fasta, "w") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
34 counter = 0 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
35 for line in F: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
36 counter += 1 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
37 fields = line[:-1].split() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
38 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
39 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
40 Fw.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
41 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
42 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
43 F = open(fastaweigthed_input, "r") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
44 number_reads = 0 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
45 for line in F: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
46 if line[0] == ">": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
47 weigth = int(line[1:-1].split("_")[-1]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
48 number_reads += weigth |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
49 else: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
50 seqdic[line[:-1]] += weigth |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
51 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
52 F = open(fastaweigthed_reparsed, "w") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
53 n=0 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
54 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
55 n += 1 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
56 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
57 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
58 print "%s reads collapsed" % number_reads |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
59 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
60 def readfastaeighted_writefasta(fastaweigthed, fasta): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
61 F = open(fastaweigthed, "r") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
62 Fw = open(fasta, "w") |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
63 counter = 0 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
64 for line in F: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
65 if line[0] == ">": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
66 weigth = int(line[1:-1].split("_")[-1]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
67 else: |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
68 seq = line[:-1] |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
69 for i in range (weigth): |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
70 counter += 1 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
71 print >> Fw, ">%s\n%s" % (counter, seq) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
72 F.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
73 Fw.close() |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
74 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
75 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
76 seqdic = defaultdict(int) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
77 option = sys.argv[3] |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
78 |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
79 if option == "fasta2tabular": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
80 readfasta_writetabular(sys.argv[1], sys.argv[2]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
81 elif option == "tabular2fasta": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
82 readtabular_writefasta(sys.argv[1], sys.argv[2]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
83 elif option == "tabular2fastaweight": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
84 readtabular_writefastaweighted (sys.argv[1], sys.argv[2]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
85 elif option == "fastaweight2fastaweight": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
86 readfastaeighted_writefastaweighted(sys.argv[1], sys.argv[2]) |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
87 elif option == "fastaweight2fasta": |
951cb6b3979b
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
88 readfastaeighted_writefasta(sys.argv[1], sys.argv[2]) |