annotate fasta_tabular_converter.py @ 2:330dd8a8c31a draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
author drosofff
date Fri, 25 Mar 2016 19:29:40 -0400
parents 2f7278120be9
children 36388b666bfc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
2 #
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 import sys
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
4 import string
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
5 import argparse
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 from collections import defaultdict
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
8 def Parser():
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
9 the_parser = argparse.ArgumentParser()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
10 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
11 '--input', action="store", type=str, help="input file")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
12 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
13 '--output', action="store", type=str, help="output converted file")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
14 the_parser.add_argument(
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
15 '--type', action="store", type=str, help="type of convertion")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
16 args = the_parser.parse_args()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
17 return args
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
18
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
19 def readfasta_writetabular(fasta, tabular, mode="oneline"):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
20 F = open(fasta, "r")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
21 for line in F:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
22 if line[0] == ">":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
23 try:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
25 except: pass
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
26 stringlist=[]
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
27 else:
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
28 stringlist.append(line[:-1])
2
330dd8a8c31a planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 1
diff changeset
29 try:
330dd8a8c31a planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 1
diff changeset
30 seqdic["".join(stringlist)] += 1 # for the last sequence
330dd8a8c31a planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 1
diff changeset
31 except: pass # in case file to convert is empty
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
32 F.close()
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
33 F = open(tabular, "w")
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
34 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
35 print >> F, "%s\t%s" % (seq, seqdic[seq])
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
36 F.close()
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39 def readtabular_writefasta(tabular, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
40 F = open(tabular, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
41 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
42 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
43 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
44 fields = line.split()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45 for i in range(int(fields[1])):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47 print >> Fw, ">%s\n%s" % (counter, fields[0])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 def readtabular_writefastaweighted (tabular, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 F = open(tabular, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57 fields = line[:-1].split()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 F = open(fastaweigthed_input, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 number_reads = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 if line[0] == ">":
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 weigth = int(line[1:-1].split("_")[-1])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68 number_reads += weigth
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 else:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 seqdic[line[:-1]] += weigth
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 F = open(fastaweigthed_reparsed, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 n=0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 for seq in sorted(seqdic, key=seqdic.get, reverse=True):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75 n += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq)
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 print "%s reads collapsed" % number_reads
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 def readfastaeighted_writefasta(fastaweigthed, fasta):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 F = open(fastaweigthed, "r")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 Fw = open(fasta, "w")
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 counter = 0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 for line in F:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 if line[0] == ">":
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 weigth = int(line[1:-1].split("_")[-1])
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 else:
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 seq = line[:-1]
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 for i in range (weigth):
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90 counter += 1
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 print >> Fw, ">%s\n%s" % (counter, seq)
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92 F.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
93 Fw.close()
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
94
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
95 def main(input, output, type):
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
96 if type == "fasta2tabular":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
97 readfasta_writetabular(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
98 elif type == "tabular2fasta":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
99 readtabular_writefasta(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
100 elif type == "tabular2fastaweight":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
101 readtabular_writefastaweighted (input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
102 elif type == "fastaweight2fastaweight":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
103 readfastaeighted_writefastaweighted(input, output)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
104 elif type == "fastaweight2fasta":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
105 readfastaeighted_writefasta(input, output)
0
951cb6b3979b planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
106
1
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
107 if __name__ == "__main__":
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
108 seqdic = defaultdict(int)
2f7278120be9 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 6a93f2809e2939f9d847c3238bfbff8ead367d9f
drosofff
parents: 0
diff changeset
109 args = Parser()
2
330dd8a8c31a planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit 030207144f0811822dbdda9a10e036ff8e794d7c
drosofff
parents: 1
diff changeset
110 main (args.input, args.output, args.type)