Mercurial > repos > drosofff > msp_fasta_tabular_converter
comparison fasta_tabular_converter.py @ 3:36388b666bfc draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_fasta_tabular_converter commit b6de14061c479f0418cd89e26d6f5ac26e565a07
author | drosofff |
---|---|
date | Wed, 09 Nov 2016 11:24:13 -0500 |
parents | 330dd8a8c31a |
children |
comparison
equal
deleted
inserted
replaced
2:330dd8a8c31a | 3:36388b666bfc |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/env python |
2 # | 2 # |
3 import argparse | |
4 import logging | |
3 import sys | 5 import sys |
4 import string | |
5 import argparse | |
6 from collections import defaultdict | 6 from collections import defaultdict |
7 | |
7 | 8 |
8 def Parser(): | 9 def Parser(): |
9 the_parser = argparse.ArgumentParser() | 10 the_parser = argparse.ArgumentParser() |
10 the_parser.add_argument( | 11 the_parser.add_argument( |
11 '--input', action="store", type=str, help="input file") | 12 '--input', action="store", type=str, help="input file") |
14 the_parser.add_argument( | 15 the_parser.add_argument( |
15 '--type', action="store", type=str, help="type of convertion") | 16 '--type', action="store", type=str, help="type of convertion") |
16 args = the_parser.parse_args() | 17 args = the_parser.parse_args() |
17 return args | 18 return args |
18 | 19 |
20 | |
19 def readfasta_writetabular(fasta, tabular, mode="oneline"): | 21 def readfasta_writetabular(fasta, tabular, mode="oneline"): |
20 F = open(fasta, "r") | 22 for line in fasta: |
21 for line in F: | |
22 if line[0] == ">": | 23 if line[0] == ">": |
23 try: | 24 try: |
24 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable | 25 seqdic["".join(stringlist)] += 1 # to dump the sequence of the previous item - try because of first missing stringlist variable |
25 except: pass | 26 except NameError: |
26 stringlist=[] | 27 pass |
28 stringlist = [] | |
27 else: | 29 else: |
28 stringlist.append(line[:-1]) | 30 try: |
31 stringlist.append(line[:-1]) | |
32 except UnboundLocalError: # if file went through filter and contains only empty lines | |
33 logging.info("first line is empty.") | |
29 try: | 34 try: |
30 seqdic["".join(stringlist)] += 1 # for the last sequence | 35 seqdic["".join(stringlist)] += 1 # for the last sequence |
31 except: pass # in case file to convert is empty | 36 except NameError: |
32 F.close() | 37 logging.info("input file has not fasta sequences.") |
33 F = open(tabular, "w") | |
34 for seq in sorted(seqdic, key=seqdic.get, reverse=True): | 38 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
35 print >> F, "%s\t%s" % (seq, seqdic[seq]) | 39 tabular.write("%s\t%s\n" % (seq, seqdic[seq])) |
36 F.close() | 40 |
37 | 41 |
38 | |
39 def readtabular_writefasta(tabular, fasta): | 42 def readtabular_writefasta(tabular, fasta): |
40 F = open(tabular, "r") | 43 counter = 0 |
41 Fw = open(fasta, "w") | 44 for line in tabular: |
42 counter = 0 | 45 fields = line.split() |
43 for line in F: | 46 for i in range(int(fields[1])): |
44 fields = line.split() | 47 counter += 1 |
45 for i in range(int(fields[1])): | 48 fasta.write(">%s\n%s\n" % (counter, fields[0])) |
46 counter += 1 | |
47 print >> Fw, ">%s\n%s" % (counter, fields[0]) | |
48 F.close() | |
49 Fw.close() | |
50 | 49 |
51 def readtabular_writefastaweighted (tabular, fasta): | |
52 F = open(tabular, "r") | |
53 Fw = open(fasta, "w") | |
54 counter = 0 | |
55 for line in F: | |
56 counter += 1 | |
57 fields = line[:-1].split() | |
58 print >> Fw, ">%s_%s\n%s" % (counter, fields[1], fields[0]) | |
59 F.close() | |
60 Fw.close() | |
61 | 50 |
62 def readfastaeighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): | 51 def readtabular_writefastaweighted(tabular, fasta): |
63 F = open(fastaweigthed_input, "r") | 52 counter = 0 |
64 number_reads = 0 | 53 for line in tabular: |
65 for line in F: | 54 counter += 1 |
66 if line[0] == ">": | 55 fields = line[:-1].split() |
67 weigth = int(line[1:-1].split("_")[-1]) | 56 fasta.write(">%s_%s\n%s\n" % (counter, fields[1], fields[0])) |
68 number_reads += weigth | |
69 else: | |
70 seqdic[line[:-1]] += weigth | |
71 F.close() | |
72 F = open(fastaweigthed_reparsed, "w") | |
73 n=0 | |
74 for seq in sorted(seqdic, key=seqdic.get, reverse=True): | |
75 n += 1 | |
76 print >> F, ">%s_%s\n%s" % (n, seqdic[seq], seq) | |
77 F.close() | |
78 print "%s reads collapsed" % number_reads | |
79 | 57 |
80 def readfastaeighted_writefasta(fastaweigthed, fasta): | 58 |
81 F = open(fastaweigthed, "r") | 59 def readfastaweighted_writefastaweighted(fastaweigthed_input, fastaweigthed_reparsed): |
82 Fw = open(fasta, "w") | 60 number_reads = 0 |
83 counter = 0 | 61 for line in fastaweigthed_input: |
84 for line in F: | 62 if line[0] == ">": |
85 if line[0] == ">": | 63 weigth = int(line[1:-1].split("_")[-1]) |
86 weigth = int(line[1:-1].split("_")[-1]) | 64 number_reads += weigth |
87 else: | 65 else: |
88 seq = line[:-1] | 66 seqdic[line[:-1]] += weigth |
89 for i in range (weigth): | 67 n = 0 |
90 counter += 1 | 68 for seq in sorted(seqdic, key=seqdic.get, reverse=True): |
91 print >> Fw, ">%s\n%s" % (counter, seq) | 69 n += 1 |
92 F.close() | 70 fastaweigthed_reparsed.write(">%s_%s\n%s\n" % (n, seqdic[seq], seq)) |
93 Fw.close() | 71 log.info("%s reads collapsed" % number_reads) |
72 | |
73 | |
74 def readfastaweighted_writefasta(fastaweigthed, fasta): | |
75 counter = 0 | |
76 for line in fastaweigthed: | |
77 if line[0] == ">": | |
78 weigth = int(line[1:-1].split("_")[-1]) | |
79 else: | |
80 seq = line[:-1] | |
81 for i in range(weigth): | |
82 counter += 1 | |
83 fasta.write(">%s\n%s\n" % (counter, seq)) | |
84 | |
94 | 85 |
95 def main(input, output, type): | 86 def main(input, output, type): |
96 if type == "fasta2tabular": | 87 with open(input, "r") as input: |
97 readfasta_writetabular(input, output) | 88 with open(output, "w") as output: |
98 elif type == "tabular2fasta": | 89 if type == "fasta2tabular": |
99 readtabular_writefasta(input, output) | 90 readfasta_writetabular(input, output) |
100 elif type == "tabular2fastaweight": | 91 elif type == "tabular2fasta": |
101 readtabular_writefastaweighted (input, output) | 92 readtabular_writefasta(input, output) |
102 elif type == "fastaweight2fastaweight": | 93 elif type == "tabular2fastaweight": |
103 readfastaeighted_writefastaweighted(input, output) | 94 readtabular_writefastaweighted(input, output) |
104 elif type == "fastaweight2fasta": | 95 elif type == "fastaweight2fastaweight": |
105 readfastaeighted_writefasta(input, output) | 96 readfastaweighted_writefastaweighted(input, output) |
97 elif type == "fastaweight2fasta": | |
98 readfastaweighted_writefasta(input, output) | |
99 | |
106 | 100 |
107 if __name__ == "__main__": | 101 if __name__ == "__main__": |
108 seqdic = defaultdict(int) | 102 seqdic = defaultdict(int) |
109 args = Parser() | 103 args = Parser() |
110 main (args.input, args.output, args.type) | 104 log = logging.getLogger(__name__) |
105 logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
106 main(args.input, args.output, args.type) |