14
|
1 #!/usr/bin/env python3
|
|
2 '''
|
|
3 parse .aln file - output from cap3 program. Output is fasta file and
|
|
4 profile file
|
|
5 '''
|
|
6 import argparse
|
15
|
7 import re
|
14
|
8
|
|
9
|
|
10 def parse_args():
|
|
11 '''Argument parsin'''
|
|
12 description = """
|
|
13 parsing cap3 assembly aln output
|
|
14 """
|
15
|
15
|
|
16 parser = argparse.ArgumentParser(
|
|
17 description=description,
|
|
18 formatter_class=argparse.RawTextHelpFormatter)
|
|
19 parser.add_argument('-a',
|
|
20 '--aln_file',
|
|
21 default=None,
|
|
22 required=True,
|
|
23 help="Aln file input",
|
|
24 type=str,
|
|
25 action='store')
|
|
26 parser.add_argument('-f',
|
|
27 '--fasta',
|
|
28 default=None,
|
|
29 required=True,
|
|
30 help="fasta output file name",
|
|
31 type=str,
|
|
32 action='store')
|
|
33 parser.add_argument('-p',
|
|
34 '--profile',
|
|
35 default=None,
|
|
36 required=True,
|
|
37 help="output file for coverage profile",
|
|
38 type=str,
|
|
39 action="store")
|
14
|
40 return parser.parse_args()
|
|
41
|
|
42
|
15
|
43 def get_header(f):
|
16
|
44 aln_header = " . : . : . : . : . : . :"
|
15
|
45 contig_lead = "******************"
|
|
46 aln_start = -1
|
|
47 while True:
|
|
48 line = f.readline()
|
|
49 if not line:
|
|
50 return None, None
|
|
51 if line[0:18] == contig_lead:
|
|
52 line2 = f.readline()
|
|
53 else:
|
|
54 continue
|
|
55 if aln_header in line2:
|
|
56 aln_start = line2.index(aln_header)
|
|
57 break
|
|
58 contig_name = line.split()[1] + line.split()[2]
|
|
59 return contig_name, aln_start
|
|
60
|
|
61
|
|
62 def segment_start(f):
|
|
63 pos = f.tell()
|
|
64 line = f.readline()
|
|
65 # detect next contig or end of file
|
16
|
66 if "********" in line or line == "" or "Number of segment pairs = " in line:
|
15
|
67 segment = False
|
|
68 else:
|
|
69 segment = True
|
|
70 f.seek(pos)
|
|
71 return segment
|
|
72
|
|
73
|
|
74 def get_segment(f, seq_start):
|
|
75 if not segment_start(f):
|
|
76 return None, None
|
|
77 aln = []
|
|
78 while True:
|
|
79 line = f.readline()
|
|
80 if ". : . :" in line:
|
|
81 continue
|
|
82 if "__________" in line:
|
|
83 consensus = f.readline().rstrip('\n')[seq_start:]
|
|
84 f.readline() # empty line
|
|
85 break
|
|
86 else:
|
|
87 aln.append(line.rstrip('\n')[seq_start:])
|
|
88 return aln, consensus
|
|
89
|
|
90
|
|
91 def aln2coverage(aln):
|
|
92 coverage = [0] * len(aln[0])
|
|
93 for a in aln:
|
|
94 for i, c in enumerate(a):
|
|
95 if c not in " -":
|
|
96 coverage[i] += 1
|
|
97 return coverage
|
|
98
|
|
99
|
|
100 def read_contig(f, seq_start):
|
|
101 contig = ""
|
|
102 coverage = []
|
|
103 while True:
|
|
104 aln, consensus = get_segment(f, seq_start)
|
|
105 if aln:
|
|
106 contig += consensus
|
|
107 coverage += aln2coverage(aln)
|
|
108 else:
|
|
109 break
|
|
110 return contig, coverage
|
|
111
|
|
112 def remove_gaps(consensus, coverage):
|
|
113 if "-" not in consensus:
|
|
114 return consensus, coverage
|
|
115 new_coverage = [cov for cons, cov in zip(consensus, coverage)
|
|
116 if cons != "-"]
|
|
117 new_consensus = consensus.replace("-", "")
|
|
118 return new_consensus, new_coverage
|
|
119
|
|
120 def main():
|
|
121 args = parse_args()
|
|
122 with open(args.aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta, open(args.profile, 'w') as fprofile:
|
|
123 while True:
|
|
124 contig_name, seq_start = get_header(f1)
|
|
125 if contig_name:
|
|
126 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
|
|
127 ffasta.write(">{}\n".format(contig_name))
|
|
128 ffasta.write("{}\n".format(consensus))
|
|
129 fprofile.write(">{}\n".format(contig_name))
|
|
130 fprofile.write("{}\n".format(" ".join([str(i) for i in coverage])))
|
|
131 else:
|
|
132 break
|
|
133
|
|
134
|
14
|
135 if __name__ == "__main__":
|
|
136
|
15
|
137 main()
|