annotate parse_aln.py @ 22:1eabd42e00ef draft

Uploaded
author petr-novak
date Fri, 03 Apr 2020 07:27:59 -0400
parents 0e820310d4dc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
2 '''
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
3 parse .aln file - output from cap3 program. Output is fasta file and
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
4 profile file
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
5 '''
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
6 import argparse
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
7 import re
14
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
8
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
9
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
10 def parse_args():
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
11 '''Argument parsin'''
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
12 description = """
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
13 parsing cap3 assembly aln output
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
14 """
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
16 parser = argparse.ArgumentParser(
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
17 description=description,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
18 formatter_class=argparse.RawTextHelpFormatter)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
19 parser.add_argument('-a',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
20 '--aln_file',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
21 default=None,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
22 required=True,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
23 help="Aln file input",
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
24 type=str,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
25 action='store')
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
26 parser.add_argument('-f',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
27 '--fasta',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
28 default=None,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
29 required=True,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
30 help="fasta output file name",
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
31 type=str,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
32 action='store')
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
33 parser.add_argument('-p',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
34 '--profile',
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
35 default=None,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
36 required=True,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
37 help="output file for coverage profile",
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
38 type=str,
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
39 action="store")
14
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
40 return parser.parse_args()
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
41
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
42
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
43 def get_header(f):
16
0e820310d4dc Uploaded
petr-novak
parents: 15
diff changeset
44 aln_header = " . : . : . : . : . : . :"
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
45 contig_lead = "******************"
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
46 aln_start = -1
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
47 while True:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
48 line = f.readline()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
49 if not line:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
50 return None, None
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
51 if line[0:18] == contig_lead:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
52 line2 = f.readline()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
53 else:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
54 continue
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
55 if aln_header in line2:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
56 aln_start = line2.index(aln_header)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
57 break
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
58 contig_name = line.split()[1] + line.split()[2]
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
59 return contig_name, aln_start
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
60
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
61
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
62 def segment_start(f):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
63 pos = f.tell()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
64 line = f.readline()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
65 # detect next contig or end of file
16
0e820310d4dc Uploaded
petr-novak
parents: 15
diff changeset
66 if "********" in line or line == "" or "Number of segment pairs = " in line:
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
67 segment = False
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
68 else:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
69 segment = True
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
70 f.seek(pos)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
71 return segment
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
72
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
73
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
74 def get_segment(f, seq_start):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
75 if not segment_start(f):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
76 return None, None
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
77 aln = []
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
78 while True:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
79 line = f.readline()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
80 if ". : . :" in line:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
81 continue
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
82 if "__________" in line:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
83 consensus = f.readline().rstrip('\n')[seq_start:]
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
84 f.readline() # empty line
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
85 break
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
86 else:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
87 aln.append(line.rstrip('\n')[seq_start:])
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
88 return aln, consensus
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
89
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
90
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
91 def aln2coverage(aln):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
92 coverage = [0] * len(aln[0])
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
93 for a in aln:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
94 for i, c in enumerate(a):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
95 if c not in " -":
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
96 coverage[i] += 1
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
97 return coverage
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
98
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
99
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
100 def read_contig(f, seq_start):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
101 contig = ""
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
102 coverage = []
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
103 while True:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
104 aln, consensus = get_segment(f, seq_start)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
105 if aln:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
106 contig += consensus
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
107 coverage += aln2coverage(aln)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
108 else:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
109 break
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
110 return contig, coverage
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
111
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
112 def remove_gaps(consensus, coverage):
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
113 if "-" not in consensus:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
114 return consensus, coverage
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
115 new_coverage = [cov for cons, cov in zip(consensus, coverage)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
116 if cons != "-"]
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
117 new_consensus = consensus.replace("-", "")
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
118 return new_consensus, new_coverage
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
119
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
120 def main():
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
121 args = parse_args()
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
122 with open(args.aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta, open(args.profile, 'w') as fprofile:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
123 while True:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
124 contig_name, seq_start = get_header(f1)
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
125 if contig_name:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
126 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
127 ffasta.write(">{}\n".format(contig_name))
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
128 ffasta.write("{}\n".format(consensus))
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
129 fprofile.write(">{}\n".format(contig_name))
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
130 fprofile.write("{}\n".format(" ".join([str(i) for i in coverage])))
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
131 else:
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
132 break
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
133
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
134
14
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
135 if __name__ == "__main__":
a6c55d1bdb6c Uploaded
petr-novak
parents:
diff changeset
136
15
3151a72a6671 Uploaded
petr-novak
parents: 14
diff changeset
137 main()