0
|
1 #!/usr/bin/env python
|
|
2 '''
|
|
3 parse .aln file - output from cap3 program. Output is fasta file and
|
|
4 profile file
|
|
5 '''
|
|
6 import argparse
|
|
7 import re
|
|
8 import zipfile
|
|
9 import tempfile
|
|
10 import textwrap
|
1
|
11 import os
|
0
|
12
|
|
13 def parse_args():
|
|
14 '''Argument parsin'''
|
|
15 description = """
|
|
16 parsing cap3 assembly aln output
|
|
17 """
|
|
18
|
|
19 parser = argparse.ArgumentParser(
|
|
20 description=description, formatter_class=argparse.RawTextHelpFormatter)
|
|
21 parser.add_argument('-re',
|
|
22 '--re_file',
|
|
23 default=None,
|
|
24 required=True,
|
|
25 help="RepeatExlorer archive or directory",
|
|
26 type=str,
|
|
27 action='store')
|
|
28 parser.add_argument('-f',
|
|
29 '--fasta',
|
|
30 default=None,
|
|
31 required=True,
|
|
32 help="fasta output file name",
|
|
33 type=str,
|
|
34 action='store')
|
|
35 parser.add_argument('-m',
|
|
36 '--min_coverage',
|
|
37 default=5,
|
|
38 required=False,
|
|
39 help="minimum contig coverage",
|
|
40 type=int,
|
|
41 action="store")
|
|
42 parser.add_argument('-L',
|
|
43 '--min_contig_length',
|
|
44 default=50,
|
|
45 required=False,
|
|
46 help="minimum contig length",
|
|
47 type=int,
|
|
48 action="store")
|
|
49 return parser.parse_args()
|
|
50
|
|
51
|
|
52 def get_header(f):
|
|
53 aln_header = " . : . : . : . : . : . :"
|
|
54 contig_lead = "******************"
|
|
55 aln_start = -1
|
|
56 while True:
|
|
57 line = f.readline()
|
|
58 if not line:
|
|
59 return None, None
|
|
60 if line[0:18] == contig_lead:
|
|
61 line2 = f.readline()
|
|
62 else:
|
|
63 continue
|
|
64 if aln_header in line2:
|
|
65 aln_start = line2.index(aln_header)
|
|
66 break
|
|
67 contig_name = line.split()[1] + line.split()[2]
|
|
68 return contig_name, aln_start
|
|
69
|
|
70
|
|
71 def segment_start(f):
|
|
72 pos = f.tell()
|
|
73 line = f.readline()
|
|
74 # detect next contig or end of file
|
|
75 if "********" in line or line == "" or "Number of segment pairs = " in line:
|
|
76 segment = False
|
|
77 else:
|
|
78 segment = True
|
|
79 f.seek(pos)
|
|
80 return segment
|
|
81
|
|
82
|
|
83 def get_segment(f, seq_start):
|
|
84 if not segment_start(f):
|
|
85 return None, None
|
|
86 aln = []
|
|
87 while True:
|
|
88 line = f.readline()
|
|
89 if ". : . :" in line:
|
|
90 continue
|
|
91 if "__________" in line:
|
|
92 consensus = f.readline().rstrip('\n')[seq_start:]
|
|
93 f.readline() # empty line
|
|
94 break
|
|
95 else:
|
|
96 aln.append(line.rstrip('\n')[seq_start:])
|
|
97 return aln, consensus
|
|
98
|
|
99
|
|
100 def aln2coverage(aln):
|
|
101 coverage = [0] * len(aln[0])
|
|
102 for a in aln:
|
|
103 for i, c in enumerate(a):
|
|
104 if c not in " -":
|
|
105 coverage[i] += 1
|
|
106 return coverage
|
|
107
|
|
108
|
|
109 def read_contig(f, seq_start):
|
|
110 contig = ""
|
|
111 coverage = []
|
|
112 while True:
|
|
113 aln, consensus = get_segment(f, seq_start)
|
|
114 if aln:
|
|
115 contig += consensus
|
|
116 coverage += aln2coverage(aln)
|
|
117 else:
|
|
118 break
|
|
119 return contig, coverage
|
|
120
|
|
121
|
|
122 def remove_gaps(consensus, coverage):
|
|
123 if "-" not in consensus:
|
|
124 return consensus, coverage
|
|
125 new_coverage = [
|
|
126 cov for cons, cov in zip(consensus, coverage) if cons != "-"
|
|
127 ]
|
|
128 new_consensus = consensus.replace("-", "")
|
|
129 return new_consensus, new_coverage
|
|
130
|
|
131
|
|
132 def extract_contigs_from_re_archive(archive, aln_output):
|
|
133 with zipfile.ZipFile(archive, 'r') as zip_object, open(aln_output,
|
|
134 'w') as fout:
|
|
135 flist = zip_object.infolist()
|
|
136 for fn in flist:
|
|
137 if re.match('seqclust.+[.]aln$', fn.filename):
|
|
138 with zip_object.open(fn.filename) as aln:
|
|
139 for l in aln:
|
|
140 fout.write(l.decode('utf-8'))
|
|
141 return aln_output
|
|
142
|
|
143 def read_tarean_fasta(fobj):
|
|
144 ids = []
|
|
145 s = []
|
|
146 for i in fobj:
|
1
|
147 if isinstance(i, str):
|
|
148 ii = i
|
|
149 else:
|
|
150 ii = i.decode('utf-8')
|
0
|
151 if ii[0] == ">":
|
|
152 ids.append(ii)
|
|
153 s.append("")
|
|
154 else:
|
|
155 s[-1] = s[-1] + ii.strip()
|
|
156 return ids, s
|
|
157
|
1
|
158
|
|
159 def extract_contigs_from_re_directory(re_dir, aln_output):
|
|
160 with open(aln_output, 'w') as fout:
|
|
161 for subdir, dirs, files in os.walk(re_dir):
|
|
162 for fn in files:
|
|
163 fn_full = subdir + os.sep + fn
|
|
164 if re.match('^.+seqclust.+[.]aln$', fn_full):
|
|
165 print(fn_full)
|
|
166 with open(fn_full) as aln:
|
|
167 for l in aln:
|
|
168 fout.write(l)
|
|
169 return aln_output
|
|
170
|
|
171
|
|
172
|
0
|
173 def extract_tarean_contigs_from_re_archive(archive):
|
|
174 with zipfile.ZipFile(archive, 'r') as zip_object:
|
|
175 flist = zip_object.infolist()
|
|
176 seqs_all = []
|
|
177 ids_all = []
|
|
178 for fn in flist:
|
|
179 if re.match("seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn.filename):
|
|
180 print(fn.filename)
|
|
181 with zip_object.open(fn.filename) as fobj:
|
|
182 ids, seqs = read_tarean_fasta(fobj)
|
|
183 # wrap sequences
|
|
184 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
|
|
185 seqs_all += seqs
|
|
186 ids_all += ids
|
|
187 return ids_all, seqs_all
|
|
188
|
|
189
|
1
|
190 def extract_tarean_contigs_from_re_directory(re_dir):
|
|
191 seqs_all = []
|
|
192 ids_all = []
|
|
193 for subdir, dirs, files in os.walk(re_dir):
|
|
194 for fn in files:
|
|
195 fn_full = subdir + os.sep + fn
|
|
196 if re.match("^.+seqclust.+dir_CL[0-9]+[/]tarean_contigs.fasta", fn_full):
|
|
197 print (fn_full)
|
|
198 with open(fn_full) as fobj:
|
|
199 ids, seqs = read_tarean_fasta(fobj)
|
|
200 # wrap sequences
|
|
201 seqs = ["\n".join(textwrap.wrap(s + s, 80)) for s in seqs]
|
|
202 seqs_all += seqs
|
|
203 ids_all += ids
|
|
204 return ids_all, seqs_all
|
0
|
205
|
|
206 def filter_contigs(consensus, coverage, min_coverage=5):
|
|
207 x = "".join([
|
|
208 s if cov >= min_coverage else " "
|
|
209 for s, cov in zip(consensus, coverage)
|
|
210 ]).strip()
|
|
211 consensus_N = "\n".join(textwrap.wrap(x.replace(" ", "N"),80))
|
|
212 return consensus_N
|
|
213
|
|
214
|
|
215 def main():
|
|
216 args = parse_args()
|
1
|
217 if os.path.isdir(args.re_file):
|
|
218 # extract from directory
|
|
219 ids, seqs = extract_tarean_contigs_from_re_directory(args.re_file)
|
|
220 aln_file = extract_contigs_from_re_directory(
|
|
221 args.re_file,
|
|
222 tempfile.NamedTemporaryFile().name)
|
|
223 else:
|
|
224 # extract aln from archive
|
|
225 ids, seqs = extract_tarean_contigs_from_re_archive(args.re_file)
|
|
226 aln_file = extract_contigs_from_re_archive(
|
|
227 args.re_file,
|
|
228 tempfile.NamedTemporaryFile().name)
|
|
229
|
0
|
230 with open(aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta:
|
|
231 while True:
|
|
232 contig_name, seq_start = get_header(f1)
|
|
233 if contig_name:
|
|
234 consensus, coverage = remove_gaps(*read_contig(f1, seq_start))
|
|
235 clean_consensus = filter_contigs(consensus, coverage,
|
|
236 args.min_coverage)
|
|
237 if len(clean_consensus) >= args.min_contig_length:
|
|
238 ffasta.write(">{}\n".format(contig_name))
|
|
239 ffasta.write("{}\n".format(clean_consensus))
|
|
240 else:
|
|
241 break
|
|
242
|
|
243 # write tarean sequences:
|
|
244 for i, s in zip(ids, seqs):
|
|
245 ffasta.write(i)
|
|
246 ffasta.write(s + "\n")
|
|
247
|
|
248
|
|
249
|
|
250 if __name__ == "__main__":
|
|
251
|
|
252 main()
|