0
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: utf-8 -*-
|
|
3 # vim: set fileencoding=utf-8
|
|
4 import os
|
|
5 import sys
|
|
6 import json
|
|
7 import math
|
|
8 import numpy
|
|
9 import argparse
|
|
10 import itertools
|
|
11 import logging
|
|
12 from gff3 import (
|
|
13 feature_lambda,
|
|
14 coding_genes,
|
|
15 genes,
|
|
16 get_gff3_id,
|
|
17 feature_test_location,
|
|
18 get_rbs_from,
|
|
19 nice_name,
|
|
20 )
|
|
21 from shinefind import NaiveSDCaller
|
|
22 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
|
|
23 from Bio import SeqIO
|
|
24 from Bio.SeqRecord import SeqRecord
|
|
25 from Bio.SeqFeature import SeqFeature, FeatureLocation
|
|
26 from jinja2 import Environment, FileSystemLoader
|
|
27 from cpt import MGAFinder
|
|
28
|
|
29 logging.basicConfig(level=logging.DEBUG)
|
|
30 log = logging.getLogger(name="pav")
|
|
31
|
|
32 # Path to script, required because of Galaxy.
|
|
33 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
|
|
34 # Path to the HTML template for the report
|
|
35
|
|
36 ENCOURAGEMENT = (
|
|
37 (100, "Perfection itself!"),
|
|
38 (90, "Amazing!"),
|
|
39 (80, "Not too bad, a few minor things to fix..."),
|
|
40 (70, "Some issues to address"),
|
|
41 (
|
|
42 50,
|
|
43 """Issues detected! </p><p class="text-muted">Have you heard of the
|
|
44 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation
|
|
45 Pipeline?""",
|
|
46 ),
|
|
47 (
|
|
48 0,
|
|
49 """<b>MAJOR</b> issues detected! Please consider using the
|
|
50 <a href="https://cpt.tamu.edu">CPT</a>\'s Automated Phage Annotation Pipeline""",
|
|
51 ),
|
|
52 )
|
|
53
|
|
54
|
|
55 def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"):
|
|
56 kwargs = {"qualifiers": {"note": [message]}}
|
|
57 kwargs["type"] = type_src
|
|
58 kwargs["strand"] = strand
|
|
59 kwargs["phase"]=0
|
|
60 kwargs["score"]=0.0
|
|
61 kwargs["source"]="feature"
|
|
62 if id_src is not None:
|
|
63 kwargs["id"] = id_src.id
|
|
64 kwargs["qualifiers"]["ID"] = [id_src.id]
|
|
65 kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", [])
|
|
66
|
|
67
|
|
68 if end >= start:
|
|
69 return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs)
|
|
70 else:
|
|
71 return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs)
|
|
72
|
|
73
|
|
74 def __ensure_location_in_bounds(start=0, end=0, parent_length=0):
|
|
75 # This prevents frameshift errors
|
|
76 while start < 0:
|
|
77 start += 3
|
|
78 while end < 0:
|
|
79 end += 3
|
|
80 while start > parent_length:
|
|
81 start -= 3
|
|
82 while end > parent_length:
|
|
83 end -= 3
|
|
84 return (start, end)
|
|
85
|
|
86
|
|
87 def missing_rbs(record, lookahead_min=5, lookahead_max=15):
|
|
88 """
|
|
89 Identify gene features with missing RBSs
|
|
90
|
|
91 This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
|
|
92 there's an RBS feature in those bounds.
|
|
93
|
|
94 The returned data is a set of genes with the RBS sequence in the __upstream
|
|
95 attribute, and a message in the __message attribute.
|
|
96 """
|
|
97 results = []
|
|
98 good = 0
|
|
99 bad = 0
|
|
100 qc_features = []
|
|
101 sd_finder = NaiveSDCaller()
|
|
102
|
|
103 any_rbss = False
|
|
104
|
|
105 for gene in coding_genes(record.features):
|
|
106 # Check if there are RBSs, TODO: make this recursive. Each feature in
|
|
107 # gene.sub_features can also have sub_features.
|
|
108 rbss = get_rbs_from(gene)
|
|
109 # No RBS found
|
|
110 if len(rbss) == 0:
|
|
111 # Get the sequence lookahead_min to lookahead_max upstream
|
|
112 if gene.strand > 0:
|
|
113 start = gene.location.start - lookahead_max
|
|
114 end = gene.location.start - lookahead_min
|
|
115 else:
|
|
116 start = gene.location.end + lookahead_min
|
|
117 end = gene.location.end + lookahead_max
|
|
118 # We have to ensure the feature is ON the genome, otherwise we may
|
|
119 # be trying to access a location outside of the length of the
|
|
120 # genome, which would be bad.
|
|
121 (start, end) = __ensure_location_in_bounds(
|
|
122 start=start, end=end, parent_length=len(record)
|
|
123 )
|
|
124 # Temporary feature to extract sequence
|
|
125 tmp = gffSeqFeature(
|
|
126 FeatureLocation(start, end, strand=gene.strand), type="domain"
|
|
127 )
|
|
128 # Get the sequence
|
|
129 seq = str(tmp.extract(record.seq))
|
|
130 # Set the default properties
|
|
131 gene.__upstream = seq.lower()
|
|
132 gene.__message = "No RBS annotated, None found"
|
|
133
|
|
134 # Try and do an automated shinefind call
|
|
135 sds = sd_finder.list_sds(seq)
|
|
136 if len(sds) > 0:
|
|
137 sd = sds[0]
|
|
138 gene.__upstream = sd_finder.highlight_sd(
|
|
139 seq.lower(), sd["start"], sd["end"]
|
|
140 )
|
|
141 gene.__message = "Unannotated but valid RBS"
|
|
142
|
|
143 qc_features.append(
|
|
144 gen_qc_feature(
|
|
145 start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene"
|
|
146 )
|
|
147 )
|
|
148
|
|
149 bad += 1
|
|
150 results.append(gene)
|
|
151 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
|
|
152 else:
|
|
153 if len(rbss) > 1:
|
|
154 log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene))
|
|
155 any_rbss = True
|
|
156 # get first RBS/CDS
|
|
157 cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
|
|
158 rbs = rbss[0]
|
|
159
|
|
160 # Get the distance between the two
|
|
161 if gene.strand > 0:
|
|
162 distance = cds.location.start - rbs.location.end
|
|
163 else:
|
|
164 distance = rbs.location.start - cds.location.end
|
|
165
|
|
166 # If the RBS is too far away, annotate that
|
|
167 if distance > lookahead_max:
|
|
168 gene.__message = "RBS too far away (%s nt)" % distance
|
|
169
|
|
170 qc_features.append(
|
|
171 gen_qc_feature(
|
|
172 rbs.location.start,
|
|
173 rbs.location.end,
|
|
174 gene.__message,
|
|
175 strand=gene.strand,
|
|
176 id_src=gene,
|
|
177 type_src="gene"
|
|
178 )
|
|
179 )
|
|
180
|
|
181 bad += 1
|
|
182 results.append(gene)
|
|
183 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
|
|
184 else:
|
|
185 good += 1
|
|
186
|
|
187 return good, bad, results, qc_features, any_rbss
|
|
188
|
|
189
|
|
190 # modified from get_orfs_or_cdss.py
|
|
191 # -----------------------------------------------------------
|
|
192
|
|
193
|
|
194 def require_sd(data, record, chrom_start, sd_min, sd_max):
|
|
195 sd_finder = NaiveSDCaller()
|
|
196 for putative_gene in data:
|
|
197 if putative_gene[2] > 0: # strand
|
|
198 start = chrom_start + putative_gene[0] - sd_max
|
|
199 end = chrom_start + putative_gene[0] - sd_min
|
|
200 else:
|
|
201 start = chrom_start + putative_gene[1] + sd_min
|
|
202 end = chrom_start + putative_gene[1] + sd_max
|
|
203
|
|
204 (start, end) = __ensure_location_in_bounds(
|
|
205 start=start, end=end, parent_length=len(record)
|
|
206 )
|
|
207 tmp = gffSeqFeature(
|
|
208 FeatureLocation(start, end, strand=putative_gene[2]), type="domain"
|
|
209 )
|
|
210 # Get the sequence
|
|
211 seq = str(tmp.extract(record.seq))
|
|
212 sds = sd_finder.list_sds(seq)
|
|
213 if len(sds) > 0:
|
|
214 yield putative_gene + (start, end)
|
|
215
|
|
216
|
|
217 def excessive_gap(
|
|
218 record,
|
|
219 excess=50,
|
|
220 excess_divergent=200,
|
|
221 min_gene=30,
|
|
222 slop=30,
|
|
223 lookahead_min=5,
|
|
224 lookahead_max=15,
|
|
225 ):
|
|
226 """
|
|
227 Identify excessive gaps between gene features.
|
|
228
|
|
229 Default "excessive" gap size is 10, but that should likely be larger.
|
|
230 """
|
|
231 results = []
|
|
232 good = 0
|
|
233 bad = 0
|
|
234
|
|
235 contiguous_regions = []
|
|
236
|
|
237 sorted_genes = sorted(
|
|
238 genes(record.features), key=lambda feature: feature.location.start
|
|
239 )
|
|
240 if len(sorted_genes) == 0:
|
|
241 log.warn("NO GENES FOUND")
|
|
242 return good, bad, results, []
|
|
243
|
|
244 current_gene = None
|
|
245 for gene in sorted_genes:
|
|
246 # If the gene's start is contiguous to the "current_gene", then we
|
|
247 # extend current_gene
|
|
248 for cds in genes(gene.sub_features, feature_type="CDS"):
|
|
249 if current_gene is None:
|
|
250 current_gene = [int(cds.location.start), int(cds.location.end)]
|
|
251
|
|
252 if cds.location.start <= current_gene[1] + excess:
|
|
253 # Don't want to decrease size
|
|
254 if int(cds.location.end) >= current_gene[1]:
|
|
255 current_gene[1] = int(cds.location.end)
|
|
256 else:
|
|
257 # If it's discontiguous, we append the region and clear.
|
|
258 contiguous_regions.append(current_gene)
|
|
259 current_gene = [int(cds.location.start), int(cds.location.end)]
|
|
260
|
|
261 # This generally expected that annotations would NOT continue unto the end
|
|
262 # of the genome, however that's a bug, and we can make it here with an
|
|
263 # empty contiguous_regions list
|
|
264 contiguous_regions.append(current_gene)
|
|
265
|
|
266 for i in range(len(contiguous_regions) + 1):
|
|
267 if i == 0:
|
|
268 a = (1, 1)
|
|
269 b = contiguous_regions[i]
|
|
270 elif i >= len(contiguous_regions):
|
|
271 a = contiguous_regions[i - 1]
|
|
272 b = (len(record.seq), None)
|
|
273 else:
|
|
274 a = contiguous_regions[i - 1]
|
|
275 b = contiguous_regions[i]
|
|
276
|
|
277 gap_size = abs(b[0] - a[1])
|
|
278
|
|
279 if gap_size > min(excess, excess_divergent):
|
|
280 a_feat_l = itertools.islice(
|
|
281 feature_lambda(
|
|
282 sorted_genes,
|
|
283 feature_test_location,
|
|
284 {"loc": a[1]},
|
|
285 subfeatures=False,
|
|
286 ),
|
|
287 1,
|
|
288 )
|
|
289 b_feat_l = itertools.islice(
|
|
290 feature_lambda(
|
|
291 sorted_genes,
|
|
292 feature_test_location,
|
|
293 {"loc": b[0]},
|
|
294 subfeatures=False,
|
|
295 ),
|
|
296 1,
|
|
297 )
|
|
298
|
|
299 try:
|
|
300 a_feat = next(a_feat_l)
|
|
301 except StopIteration:
|
|
302 # Triggers on end of genome
|
|
303 a_feat = None
|
|
304 try:
|
|
305 b_feat = next(b_feat_l)
|
|
306 except StopIteration:
|
|
307 # Triggers on end of genome
|
|
308 b_feat = None
|
|
309
|
|
310 result_obj = [
|
|
311 a[1],
|
|
312 b[0],
|
|
313 None if not a_feat else a_feat.location.strand,
|
|
314 None if not b_feat else b_feat.location.strand,
|
|
315 ]
|
|
316
|
|
317 if a_feat is None or b_feat is None:
|
|
318 if gap_size > excess_divergent:
|
|
319 results.append(result_obj)
|
|
320 else:
|
|
321 if (
|
|
322 a_feat.location.strand == b_feat.location.strand
|
|
323 and gap_size > excess
|
|
324 ):
|
|
325 results.append(result_obj)
|
|
326 elif (
|
|
327 a_feat.location.strand != b_feat.location.strand
|
|
328 and gap_size > excess_divergent
|
|
329 ):
|
|
330 results.append(result_obj)
|
|
331
|
|
332 better_results = []
|
|
333 qc_features = []
|
|
334 of = MGAFinder(11, "CDS", "closed", min_gene)
|
|
335 # of = OrfFinder(11, 'CDS', 'closed', min_gene)
|
|
336
|
|
337 for result_obj in results:
|
|
338 start = result_obj[0]
|
|
339 end = result_obj[1]
|
|
340 f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene")
|
|
341 qc_features.append(f)
|
|
342 putative_genes = of.putative_genes_in_sequence(
|
|
343 str(record[start - slop : end + slop].seq)
|
|
344 )
|
|
345 putative_genes = list(
|
|
346 require_sd(putative_genes, record, start, lookahead_min, lookahead_max)
|
|
347 )
|
|
348 for putative_gene in putative_genes:
|
|
349 # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324)
|
|
350 possible_gene_start = start + putative_gene[0]
|
|
351 possible_gene_end = start + putative_gene[1]
|
|
352
|
|
353 if possible_gene_start <= possible_gene_end:
|
|
354 possible_cds = gffSeqFeature(
|
|
355 FeatureLocation(
|
|
356 possible_gene_start, possible_gene_end, strand=putative_gene[2]
|
|
357 ),
|
|
358 type="CDS",
|
|
359 )
|
|
360 else:
|
|
361 possible_cds = gffSeqFeature(
|
|
362 FeatureLocation(
|
|
363 possible_gene_end, possible_gene_start, strand=putative_gene[2],
|
|
364 ),
|
|
365 type="CDS",
|
|
366 )
|
|
367
|
|
368 # Now we adjust our boundaries for the RBS that's required
|
|
369 # There are only two cases, the rbs is upstream of it, or downstream
|
|
370 if putative_gene[5] < possible_gene_start:
|
|
371 possible_gene_start = putative_gene[5]
|
|
372 else:
|
|
373 possible_gene_end = putative_gene[6]
|
|
374
|
|
375 if putative_gene[5] <= putative_gene[6]:
|
|
376 possible_rbs = gffSeqFeature(
|
|
377 FeatureLocation(
|
|
378 putative_gene[5], putative_gene[6], strand=putative_gene[2]
|
|
379 ),
|
|
380 type="Shine_Dalgarno_sequence",
|
|
381 )
|
|
382 else:
|
|
383 possible_rbs = gffSeqFeature(
|
|
384 FeatureLocation(
|
|
385 putative_gene[6], putative_gene[5], strand=putative_gene[2],
|
|
386 ),
|
|
387 type="Shine_Dalgarno_sequence",
|
|
388 )
|
|
389
|
|
390 if possible_gene_start <= possible_gene_end:
|
|
391 possible_gene = gffSeqFeature(
|
|
392 FeatureLocation(
|
|
393 possible_gene_start, possible_gene_end, strand=putative_gene[2]
|
|
394 ),
|
|
395 type="gene",
|
|
396 qualifiers={"note": ["Possible gene"]},
|
|
397 )
|
|
398 else:
|
|
399 possible_gene = gffSeqFeature(
|
|
400 FeatureLocation(
|
|
401 possible_gene_end, possible_gene_start, strand=putative_gene[2],
|
|
402 ),
|
|
403 type="gene",
|
|
404 qualifiers={"note": ["Possible gene"]},
|
|
405 )
|
|
406 possible_gene.sub_features = [possible_rbs, possible_cds]
|
|
407 qc_features.append(possible_gene)
|
|
408
|
|
409 better_results.append(result_obj + [len(putative_genes)])
|
|
410
|
|
411 # Bad gaps are those with more than zero possible genes found
|
|
412 bad = len([x for x in better_results if x[2] > 0])
|
|
413 # Generally taking "good" here as every possible gap in the genome
|
|
414 # Thus, good is TOTAL - gaps
|
|
415 good = len(sorted_genes) + 1 - bad
|
|
416 # and bad is just gaps
|
|
417 return good, bad, better_results, qc_features
|
|
418
|
|
419
|
|
420 def phi(x):
|
|
421 """Standard phi function used in calculation of normal distribution"""
|
|
422 return math.exp(-1 * math.pi * x * x)
|
|
423
|
|
424
|
|
425 def norm(x, mean=0, sd=1):
|
|
426 """
|
|
427 Normal distribution. Given an x position, a mean, and a standard
|
|
428 deviation, calculate the "y" value. Useful for score scaling
|
|
429
|
|
430 Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5.
|
|
431 """
|
|
432 return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd
|
|
433
|
|
434
|
|
435 def coding_density(record, mean=92.5, sd=20):
|
|
436 """
|
|
437 Find coding density in the genome
|
|
438 """
|
|
439 feature_lengths = 0
|
|
440
|
|
441 for gene_a in coding_genes(record.features):
|
|
442 feature_lengths += sum(
|
|
443 [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")]
|
|
444 )
|
|
445
|
|
446 avgFeatLen = float(feature_lengths) / float(len(record.seq))
|
|
447 return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen)
|
|
448
|
|
449
|
|
450 def exact_coding_density(record, mean=92.5, sd=20):
|
|
451 """
|
|
452 Find exact coding density in the genome
|
|
453 """
|
|
454 data = numpy.zeros(len(record.seq))
|
|
455
|
|
456 for gene_a in coding_genes(record.features):
|
|
457 for cds in genes(gene_a.sub_features, feature_type="CDS"):
|
|
458 for i in range(cds.location.start, cds.location.end + 1):
|
|
459 data[i - 1] = 1
|
|
460
|
|
461 return float(sum(data)) / len(data)
|
|
462
|
|
463
|
|
464 def excessive_overlap(record, excess=15, excess_divergent=30):
|
|
465 """
|
|
466 Find excessive overlaps in the genome, where excessive is defined as 15
|
|
467 bases for same strand, and 30 for divergent translation.
|
|
468
|
|
469 Does a product of all the top-level features in the genome, and calculates
|
|
470 gaps.
|
|
471 """
|
|
472 results = []
|
|
473 bad = 0
|
|
474 qc_features = []
|
|
475
|
|
476 for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2):
|
|
477 # Get the CDS from the subfeature list.
|
|
478 # TODO: not recursive.
|
|
479 cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
|
|
480 cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]
|
|
481
|
|
482 if len(cds_a) == 0:
|
|
483 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
|
|
484 continue
|
|
485
|
|
486 if len(cds_b) == 0:
|
|
487 log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
|
|
488 continue
|
|
489
|
|
490 cds_a = cds_a[0]
|
|
491 cds_b = cds_b[0]
|
|
492
|
|
493 # Set of locations that are included in the CDS of A and the
|
|
494 # CDS of B
|
|
495 cas = set(range(cds_a.location.start, cds_a.location.end))
|
|
496 cbs = set(range(cds_b.location.start, cds_b.location.end))
|
|
497
|
|
498 # Here we calculate the intersection between the two sets, and
|
|
499 # if it's larger than our excessive size, we know that they're
|
|
500 # overlapped
|
|
501 ix = cas.intersection(cbs)
|
|
502
|
|
503 if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (
|
|
504 cds_a.location.strand != cds_b.location.strand
|
|
505 and len(ix) >= excess_divergent
|
|
506 ):
|
|
507 bad += float(len(ix)) / float(min(excess, excess_divergent))
|
|
508 qc_features.append(
|
|
509 gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene")
|
|
510 )
|
|
511 results.append((gene_a, gene_b, min(ix), max(ix)))
|
|
512
|
|
513 # Good isn't accurate here. It's a triangle number and just ugly, but we
|
|
514 # don't care enough to fix it.
|
|
515 good = len(list(coding_genes(record.features)))
|
|
516 good = int(good - bad)
|
|
517 if good < 0:
|
|
518 good = 0
|
|
519 return good, int(bad), results, qc_features
|
|
520
|
|
521
|
|
522 def get_encouragement(score):
|
|
523 """Some text telling the user how they did
|
|
524 """
|
|
525 for encouragement in ENCOURAGEMENT:
|
|
526 if score > encouragement[0]:
|
|
527 return encouragement[1]
|
|
528 return ENCOURAGEMENT[-1][1]
|
|
529
|
|
530
|
|
531 def genome_overview(record):
|
|
532 """Genome overview
|
|
533 """
|
|
534 data = {
|
|
535 "genes": {
|
|
536 "count": 0,
|
|
537 "bases": len(record.seq),
|
|
538 "density": 0, # genes / kb
|
|
539 "avg_len": [],
|
|
540 "comp": {"A": 0, "C": 0, "G": 0, "T": 0},
|
|
541 },
|
|
542 "overall": {
|
|
543 "comp": {
|
|
544 "A": record.seq.count("A") + record.seq.count("a"),
|
|
545 "C": record.seq.count("C") + record.seq.count("c"),
|
|
546 "G": record.seq.count("G") + record.seq.count("g"),
|
|
547 "T": record.seq.count("T") + record.seq.count("t"),
|
|
548 },
|
|
549 "gc": 0,
|
|
550 },
|
|
551 }
|
|
552 gene_features = list(coding_genes(record.features))
|
|
553 data["genes"]["count"] = len(gene_features)
|
|
554
|
|
555 for feat in gene_features:
|
|
556 data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a")
|
|
557 data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c")
|
|
558 data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t")
|
|
559 data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g")
|
|
560 #data["genes"]["bases"] += len(feat)
|
|
561 data["genes"]["avg_len"].append(len(feat))
|
|
562
|
|
563 data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features)
|
|
564 data["overall"]["gc"] = float(
|
|
565 data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"]
|
|
566 ) / len(record.seq)
|
|
567 return data
|
|
568
|
|
569
|
|
570 def find_morons(record):
|
|
571 """Locate morons in the genome
|
|
572
|
|
573 Don't even know why...
|
|
574
|
|
575 TODO: remove? Idk.
|
|
576 """
|
|
577 results = []
|
|
578 good = 0
|
|
579 bad = 0
|
|
580
|
|
581 gene_features = list(coding_genes(record.features))
|
|
582 for i, gene in enumerate(gene_features):
|
|
583 two_left = gene_features[i - 2 : i]
|
|
584 two_right = gene_features[i + 1 : i + 1 + 2]
|
|
585 strands = [x.strand for x in two_left] + [x.strand for x in two_right]
|
|
586 anticon = [x for x in strands if x != gene.strand]
|
|
587
|
|
588 if len(anticon) == 4:
|
|
589 has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features]
|
|
590 if any(has_rbs):
|
|
591 rbs = [
|
|
592 x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence"
|
|
593 ][0]
|
|
594 rbs_msg = str(rbs.extract(record.seq))
|
|
595 else:
|
|
596 rbs_msg = "No RBS Available"
|
|
597 results.append((gene, two_left, two_right, rbs_msg))
|
|
598 bad += 1
|
|
599 else:
|
|
600 good += 1
|
|
601 return good, bad, results, []
|
|
602
|
|
603
|
|
604 def bad_gene_model(record):
|
|
605 """Find features without product
|
|
606 """
|
|
607 results = []
|
|
608 good = 0
|
|
609 bad = 0
|
|
610 qc_features = []
|
|
611
|
|
612 for gene in coding_genes(record.features):
|
|
613 exons = [
|
|
614 x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10
|
|
615 ]
|
|
616 CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")]
|
|
617 if len(exons) >= 1 and len(CDSs) >= 1:
|
|
618 if len(exons) != len(CDSs):
|
|
619 results.append(
|
|
620 (
|
|
621 get_gff3_id(gene),
|
|
622 None,
|
|
623 None,
|
|
624 "Mismatched number of exons and CDSs in gff3 representation",
|
|
625 )
|
|
626 )
|
|
627 qc_features.append(
|
|
628 gen_qc_feature(
|
|
629 gene.location.start,
|
|
630 gene.location.end,
|
|
631 "Mismatched number of exons and CDSs in gff3 representation",
|
|
632 strand=gene.strand,
|
|
633 id_src=gene,
|
|
634 type_src="gene"
|
|
635 )
|
|
636 )
|
|
637 bad += 1
|
|
638 else:
|
|
639 for (exon, cds) in zip(
|
|
640 sorted(exons, key=lambda x: x.location.start),
|
|
641 sorted(CDSs, key=lambda x: x.location.start),
|
|
642 ):
|
|
643 if len(exon) != len(cds):
|
|
644 results.append(
|
|
645 (
|
|
646 get_gff3_id(gene),
|
|
647 exon,
|
|
648 cds,
|
|
649 "CDS does not extend to full length of gene",
|
|
650 )
|
|
651 )
|
|
652 qc_features.append(
|
|
653 gen_qc_feature(
|
|
654 exon.location.start,
|
|
655 exon.location.end,
|
|
656 "CDS does not extend to full length of gene",
|
|
657 strand=exon.strand,
|
|
658 id_src=gene,
|
|
659 type_src="CDS"
|
|
660 )
|
|
661 )
|
|
662 bad += 1
|
|
663 else:
|
|
664 good += 1
|
|
665 else:
|
|
666 log.warn("Could not handle %s, %s", exons, CDSs)
|
|
667 results.append(
|
|
668 (
|
|
669 get_gff3_id(gene),
|
|
670 None,
|
|
671 None,
|
|
672 "{0} exons, {1} CDSs".format(len(exons), len(CDSs)),
|
|
673 )
|
|
674 )
|
|
675
|
|
676 return good, len(results) + bad, results, qc_features
|
|
677
|
|
678
|
|
679 def weird_starts(record):
|
|
680 """Find features without product
|
|
681 """
|
|
682 good = 0
|
|
683 bad = 0
|
|
684 qc_features = []
|
|
685 results = []
|
|
686
|
|
687 overall = {}
|
|
688 for gene in coding_genes(record.features):
|
|
689 seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
|
|
690 if len(seq) == 0:
|
|
691 log.warn("No CDS for gene %s", get_gff3_id(gene))
|
|
692 continue
|
|
693 else:
|
|
694 seq = seq[0]
|
|
695
|
|
696 seq_str = str(seq.extract(record.seq))
|
|
697 start_codon = seq_str[0:3]
|
|
698 if len(seq_str) < 3:
|
|
699 sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n')
|
|
700 exit(2)
|
|
701 # if len(seq_str) % 3 != 0:
|
|
702 # if len(seq_str) < 3:
|
|
703 # stop_codon = seq_str[-(len(seq_str))]
|
|
704 # else:
|
|
705 # stop_codon = seq_str[-3]
|
|
706 #
|
|
707 # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
|
|
708 # seq.__error = "Bad CDS Length"
|
|
709 # results.append(seq)
|
|
710 # qc_features.append(
|
|
711 # gen_qc_feature(
|
|
712 # s, e, "Bad Length", strand=seq.strand, id_src=gene
|
|
713 # )
|
|
714 # )
|
|
715 # bad += 1
|
|
716 # seq.__start = start_codon
|
|
717 # seq.__stop = stop_codon
|
|
718 # continue
|
|
719
|
|
720 stop_codon = seq_str[-3]
|
|
721 seq.__start = start_codon
|
|
722 seq.__stop = stop_codon
|
|
723 if start_codon not in overall:
|
|
724 overall[start_codon] = 1
|
|
725 else:
|
|
726 overall[start_codon] += 1
|
|
727
|
|
728 if start_codon not in ("ATG", "TTG", "GTG"):
|
|
729 log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
|
|
730 seq.__error = "Unusual start codon %s" % start_codon
|
|
731
|
|
732 s = 0
|
|
733 e = 0
|
|
734 if seq.strand > 0:
|
|
735 s = seq.location.start
|
|
736 e = seq.location.start + 3
|
|
737 else:
|
|
738 s = seq.location.end
|
|
739 e = seq.location.end - 3
|
|
740
|
|
741 results.append(seq)
|
|
742 results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand)
|
|
743 qc_features.append(
|
|
744 gen_qc_feature(
|
|
745 s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene"
|
|
746 )
|
|
747 )
|
|
748 bad += 1
|
|
749 else:
|
|
750 good += 1
|
|
751
|
|
752 return good, bad, results, qc_features, overall
|
|
753
|
|
754
|
|
755 def missing_genes(record):
|
|
756 """Find features without product
|
|
757 """
|
|
758 results = []
|
|
759 good = 0
|
|
760 bad = 0
|
|
761 qc_features = []
|
|
762
|
|
763 for gene in coding_genes(record.features):
|
|
764 if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION":
|
|
765 results.append(gene)
|
|
766 bad += 1
|
|
767 else:
|
|
768 good += 1
|
|
769
|
|
770 return good, bad, results, qc_features
|
|
771
|
|
772
|
|
773 def gene_model_correction_issues(record):
|
|
774 """Find features that have issues from the gene model correction step.
|
|
775 These have qualifiers beginning with CPT_GMS
|
|
776 """
|
|
777 results = []
|
|
778 good = 0
|
|
779 bad = 0
|
|
780 qc_features = []
|
|
781
|
|
782 # For each gene
|
|
783 for gene in coding_genes(record.features):
|
|
784 # Get the list of child CDSs
|
|
785 cdss = [x for x in genes(gene.sub_features, feature_type="CDS")]
|
|
786 # And our matching qualifiers
|
|
787 gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"]
|
|
788 # If there are problems with ONLY the parent, let's complain
|
|
789 local_results = []
|
|
790 local_qc_features = []
|
|
791 for x in gene_data:
|
|
792 if "Missing Locus Tag" in x[1]:
|
|
793 # Missing locus tag is an either or thing, if it hits here
|
|
794 # there shouldn't be anything else wrong with it.
|
|
795
|
|
796 # Obviously missing so we remove it
|
|
797 gene.qualifiers["locus_tag"] = [""]
|
|
798 # Translation from bp_genbank2gff3.py
|
|
799 cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"]
|
|
800 # Append our results
|
|
801 local_results.append((gene, cdss[0], "Gene is missing a locus_tag"))
|
|
802 local_qc_features.append(
|
|
803 gen_qc_feature(
|
|
804 gene.location.start,
|
|
805 gene.location.end,
|
|
806 "Gene is missing a locus_tag",
|
|
807 strand=gene.strand,
|
|
808 type_src="gene"
|
|
809 )
|
|
810 )
|
|
811
|
|
812 # We need to alert on any child issues as well.
|
|
813 for cds in cdss:
|
|
814 cds_data = [
|
|
815 (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc"
|
|
816 ]
|
|
817 if len(gene_data) == 0 and len(cds_data) == 0:
|
|
818 # Alles gut
|
|
819 pass
|
|
820 else:
|
|
821 for _, problem in cds_data:
|
|
822 if problem == "BOTH Missing Locus Tag":
|
|
823 gene.qualifiers["locus_tag"] = [""]
|
|
824 cds.qualifiers["locus_tag"] = [""]
|
|
825 local_results.append(
|
|
826 (gene, cds, "Both gene and CDS are missing locus tags")
|
|
827 )
|
|
828 local_qc_features.append(
|
|
829 gen_qc_feature(
|
|
830 cds.location.start,
|
|
831 cds.location.end,
|
|
832 "CDS is missing a locus_tag",
|
|
833 strand=cds.strand,
|
|
834 type_src="CDS"
|
|
835 )
|
|
836 )
|
|
837 local_qc_features.append(
|
|
838 gen_qc_feature(
|
|
839 gene.location.start,
|
|
840 gene.location.end,
|
|
841 "Gene is missing a locus_tag",
|
|
842 strand=gene.strand,
|
|
843 type_src="gene"
|
|
844 )
|
|
845 )
|
|
846 elif problem == "Different locus tag from associated gene.":
|
|
847 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
|
|
848 cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"]
|
|
849 local_results.append(
|
|
850 (gene, cds, "Gene and CDS have differing locus tags")
|
|
851 )
|
|
852 local_qc_features.append(
|
|
853 gen_qc_feature(
|
|
854 gene.location.start,
|
|
855 gene.location.end,
|
|
856 "Gene and CDS have differing locus tags",
|
|
857 strand=gene.strand,
|
|
858 type_src="gene"
|
|
859 )
|
|
860 )
|
|
861 elif problem == "Missing Locus Tag":
|
|
862 # Copy this over
|
|
863 gene.qualifiers["locus_tag"] = gene.qualifiers["Name"]
|
|
864 # This one is missing
|
|
865 cds.qualifiers["locus_tag"] = [""]
|
|
866 local_results.append((gene, cds, "CDS is missing a locus_tag"))
|
|
867 local_qc_features.append(
|
|
868 gen_qc_feature(
|
|
869 cds.location.start,
|
|
870 cds.location.end,
|
|
871 "CDS is missing a locus_tag",
|
|
872 strand=cds.strand,
|
|
873 type_src="CDS"
|
|
874 )
|
|
875 )
|
|
876 else:
|
|
877 log.warn("Cannot handle %s", problem)
|
|
878
|
|
879 if len(local_results) > 0:
|
|
880 bad += 1
|
|
881 else:
|
|
882 good += 1
|
|
883
|
|
884 qc_features.extend(local_qc_features)
|
|
885 results.extend(local_results)
|
|
886 return good, bad, results, qc_features
|
|
887
|
|
888
|
|
889 def missing_tags(record):
|
|
890 """Find features without product
|
|
891 """
|
|
892 results = []
|
|
893 good = 0
|
|
894 bad = 0
|
|
895 qc_features = []
|
|
896
|
|
897 for gene in coding_genes(record.features):
|
|
898 cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
|
|
899 if len(cds) == 0:
|
|
900 log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
|
|
901 continue
|
|
902
|
|
903 cds = cds[0]
|
|
904
|
|
905 if "product" not in cds.qualifiers:
|
|
906 log.info("Missing product tag on %s", get_gff3_id(gene))
|
|
907 qc_features.append(
|
|
908 gen_qc_feature(
|
|
909 cds.location.start,
|
|
910 cds.location.end,
|
|
911 "Missing product tag",
|
|
912 strand=cds.strand,
|
|
913 type_src="CDS"
|
|
914 )
|
|
915 )
|
|
916 results.append(cds)
|
|
917 bad += 1
|
|
918 else:
|
|
919 good += 1
|
|
920
|
|
921 return good, bad, results, qc_features
|
|
922
|
|
923
|
|
924 def evaluate_and_report(
|
|
925 annotations,
|
|
926 genome,
|
|
927 gff3=None,
|
|
928 tbl=None,
|
|
929 sd_min=5,
|
|
930 sd_max=15,
|
|
931 min_gene_length=30,
|
|
932 excessive_gap_dist=50,
|
|
933 excessive_gap_divergent_dist=200,
|
|
934 excessive_overlap_dist=25,
|
|
935 excessive_overlap_divergent_dist=50,
|
|
936 reportTemplateName="phage_annotation_validator.html",
|
|
937 ):
|
|
938 """
|
|
939 Generate our HTML evaluation of the genome
|
|
940 """
|
|
941 # Get features from GFF file
|
|
942 seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta"))
|
|
943 # Get the first GFF3 record
|
|
944 # TODO: support multiple GFF3 files.
|
|
945 mostFeat = 0
|
|
946 for rec in list(gffParse(annotations, base_dict=seq_dict)):
|
|
947 if len(rec.features) > mostFeat:
|
|
948 mostFeat = len(rec.features)
|
|
949 record = rec
|
|
950
|
|
951 gff3_qc_record = SeqRecord(record.id, id=record.id)
|
|
952 gff3_qc_record.features = []
|
|
953 gff3_qc_features = []
|
|
954
|
|
955 log.info("Locating missing RBSs")
|
|
956 # mb_any = "did they annotate ANY rbss? if so, take off from score."
|
|
957 mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs(
|
|
958 record, lookahead_min=sd_min, lookahead_max=sd_max
|
|
959 )
|
|
960 gff3_qc_features += mb_annotations
|
|
961
|
|
962 log.info("Locating excessive gaps")
|
|
963 eg_good, eg_bad, eg_results, eg_annotations = excessive_gap(
|
|
964 record,
|
|
965 excess=excessive_gap_dist,
|
|
966 excess_divergent=excessive_gap_divergent_dist,
|
|
967 min_gene=min_gene_length,
|
|
968 slop=excessive_overlap_dist,
|
|
969 lookahead_min=sd_min,
|
|
970 lookahead_max=sd_max,
|
|
971 )
|
|
972 gff3_qc_features += eg_annotations
|
|
973
|
|
974 log.info("Locating excessive overlaps")
|
|
975 eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap(
|
|
976 record,
|
|
977 excess=excessive_overlap_dist,
|
|
978 excess_divergent=excessive_overlap_divergent_dist,
|
|
979 )
|
|
980 gff3_qc_features += eo_annotations
|
|
981
|
|
982 log.info("Locating morons")
|
|
983 mo_good, mo_bad, mo_results, mo_annotations = find_morons(record)
|
|
984 gff3_qc_features += mo_annotations
|
|
985
|
|
986 log.info("Locating missing tags")
|
|
987 mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record)
|
|
988 gff3_qc_features += mt_annotations
|
|
989
|
|
990 log.info("Locating missing gene features")
|
|
991 mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record)
|
|
992 gff3_qc_features += mg_annotations
|
|
993
|
|
994 log.info("Determining coding density")
|
|
995 cd, cd_real = coding_density(record)
|
|
996
|
|
997 log.info("Locating weird starts")
|
|
998 ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record)
|
|
999 gff3_qc_features += ws_annotations
|
|
1000
|
|
1001 log.info("Locating bad gene models")
|
|
1002 gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record)
|
|
1003 if gm_good + gm_bad == 0:
|
|
1004 gm_bad = 1
|
|
1005
|
|
1006 log.info("Locating more bad gene models")
|
|
1007 gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues(
|
|
1008 record
|
|
1009 )
|
|
1010 if gmc_good + gmc_bad == 0:
|
|
1011 gmc_bad = 1
|
|
1012
|
|
1013 good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good]
|
|
1014 bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad]
|
|
1015
|
|
1016 # Only if they tried to annotate RBSs do we consider them.
|
|
1017 if mb_any:
|
|
1018 good_scores.append(mb_good)
|
|
1019 bad_scores.append(mb_bad)
|
|
1020 subscores = []
|
|
1021
|
|
1022 for (g, b) in zip(good_scores, bad_scores):
|
|
1023 if g + b == 0:
|
|
1024 s = 0
|
|
1025 else:
|
|
1026 s = int(100 * float(g) / (float(b) + float(g)))
|
|
1027 subscores.append(s)
|
|
1028 subscores.append(cd)
|
|
1029
|
|
1030 score = int(float(sum(subscores)) / float(len(subscores)))
|
|
1031
|
|
1032 # This is data that will go into our HTML template
|
|
1033 kwargs = {
|
|
1034 "upstream_min": sd_min,
|
|
1035 "upstream_max": sd_max,
|
|
1036 "record_name": record.id,
|
|
1037 "record_nice_name": nice_name(record),
|
|
1038 "params": {
|
|
1039 "sd_min": sd_min,
|
|
1040 "sd_max": sd_max,
|
|
1041 "min_gene_length": min_gene_length,
|
|
1042 "excessive_gap_dist": excessive_gap_dist,
|
|
1043 "excessive_gap_divergent_dist": excessive_gap_divergent_dist,
|
|
1044 "excessive_overlap_dist": excessive_overlap_dist,
|
|
1045 "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist,
|
|
1046 },
|
|
1047 "score": score,
|
|
1048 "encouragement": get_encouragement(score),
|
|
1049 "genome_overview": genome_overview(record),
|
|
1050 "rbss_annotated": mb_any,
|
|
1051 "missing_rbs": mb_results,
|
|
1052 "missing_rbs_good": mb_good,
|
|
1053 "missing_rbs_bad": mb_bad,
|
|
1054 "missing_rbs_score": 0
|
|
1055 if mb_good + mb_bad == 0
|
|
1056 else (100 * mb_good / (mb_good + mb_bad)),
|
|
1057 "excessive_gap": eg_results,
|
|
1058 "excessive_gap_good": eg_good,
|
|
1059 "excessive_gap_bad": eg_bad,
|
|
1060 "excessive_gap_score": 0
|
|
1061 if eo_good + eo_bad == 0
|
|
1062 else (100 * eo_good / (eo_good + eo_bad)),
|
|
1063 "excessive_overlap": eo_results,
|
|
1064 "excessive_overlap_good": eo_good,
|
|
1065 "excessive_overlap_bad": eo_bad,
|
|
1066 "excessive_overlap_score": 0
|
|
1067 if eo_good + eo_bad == 0
|
|
1068 else (100 * eo_good / (eo_good + eo_bad)),
|
|
1069 "morons": mo_results,
|
|
1070 "morons_good": mo_good,
|
|
1071 "morons_bad": mo_bad,
|
|
1072 "morons_score": 0
|
|
1073 if mo_good + mo_bad == 0
|
|
1074 else (100 * mo_good / (mo_good + mo_bad)),
|
|
1075 "missing_tags": mt_results,
|
|
1076 "missing_tags_good": mt_good,
|
|
1077 "missing_tags_bad": mt_bad,
|
|
1078 "missing_tags_score": 0
|
|
1079 if mt_good + mt_bad == 0
|
|
1080 else (100 * mt_good / (mt_good + mt_bad)),
|
|
1081 "missing_genes": mg_results,
|
|
1082 "missing_genes_good": mg_good,
|
|
1083 "missing_genes_bad": mg_bad,
|
|
1084 "missing_genes_score": 0
|
|
1085 if mg_good + mg_bad == 0
|
|
1086 else (100 * mg_good / (mg_good + mg_bad)),
|
|
1087 "weird_starts": ws_results,
|
|
1088 "weird_starts_good": ws_good,
|
|
1089 "weird_starts_bad": ws_bad,
|
|
1090 "weird_starts_overall": ws_overall,
|
|
1091 "weird_starts_overall_sorted_keys": sorted(
|
|
1092 ws_overall, reverse=True, key=lambda x: ws_overall[x]
|
|
1093 ),
|
|
1094 "weird_starts_score": 0
|
|
1095 if ws_good + ws_bad == 0
|
|
1096 else (100 * ws_good / (ws_good + ws_bad)),
|
|
1097 "gene_model": gm_results,
|
|
1098 "gene_model_good": gm_good,
|
|
1099 "gene_model_bad": gm_bad,
|
|
1100 "gene_model_score": 0
|
|
1101 if gm_good + gm_bad == 0
|
|
1102 else (100 * gm_good / (gm_good + gm_bad)),
|
|
1103 "gene_model_correction": gmc_results,
|
|
1104 "gene_model_correction_good": gmc_good,
|
|
1105 "gene_model_correction_bad": gmc_bad,
|
|
1106 "gene_model_correction_score": 0
|
|
1107 if gmc_good + gmc_bad == 0
|
|
1108 else (100 * gmc_good / (gmc_good + gmc_bad)),
|
|
1109 "coding_density": cd,
|
|
1110 "coding_density_exact": exact_coding_density(record),
|
|
1111 "coding_density_real": cd_real,
|
|
1112 "coding_density_score": cd,
|
|
1113 }
|
|
1114
|
|
1115 with open(tbl, "w") as handle:
|
|
1116 kw_subset = {}
|
|
1117 for key in kwargs:
|
|
1118 if (
|
|
1119 key in ("score", "record_name")
|
|
1120 or "_good" in key
|
|
1121 or "_bad" in key
|
|
1122 or "_overall" in key
|
|
1123 ):
|
|
1124 kw_subset[key] = kwargs[key]
|
|
1125 json.dump(kw_subset, handle)
|
|
1126
|
|
1127 with open(gff3, "w") as handle:
|
|
1128 gff3_qc_record.features = gff3_qc_features
|
|
1129 gff3_qc_record.annotations = {}
|
|
1130 gffWrite([gff3_qc_record], handle)
|
|
1131
|
|
1132 def nice_strand(direction):
|
|
1133 # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level
|
|
1134 if "SeqFeature" in str(type(direction)):
|
|
1135 direction = direction.location.strand
|
|
1136 if direction > 0:
|
|
1137 return "→"#.decode("utf-8")
|
|
1138 else:
|
|
1139 return "←"#.decode("utf-8")
|
|
1140
|
|
1141 def nice_strand_tex(direction):
|
|
1142 if "SeqFeature" in str(type(direction)):
|
|
1143 direction = direction.location.strand
|
|
1144 if direction > 0:
|
|
1145 return "$\\rightarrow$"
|
|
1146 else:
|
|
1147 return "$\\leftarrow$"
|
|
1148
|
|
1149 def texify(data):
|
|
1150 return data.replace("_", "\\_").replace("$", "\\$")
|
|
1151
|
|
1152 def length(data):
|
|
1153 return len(data)
|
|
1154
|
|
1155 def my_encode(data):
|
|
1156 return str(data)#.encode("utf-8")
|
|
1157
|
|
1158 def my_decode(data):
|
|
1159 # For production
|
|
1160 return str(data)#.decode("utf-8")
|
|
1161 # For local testing. No, I do not understand.
|
|
1162 return str(data)#.encode("utf-8")).decode("utf-8")
|
|
1163
|
|
1164 env = Environment(
|
|
1165 loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True
|
|
1166 )
|
|
1167 env.filters.update(
|
|
1168 {
|
|
1169 "nice_id": get_gff3_id,
|
|
1170 "nice_strand": nice_strand,
|
|
1171 "nice_strand_tex": nice_strand_tex,
|
|
1172 "texify": texify,
|
|
1173 "length": length,
|
|
1174 "encode": my_encode,
|
|
1175 "decode": my_decode,
|
|
1176 }
|
|
1177 )
|
|
1178 tpl = env.get_template(reportTemplateName)
|
|
1179 return tpl.render(**kwargs)#.encode("utf-8")
|
|
1180
|
|
1181
|
|
1182 if __name__ == "__main__":
|
|
1183 parser = argparse.ArgumentParser(
|
|
1184 description="rebase gff3 features against parent locations", epilog=""
|
|
1185 )
|
|
1186 parser.add_argument(
|
|
1187 "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations"
|
|
1188 )
|
|
1189 parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence")
|
|
1190 parser.add_argument(
|
|
1191 "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3"
|
|
1192 )
|
|
1193 parser.add_argument(
|
|
1194 "--tbl",
|
|
1195 type=str,
|
|
1196 help="Table for noninteractive parsing",
|
|
1197 default="qc_results.json",
|
|
1198 )
|
|
1199
|
|
1200 parser.add_argument(
|
|
1201 "--sd_min",
|
|
1202 type=int,
|
|
1203 help="Minimum distance from gene start for an SD to be",
|
|
1204 default=5,
|
|
1205 )
|
|
1206 parser.add_argument(
|
|
1207 "--sd_max",
|
|
1208 type=int,
|
|
1209 help="Maximum distance from gene start for an SD to be",
|
|
1210 default=15,
|
|
1211 )
|
|
1212
|
|
1213 parser.add_argument(
|
|
1214 "--min_gene_length",
|
|
1215 type=int,
|
|
1216 help="Minimum length for a putative gene call (AAs)",
|
|
1217 default=30,
|
|
1218 )
|
|
1219
|
|
1220 parser.add_argument(
|
|
1221 "--excessive_overlap_dist",
|
|
1222 type=int,
|
|
1223 help="Excessive overlap for genes in same direction",
|
|
1224 default=25,
|
|
1225 )
|
|
1226 parser.add_argument(
|
|
1227 "--excessive_overlap_divergent_dist",
|
|
1228 type=int,
|
|
1229 help="Excessive overlap for genes in diff directions",
|
|
1230 default=50,
|
|
1231 )
|
|
1232
|
|
1233 parser.add_argument(
|
|
1234 "--excessive_gap_dist",
|
|
1235 type=int,
|
|
1236 help="Maximum distance between two genes",
|
|
1237 default=40,
|
|
1238 )
|
|
1239 parser.add_argument(
|
|
1240 "--excessive_gap_divergent_dist",
|
|
1241 type=int,
|
|
1242 help="Maximum distance between two divergent genes",
|
|
1243 default=200,
|
|
1244 )
|
|
1245
|
|
1246 parser.add_argument(
|
|
1247 "--reportTemplateName",
|
|
1248 help="Report template file name",
|
|
1249 default="phageqc_report_full.html",
|
|
1250 )
|
|
1251
|
|
1252 args = parser.parse_args()
|
|
1253
|
|
1254 sys.stdout.write(evaluate_and_report(**vars(args)))
|