# HG changeset patch # User cpt # Date 1655470850 0 # Node ID c3140b08d703e19f56ca95e2bc6becea01da49b3 Uploaded diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/cpt-macros.xml Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,115 @@ + + + + + python + biopython + requests + + + + + + + + 10.1371/journal.pcbi.1008214 + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/cpt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/cpt.py Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,342 @@ +#!/usr/bin/env python +import regex as re +from Bio.Seq import Seq, reverse_complement, translate +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO +from Bio.Data import CodonTable +import logging + +logging.basicConfig() +log = logging.getLogger() + +PHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*phage (?P.*)$") +BACTERIOPHAGE_IN_MIDDLE = re.compile("^(?P.*)\s*bacteriophage (?P.*)$") +STARTS_WITH_PHAGE = re.compile( + "^(bacterio|vibrio|Bacterio|Vibrio|)?[Pp]hage (?P.*)$" +) +NEW_STYLE_NAMES = re.compile("(?Pv[A-Z]_[A-Z][a-z]{2}_.*)") + + +def phage_name_parser(name): + host = None + phage = None + name = name.replace(", complete genome.", "") + name = name.replace(", complete genome", "") + + m = BACTERIOPHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = PHAGE_IN_MIDDLE.match(name) + if m: + host = m.group("host") + phage = m.group("phage") + return (host, phage) + + m = STARTS_WITH_PHAGE.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + m = NEW_STYLE_NAMES.match(name) + if m: + phage = m.group("phage") + return (host, phage) + + return (host, phage) + + +class OrfFinder(object): + def __init__(self, table, ftype, ends, min_len, strand): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + self.strand = strand + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t # Edited by CPT to be a generator + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + full_len = len(nuc_seq) + if self.strand != "reverse": + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + if self.strand != "forward": + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t) + + +class MGAFinder(object): + def __init__(self, table, ftype, ends, min_len): + self.table = table + self.table_obj = CodonTable.ambiguous_generic_by_id[table] + self.ends = ends + self.ftype = ftype + self.min_len = min_len + self.starts = sorted(self.table_obj.start_codons) + self.stops = sorted(self.table_obj.stop_codons) + self.re_starts = re.compile("|".join(self.starts)) + self.re_stops = re.compile("|".join(self.stops)) + + def locate(self, fasta_file, out_nuc, out_prot, out_bed, out_gff3): + seq_format = "fasta" + log.debug("Genetic code table %i" % self.table) + log.debug("Minimum length %i aa" % self.min_len) + + out_count = 0 + + out_gff3.write("##gff-version 3\n") + + for idx, record in enumerate(SeqIO.parse(fasta_file, seq_format)): + for i, (f_start, f_end, f_strand, n, t) in enumerate( + self.get_all_peptides(str(record.seq).upper()) + ): + out_count += 1 + + descr = "length %i aa, %i bp, from %s..%s[%s] of %s" % ( + len(t), + len(n), + f_start, + f_end, + f_strand, + record.description, + ) + fid = record.id + "|%s%i" % (self.ftype, i + 1) + + r = SeqRecord(Seq(n), id=fid, name="", description=descr) + t = SeqRecord(Seq(t), id=fid, name="", description=descr) + + SeqIO.write(r, out_nuc, "fasta") + SeqIO.write(t, out_prot, "fasta") + + nice_strand = "+" if f_strand == +1 else "-" + + out_bed.write( + "\t".join( + map(str, [record.id, f_start, f_end, fid, 0, nice_strand]) + ) + + "\n" + ) + + out_gff3.write( + "\t".join( + map( + str, + [ + record.id, + "getOrfsOrCds", + "CDS", + f_start + 1, + f_end, + ".", + nice_strand, + 0, + "ID=%s.%s.%s" % (self.ftype, idx, i + 1), + ], + ) + ) + + "\n" + ) + log.info("Found %i %ss", out_count, self.ftype) + + def start_chop_and_trans(self, s, strict=True): + """Returns offset, trimmed nuc, protein.""" + if strict: + assert s[-3:] in self.stops, s + assert len(s) % 3 == 0 + for match in self.re_starts.finditer(s, overlapped=True): + # Must check the start is in frame + start = match.start() + if start % 3 == 0: + n = s[start:] + assert len(n) % 3 == 0, "%s is len %i" % (n, len(n)) + if strict: + t = translate(n, self.table) + else: + # Use when missing stop codon, + t = "M" + translate(n[3:], self.table, to_stop=True) + yield start, n, t + + def break_up_frame(self, s): + """Returns offset, nuc, protein.""" + start = 0 + for match in self.re_stops.finditer(s, overlapped=True): + index = match.start() + 3 + if index % 3 != 0: + continue + n = s[start:index] + for (offset, n, t) in self.start_chop_and_trans(n): + if n and len(t) >= self.min_len: + yield start + offset, n, t + start = index + + def putative_genes_in_sequence(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + Co-ordinates are Python style zero-based. + """ + nuc_seq = nuc_seq.upper() + # TODO - Refactor to use a generator function (in start order) + # rather than making a list and sorting? + answer = [] + full_len = len(nuc_seq) + + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + answer.append((start, start + len(n), +1, n, t)) + + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + answer.append((start, start - len(n), -1, n, t)) + answer.sort() + return answer + + def get_all_peptides(self, nuc_seq): + """Returns start, end, strand, nucleotides, protein. + + Co-ordinates are Python style zero-based. + """ + # Refactored into generator by CPT + + full_len = len(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(nuc_seq[frame:]): + start = frame + offset # zero based + yield (start, start + len(n), +1, n, t) + rc = reverse_complement(nuc_seq) + for frame in range(0, 3): + for offset, n, t in self.break_up_frame(rc[frame:]): + start = full_len - frame - offset # zero based + yield (start - len(n), start, -1, n, t) diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/gff3.py Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,346 @@ +import copy +import logging + +log = logging.getLogger() +log.setLevel(logging.WARN) + + +def feature_lambda( + feature_list, + test, + test_kwargs, + subfeatures=True, + parent=None, + invert=False, + recurse=True, +): + """Recursively search through features, testing each with a test function, yielding matches. + + GFF3 is a hierachical data structure, so we need to be able to recursively + search through features. E.g. if you're looking for a feature with + ID='bob.42', you can't just do a simple list comprehension with a test + case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. + + :type feature_list: list + :param feature_list: an iterable of features + + :type test: function reference + :param test: a closure with the method signature (feature, **kwargs) where + the kwargs are those passed in the next argument. This + function should return True or False, True if the feature is + to be yielded as part of the main feature_lambda function, or + False if it is to be ignored. This function CAN mutate the + features passed to it (think "apply"). + + :type test_kwargs: dictionary + :param test_kwargs: kwargs to pass to your closure when it is called. + + :type subfeatures: boolean + :param subfeatures: when a feature is matched, should just that feature be + yielded to the caller, or should the entire sub_feature + tree for that feature be included? subfeatures=True is + useful in cases such as searching for a gene feature, + and wanting to know what RBS/Shine_Dalgarno_sequences + are in the sub_feature tree (which can be accomplished + with two feature_lambda calls). subfeatures=False is + useful in cases when you want to process (and possibly + return) the entire feature tree, such as applying a + qualifier to every single feature. + + :type invert: boolean + :param invert: Negate/invert the result of the filter. + + :rtype: yielded list + :return: Yields a list of matching features. + """ + # Either the top level set of [features] or the subfeature attribute + for feature in feature_list: + feature._parent = parent + if not parent: + # Set to self so we cannot go above root. + feature._parent = feature + test_result = test(feature, **test_kwargs) + # if (not invert and test_result) or (invert and not test_result): + if invert ^ test_result: + if not subfeatures: + feature_copy = copy.deepcopy(feature) + feature_copy.sub_features = list() + yield feature_copy + else: + yield feature + + if recurse and hasattr(feature, "sub_features"): + for x in feature_lambda( + feature.sub_features, + test, + test_kwargs, + subfeatures=subfeatures, + parent=feature, + invert=invert, + recurse=recurse, + ): + yield x + + +def fetchParent(feature): + if not hasattr(feature, "_parent") or feature._parent is None: + return feature + else: + return fetchParent(feature._parent) + + +def feature_test_true(feature, **kwargs): + return True + + +def feature_test_type(feature, **kwargs): + if "type" in kwargs: + return str(feature.type).upper() == str(kwargs["type"]).upper() + elif "types" in kwargs: + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False + raise Exception("Incorrect feature_test_type call, need type or types") + + +def feature_test_qual_value(feature, **kwargs): + """Test qualifier values. + + For every feature, check that at least one value in + feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] + """ + if isinstance(kwargs["qualifier"], list): + for qualifier in kwargs["qualifier"]: + for attribute_value in feature.qualifiers.get(qualifier, []): + if attribute_value in kwargs["attribute_list"]: + return True + else: + for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): + if attribute_value in kwargs["attribute_list"]: + return True + return False + + +def feature_test_location(feature, **kwargs): + if "strand" in kwargs: + if feature.location.strand != kwargs["strand"]: + return False + + return feature.location.start <= kwargs["loc"] <= feature.location.end + + +def feature_test_quals(feature, **kwargs): + """ + Example:: + + a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']}) + + # Check if a contains a Note + feature_test_quals(a, {'Note': None}) # Returns True + feature_test_quals(a, {'Product': None}) # Returns False + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['ome']}) # Returns True + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['other']}) # Returns False + """ + for key in kwargs: + if key not in feature.qualifiers: + return False + + # Key is present, no value specified + if kwargs[key] is None: + return True + + # Otherwise there is a key value we're looking for. + # so we make a list of matches + matches = [] + # And check all of the feature qualifier valuse + for value in feature.qualifiers[key]: + # For that kwargs[key] value + for x in kwargs[key]: + matches.append(x in value) + + # If none matched, then we return false. + if not any(matches): + return False + + return True + + +def feature_test_contains(feature, **kwargs): + if "index" in kwargs: + return feature.location.start < kwargs["index"] < feature.location.end + elif "range" in kwargs: + return ( + feature.location.start < kwargs["range"]["start"] < feature.location.end + and feature.location.start < kwargs["range"]["end"] < feature.location.end + ) + else: + raise RuntimeError("Must use index or range keyword") + + +def get_id(feature=None, parent_prefix=None): + result = "" + if parent_prefix is not None: + result += parent_prefix + "|" + if "locus_tag" in feature.qualifiers: + result += feature.qualifiers["locus_tag"][0] + elif "gene" in feature.qualifiers: + result += feature.qualifiers["gene"][0] + elif "Gene" in feature.qualifiers: + result += feature.qualifiers["Gene"][0] + elif "product" in feature.qualifiers: + result += feature.qualifiers["product"][0] + elif "Product" in feature.qualifiers: + result += feature.qualifiers["Product"][0] + elif "Name" in feature.qualifiers: + result += feature.qualifiers["Name"][0] + else: + return feature.id + # Leaving in case bad things happen. + # result += '%s_%s_%s_%s' % ( + # feature.id, + # feature.location.start, + # feature.location.end, + # feature.location.strand + # ) + return result + + +def get_gff3_id(gene): + return gene.qualifiers.get("Name", [gene.id])[0] + + +def ensure_location_in_bounds(start=0, end=0, parent_length=0): + # This prevents frameshift errors + while start < 0: + start += 3 + while end < 0: + end += 3 + while start > parent_length: + start -= 3 + while end > parent_length: + end -= 3 + return (start, end) + + +def coding_genes(feature_list): + for x in genes(feature_list): + if ( + len( + list( + feature_lambda( + x.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=False, + ) + ) + ) + > 0 + ): + yield x + + +def genes(feature_list, feature_type="gene", sort=False): + """ + Simple filter to extract gene features from the feature set. + """ + + if not sort: + for x in feature_lambda( + feature_list, feature_test_type, {"type": feature_type}, subfeatures=True + ): + yield x + else: + data = list(genes(feature_list, feature_type=feature_type, sort=False)) + data = sorted(data, key=lambda feature: feature.location.start) + for x in data: + yield x + + +def wa_unified_product_name(feature): + """ + Try and figure out a name. We gave conflicting instructions, so + this isn't as trivial as it should be. Sometimes it will be in + 'product' or 'Product', othertimes in 'Name' + """ + # Manually applied tags. + protein_product = feature.qualifiers.get( + "product", feature.qualifiers.get("Product", [None]) + )[0] + + # If neither of those are available ... + if protein_product is None: + # And there's a name... + if "Name" in feature.qualifiers: + if not is_uuid(feature.qualifiers["Name"][0]): + protein_product = feature.qualifiers["Name"][0] + + return protein_product + + +def is_uuid(name): + return name.count("-") == 4 and len(name) == 36 + + +def get_rbs_from(gene): + # Normal RBS annotation types + rbs_rbs = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False + ) + ) + rbs_sds = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "Shine_Dalgarno_sequence"}, + subfeatures=False, + ) + ) + # Fraking apollo + apollo_exons = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False + ) + ) + apollo_exons = [x for x in apollo_exons if len(x) < 10] + # These are more NCBI's style + regulatory_elements = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "regulatory"}, + subfeatures=False, + ) + ) + rbs_regulatory = list( + feature_lambda( + regulatory_elements, + feature_test_quals, + {"regulatory_class": ["ribosome_binding_site"]}, + subfeatures=False, + ) + ) + # Here's hoping you find just one ;) + return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons + + +def nice_name(record): + """ + get the real name rather than NCBI IDs and so on. If fails, will return record.id + """ + name = record.id + likely_parental_contig = list(genes(record.features, feature_type="contig")) + if len(likely_parental_contig) == 1: + name = likely_parental_contig[0].qualifiers.get("organism", [name])[0] + return name + + +def fsort(it): + for i in sorted(it, key=lambda x: int(x.location.start)): + yield i diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/macros.xml Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,23 @@ + + + + + python + biopython + cpt_gffparser + + + + + + + + + + + ln -s $genome_fasta genomeref.fa; + + + genomeref.fa + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phage_annotation_validator.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phage_annotation_validator.py Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,1254 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: set fileencoding=utf-8 +import os +import sys +import json +import math +import numpy +import argparse +import itertools +import logging +from gff3 import ( + feature_lambda, + coding_genes, + genes, + get_gff3_id, + feature_test_location, + get_rbs_from, + nice_name, +) +from shinefind import NaiveSDCaller +from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +from jinja2 import Environment, FileSystemLoader +from cpt import MGAFinder + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(name="pav") + +# Path to script, required because of Galaxy. +SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) +# Path to the HTML template for the report + +ENCOURAGEMENT = ( + (100, "Perfection itself!"), + (90, "Amazing!"), + (80, "Not too bad, a few minor things to fix..."), + (70, "Some issues to address"), + ( + 50, + """Issues detected!

Have you heard of the + CPT\'s Automated Phage Annotation + Pipeline?""", + ), + ( + 0, + """MAJOR issues detected! Please consider using the + CPT\'s Automated Phage Annotation Pipeline""", + ), +) + + +def gen_qc_feature(start, end, message, strand=0, id_src=None, type_src="gene"): + kwargs = {"qualifiers": {"note": [message]}} + kwargs["type"] = type_src + kwargs["strand"] = strand + kwargs["phase"]=0 + kwargs["score"]=0.0 + kwargs["source"]="feature" + if id_src is not None: + kwargs["id"] = id_src.id + kwargs["qualifiers"]["ID"] = [id_src.id] + kwargs["qualifiers"]["Name"] = id_src.qualifiers.get("Name", []) + + + if end >= start: + return gffSeqFeature(FeatureLocation(start, end, strand=strand), **kwargs) + else: + return gffSeqFeature(FeatureLocation(end, start, strand=strand), **kwargs) + + +def __ensure_location_in_bounds(start=0, end=0, parent_length=0): + # This prevents frameshift errors + while start < 0: + start += 3 + while end < 0: + end += 3 + while start > parent_length: + start -= 3 + while end > parent_length: + end -= 3 + return (start, end) + + +def missing_rbs(record, lookahead_min=5, lookahead_max=15): + """ + Identify gene features with missing RBSs + + This "looks ahead" 5-15 bases ahead of each gene feature, and checks if + there's an RBS feature in those bounds. + + The returned data is a set of genes with the RBS sequence in the __upstream + attribute, and a message in the __message attribute. + """ + results = [] + good = 0 + bad = 0 + qc_features = [] + sd_finder = NaiveSDCaller() + + any_rbss = False + + for gene in coding_genes(record.features): + # Check if there are RBSs, TODO: make this recursive. Each feature in + # gene.sub_features can also have sub_features. + rbss = get_rbs_from(gene) + # No RBS found + if len(rbss) == 0: + # Get the sequence lookahead_min to lookahead_max upstream + if gene.strand > 0: + start = gene.location.start - lookahead_max + end = gene.location.start - lookahead_min + else: + start = gene.location.end + lookahead_min + end = gene.location.end + lookahead_max + # We have to ensure the feature is ON the genome, otherwise we may + # be trying to access a location outside of the length of the + # genome, which would be bad. + (start, end) = __ensure_location_in_bounds( + start=start, end=end, parent_length=len(record) + ) + # Temporary feature to extract sequence + tmp = gffSeqFeature( + FeatureLocation(start, end, strand=gene.strand), type="domain" + ) + # Get the sequence + seq = str(tmp.extract(record.seq)) + # Set the default properties + gene.__upstream = seq.lower() + gene.__message = "No RBS annotated, None found" + + # Try and do an automated shinefind call + sds = sd_finder.list_sds(seq) + if len(sds) > 0: + sd = sds[0] + gene.__upstream = sd_finder.highlight_sd( + seq.lower(), sd["start"], sd["end"] + ) + gene.__message = "Unannotated but valid RBS" + + qc_features.append( + gen_qc_feature( + start, end, "Missing RBS", strand=gene.strand, id_src=gene, type_src="gene" + ) + ) + + bad += 1 + results.append(gene) + results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + else: + if len(rbss) > 1: + log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene)) + any_rbss = True + # get first RBS/CDS + cds = list(genes(gene.sub_features, feature_type="CDS"))[0] + rbs = rbss[0] + + # Get the distance between the two + if gene.strand > 0: + distance = cds.location.start - rbs.location.end + else: + distance = rbs.location.start - cds.location.end + + # If the RBS is too far away, annotate that + if distance > lookahead_max: + gene.__message = "RBS too far away (%s nt)" % distance + + qc_features.append( + gen_qc_feature( + rbs.location.start, + rbs.location.end, + gene.__message, + strand=gene.strand, + id_src=gene, + type_src="gene" + ) + ) + + bad += 1 + results.append(gene) + results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + else: + good += 1 + + return good, bad, results, qc_features, any_rbss + + +# modified from get_orfs_or_cdss.py +# ----------------------------------------------------------- + + +def require_sd(data, record, chrom_start, sd_min, sd_max): + sd_finder = NaiveSDCaller() + for putative_gene in data: + if putative_gene[2] > 0: # strand + start = chrom_start + putative_gene[0] - sd_max + end = chrom_start + putative_gene[0] - sd_min + else: + start = chrom_start + putative_gene[1] + sd_min + end = chrom_start + putative_gene[1] + sd_max + + (start, end) = __ensure_location_in_bounds( + start=start, end=end, parent_length=len(record) + ) + tmp = gffSeqFeature( + FeatureLocation(start, end, strand=putative_gene[2]), type="domain" + ) + # Get the sequence + seq = str(tmp.extract(record.seq)) + sds = sd_finder.list_sds(seq) + if len(sds) > 0: + yield putative_gene + (start, end) + + +def excessive_gap( + record, + excess=50, + excess_divergent=200, + min_gene=30, + slop=30, + lookahead_min=5, + lookahead_max=15, +): + """ + Identify excessive gaps between gene features. + + Default "excessive" gap size is 10, but that should likely be larger. + """ + results = [] + good = 0 + bad = 0 + + contiguous_regions = [] + + sorted_genes = sorted( + genes(record.features), key=lambda feature: feature.location.start + ) + if len(sorted_genes) == 0: + log.warn("NO GENES FOUND") + return good, bad, results, [] + + current_gene = None + for gene in sorted_genes: + # If the gene's start is contiguous to the "current_gene", then we + # extend current_gene + for cds in genes(gene.sub_features, feature_type="CDS"): + if current_gene is None: + current_gene = [int(cds.location.start), int(cds.location.end)] + + if cds.location.start <= current_gene[1] + excess: + # Don't want to decrease size + if int(cds.location.end) >= current_gene[1]: + current_gene[1] = int(cds.location.end) + else: + # If it's discontiguous, we append the region and clear. + contiguous_regions.append(current_gene) + current_gene = [int(cds.location.start), int(cds.location.end)] + + # This generally expected that annotations would NOT continue unto the end + # of the genome, however that's a bug, and we can make it here with an + # empty contiguous_regions list + contiguous_regions.append(current_gene) + + for i in range(len(contiguous_regions) + 1): + if i == 0: + a = (1, 1) + b = contiguous_regions[i] + elif i >= len(contiguous_regions): + a = contiguous_regions[i - 1] + b = (len(record.seq), None) + else: + a = contiguous_regions[i - 1] + b = contiguous_regions[i] + + gap_size = abs(b[0] - a[1]) + + if gap_size > min(excess, excess_divergent): + a_feat_l = itertools.islice( + feature_lambda( + sorted_genes, + feature_test_location, + {"loc": a[1]}, + subfeatures=False, + ), + 1, + ) + b_feat_l = itertools.islice( + feature_lambda( + sorted_genes, + feature_test_location, + {"loc": b[0]}, + subfeatures=False, + ), + 1, + ) + + try: + a_feat = next(a_feat_l) + except StopIteration: + # Triggers on end of genome + a_feat = None + try: + b_feat = next(b_feat_l) + except StopIteration: + # Triggers on end of genome + b_feat = None + + result_obj = [ + a[1], + b[0], + None if not a_feat else a_feat.location.strand, + None if not b_feat else b_feat.location.strand, + ] + + if a_feat is None or b_feat is None: + if gap_size > excess_divergent: + results.append(result_obj) + else: + if ( + a_feat.location.strand == b_feat.location.strand + and gap_size > excess + ): + results.append(result_obj) + elif ( + a_feat.location.strand != b_feat.location.strand + and gap_size > excess_divergent + ): + results.append(result_obj) + + better_results = [] + qc_features = [] + of = MGAFinder(11, "CDS", "closed", min_gene) + # of = OrfFinder(11, 'CDS', 'closed', min_gene) + + for result_obj in results: + start = result_obj[0] + end = result_obj[1] + f = gen_qc_feature(start, end, "Excessive gap, %s bases" % abs(end - start), type_src="gene") + qc_features.append(f) + putative_genes = of.putative_genes_in_sequence( + str(record[start - slop : end + slop].seq) + ) + putative_genes = list( + require_sd(putative_genes, record, start, lookahead_min, lookahead_max) + ) + for putative_gene in putative_genes: + # (0, 33, 1, 'ATTATTTTATCAAAACGCTTTACAATCTTTTAG', 'MILSKRFTIF', 123123, 124324) + possible_gene_start = start + putative_gene[0] + possible_gene_end = start + putative_gene[1] + + if possible_gene_start <= possible_gene_end: + possible_cds = gffSeqFeature( + FeatureLocation( + possible_gene_start, possible_gene_end, strand=putative_gene[2] + ), + type="CDS", + ) + else: + possible_cds = gffSeqFeature( + FeatureLocation( + possible_gene_end, possible_gene_start, strand=putative_gene[2], + ), + type="CDS", + ) + + # Now we adjust our boundaries for the RBS that's required + # There are only two cases, the rbs is upstream of it, or downstream + if putative_gene[5] < possible_gene_start: + possible_gene_start = putative_gene[5] + else: + possible_gene_end = putative_gene[6] + + if putative_gene[5] <= putative_gene[6]: + possible_rbs = gffSeqFeature( + FeatureLocation( + putative_gene[5], putative_gene[6], strand=putative_gene[2] + ), + type="Shine_Dalgarno_sequence", + ) + else: + possible_rbs = gffSeqFeature( + FeatureLocation( + putative_gene[6], putative_gene[5], strand=putative_gene[2], + ), + type="Shine_Dalgarno_sequence", + ) + + if possible_gene_start <= possible_gene_end: + possible_gene = gffSeqFeature( + FeatureLocation( + possible_gene_start, possible_gene_end, strand=putative_gene[2] + ), + type="gene", + qualifiers={"note": ["Possible gene"]}, + ) + else: + possible_gene = gffSeqFeature( + FeatureLocation( + possible_gene_end, possible_gene_start, strand=putative_gene[2], + ), + type="gene", + qualifiers={"note": ["Possible gene"]}, + ) + possible_gene.sub_features = [possible_rbs, possible_cds] + qc_features.append(possible_gene) + + better_results.append(result_obj + [len(putative_genes)]) + + # Bad gaps are those with more than zero possible genes found + bad = len([x for x in better_results if x[2] > 0]) + # Generally taking "good" here as every possible gap in the genome + # Thus, good is TOTAL - gaps + good = len(sorted_genes) + 1 - bad + # and bad is just gaps + return good, bad, better_results, qc_features + + +def phi(x): + """Standard phi function used in calculation of normal distribution""" + return math.exp(-1 * math.pi * x * x) + + +def norm(x, mean=0, sd=1): + """ + Normal distribution. Given an x position, a mean, and a standard + deviation, calculate the "y" value. Useful for score scaling + + Modified to multiply by SD. This means even at sd=5, norm(x, mean) where x = mean => 1, rather than 1/5. + """ + return (1 / float(sd)) * phi(float(x - mean) / float(sd)) * sd + + +def coding_density(record, mean=92.5, sd=20): + """ + Find coding density in the genome + """ + feature_lengths = 0 + + for gene_a in coding_genes(record.features): + feature_lengths += sum( + [len(x) for x in genes(gene_a.sub_features, feature_type="CDS")] + ) + + avgFeatLen = float(feature_lengths) / float(len(record.seq)) + return int(norm(100 * avgFeatLen, mean=mean, sd=sd) * 100), int(100 * avgFeatLen) + + +def exact_coding_density(record, mean=92.5, sd=20): + """ + Find exact coding density in the genome + """ + data = numpy.zeros(len(record.seq)) + + for gene_a in coding_genes(record.features): + for cds in genes(gene_a.sub_features, feature_type="CDS"): + for i in range(cds.location.start, cds.location.end + 1): + data[i - 1] = 1 + + return float(sum(data)) / len(data) + + +def excessive_overlap(record, excess=15, excess_divergent=30): + """ + Find excessive overlaps in the genome, where excessive is defined as 15 + bases for same strand, and 30 for divergent translation. + + Does a product of all the top-level features in the genome, and calculates + gaps. + """ + results = [] + bad = 0 + qc_features = [] + + for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2): + # Get the CDS from the subfeature list. + # TODO: not recursive. + cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")] + cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")] + + if len(cds_a) == 0: + log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a)) + continue + + if len(cds_b) == 0: + log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b)) + continue + + cds_a = cds_a[0] + cds_b = cds_b[0] + + # Set of locations that are included in the CDS of A and the + # CDS of B + cas = set(range(cds_a.location.start, cds_a.location.end)) + cbs = set(range(cds_b.location.start, cds_b.location.end)) + + # Here we calculate the intersection between the two sets, and + # if it's larger than our excessive size, we know that they're + # overlapped + ix = cas.intersection(cbs) + + if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or ( + cds_a.location.strand != cds_b.location.strand + and len(ix) >= excess_divergent + ): + bad += float(len(ix)) / float(min(excess, excess_divergent)) + qc_features.append( + gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a, type_src="gene") + ) + results.append((gene_a, gene_b, min(ix), max(ix))) + + # Good isn't accurate here. It's a triangle number and just ugly, but we + # don't care enough to fix it. + good = len(list(coding_genes(record.features))) + good = int(good - bad) + if good < 0: + good = 0 + return good, int(bad), results, qc_features + + +def get_encouragement(score): + """Some text telling the user how they did + """ + for encouragement in ENCOURAGEMENT: + if score > encouragement[0]: + return encouragement[1] + return ENCOURAGEMENT[-1][1] + + +def genome_overview(record): + """Genome overview + """ + data = { + "genes": { + "count": 0, + "bases": len(record.seq), + "density": 0, # genes / kb + "avg_len": [], + "comp": {"A": 0, "C": 0, "G": 0, "T": 0}, + }, + "overall": { + "comp": { + "A": record.seq.count("A") + record.seq.count("a"), + "C": record.seq.count("C") + record.seq.count("c"), + "G": record.seq.count("G") + record.seq.count("g"), + "T": record.seq.count("T") + record.seq.count("t"), + }, + "gc": 0, + }, + } + gene_features = list(coding_genes(record.features)) + data["genes"]["count"] = len(gene_features) + + for feat in gene_features: + data["genes"]["comp"]["A"] += feat.extract(record).seq.count("A") + feat.extract(record).seq.count("a") + data["genes"]["comp"]["C"] += feat.extract(record).seq.count("C") + feat.extract(record).seq.count("c") + data["genes"]["comp"]["T"] += feat.extract(record).seq.count("T") + feat.extract(record).seq.count("t") + data["genes"]["comp"]["G"] += feat.extract(record).seq.count("G") + feat.extract(record).seq.count("g") + #data["genes"]["bases"] += len(feat) + data["genes"]["avg_len"].append(len(feat)) + + data["genes"]["avg_len"] = float(sum(data["genes"]["avg_len"])) / len(gene_features) + data["overall"]["gc"] = float( + data["overall"]["comp"]["G"] + data["overall"]["comp"]["C"] + ) / len(record.seq) + return data + + +def find_morons(record): + """Locate morons in the genome + + Don't even know why... + + TODO: remove? Idk. + """ + results = [] + good = 0 + bad = 0 + + gene_features = list(coding_genes(record.features)) + for i, gene in enumerate(gene_features): + two_left = gene_features[i - 2 : i] + two_right = gene_features[i + 1 : i + 1 + 2] + strands = [x.strand for x in two_left] + [x.strand for x in two_right] + anticon = [x for x in strands if x != gene.strand] + + if len(anticon) == 4: + has_rbs = [x.type == "Shine_Dalgarno_sequence" for x in gene.sub_features] + if any(has_rbs): + rbs = [ + x for x in gene.sub_features if x.type == "Shine_Dalgarno_sequence" + ][0] + rbs_msg = str(rbs.extract(record.seq)) + else: + rbs_msg = "No RBS Available" + results.append((gene, two_left, two_right, rbs_msg)) + bad += 1 + else: + good += 1 + return good, bad, results, [] + + +def bad_gene_model(record): + """Find features without product + """ + results = [] + good = 0 + bad = 0 + qc_features = [] + + for gene in coding_genes(record.features): + exons = [ + x for x in genes(gene.sub_features, feature_type="exon") if len(x) > 10 + ] + CDSs = [x for x in genes(gene.sub_features, feature_type="CDS")] + if len(exons) >= 1 and len(CDSs) >= 1: + if len(exons) != len(CDSs): + results.append( + ( + get_gff3_id(gene), + None, + None, + "Mismatched number of exons and CDSs in gff3 representation", + ) + ) + qc_features.append( + gen_qc_feature( + gene.location.start, + gene.location.end, + "Mismatched number of exons and CDSs in gff3 representation", + strand=gene.strand, + id_src=gene, + type_src="gene" + ) + ) + bad += 1 + else: + for (exon, cds) in zip( + sorted(exons, key=lambda x: x.location.start), + sorted(CDSs, key=lambda x: x.location.start), + ): + if len(exon) != len(cds): + results.append( + ( + get_gff3_id(gene), + exon, + cds, + "CDS does not extend to full length of gene", + ) + ) + qc_features.append( + gen_qc_feature( + exon.location.start, + exon.location.end, + "CDS does not extend to full length of gene", + strand=exon.strand, + id_src=gene, + type_src="CDS" + ) + ) + bad += 1 + else: + good += 1 + else: + log.warn("Could not handle %s, %s", exons, CDSs) + results.append( + ( + get_gff3_id(gene), + None, + None, + "{0} exons, {1} CDSs".format(len(exons), len(CDSs)), + ) + ) + + return good, len(results) + bad, results, qc_features + + +def weird_starts(record): + """Find features without product + """ + good = 0 + bad = 0 + qc_features = [] + results = [] + + overall = {} + for gene in coding_genes(record.features): + seq = [x for x in genes(gene.sub_features, feature_type="CDS")] + if len(seq) == 0: + log.warn("No CDS for gene %s", get_gff3_id(gene)) + continue + else: + seq = seq[0] + + seq_str = str(seq.extract(record.seq)) + start_codon = seq_str[0:3] + if len(seq_str) < 3: + sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n') + exit(2) +# if len(seq_str) % 3 != 0: +# if len(seq_str) < 3: +# stop_codon = seq_str[-(len(seq_str))] +# else: +# stop_codon = seq_str[-3] +# +# log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) +# seq.__error = "Bad CDS Length" +# results.append(seq) +# qc_features.append( +# gen_qc_feature( +# s, e, "Bad Length", strand=seq.strand, id_src=gene +# ) +# ) +# bad += 1 +# seq.__start = start_codon +# seq.__stop = stop_codon +# continue + + stop_codon = seq_str[-3] + seq.__start = start_codon + seq.__stop = stop_codon + if start_codon not in overall: + overall[start_codon] = 1 + else: + overall[start_codon] += 1 + + if start_codon not in ("ATG", "TTG", "GTG"): + log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) + seq.__error = "Unusual start codon %s" % start_codon + + s = 0 + e = 0 + if seq.strand > 0: + s = seq.location.start + e = seq.location.start + 3 + else: + s = seq.location.end + e = seq.location.end - 3 + + results.append(seq) + results[-1].location = FeatureLocation(results[-1].location.start + 1, results[-1].location.end, results[-1].location.strand) + qc_features.append( + gen_qc_feature( + s, e, "Weird start codon", strand=seq.strand, id_src=gene, type_src="gene" + ) + ) + bad += 1 + else: + good += 1 + + return good, bad, results, qc_features, overall + + +def missing_genes(record): + """Find features without product + """ + results = [] + good = 0 + bad = 0 + qc_features = [] + + for gene in coding_genes(record.features): + if gene.qualifiers.get("cpt_source", [None])[0] == "CPT_GENE_MODEL_CORRECTION": + results.append(gene) + bad += 1 + else: + good += 1 + + return good, bad, results, qc_features + + +def gene_model_correction_issues(record): + """Find features that have issues from the gene model correction step. + These have qualifiers beginning with CPT_GMS + """ + results = [] + good = 0 + bad = 0 + qc_features = [] + + # For each gene + for gene in coding_genes(record.features): + # Get the list of child CDSs + cdss = [x for x in genes(gene.sub_features, feature_type="CDS")] + # And our matching qualifiers + gene_data = [(k, v) for (k, v) in gene.qualifiers.items() if k == "cpt_gmc"] + # If there are problems with ONLY the parent, let's complain + local_results = [] + local_qc_features = [] + for x in gene_data: + if "Missing Locus Tag" in x[1]: + # Missing locus tag is an either or thing, if it hits here + # there shouldn't be anything else wrong with it. + + # Obviously missing so we remove it + gene.qualifiers["locus_tag"] = [""] + # Translation from bp_genbank2gff3.py + cdss[0].qualifiers["locus_tag"] = cdss[0].qualifiers["Name"] + # Append our results + local_results.append((gene, cdss[0], "Gene is missing a locus_tag")) + local_qc_features.append( + gen_qc_feature( + gene.location.start, + gene.location.end, + "Gene is missing a locus_tag", + strand=gene.strand, + type_src="gene" + ) + ) + + # We need to alert on any child issues as well. + for cds in cdss: + cds_data = [ + (k, v[0]) for (k, v) in cds.qualifiers.items() if k == "cpt_gmc" + ] + if len(gene_data) == 0 and len(cds_data) == 0: + # Alles gut + pass + else: + for _, problem in cds_data: + if problem == "BOTH Missing Locus Tag": + gene.qualifiers["locus_tag"] = [""] + cds.qualifiers["locus_tag"] = [""] + local_results.append( + (gene, cds, "Both gene and CDS are missing locus tags") + ) + local_qc_features.append( + gen_qc_feature( + cds.location.start, + cds.location.end, + "CDS is missing a locus_tag", + strand=cds.strand, + type_src="CDS" + ) + ) + local_qc_features.append( + gen_qc_feature( + gene.location.start, + gene.location.end, + "Gene is missing a locus_tag", + strand=gene.strand, + type_src="gene" + ) + ) + elif problem == "Different locus tag from associated gene.": + gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] + cds.qualifiers["locus_tag"] = cds.qualifiers["cpt_gmc_locus"] + local_results.append( + (gene, cds, "Gene and CDS have differing locus tags") + ) + local_qc_features.append( + gen_qc_feature( + gene.location.start, + gene.location.end, + "Gene and CDS have differing locus tags", + strand=gene.strand, + type_src="gene" + ) + ) + elif problem == "Missing Locus Tag": + # Copy this over + gene.qualifiers["locus_tag"] = gene.qualifiers["Name"] + # This one is missing + cds.qualifiers["locus_tag"] = [""] + local_results.append((gene, cds, "CDS is missing a locus_tag")) + local_qc_features.append( + gen_qc_feature( + cds.location.start, + cds.location.end, + "CDS is missing a locus_tag", + strand=cds.strand, + type_src="CDS" + ) + ) + else: + log.warn("Cannot handle %s", problem) + + if len(local_results) > 0: + bad += 1 + else: + good += 1 + + qc_features.extend(local_qc_features) + results.extend(local_results) + return good, bad, results, qc_features + + +def missing_tags(record): + """Find features without product + """ + results = [] + good = 0 + bad = 0 + qc_features = [] + + for gene in coding_genes(record.features): + cds = [x for x in genes(gene.sub_features, feature_type="CDS")] + if len(cds) == 0: + log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene)) + continue + + cds = cds[0] + + if "product" not in cds.qualifiers: + log.info("Missing product tag on %s", get_gff3_id(gene)) + qc_features.append( + gen_qc_feature( + cds.location.start, + cds.location.end, + "Missing product tag", + strand=cds.strand, + type_src="CDS" + ) + ) + results.append(cds) + bad += 1 + else: + good += 1 + + return good, bad, results, qc_features + + +def evaluate_and_report( + annotations, + genome, + gff3=None, + tbl=None, + sd_min=5, + sd_max=15, + min_gene_length=30, + excessive_gap_dist=50, + excessive_gap_divergent_dist=200, + excessive_overlap_dist=25, + excessive_overlap_divergent_dist=50, + reportTemplateName="phage_annotation_validator.html", +): + """ + Generate our HTML evaluation of the genome + """ + # Get features from GFF file + seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) + # Get the first GFF3 record + # TODO: support multiple GFF3 files. + mostFeat = 0 + for rec in list(gffParse(annotations, base_dict=seq_dict)): + if len(rec.features) > mostFeat: + mostFeat = len(rec.features) + record = rec + + gff3_qc_record = SeqRecord(record.id, id=record.id) + gff3_qc_record.features = [] + gff3_qc_features = [] + + log.info("Locating missing RBSs") + # mb_any = "did they annotate ANY rbss? if so, take off from score." + mb_good, mb_bad, mb_results, mb_annotations, mb_any = missing_rbs( + record, lookahead_min=sd_min, lookahead_max=sd_max + ) + gff3_qc_features += mb_annotations + + log.info("Locating excessive gaps") + eg_good, eg_bad, eg_results, eg_annotations = excessive_gap( + record, + excess=excessive_gap_dist, + excess_divergent=excessive_gap_divergent_dist, + min_gene=min_gene_length, + slop=excessive_overlap_dist, + lookahead_min=sd_min, + lookahead_max=sd_max, + ) + gff3_qc_features += eg_annotations + + log.info("Locating excessive overlaps") + eo_good, eo_bad, eo_results, eo_annotations = excessive_overlap( + record, + excess=excessive_overlap_dist, + excess_divergent=excessive_overlap_divergent_dist, + ) + gff3_qc_features += eo_annotations + + log.info("Locating morons") + mo_good, mo_bad, mo_results, mo_annotations = find_morons(record) + gff3_qc_features += mo_annotations + + log.info("Locating missing tags") + mt_good, mt_bad, mt_results, mt_annotations = missing_tags(record) + gff3_qc_features += mt_annotations + + log.info("Locating missing gene features") + mg_good, mg_bad, mg_results, mg_annotations = missing_genes(record) + gff3_qc_features += mg_annotations + + log.info("Determining coding density") + cd, cd_real = coding_density(record) + + log.info("Locating weird starts") + ws_good, ws_bad, ws_results, ws_annotations, ws_overall = weird_starts(record) + gff3_qc_features += ws_annotations + + log.info("Locating bad gene models") + gm_good, gm_bad, gm_results, gm_annotations = bad_gene_model(record) + if gm_good + gm_bad == 0: + gm_bad = 1 + + log.info("Locating more bad gene models") + gmc_good, gmc_bad, gmc_results, gmc_annotations = gene_model_correction_issues( + record + ) + if gmc_good + gmc_bad == 0: + gmc_bad = 1 + + good_scores = [eg_good, eo_good, mt_good, ws_good, gm_good, gmc_good] + bad_scores = [eg_bad, eo_bad, mt_bad, ws_bad, gm_bad, gmc_bad] + + # Only if they tried to annotate RBSs do we consider them. + if mb_any: + good_scores.append(mb_good) + bad_scores.append(mb_bad) + subscores = [] + + for (g, b) in zip(good_scores, bad_scores): + if g + b == 0: + s = 0 + else: + s = int(100 * float(g) / (float(b) + float(g))) + subscores.append(s) + subscores.append(cd) + + score = int(float(sum(subscores)) / float(len(subscores))) + + # This is data that will go into our HTML template + kwargs = { + "upstream_min": sd_min, + "upstream_max": sd_max, + "record_name": record.id, + "record_nice_name": nice_name(record), + "params": { + "sd_min": sd_min, + "sd_max": sd_max, + "min_gene_length": min_gene_length, + "excessive_gap_dist": excessive_gap_dist, + "excessive_gap_divergent_dist": excessive_gap_divergent_dist, + "excessive_overlap_dist": excessive_overlap_dist, + "excessive_overlap_divergent_dist": excessive_overlap_divergent_dist, + }, + "score": score, + "encouragement": get_encouragement(score), + "genome_overview": genome_overview(record), + "rbss_annotated": mb_any, + "missing_rbs": mb_results, + "missing_rbs_good": mb_good, + "missing_rbs_bad": mb_bad, + "missing_rbs_score": 0 + if mb_good + mb_bad == 0 + else (100 * mb_good / (mb_good + mb_bad)), + "excessive_gap": eg_results, + "excessive_gap_good": eg_good, + "excessive_gap_bad": eg_bad, + "excessive_gap_score": 0 + if eo_good + eo_bad == 0 + else (100 * eo_good / (eo_good + eo_bad)), + "excessive_overlap": eo_results, + "excessive_overlap_good": eo_good, + "excessive_overlap_bad": eo_bad, + "excessive_overlap_score": 0 + if eo_good + eo_bad == 0 + else (100 * eo_good / (eo_good + eo_bad)), + "morons": mo_results, + "morons_good": mo_good, + "morons_bad": mo_bad, + "morons_score": 0 + if mo_good + mo_bad == 0 + else (100 * mo_good / (mo_good + mo_bad)), + "missing_tags": mt_results, + "missing_tags_good": mt_good, + "missing_tags_bad": mt_bad, + "missing_tags_score": 0 + if mt_good + mt_bad == 0 + else (100 * mt_good / (mt_good + mt_bad)), + "missing_genes": mg_results, + "missing_genes_good": mg_good, + "missing_genes_bad": mg_bad, + "missing_genes_score": 0 + if mg_good + mg_bad == 0 + else (100 * mg_good / (mg_good + mg_bad)), + "weird_starts": ws_results, + "weird_starts_good": ws_good, + "weird_starts_bad": ws_bad, + "weird_starts_overall": ws_overall, + "weird_starts_overall_sorted_keys": sorted( + ws_overall, reverse=True, key=lambda x: ws_overall[x] + ), + "weird_starts_score": 0 + if ws_good + ws_bad == 0 + else (100 * ws_good / (ws_good + ws_bad)), + "gene_model": gm_results, + "gene_model_good": gm_good, + "gene_model_bad": gm_bad, + "gene_model_score": 0 + if gm_good + gm_bad == 0 + else (100 * gm_good / (gm_good + gm_bad)), + "gene_model_correction": gmc_results, + "gene_model_correction_good": gmc_good, + "gene_model_correction_bad": gmc_bad, + "gene_model_correction_score": 0 + if gmc_good + gmc_bad == 0 + else (100 * gmc_good / (gmc_good + gmc_bad)), + "coding_density": cd, + "coding_density_exact": exact_coding_density(record), + "coding_density_real": cd_real, + "coding_density_score": cd, + } + + with open(tbl, "w") as handle: + kw_subset = {} + for key in kwargs: + if ( + key in ("score", "record_name") + or "_good" in key + or "_bad" in key + or "_overall" in key + ): + kw_subset[key] = kwargs[key] + json.dump(kw_subset, handle) + + with open(gff3, "w") as handle: + gff3_qc_record.features = gff3_qc_features + gff3_qc_record.annotations = {} + gffWrite([gff3_qc_record], handle) + + def nice_strand(direction): + # It is somehow possible for whole gffSeqFeature objects to end up in here, apparently at the gene level + if "SeqFeature" in str(type(direction)): + direction = direction.location.strand + if direction > 0: + return "→"#.decode("utf-8") + else: + return "←"#.decode("utf-8") + + def nice_strand_tex(direction): + if "SeqFeature" in str(type(direction)): + direction = direction.location.strand + if direction > 0: + return "$\\rightarrow$" + else: + return "$\\leftarrow$" + + def texify(data): + return data.replace("_", "\\_").replace("$", "\\$") + + def length(data): + return len(data) + + def my_encode(data): + return str(data)#.encode("utf-8") + + def my_decode(data): + # For production + return str(data)#.decode("utf-8") + # For local testing. No, I do not understand. + return str(data)#.encode("utf-8")).decode("utf-8") + + env = Environment( + loader=FileSystemLoader(SCRIPT_PATH), trim_blocks=True, lstrip_blocks=True + ) + env.filters.update( + { + "nice_id": get_gff3_id, + "nice_strand": nice_strand, + "nice_strand_tex": nice_strand_tex, + "texify": texify, + "length": length, + "encode": my_encode, + "decode": my_decode, + } + ) + tpl = env.get_template(reportTemplateName) + return tpl.render(**kwargs)#.encode("utf-8") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="rebase gff3 features against parent locations", epilog="" + ) + parser.add_argument( + "annotations", type=argparse.FileType("r"), help="Parent GFF3 annotations" + ) + parser.add_argument("genome", type=argparse.FileType("r"), help="Genome Sequence") + parser.add_argument( + "--gff3", type=str, help="GFF3 Annotations", default="qc_annotations.gff3" + ) + parser.add_argument( + "--tbl", + type=str, + help="Table for noninteractive parsing", + default="qc_results.json", + ) + + parser.add_argument( + "--sd_min", + type=int, + help="Minimum distance from gene start for an SD to be", + default=5, + ) + parser.add_argument( + "--sd_max", + type=int, + help="Maximum distance from gene start for an SD to be", + default=15, + ) + + parser.add_argument( + "--min_gene_length", + type=int, + help="Minimum length for a putative gene call (AAs)", + default=30, + ) + + parser.add_argument( + "--excessive_overlap_dist", + type=int, + help="Excessive overlap for genes in same direction", + default=25, + ) + parser.add_argument( + "--excessive_overlap_divergent_dist", + type=int, + help="Excessive overlap for genes in diff directions", + default=50, + ) + + parser.add_argument( + "--excessive_gap_dist", + type=int, + help="Maximum distance between two genes", + default=40, + ) + parser.add_argument( + "--excessive_gap_divergent_dist", + type=int, + help="Maximum distance between two divergent genes", + default=200, + ) + + parser.add_argument( + "--reportTemplateName", + help="Report template file name", + default="phageqc_report_full.html", + ) + + args = parser.parse_args() + + sys.stdout.write(evaluate_and_report(**vars(args))) diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phage_annotation_validator.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phage_annotation_validator.xml Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,102 @@ + + + validate phage annotations + + macros.xml + cpt-macros.xml + + + python + biopython + cpt_gffparser + python-levenshtein + regex + metagene_annotator + jinja2 + numpy + + $output; + +#if ".tex" in str($report_format): + mv $output tmp.tex; + docker run --rm -i --user="1002:1002" --net=none -v \$PWD:/data blang/latex pdflatex tmp.tex && + docker run --rm -i --user="1002:1002" --net=none -v \$PWD:/data blang/latex pdflatex tmp.tex && + mv tmp.pdf $output; +#end if +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_464.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phageqc_report_464.html Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,419 @@ + + + + + + + + + + [BICH464] Phage QC on {{record_name}} - {{score}} + + + + + + + + + +

+ +
+
+ +
+
+
+
+

Phage {{record_name}}

+ +
+ +
+
+

Genome Overview

+

Genes

+
    +
  • Count: {{ genome_overview.genes.count }}
  • +
  • Bases: {{ genome_overview.genes.bases }}
  • +
  • Average Length: {{ genome_overview.genes.avg_len | round | int}}
  • +
  • Coding Density: {{ '%0.2f' % (100 * coding_density_exact) }}%
  • +
  • Composition +
      +
    • A {{ genome_overview.genes.comp.A }}
    • +
    • C {{ genome_overview.genes.comp.C }}
    • +
    • T {{ genome_overview.genes.comp.T }}
    • +
    • G {{ genome_overview.genes.comp.G }}
    • +
    +
  • +
+

Overall

+
    +
  • %GC: {{ '%0.2f' % (100 * genome_overview.overall.gc) }}%
  • +
  • Composition +
      +
    • A {{ genome_overview.overall.comp.A }}
    • +
    • C {{ genome_overview.overall.comp.C }}
    • +
    • T {{ genome_overview.overall.comp.T }}
    • +
    • G {{ genome_overview.overall.comp.G }}
    • +
    +
  • +
+
+
+ +

Gene Starts

+

Genes Missing RBS {{missing_rbs_good}} / {{missing_rbs_good + missing_rbs_bad}}

+

The following genes have issues with their RBS.

+ {% if not rbss_annotated %} +

+ Since you have not annotated any possible RBSs, this does not count off from your overall score. +

+ {% endif %} +
+ + + + + + + + + + + {% for row in missing_rbs %} + {% if "None found" in row.__message %} + + + + + + + {% endif %} + {% endfor %} + +
IDLocationErrorUpstream (-{{upstream_max}} .. -{{upstream_min}})
{{row | nice_id | decode}}{{row.location.start}}..{{row.location.end}} [{{row.strand}}]None found{{row.__upstream }}
+
+ +

Start Codon Usage

+

This section covers genes with unusual start codons

+
+ + + + + + + + + {% for codon_key in weird_starts_overall_sorted_keys %} + + {% endfor %} + +
Start CodonCount
{{ codon_key }}{{ weird_starts_overall[codon_key] }}
+
+ +
+ + + + + + + + + + {% for row in weird_starts %} + + + + + + {% endfor %} + +
IDLocationError
{{row | nice_id| decode}}{{row.location.start}}..{{row.location.end}} [{{row.strand}}]{{row.qualifiers.get('note', [])}}
+
+ +

Intergenic Gaps

+

Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.

+
+ + + + + + + + + + + {% for row in excessive_gap %} + + + + + + + {% endfor %} + +
RegionSizeBounding Gene Transcription DirectionMessage
{{row[0]}} .. {{row[1]}}{{row[1] - row[0]}}{{row[2] | nice_strand}} {{row[3] | nice_strand}} + {% if row[4] == 0 %} + {% else %} + {{row[4]}} possible genes found in this region + {% endif %} +
+
+ +

Overlapping Genes

+

Large gene overlaps may indicate an incorrect gene start or miscalled gene.

+
+ + + + + + + + + + + {% for row in excessive_overlap %} + + + + + + + {% endfor %} + +
Feature AFeature BShared RegionOverlap Length
{{row[0] | nice_id | decode}} ({{row[0].location}}){{row[1] | nice_id | decode}} ({{row[1].location}}){{row[2]}}..{{row[3]}}{{row[3] - row[2]}}bp
+
+ + +

Gene Model Issues

+

These issues are mostly derived from how Apollo handles the gene model.

+
+ + + + + + + + + + + {% for row in gene_model %} + + + + + + + {% endfor %} + +
IDExonCDSMessage
{{row[0]}}{{row[1].location}}{{row[2].location}}{{row[3]}}
+
+ +
+
+
+ + + + + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_annotation_table.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phageqc_report_annotation_table.html Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,244 @@ + + + + + + + + + + Annotation Table + + + + + + + + + + + + + + + +
+
+
+ + + +
+
+ Data on each organism will be accessible from the tabs above. +
+ {% for (record, data) in annotation_table_data %} +
+ + + + {% for col in annotation_table_col_names %} + + {% endfor %} + + + + {% for row in data %} + + {% for col in row %} + + {% endfor %} + + {% endfor %} + +
{{ col[0] }}
{% if col is not string %}
    {% for val in col %}
  • {{ val }}
  • {% endfor %}
{% else %}{{ col }}{% endif %}
+
+ {% endfor %} +
+
+
+
+ + + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_full.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phageqc_report_full.html Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,412 @@ + + + + + + + + + + Phage QC on {{record_name}} - {{score}} + + + + + + + + + + + +
+
+ +
+
+
+
+

Phage {{record_name}}

+ +
+ +
+
+ +

Gene Starts

+

Genes missing RBS {{missing_rbs_good}} / {{missing_rbs_good + missing_rbs_bad}}

+

The following genes have issues with their RBS.

+ {% if not rbss_annotated %} +

+ Since you have not annotated any possible RBSs, this does not count off from your overall score. +

+ {% endif %} +
+ + + + + + + + + + + + {% for row in missing_rbs %} + + + + + + + + {% endfor %} + +
Feature TypeIDLocationErrorUpstream (-{{upstream_max}} .. -{{upstream_min}})
{{row.type}}{{row.id}}{{row.location.start}}..{{row.location.end}} [{{row.strand}}]{{row.__message}}{{row.__upstream}}
+
+ +

Start Codon Usage

+

This section covers genes with unusual start codons

+
+ + + + + + + + + {% for codon_key in weird_starts_overall_sorted_keys %} + + {% endfor %} + +
Start CodonCount
{{ codon_key }}{{ weird_starts_overall[codon_key] }}
+
+ +
+ + + + + + + + + + + {% for row in weird_starts %} + + + + + + + {% endfor %} + +
Feature TypeIDLocationError
{{row.type}}{{row.id}}{{row.location.start}}..{{row.location.end}} [{{row.strand}}]{{row.__error}}
+
+ +

Intergenic Gaps

+

Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.

+
+ + + + + + + + + + + {% for row in excessive_gap %} + + + + + + + {% endfor %} + +
RegionSizeBounding Gene Transcription DirectionMessage
{{row[0]}} .. {{row[1]}}{{row[1] - row[0]}}{{row[2] | nice_strand}} {{row[3] | nice_strand}} + {% if row[4] == 0 %} + {% else %} + {{row[4]}} ORFs found in this region + {% endif %} +
+
+ +

Overlapping Genes

+

Large gene overlaps may indicate an incorrect gene start or miscalled gene.

+
+ + + + + + + + + + + {% for row in excessive_overlap %} + + + + + + + {% endfor %} + +
Feature AFeature BShared RegionOverlap Length
{{row[0].id}} ({{row[0].location}}){{row[1].id}} ({{row[1].location}}){{row[2]}}..{{row[3]}}{{row[3] - row[2]}} bp
+
+ + +

Antisense Genes

+

Possible Morons {{morons_good}} / {{morons_good + morons_bad}} (Doesn't count towards score)

+
+ + + + + + + + + + {% for row in morons %} + + + + + + {% endfor %} + +
FeatureRBSSurrounding Features
{{row[0].id}}{{row[3]}} + {% for x in row[1] %} + {{ x | nice_strand }} + {% endfor %} + {{ row[0].strand | nice_strand }} + {% for x in row[2] %} + {{ x | nice_strand }} + {% endfor %} + +
+
+ + +

Annotation Issues

+

Missing Product Tags {{missing_tags_good}} / {{missing_tags_good + missing_tags_bad}}

+
+ + + + + + + + + {% for row in missing_tags %} + + + + + {% endfor %} + +
FeatureQualifiers
{{row.id}} + {% for key in row.qualifiers %} + {{ key }} +
    + {% for value in row.qualifiers[key] %} +
  • {{value}}
  • + {% endfor %} +
+ {% endfor %} +
+
+ + + + + +
+
+
+ + + + + + diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/phageqc_report_genomea.tex --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/phageqc_report_genomea.tex Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,235 @@ +\documentclass[]{article} +\usepackage{lmodern} +\usepackage{amssymb,amsmath} +\usepackage{ifxetex,ifluatex} +\usepackage{fixltx2e} % provides \textsubscript +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} + +\addtolength{\oddsidemargin}{-.875in} +\addtolength{\evensidemargin}{-.875in} +\addtolength{\textwidth}{1.75in} + +\addtolength{\topmargin}{-.875in} +\addtolength{\textheight}{1.75in} + +\usepackage{fancyhdr} +\pagestyle{fancy} +\lhead{GenomeA Compliance Report} +\chead{} +\rhead{ {{record_name | texify}} } +\lfoot{} +\cfoot{\thepage} +\rfoot{} + + + +\usepackage{microtype} +\usepackage{hyperref} +\hypersetup{unicode=true, + pdfborder={0 0 0}, + breaklinks=true} +\urlstyle{same} % don't use monospace font for urls +\usepackage{longtable,booktabs} +\date{Compiled \today} +\title{GenomeA Compliance Report for {{record_nice_name | texify}}} + +\begin{document} +%\pagestyle{plain} +\maketitle +This report details possible issues with your submitted genome annotations. + +\section{Required Changes} + +The changes detailed in this section are required for acceptance of your +submission. + +\subsection{Missing Gene Features} + +These coding sequences (``CDS'' in your GenBank file) are missing the +associated gene feature (``gene''). This is required for validation by NCBI's +rules which are encoded in the sequin and tbl2asn programs. +{%if missing_genes_bad > 0 %} + +{{ missing_genes_bad }} out of {{ missing_genes_good + missing_genes_bad +}} features are lacking their associated gene feature. + +\begin{longtable}{ll} +\hline +Feature ID & Location\\ +\hline +\endhead +{% for row in missing_genes %} +{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}}\tabularnewline +{% endfor %} +\end{longtable} +{% else %} +You are not missing any gene features +{% endif %} + +\subsection{Missing Product Tags}\label{missing-product-tags} + +{{missing_tags_good}} out of {{missing_tags_good + missing_tags_bad}} features have product tags (\texttt{/product="..."}). +{% if missing_tags_bad > 0 %} +The following features are missing product tags +\begin{longtable}{ll} +\hline +Feature & Location\\ +\hline +\endhead +{% for row in missing_tags %} +{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}}\tabularnewline +{% endfor %} +\end{longtable} +{% endif %} + +\subsection{Missing Locus Tags}\label{missing-locus-tags} + +{{gene_model_correction_good}} out of {{gene_model_correction_good + gene_model_correction_bad}} features have valid locus tags (\texttt{/locus\_tag="..."}). +{% if gene_model_correction_bad > 0 %} +The following features have issues with their locus tags +\begin{longtable}{lllll} +\hline +ID & Location & Gene Locus Tag & CDS Locus Tag & Issue \\ +\hline +\endhead +{% for row in gene_model_correction %} +{{ row[0].id | texify }} & \texttt{{'{'}}{{row[1].location}}{{'}'}} & {{ row[0].qualifiers['locus_tag'][0] | texify }} & {{ row[1].qualifiers['locus_tag'][0] | texify }} & {{ row[2] | texify }}\tabularnewline +{% endfor %} +\end{longtable} +{% endif %} + + +\section{Suggested Changes}\label{suggested-changes} + +These changes are not required, but are strongly encouraged in order to +provide a uniform genome annotation within the phage community. + +\subsection{Start Codons}\label{start-codons} +Nearly all phage genes use ATG, GTG or TTG as start codons. The start codon distribution is as +follows: + + +\begin{longtable}{lll} +\hline +Start Codon & Count\\ +\hline +\endhead +{% for codon_key in weird_starts_overall_sorted_keys %} +{{ codon_key }} & {{ weird_starts_overall[codon_key] }} \\ +{% endfor %} +\end{longtable} + +{% if weird_starts_bad != 0 %} +There are {{weird_starts_bad }} unusual start codons in the genome, these +should be carefully justified. If there is evidence for these starts, the +GenomeA text should note this. + +\begin{longtable}{lll} +\hline +Feature ID & Location & Start Codon\\ +\hline +\endhead +{% for row in weird_starts %} +{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__start}} \\ +{% endfor %} +\end{longtable} + +{% endif %} + +\subsection{Unannotated RBSs}\label{unannotated-rbss} + +The following CDSs either do not have a detectable ribosome binding site (RBS; +Shine-Dalgarno sequence), in which case there is a strong possibility that +this is not the correct start, or there is one but it is not annotated. +Annotating the RBS as part of the gene feature is the best practice. + +\begin{longtable}{lllll} +\hline +ID & Location & Error & Upstream (-{{upstream_max}} .. -{{upstream_min}})\\ +\hline +\endhead +{% for row in missing_rbs %} +{% if 'Unannotated' not in row.__message%} +{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__message | texify}} & \texttt{{'{'}}{{row.__upstream}}{{'}'}} \\ +{% endif %} +{% endfor %} +{% for row in missing_rbs %} +{% if 'Unannotated' in row.__message%} +{{ row.id | texify }} & \texttt{{'{'}}{{row.location}}{{'}'}} & {{row.__message | texify}} & \texttt{{'{'}}{{row.__upstream}}{{'}'}} \\ +{% endif %} +{% endfor %} +\end{longtable} + +\section{Areas for Further Examination}\label{notes} + +These areas may be indicative of a problem, or may simply be +informational. You should examine the areas mentioned in detail to ensure +that the annotations are valid and that no genes are missed. + + + + + +\subsection{Unusual Gaps}\label{excessive-gaps} + +{% if excessive_gap | length == 0 %} +No gaps over {{ params['excessive_gap_dist'] }} nt (for genes on the same +strand) or {{ params['excessive_gap_divergent_dist'] }} (for genes on +opposite strands) were found. +{% else %} +Gaps over {{ params['excessive_gap_dist'] }} nt (for genes on the same +strand) or {{ params['excessive_gap_divergent_dist'] }} (for genes on +opposite strands) were found. + +\begin{longtable}{llll} +\hline +Region & Size & Surroundings & Messages\\ +\hline +\endhead +{% for row in excessive_gap %} +\texttt{{'{'}}{{row[0]}}..{{row[1]}}{{'}'}} & {{row[1] - row[0]}} & {{row[2] | nice_strand_tex}} {{row[3] | nice_strand_tex}} & {% if row[4] != 0 %}{{row[4]}} ORFs found in this region{% endif %} \\ + +{% endfor %} +\end{longtable} +{% endif %} + + + + +\subsection{Unusual Overlaps}\label{excessive-overlaps} + +{% if excessive_overlap | length == 0 %} +No overlaps over {{ params['excessive_overlap_dist'] }} nt (for genes on the same +strand) or {{ params['excessive_overlap_divergent_dist'] }} (for genes on +opposite strands) were found. +{% else %} +Overlaps over {{ params['excessive_overlap_dist'] }} nt (for genes on the same +strand) or {{ params['excessive_overlap_divergent_dist'] }} (for genes on +opposite strands) were found. +\begin{longtable}{llllll} +\hline +\multicolumn{2}{l}{Feature A} & \multicolumn{2}{l}{Feature B} & & \\ +ID & Location & ID & Location & Region & Length\\ +\hline +\endhead +{% for row in excessive_overlap %} +{{row[0].id | texify}} & \texttt{{'{'}}{{row[0].location}}{{'}'}} & {{row[1].id | texify}} & \texttt{{'{'}}{{row[1].location}}{{'}'}} & {{row[2]}}..{{row[3]}} & {{row[3] - row[2]}} \\ +{% endfor %} +\end{longtable} +{% endif %} + +\subsection{Coding Density}\label{coding-density} + +You have a coding density of {{ coding_density_real }}\% which scores +{{ coding_density }} / 100 on our scale. Most genomes should be in the 90\% to 100\% +coding density range + + + + + + + + +\end{document} diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/shinefind.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/shinefind.py Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,420 @@ +#!/usr/bin/env python +import re +import sys +import argparse +import logging +from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import FeatureLocation +from gff3 import ( + feature_lambda, + feature_test_type, + feature_test_true, + feature_test_quals, + get_id, + ensure_location_in_bounds, +) + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +class NaiveSDCaller(object): + + # TODO May make switch for different sequence sets + SD_SEQUENCES = ( + "AGGAGGT", + "GGAGGT", + "AGGAGG", + "GGGGGG", + "AGGAG", + "GAGGT", + "GGAGG", + "GGGGG", + "AGGT", + "GGGT", + "GAGG", + "GGGG", + "AGGA", + "GGAG", + "GGA", + "GAG", + "AGG", + "GGT", + "GGG", + ) + + def __init__(self): + self.sd_reg = [re.compile(x, re.IGNORECASE) for x in self.SD_SEQUENCES] + + def list_sds(self, sequence, sd_min=3, sd_max=17): + hits = [] + for regex in self.sd_reg: + for match in regex.finditer(sequence): + spacing = len(sequence) - len(match.group()) - match.start() + if sd_max >= spacing+sd_min and spacing+sd_min >= sd_min: + #if the spacing is within gap limits, add + #(search space is [sd_max+7 .. sd_min] so actual gap is spacing+sd_min) + #print('min %d max %d - adding SD with gap %d' % (sd_min, sd_max, spacing+sd_min)) + hits.append( + { + "spacing": spacing, + "hit": match.group(), + "start": match.start(), + "end": match.end(), + "len": len(match.group()), + } + ) + hits = sorted(hits, key= lambda x: (-x['len'],x['spacing'])) + return hits + + @classmethod + def highlight_sd(cls, sequence, start, end): + return " ".join( + [ + sequence[0:start].lower(), + sequence[start:end].upper(), + sequence[end:].lower(), + ] + ) + + @classmethod + def to_features(cls, hits, strand, parent_start, parent_end, feature_id=None, sd_min=3, sd_max=17): + results = [] + for idx, hit in enumerate(hits): + # gene complement(124..486) + # -1 491 501 0 5 5 + # -1 491 501 0 4 5 + # -1 491 501 1 4 5 + # -1 491 501 2 3 5 + # -1 491 501 1 3 5 + # -1 491 501 0 3 5 + + qualifiers = { + "source": "CPT_ShineFind", + "ID": "%s.rbs-%s" % (feature_id, idx), + } + + if strand > 0: + start = parent_end - hit["spacing"] - hit["len"] + end = parent_end - hit["spacing"] + else: + start = parent_start + hit["spacing"] + end = parent_start + hit["spacing"] + hit["len"] + # check that the END of the SD sequence is within the given min/max of parent start/end + + # gap is either the sd_start-cds_end (neg strand) or the sd_end-cds_start (pos strand) + # minimum absolute value of these two will be the proper gap regardless of strand + tmp = gffSeqFeature( + FeatureLocation(min(start, end), max(start, end), strand=strand), + #FeatureLocation(min(start, end), max(start, end), strand=strand), + type="Shine_Dalgarno_sequence", + qualifiers=qualifiers, + ) + results.append(tmp) + return results + + def testFeatureUpstream(self, feature, record, sd_min=3, sd_max=17): + # Strand information necessary to getting correct upstream sequence + strand = feature.location.strand + + # n_bases_upstream (plus/minus 7 upstream to make the min/max define the possible gap position) + if strand > 0: + start = feature.location.start - sd_max - 7 + end = feature.location.start - sd_min + else: + start = feature.location.end + sd_min + end = feature.location.end + sd_max + 7 + + (start, end) = ensure_location_in_bounds( + start=start, end=end, parent_length=len(record) + ) + + # Create our temp feature used to obtain correct portion of + # genome + tmp = gffSeqFeature(FeatureLocation(min(start, end), max(start, end), strand=strand), type="domain") + seq = str(tmp.extract(record.seq)) + return self.list_sds(seq, sd_min, sd_max), start, end, seq + + def hasSd(self, feature, record, sd_min=3, sd_max=17): + sds, start, end, seq = self.testFeatureUpstream( + feature, record, sd_min=sd_min, sd_max=sd_max + ) + return len(sds) > 0 + + +# Cycle through subfeatures, set feature's location to be equal +# to the smallest start and largest end. +# Remove pending bugfix for feature display in Apollo +def fminmax(feature): + fmin = None + fmax = None + for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True): + if fmin is None: + fmin = sf.location.start + fmax = sf.location.end + if sf.location.start < fmin: + fmin = sf.location.start + if sf.location.end > fmax: + fmax = sf.location.end + return fmin, fmax + + +def fix_gene_boundaries(feature): + # There is a bug in Apollo whereby we have created gene + # features which are larger than expected, but we cannot see this. + # We only see a perfect sized gene + SD together. + # + # So, we clamp the location of the gene feature to the + # contained mRNAs. Will remove pending Apollo upgrade. + fmin, fmax = fminmax(feature) + if feature.location.strand > 0: + feature.location = FeatureLocation(fmin, fmax, strand=1) + else: + feature.location = FeatureLocation(fmin, fmax, strand=-1) + return feature + +def shinefind( + fasta, + gff3, + gff3_output=None, + table_output=None, + lookahead_min=3, + lookahead_max=17, + top_only=False, + add=False, +): + table_output.write( + "\t".join( + [ + "ID", + "Name", + "Terminus", + "Terminus", + "Strand", + "Upstream Sequence", + "SD", + "Spacing", + ] + ) + + "\n" + ) + + sd_finder = NaiveSDCaller() + # Load up sequence(s) for GFF3 data + seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) + # Parse GFF3 records + for record in gffParse(gff3, base_dict=seq_dict): + # Shinefind's gff3_output. + gff3_output_record = SeqRecord(record.seq, record.id) + # Filter out just coding sequences + ignored_features = [] + for x in record.features: + # If feature X does NOT contain a CDS, add to ignored_features + # list. This means if we have a top level gene feature with or + # without a CDS subfeature, we're catch it appropriately here. + if ( + len( + list( + feature_lambda( + [x], feature_test_type, {"type": "CDS"}, subfeatures=True + ) + ) + ) + == 0 + ): + ignored_features.append(x) + + # Loop over all gene features + for gene in feature_lambda( + record.features, feature_test_type, {"type": "gene"}, subfeatures=True + ): + + # Get the CDS from this gene. + feature = sorted( + list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=True, + ) + ), + key=lambda x: x.location.start, + ) + # If no CDSs are in this gene feature, then quit + if len(feature) == 0: + # We've already caught these above in our ignored_features + # list, so we skip out on the rest of this for loop + continue + else: + # Otherwise pull the first on the strand. + feature = feature[0] + + # Three different ways RBSs can be stored that we expect. + rbs_rbs = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "RBS"}, + subfeatures=False, + ) + ) + rbs_sds = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "Shine_Dalgarno_sequence"}, + subfeatures=False, + ) + ) + regulatory_elements = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "regulatory"}, + subfeatures=False, + ) + ) + rbs_regulatory = list( + feature_lambda( + regulatory_elements, + feature_test_quals, + {"regulatory_class": ["ribosome_binding_site"]}, + subfeatures=False, + ) + ) + rbss = rbs_rbs + rbs_sds + rbs_regulatory + + # If someone has already annotated an RBS, we move to the next gene + if len(rbss) > 0: + log.debug("Has %s RBSs", len(rbss)) + ignored_features.append(gene) + continue + + sds, start, end, seq = sd_finder.testFeatureUpstream( + feature, record, sd_min=lookahead_min, sd_max=lookahead_max + ) + + feature_id = get_id(feature) + sd_features = sd_finder.to_features( + sds, feature.location.strand, start, end, feature_id=feature.id + ) + + human_strand = "+" if feature.location.strand == 1 else "-" + + # http://book.pythontips.com/en/latest/for_-_else.html + log.debug("Found %s SDs", len(sds)) + for (sd, sd_feature) in zip(sds, sd_features): + # If we only want the top feature, after the bulk of the + # forloop executes once, we append the top feature, and fake a + # break, because an actual break triggers the else: block + table_output.write( + "\t".join( + map( + str, + [ + feature.id, + feature_id, + feature.location.start, + feature.location.end, + human_strand, + sd_finder.highlight_sd(seq, sd["start"], sd["end"]), + sd["hit"], + int(sd["spacing"]) + lookahead_min, + ], + ) + ) + + "\n" + ) + + if add: + # Append the top RBS to the gene feature + gene.sub_features.append(sd_feature) + # Pick out start/end locations for all sub_features + locations = [x.location.start for x in gene.sub_features] + [ + x.location.end for x in gene.sub_features + ] + # Update gene's start/end to be inclusive + gene.location._start = min(locations) + gene.location._end = max(locations) + # Also register the feature with the separate GFF3 output + sd_feature = fix_gene_boundaries(sd_feature) + gff3_output_record.features.append(sd_feature) + + if top_only or sd == (sds[-1]): + break + else: + table_output.write( + "\t".join( + map( + str, + [ + feature.id, + feature_id, + feature.location.start, + feature.location.end, + human_strand, + seq, + None, + -1, + ], + ) + ) + + "\n" + ) + + record.annotations = {} + gffWrite([record], sys.stdout) + + gff3_output_record.features = sorted( + gff3_output_record.features, key=lambda x: x.location.start + ) + gff3_output_record.annotations = {} + gffWrite([gff3_output_record], gff3_output) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Identify shine-dalgarno sequences") + parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome") + parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations") + + parser.add_argument( + "--gff3_output", + type=argparse.FileType("w"), + help="GFF3 Output", + default="shinefind.gff3", + ) + parser.add_argument( + "--table_output", + type=argparse.FileType("w"), + help="Tabular Output", + default="shinefind.tbl", + ) + + parser.add_argument( + "--lookahead_min", + nargs="?", + type=int, + help="Number of bases upstream of CDSs to end search", + default=3, + ) + parser.add_argument( + "--lookahead_max", + nargs="?", + type=int, + help="Number of bases upstream of CDSs to begin search", + default=17, + ) + + parser.add_argument("--top_only", action="store_true", help="Only report best hits") + parser.add_argument( + "--add", + action="store_true", + help='Function in "addition" mode whereby the ' + + "RBSs are added directly to the gene model.", + ) + + args = parser.parse_args() + shinefind(**vars(args)) diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/AY216660.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/test-data/AY216660.fasta Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,815 @@ +>AY216660.2 Enterobacteria phage T1, complete genome +GCCTGCAATATGGTAAAATAGCACTAAATGTTAAACAAAGAGGATGTATTTATGAGTGAA +CCTAAGAACGCTCCCGTAGTCCAGGGTGGTAATTTCAAAGAGCTATACAAGAAAAAGTTT +GGCACTGTACTCGCGAAAAACCGGGCTATGACGCCAGAGCAACTATTCGATCTGTCAGTG +AAGTATTTCGAATGGGCCGAGGACAATGCGATCAAGGCGTCAGAATCAGCCAGCTTTCAG +GGTGGCGTTTATGAGTCGCTTGTCCATAAGCCGCGCGTCTTCACCTGGACCGGATACCGA +CTATTCATCGGTGCAAGCGAGGCTGCAATCATTAAGTGGAAGCGAGAGGAAGAATACAGC +GAGGTTATGGAGTTTGTGGAATCGGTAATCAACGAGCAAAAATTCCAGCTTGCCGCCAAC +GGTGTTATTAATGCCTCCTTTATCGGTAAGGATCTCGGAATCGATAAGCCAGCCTCAATC +AATATCGAAAACTCGTCAGCTTCCGCATCGACAGTAGTCGCCACTACTGAGGATGCGATG +AAAGAGGCGGTAAACAGCATTCTTGATATGCTTTAACTTTAGGGGCGCGAGAGCGCCCAC +ATGGGAGACTTAATCATGATTCAATGGGAAGACCTTAACGCAACGCAGAAGTTAGCGATC +AAGAAAATGAGCGAGGCCAATTTCGAAAAAATGATTCGGATCTGGTTCCAACTTATGCAG +GCGCAGCAGTTCCAGCCTAACTGGCATCACCTTTACCTATGTCACGAAGTGGAGGAAATT +ATTGCAGGGCGGCGAGGGAATACAATCTTTAACGTCACACCAGGTTCCGGTAAAACTGAA +GTGTTCTCAATTCACCTTCCGGTGTACGCAATGCTTAAGTGTAAGAAGGTGCGAAACCTT +AACGTGTCGTTTGCTGACAGCCTGGTTAAGCGTAACAGTAAGCGCGTCCGTGAGATTATC +AGCAGCAACGAATTTCAAGAGCTATGGCCTTGCAAGTTCGGTACATCGAAAGATGAGGAG +ATGCAGGTTCTTAACGAAGATGGAAAGGTTTGGTTTGAGTTGATATCCGCAGCGGCTGGC +GGTCGTATTACTGGTTCGCGTGGTGGCTACATGACGCCGGGATTCTCGGGGATGGTAATG +CTAGACGATATCGACAAGCCTGATGATATGTTCTCAAAGGTTAAGCGTGAGCGTACGCAC +ATGCTATTGAAAAACACCATCCGTTCCCGTCGTATGCATAACGAGACGCCTATTATTGCA +ATTCAGCAGCGACTACATGCTCAGGATTCAACCTGGTTCATGATGAATGGCGGTATGGGT +ATTGAGTTTGACCAAATCTCAATACCGGCGCTGGTGACGGAAGAATACGGAAAGACACTT +CCTGATTGGTTGCAGCCTTACTTTGAGCGTGATGTTTTATCGTCTGAGTATGTAGAATTG +GATGGCGTTAAGCATTACTCTTTTTGGCCAAGCAAGGAAAGCGTTCACGACCTGTTGGCG +TTACGAGAAGCAGATCAGTATACCTTTGATTCTCAGTATCAGCAGAAACCGATCGCGCTG +GGTGGCTCCGTGTTTAACTCAGAGTGGTGGACTTATTACGGCAGCAGTCTTGACGCTGAC +GAGCCAGATCCGGGTAAATATGATTACCGATTCATCACTGCCGATACCGCTCAGAAGACA +GGCGAGCTAAACGACTACACGGTATTTTGTTTGTGGGGCAAGAAGAATGATAAGGTTTAC +TTTATCGACGGCATTCGCGGAAAGTGGGAAGCGCCGGATATGGAGAGGCAATTTACAGCT +TTCGTCAATCAGGCATGGAGGCACAATAAATCAATGGGGGTACTTCGTAAAATTTATGTG +GAAGATAAGGCGAGCGGTACGGGCTTAATCCAGAACCTCAGGAAAAAGACCCCGATCTCA +ATCACTCCATTGCAGCGTAACAAAGACAAAGTTACCCGAGCTATGGATGCTCAGCCAGTT +ATTAAAGCCGGGCGCGTGGTTCTGCCAGAAGAGCACCCTATGCTTGCTGAAATTATCGCT +GAACACAGTGCCTTCACTTACGATGACACCCATCCGCATGATGATATCGTCGATAACTTC +ATGGATGCGGCGAACATCGAATTGCTGACCATTGATGATCCTATCGAGAGAATGAAGCGA +CTCGCCGGGATGGTTAAGCGGTAATAAATGAGATATAATTAGGGCTGTCAATTGACGGCC +CTTTTTATTGGAGGAAACATGAAAATTGTTAAGCATGATGGATATAACGATATCTTTAAC +GGCGGCGCGGACGGATCGCCTAAGCCATTCTTTATGTCTGATGCATCATATCACGTCGGT +TCTTTCTACAACGACAACGCAACCGCGAAGCGAATTGTGGATGTTATCCCGGAAGAGATG +GTGACGGCTGGTTTTAAAATGTCAGGCGTTAAGGATGAAAAAGAGTTCAAGTCTTTATGG +GATAGCTACAAACTTGATTCAAGTCTGGTGGATCTTCTTTGTTGGGCACGACTTTACGGT +GGCGCGGCGATGGTGGCAATCATCAAAGACAACCGGATGTTAACCAGTCAGGCAAAGCCT +GGAGCTAAACTTGAAGGCGTCCGAGTTTATGATCGATTTGCTATCACTGTTGAAAAGCGA +GTCACCAATGCAAGATCTCCTCGCTATGGTGAGCCTGAAATTTACAAGGTATCCCCTGGC +GACAATATGCAGCCGTACCTGATTCATCACTCAAGAGTCTTTATTGCTGATGGTGAACGA +GTGGCGCAACAGGCAAGAAAGCAGAATCAAGGATGGGGAGCTTCGGTATTGAATAAGTCA +CTGATTGATGCAATCTGTGACTATGATTACTGTGAATCTCTGGCTACTCAGATCTTGCGA +CGTAAGCAACAGGCTGTATGGAAGGTCAAAGGTCTTGCCGAAATGTGTGATGATGATGAT +GCTCAGTATGCCGCGCGCCTGCGACTTGCTCAGGTAGATGATAACTCCGGCGTAGGCCGT +GCGATCGGTATCGATGCTGAGACTGAGGAATATGACGTTCTCAACTCTGATATCAGCGGA +GTCCCTGAGTTCTTATCAAGCAAGATGGACCGCATCGTCTCCCTATCCGGGATTCATGAG +ATTATCATCAAGAATAAGAACGTAGGCGGCGTATCAGCGAGCCAAAACACAGCGCTTGAG +ACTTTCTATAAGCTAGTCGATCGCAAGCGCGAGGAAGATTACAGGCCGCTTCTTGAGTTC +TTGTTGCCGTTCATTGTTGATGAGGAAGAGTGGTCGATCGAGTTTGAGCCTTTGTCTGTT +CCGAGTAAGAAAGAGGAATCAGAGATCACGAAGAATAACGTTGAGTCAGTCACGAAGGCT +ATCACTGAGCAAATCATCGATCTGGAAGAAGCTCGCGACACGTTGCGATCCATTGCCCCT +GAGTTCAAACTCAAGGATGGTAATAACATCAACATTCGCGAACCGGAAGAAACAACCGAA +CCGGAGCCGGGATTAGGGGAGAAGTTAGAAGATGAAAATTAATGGCGTTGCAACACAGTG +GCGCTATCCTGAAATGAGCGAGCGCGCAATGTCGCGCTCCCTACAGGATGTTGCAGCCAA +ACTAACTGAAAAAATGCGTGACGAATTAAAGCCGATGAAATTTGACGCTACCGACGAAGA +GATAGATCAGACAGAGAGGTCATTGCTTGATTACGTCGAATCACTCATCGCTCCGATTAT +TGGTTCTCTATCATCCGTTGCGCTCACGATCTATAAATTCAACTCTAAGCAGTGGCTGCG +CATCGCTCGCAATGCTGGAGGTAAGAAGAATCAAGCCGTGATGCTACTTGCCCTGATTGG +TCCTACCGCTGCCGAAAGCTGGTACTCAGGACAATATAATCTGTGGCGATCGCAGGTTGC +TACTTCTATCAGGAAATTTGCCGCCAACATGGTTACTGATTTCACTGATAAACTTCGTGC +GGCATCCGGTCAGGGTAAAAGCAAGGATTTTGTTGTTGAACTTGCTAAGGAGCGATTTGG +TATTTACCGGAACTGGGCCAAAAATAGAGCGTCGGGAATTGTCGGAACCTGGAACAGTAG +ACTGATGCGTCAGCGCATAAAAGACGCTGGTGTCTCTTACTATTTCTGGCGCGGGGTGAT +GGATTTACGCGAACGTGAAAAACATGTAAGATGGGAAGGTAAGCGCATAGCGGTAGATTC +CGATCATGTATTCCCTGGTGAGGAATACAACTGCCGCTGTTGGGCTGTTCCAGACTTTTC +TACAGGAGATTAAAAATGAAGGCAAAGCAAAGATTCGATTCAGTAAAAATCAAGGCGCAC +TTTGATGATAACGGTTTTTTAGTTGACCGCCCAATCGTGGCGCGAATCGGCGCTCAGGTT +TACAAAACGCCGCACGGCGATCGAGTTGAGTTCCGTCCGGCGTCCGAAGTTTTCAAGCAA +GACTCCTTGCAAAGTTTTGCGGGTAAGCCAATTACTGTCGGTCACGTAACGGTAACTCCG +CAGAATGCTAAGGACGTTGTTGTCGGATCGTGTGCTGGCGCTGGTATTGCTTCAGGGGTT +GGCGTTGAAGTTCCTTTGAGTATTTACAGCGACTACGCGATCAGCAAGGCTAAAGCAAAA +GAAGCAGGTGAATTATCTGTTGGTTATACTTCGGTTGATATTGATAAGCCTGGTTGGGGT +TCAAATGAGACTGGAGAATATATCTTCGAAGAGGATATGAAACAGGACGAAGCGCCGCCT +GAAGGTTGGGTGAGATTCGACGCGGTACAAACTAATATCAAGGTCAACCATATTGCCCTA +GTTTTTAAAGGTCGTGCGGGAATTGCTAAATTAAATCTTGATGCCGAACAGGAGTTCCCG +TATGATAATAACGTTCAATTAACTAACGAGGACAAGCAAATGAAAAAAATTAAGATCGAC +TCAGTTGATGTGGAAGTAACCGAAGACGTTGCGAACCATATCGAAAAATTAACCGCGCAG +ATTGCCACCATTCAGGGGAAAGCTGATGGCTTCGAAGCTGAGCGCGATGCGCTGAAGGTT +AAGGTTGACTCTCTGCCGGAACTTGTGAAGGCCGAGGTAGAGAAGCAAAAAGCCGATGCC +GCCGCACGCGCAGAAGTTACCGCAGTAGCAGAAACCGCAGGCGTCAAACATGATGGTCTT +GATATCAAAGACGTCAAGATTGCCGTAGTTAAAGCCATGCTTGATAAAGATGTTAGTGAA +AAATCAGACGCATATATCGACGCTATGTTTGATGTTGCTAAAGATTCTGATATCATGGCT +ATTCAGCGTAAAGCAGTAAAAGGCGACTCTATCGAAGGCGGTAAGCCGGAAGAGAAAAAC +GACGCCGCGCCTGTTACGCCAAATTCACGTTTAAGCAAAGTAATGTAAGGGGAAATATCA +TGGCACAAATTAATGCATCTTATCAGCGAGATATGGCGATTGCGCTTCCGGGTATGGTTG +CGGATACTTCAAAGTACAATATTGACGGCGCTTGTGTCGTTAATGAAGGTGATGTTCTTG +TTGGCGCTGCCGTACAAGTTGTTCAAGCTCAGGCGGTTGATGGTCATAAGTTGGTTAAGG +CTCTTACTACCGGAACCACTCCTTACGGCGTGGCAATCCGATCTCACTGGCAGACTGTTA +ACGCTCAAAATCAGATGATTTACGAAGATGGCGGCGCTATCAACGTGATGACTTCAGGCC +GAGTATGGATGCTTTCCAAATCCACCGAAGCGCCAACTTTCGGCTCTGCCGTTAAACTTG +ATGTTGATGGTCAGGAAAAATCTGATGGCACGATCGAAACAACCTGGACCTACGCTGGCG +GTTGGACTAAATACAAAGATATTCAGCTTGTTGAGGTTCAGTTGCATCAACTGTAATTAG +CGTTTAATATGGGGACTATCCTTTTTTGGATAGTCCTTTTTTTATGGAGAAATCATTATG +GCTTACGAAAATTTAATGTTGCGCCCGGCGTGTCCGGGAAATCTTTCTGATACTTCAACC +TACAATATTGATGGCGCTTGCGTGGCTCAAGGTGACATTGAGTTCGGCTCAGCGGTTCAG +GTTGTCGGCATCGTTGATGGTGTAAAAGTTGTTACGGCGCTTTCTGATGGTGGAACTCCT +TACGGTATCGCTTTCCGTTCCCAATATGAACACCTGAGCGGTAAAATCCTCGACGGTGAA +GTGTGCAACGTCGTTTCTCACGGTCGCGTGTGGGCGCTTACTTCTCTTGATGAGGCTCCC +AGCTTGTTCTCAAAGTTGCAGTTTGGATCTGGTGGGGTTGTTACTGGTGGATCTGGTTAC +GCAGGATGGACCTTTGCTGGCGGCTTTGTTAAGCACGAAGATGGCTACATTATTGAGGTT +CGGGTGAAACAAAATGCTTTCATCGTTCCACCGCCGCCGCCCCCTGTCGTTCTTGTTGAA +TCCGCTACAATCACCACTGACAAGGAAAGCCCTCAGCCAAACAACGTTACGATCCAGTGT +GTAGCTAATGCTCTTCCGGCTAATGCAACTGATAAGACTGGCAAATGGTCAATCGACGCT +ACCAATATCGCCACTGTCAATCCGGACTCAGGTCTTGTAACTCCTGTTGGTGGAGAGGTA +GTCGGTGATTTCAATATTACCTGGACGGCTAACGATGCCAGCAAGACGACGGCAACCATT +GCTTATCGCGTAGAAGCAGTGCCAACGCCAGAGGTTGATGTATAACATAAAAACACTTTG +ACGCTTTAACAAAAAGTGCTATTATTGAAGCCGTGAACATAATCACGGTTTTTTATTAAC +TATGGAGAAGTAATCATGACTACTAAAAAATTTGATGAAGCAGATAAAAGCAATGTTGAA +ATGTATCTGATCCAGGCTGGCGTAAAACAGGATGCGGCCGCAACGATGGGTATCTGGACC +GCTCAGGAACTACACCGCATCAAAAGCCAGTCCTATGAAGAAGACTACCCGGTCGGCTCA +GCTTTACGCGTATTCCCGGTTACAACCGAGCTTTCTCCGACCGACAAGACGTTTGAGTAT +ATGACCTTTGATAAGGTTGGTACGGCTCAGATTATCGCAGACTACACCGACGATCTTCCG +CTGGTTGATGCCCTGGGTACTTCTGAATTTGGCAAGGTGTTCCGTCTTGGTAACGCGTAC +CTGATCTCAATCGACGAAATCAAAGCGGGTCAGGCAACTGGTCGCCCACTGTCAACCCGT +AAGGCGAGTGCGTGCCAGTTGGCGCATGATCAGCTTGTTAACCGCCTGGTGTTCAAAGGT +TCCGCGCCGCACAAGATTGTGTCCGTGTTCAACCATCCGAATATCACCAAAATTACCTCT +GGTAAGTGGATTGATGTATCTACTATGAAGCCGGAAACTGCGGAAGCTGAGCTAACTCAA +GCGATCGAAACCATCGAGACGATTACTCGTGGTCAGCACCGCGCAACCAACATCCTGATC +CCGCCTTCCATGCGTAAGGTTTTGGCGATTCGTATGCCTGAGACAACCATGTCTTACCTG +GACTATTTTAAGTCTCAGAACTCCGGTATCGAAATCGACTCTATCGCAGAGCTTGAGGAT +ATTGACGGCGCAGGCACCAAAGGCGTACTGGTGTACGAAAAGAATCCGATGAACATGTCC +ATCGAGATCCCGGAAGCATTTAACATGCTGCCAGCACAACCGAAAGACTTGCACTTTAAA +GTGCCTTGCACCTCTAAGTGTACTGGTCTTACAATTTATCGCCCGATGACTATTGTCTTA +ATCACTGGCGTGTAATATTATAGGGGCTAACTTAGTTAGTCCCTTTTTTTATTGGAGAAA +TCAAAATGGCTAAAGAAAAAACTGTTGTTATCGTAAACGTTGGTGTAGCTCTTCAGATGT +TCCGTCTTGAAGATGGTTCCTTTGCTAAAGTTCTTCCAGATGAAGAGGTCACGCTTCCGG +CGTCCGTTCTTGATTTACCTGGTCTGCGTTGCTTAATTGCTCGCGAAGAAATCGAAGTTA +AAGACGACAGTGCAACCAACCGCAAAATCCGCGCTGAAATGGCAAAGATCACGAAGCCAG +ATCCGTGGGATAAAATGAGCGTAAAAGAGCTTGAAGACGGCGGCGAATATTAATCATCAA +GGCGCTCATGTAGCGCCTTTTTTTATGGTGGTAAATTATGAATCAAGAAACTTTAATTGC +AGTTGTTGAGCAAATGCGAAAGCTGGTTCCGGCACTTCGTAAGGTTCCAGACGAAACGCT +TTATGCGTGGGTAGAAATGGCTGAGCTTTTTGTATGCCAGAAGACCTTTAAAGACGCATA +CGTCAAAGCGCTCGCTCTTTATGCATTGCACCTTGCTTTCCTTGACGGGGCGCTAAAAGG +TGAAGATGAGGATCTGGAATCGTACTCACGACGAGTTACGTCATTCTCCCTGAGCGGTGA +ATTTAGCCAGACTTTCGGAGAGGTTACAAAGAACCAGTCAGGAGACATGATGCTTTCGAC +GCCGTGGGGTAAGATGTTCGAACAGCTTAAAGCGCGACGCCGTGGTCGATTCGCATTAAT +GACAGGACTCCGTGGAGGATGCCACTAATGAACTACTCACAGATTGAAAGGATGGCTCGC +AAAGGTGTGGCTTTCTTCACCGATCCGTCAAGACCTATGAACCTGATAAAGCAAGGTGAA +TACGGATATGATGAAAACGGATTCGAGATCCCACCGATGGAACAGGTTATTCCAATATCC +GGCGCGACGCGAAGACCGAACGCGCGTGAGATTGACGGGGAAACCATCCGCGCCTCAGAT +ATTTTGGGGATCTTCAATAATGATCATGAAATAAACGAAGGTGACTATATAGAGATTGAT +GGCATTCGTCATGTTGTCGTTGATGCTCGCCCGGTTCAGGCGTCACTGGAACCAGTTGCC +TATCGTCCAGTATTGCGGAGGGTATCAGTCGGTGGCTAATTATCAGATTCGTAGATTTCA +AGGCGAGATTGATGCGTGGATTAATGCCGCTGAAAGCACGTTAGAACATGCTATTGAGAT +ATTCGTAAGGGATGTTCACGACGCTCTTGTTAGCCGCTCCCCTGTTGATACAGGTCGATT +CAAGGGTAACTGGCAGATAACTTTTAACGAAATCCCTAACCACGCATTAAACCGATACGA +TAAAACTGGCGGTGTCGTCAGGGGTGAGGAACAGGCAAAAACTTATGGCATGTTCAGCCG +TGGCGGCGCGATAACATCCGTTCACTTTTCAAACATGTTGATTTATGCAAACGCTCTTGA +GTACGGTCATTCACAGCAAGCACCGAGCGGCGTTGTCGGTCTTGTGGCGTTAAGGCTTAG +ATCATATATGGCTGACGCAATCAAGCAGGCAAGGAGACAGCAAAATGCACTATGAGTTAT +CAGCGGCGGCGCGAGCCGCTTTTCTATCAAAGTACAGAGACTTTCCTCACTACATGGAAA +ACAGAAATTTCACACCGCCGAAGGATGGCGGGATGTGGCTGAGGTTCAACTACATTGAAG +GGGATACGCTTTATCTATCCATTGACAGAAAGTGTAAATCTTACATCGCAATCGTTCAGA +TCGGCGTAGTGTTCCCTCCAGGCTCCGGCGTTGACGAAGCAAGATTGAAAGCAAAAGAGA +TTGCTGATTTTTTCAAAGATGGTAAAATGCTTAACGTTGGTTATATTTTCGAGGGTGCAA +TCGTGCATCAAATTGTTAAACATGAAAGCGGGTGGATGATTCCGGTTCGCTTTACAGTAC +GAGTAGACACAAAGGAGACTTAATTATGCACTTACCAAATGGCGCACAAATTTTCGTGGA +AACCTCTCGCGGGGTAGAGGTTGAGGCAACCGCTATCACTAACGCAGAAAATCCTGTTGC +TACAGTTGCATCTAAGGGTGACTTGGCAAAAGGTGATTACGTTATTGTAACTCAGTCAAC +TTGGGCAAAGATGGTTAGTCGAGTGCTAATTGTTACTGACGCTCAGGAAACAAGTATCAC +TCTTGCTGGAATTGACACCTCCGATACTCTTGTTTTCCCGGCTGGCGGCACGATGAGCTT +TGCAAAAATTACTGGCTGGACTGAGATCCCTTGCGTACAGGAGATTGGTCAGGACGGCGG +CGAGCAGCAGTATTACACTTATCAGTGTTTGTCCGACGATAAAGAGCAGCAGATCCCAAC +GTTTAAATCTGCGGTCTCGCTAACTTACACCTTCGCGCACGAATTTGATAACCCGATCTA +CCCGATTCTGCGCAAGCTGGATTCGTCTGGTCAGGTAACAGCGGTTCGAATGTACGTTCC +GAAAGCGAGCGAAATGCGCATGTGGGCTGGCATCTTGTCGTTTAACGATATCCCATCCAC +GCAGGTTAACGAAATGGAAACGGTGGAACTCGCCGTATCCCTGAAAGGTGACTTTACTTT +CATCTCATCCACTCTGGCATCGCCTGGTGCTTAAATACCATCCACAGGGGGCTTGCACCC +CCTTATTCATTTCTGTAAAATCATCTTATCAACTTTATTCGATTAACTTTTAACAAAAAG +TGCTATCAACCAATCAGGAGAAACATCATGGCTAAATTCAATTTCGTGTTGGGCCAGCTT +CCAGACTTCAAACTTCCGGTGACGTTCACCATGCCAAACGGCGAGGATGCGACTATTATT +TTTACAGTACGCCACCTTTCCAGTAAAGAAGTGCAAGATATGTATGCGAAGCAGGGCGAA +ATGAATGATAGCGATTTCATCACTAAGATAGCATCAGGATGGAATCTGGAAGAAGAATTT +AACGAAGAGAATACGCGTAAGCTGGTACAGTATTATCCTTCCGCAGCGTACAATCTGACG +GCAACTTACATCAAGGCGCTCGCCGGACACCGCGCAAAAAACTAAAAAGGGCGGTTTATC +TGTTATATCAGAAACCGCCAACAGAAGAGCAATTAAGATCGGTTGGCCTCAGTCTTTCTG +ACTATGAAGACGAGGAACCGGAAACGATAATCGGCGATGCTGAAATGGTGAAGGCGTGGA +ATGTTTTTACGTCAATGCTCACTCAGTGGAGAAGTTCAGGCGCTGGAGCTTATGGTCTTG +ACTATAATGTTTTGCCTATGTTGTTCAAAATCTATAAAATAGAAGATGAAGAACTGGCAT +TGCAGGACGTTAGGATCATGGAAGCGAAAGCGCTTGAAATGATTGCTAAGCAAAACAACT +AAGCCGCCGTTTGGCGGTTTTTTCGTATATAGGGGGGGTTATATGGTTGATAAGGTAGCA +GGTCTATCTCTTGACGTTGACGTGTCAACAGTTCAGCGCGCTGTCAAGTCACTGAAAGAG +TTTTCAAAGGCCAACGATCAGGCCGCTGATTCTATGGGTTCTTTAATCAATGAGTCAGAG +GTTGCAAAACAGAAGGCCAAAGAACACGCCGAACAACTCAGGCGCCAGAGAAAAGAGTAT +GAGGCCGTGGAGAAGGCAATCGATCCTACAGTATCAAAAATGGAAAGGTTGAAGATTGCA +TCTCAGCAGCTTGATAAACTCTGGCAGCAGGGAGTCGTTCCAGATGAGACTTTTTTCCGT +TTGGGTGAAATGCTGGATCTGCAAAACGCAAAACTTGCTCGCAGCCGGGCCATGCTGACA +GAAGAAGGGCAGGCAGCATTGCAAGAGGCGAAAGCAAAAGAGCAGGCGGCAGTGCGTAGC +AAGGCGTTCATGGATGCCCTGAATGGTCAAGTTAACGCGATCGGTAAGACTCATGCTGAA +TTGATGGAACTGAAAGCGGCTGAGCTTGGTTTATCGAAAGAAGCAGCACCACTAATCGCA +AAACTTAAAGATCAAGGCCGGGCTATGAATGCAGCAGGTATTAGCGCCGGGGAATACAGA +CAGGCAATGCGAATGCTTCCTGCGCAGATCACAGATGTCGTAACATCTCTTGCATCCGGT +ATGCCAGTATGGATGGTTGCTATCCAACAGGGCGGTCAGATTAAGGATTCGTTCGGCGGG +ATCGGTAACACGTTTAAAGTGTTGCTGAGTTATATTAATCCGGTCACGGCAGGTGTTGGC +GTTCTTGTTGGTTCGTTAGGTATTCTAGCGAAAGCTGGTTATGACTCTTACAAATCAATA +ACTGATATTCAGAATGCGCTTATTGAGACTGGAGGTTATGCAGGTGTTACGGCTGAAGAG +CTTGATTCAGTGTCTAAAAAGATCGCGCAGACAAGCAACTCAACCATTGGGAGTATTCGC +GAGATTGTAACGGAGTTGGCGAGTTCTGGTAAGTACACCCGCGAGCAGATCCAGAACATC +ACTAAGGCTACCGCAGAGTGGTCAGCGTCAACAGGAAAATCAGCAAGTCAAATTATTTCT +GAGTTCGAAAAAATAGCAAGCGATCCGGTAAAAGGACTGAAGAAGTTAAACGAGCAATAT +AATTTCCTTGAAAAAGGGCAGCTTACCTATATCGATACATTAAGCCGGACGAAAGGAGAA +ACTGAGGCTGTATCAGAGGCTACAAAACTATTCGCAGACGTAATGGAAAAGCGAATGAAG +TCGATCGCGGATAACGCTACTCCTCTGGAAAAGATGTGGAGCGATATTAAACAATGGGCT +TCGGACGCGTGGGGATGGGTTGGTGATCATACACTCGGGGCACTAAACCTGATTATCGAC +GTTGTTCAGGGTACAGTGATTCAGGTTAAAATGATTCTTGCGAAGGGTGATGAATACATC +TCAAACTTTATCGCCTCAGCCATAAAGGCAACTCAGTCACTGCCTGGAATGAGTGACTTC +GGCGCTGATGTACTGAAGGAGCAGGAGAATATTGTAAAAAGTTCTCGCGACAACTACGAT +CAGTTAGCTTCAGATCTTGACGCTATTAACGCTCGTGTAGAAAAAGGCGAGATGGGATAC +ATTGAAGCAATGAGGCAGCGCCGCACCCTTGAAAAGCAGTACAGTGAGGAAACTAAGGAG +GCAATAAGGAAAGAAGCAGAAGAGATCGAGAAGCGAAACCGAGAACGAAATAAGCAGTCG +AAAATTGTACGATCACCGACAGAGCAATTCGACAAGGAGTTAATTTCACTCAGGGCTCAA +CTTAAGGTATTGCAGGAGCATAAGGAGATCGGTCAAAAACTATCAGCACAGAGAAAGGCG +CTGTTTACAACTGAGGCTACGATCGCTGTTCTTCGCGAAGCTAGTTCTAAGCGCCAGTTG +TCTGCGGAAGAAAAAGCGTTGCTGGCAAGTCAGGAGAGAGTTATTGAGCTTGCGAAACAG +AAGGCCGAGATTGGCGATCAGATTGTTAAGCAACAGCAGTTGAATGATCTTACCGATAAA +TCTCTGAAGTTTGTCAATGAGATGACGGCGGCGACGGAACAGCTTAACGCGTCACGCGGT +CTTAGTACTCGCGACATGGAACGACAAGCTGAACTAGCTAAAATCACCACTGATTACATC +AACTCCGGCGGCAGCGAAGGAGACGAGAAACTTCAGAACATGATTAAGGCGCAAAATGAT +TACTACGCTGCGGAAGATGCCAAGCGAGCTGACTGGTTGGCAGGTGCTGAAAGTGCTTTT +GCTGATTACGGTGATGCAGCAATGGATATGTACGGCAATGTTAACGAGATCGCGTCAAGT +GCCCTTAACGGAATGTCAGATATGATGGTTCAATTTCTGACCACAGGAAAAGCGAACTTT +GAGGACTTTGCGAAAAATATCATCGGCATGATTATAAAGATGATTGCTCAGATGGTAATC +TTCAATACGATCTCAGGCATGATGGGCGGTAAGACGTGGAGCTTTGCTGGAGGGGCGTCG +TCTGGTGCTTCTGCGGCATCACAGGCAACCCCTACACCTGCTGCTTCTGTTTTTAGATCT +GTATCTTCCGGCGGGGCCGCTGTATCACTTGCTGCCGCAGCGGGTAGCGTGGCAACCTCT +GGATTCAACGCATCAAACTCGGCGCCAAAGGTGGTAAACCATTCAGGAGGTGGAACGGTC +GTTGACGTTAGCGGAATGGAGGTGAAAGTTGACAACGGTTCAGATCCGAGGGGGATTTCT +CAGGGCGTGGAAATGATGTTCAAAAAAATGATTCGTGAGTCTTGTTCGCAGGGCGGCGAG +GTTTATAATTACATTCAGGAAAAAACAGGAGGCTAATAATGGCGACACTTGACACTTTTG +GTTGGTGTACGCAGGTTCAAGGGGGCGGTGGCTCCCTTACCACTACCAACAGCGACCGCT +CTATTCAGTTCGGTAACGGGTACATGCAACTTGCATCATCTGGATTTAACACCACGCGGC +GTGAATATTCAGTCGTCTATGCCGGGGAAGATTTCATGGCTGTTTACGACTTCTGCAACT +CTCACCGCATTAAGCCGTTCGCATGGACGCCGCCGGACGGTAAGATCGGGATATGGGTAG +TAAAGCCTAACAGTTTGGGAGCGAAGCCAGTATCGCGCGACGTGATGGAGATTAACGTCA +CGTTTATGGAGCAATTTACATCTATGGAGTAACGCCATATAACAAAAGCCCGCCTTGCGC +GGGTTTTTTTGTAGCTGTAGAATGGTTGCAGGTAAACAAGAGGAAAAATCAATGAGCGAA +AACAAAAAACTTTATGATGAAGAAAGCGGAAAGAGCCTGTTTCACAACTGCCTTCAATCA +CTATATCCGGGAGAGATAATCACTCTCATCGAAGTTGATGGTAGTAAGTTCGGCGCTCAG +GTGTACCGATTCCACGGTGAGAATATCCAGTACACTCCAGAAGAAATCATGCAGGCCCAG +CAAACTGGAACGCTACCGCCGAAGGAAATTACATTCCGTGGCGAGAAATACGGGGCGCGA +CCGTTCGGTATATCCGGGATCTCGTTTGACAGTTCCGGGAAGGCAACAAAACCACAATTA +ACGGTGGCAAACATTGATAGTCGCGTATCTGCGATGATTCGTGCATATAACGGACTAATG +CAAGCTAAGGTGACTATCTGGATAACTCAGCGTGAGCTTATTAACTCCGATGGCTCAATC +GCTGATGGAGCTTACCGTAAACTGGTATACTATATCGAGCGTCCGAACTATGTTGATAAA +AGCGTTGCGCGGTTCGATCTCACATCACCTTACGATATGGACGGCATAATGATCCCGTCT +CGACTCACGCAAAGCGTATGCTATTTTGCACAACGAGGGTGGTATAAAACAGGGAAAGGC +TGCGGATACAACGGGCAAAATGGTTACTTCGATAAAGACAATAATCCTGTAGACGATCCG +TCGCTGGATTTTTGCCCGGGAACGGTAACGGCCTGCCGCCTGAGATTCGGCGCAAACAAT +GAATTGGATTTTGGCGGTTGCGCTGTCGCTTCATTACAGAGGAAAAATCAATGATTAGTG +CAAAAATTAAACTTGAAATTATGACTCACGCTCAGGAAGAATACCCCCGCGAATGCTGCG +GGGTAGTCACCCAAAAGGGCCGCGTGCAAAAATACCATCGCATTGATAATGTGCATCGTG +ATCCCGAGAATCATTTCATGATGGATGCTGTACAATACGCTTGCATTGAGGACGATGCGG +AATCAACAACAATAGCAATTGTTCACAGCCACACAGGAGACGGGGCTACAACTCTACCAA +GCGCTCACGATACGTGCATGTGCAACGAGATGGAAGTTACCTGGATTATTGTTAGCGTGC +CGGAAGGGGATATGCGATTTGTGAAGCCGGAGAAATTGCCTCTGATTGGTCGTCCGTGGT +CATTAGGATCATTCGACTGTTACGGTCTTGTTATGGCGTGGCACAAAGAGCACGGCGTAG +AATTGCGCGATCGCCGATTGAATTTTGAATGGTGGAAACCTGAGTACGGAATTAATCTCT +ATCAGGATTATTACAAGCAGGATGGCTTCGTTGAGATTCCAGATCAGAATAATCCGTCAT +TCGGTGATATGGTAATCATGCAGATAGGGCAAAACGTTCCGGTATGGAACCATGCAGGGA +TTTACCTGGGAGATAATCAGATCTTGCATCATGCCTTCGGCAAGCTATCTCGTCGTGATA +TTTATTCCGGATGGTATCAGGATCATACTGTTTTAATCGTTCGCCATAAGGATCTTAAAT +TATGAATGATGTAAAAGTAATTAAATTGTCAGGTTCACTTGGGAGACGCTTCGGCGTCTT +TCACCGTTACGCTGTTGACTCTTACCCGGAAGCCATACGGGCGCTATCCAGTCAGGTTGA +CGGATTTAAAGAATACATGCAAAGCGAGGTAGGATCTCGTAGCAAGTTTGCAATATTTGT +GGATGGCGTTAACGTGGGACACCATGAAGAGGAAAAATTCAAGTGCGCGAAAGAGATAAG +AATCGTACCGATCCCTACTGGCTCTAAGACAGGAGGTCTATTTCAGGTTGTATTGGGCGC +GGCAATAATGGTTGCAGCATTCTATACTGGCGGCGCGTCTCTGGCTTTAATGGGCACAAT +GTCCTCGTCTCTGTTTATGATGGGCGGCGCTATGGTGCTGGGCGGCGTGATGCAGATGAT +TTCACCGCAGCCGGGTGGCGCAAACTTTGAAGTTCAATCAAGCAAGAATAAACCTTCGTA +CGCGTTCGGCGGTGCTGTCAATACGACGGCGGCGGGATACCCTCTCCCGGTCCCGTATGG +ATATCGCGCCGGAGGTGGGGCAACTTTCTCAGCAGGTTCTTATGCCGAGGATATGAGTTA +AAATTAACCCGCCTTGCGCGGGTTTTTTTTCGCCTGTATAATGAGTCCACCGATAAATAG +CACAAAAAGGTAAACATCATGATTCAAAAAGTGATAAGCGGATCTAAAGGTGGGTCACAG +AAGCCTCATAACCCAGTTGAGATGGAGGACAATCTAATCTCAATCAACAAAATCAAGATC +CTGTTAGCTGTATCTGATGGTGAAATTGACGAAACATTCAGCCTGAAGCAGTTGATGTTT +AACTCAGTCCCGGTGCAAAACGAGGATGGCTCATTTAACTTCGAGGGAGTAAAGGCAGAG +TTCAGACCGGGGACGCAGACTCAGGAATATATCAAGGGAATGGAAGATAGCTCTAGTGAG +GTAACTGTAAATCGTGAGGTTACTACCGATAACCCATACACAATCTCAGTAACCAACAAA +ACGCTGTCGGCAATCCGTATCAAAATGTTCATGCCTCGCGGCGTACGAATTGAAAGTAAC +GGTGATAAAAATGGCGTAAGAGTTGAGTATGAGGTGCAACAAGCTGTTGATGGCGGCTCG +TTTGAGACGGTGCTCACCGATGTAATCGAAGGCAAAACAATGTCAGGTTACGATCGAAGC +AGACGTGTAAACCTACCTAACTTCAACAATCAGGTGATATTCAGAGTGGTTCGGAAAACT +CCAGACTCTAACGACTCGAACGTTGTTGACGCGATTCAGGTAAAGAGCTATGCCGAGGTG +ATTGATGCCAAATTCCGTTATCCGCTGACTGGTCTTCTTTTTGTCGAGTTTGATTCGAAG +ATGTTCCCAAACCAGTTACCTACGATCTCAATTCGTAAGCGCTGGAAGATTGTAAACGTT +CCGTCAAACTATGATCCAGAATCACGAACTTATAACGGAAATTGGGATGGAACTTTTAAG +AAGGCATGGACGAATAATCCGGCCTGGGTGCTTTATGACCTGATGATTAATCAGCGTTAT +GGCTTGGATCAGAAAGAGCTTGGTATCGCTGTAGATAAATGGGCGCTCTACGAGGCTGCG +CAATATTGCGATCAGATGGTTCCTGATGGGAAGGGCGGGACGGAACCTCGATACCTTTGC +GACGTGATAATCCAGTCTCAGACTGACGCTTACAAGGTTATCCGAGATATTTGCTCAATC +TTTCGTGGTATGAGCTTTTGGAATGGTGAGAGCATTTCGGTAATCATCGACAGGCCGCGT +GAACCTGCGTACATCTTCACTAACGACAACGTTGTTAATGGTGACTTCTCCTACACGTTC +GCAAGCGAAAAGAGCATGTACACGACGTGTAATGTGATGTTTGATGATGAACAAAACATG +TATCAGCAGGACGTTGAGCCAGTATTCGATCGTGAGGCTACTCTACGGTTTGGGAACAAC +GTTACGAGCATTACAGCGATCGGTTGCACACGTCGAAGCGAGGCCAACCGACGCGGGAGA +TGGATTCTGAAAACTAACCTCCGCAGCACTACGGTAAACTTCGCTACCGGGCTTGAGGGC +ATGATCCCGACAATCGGAGATGTTGTGGCAATAGCTGATAACTTCTGGTCAAGTAACTTG +ACAATGAACCTGTCAGGGCGTTTGCTCGAAGTGTCTGGAAGTCAGATTTTCTTGCCGTTC +CGGGTGGATGCACGCGCTGGTGACTTTATTATCGTAAATAAGCCAGATGGCAAGCCTGTG +AAGCGCACAATCTCAAGTGTTAGTGCGGATGGTAAGACTATAGAGGTTAACATTGGCTTT +GGCTTTCCTGTGAAGCCTAACACGGTATTCGCTATCGACCGCACCGACATTGCGTTACAG +CAGTACGTCGTGACAAAAATCGATAAGGGCGATGATGATGAGGAATTTACCTACAAAATA +ACGGCGGTGGAGTACGATCCTAACAAGTATGATGAGATTGATTACGGAGTTAACATCGAC +GACCGACCGACGAGCATCGTTGAACCAGATCAGATCCCTAGACCGAAAAATGTGCAAGTA +TCCTCAGAGTCGAGAATCGTCCAGGGGATGAGCGTAGAAACGATGATTGTTAGCTGGGAT +AAAGTTCCGTACGCTGTTTTCTATGACGTCCAGTGGCGAAAGGATAACGGCAACTGGCAA +AATGTACCGCAGACAGCAAACAAAGAGGTATACGTTGAAGGTATTTACGCTGGCAACTAT +CAGGTTCGCGTTCGCTCAGTCGCTGGTTCGGGCACGACTTCAGGCTGGTCAAATATCGTC +GCGGCAACGTTGACGGGTAAACAGGGTGAACCGGGCCGACCGATTAACCTCACAGCTACG +GATGATGTTGTTTTTGGTATCCGTACAAAATGGGGGTTCTCTGATGGTTCTGGAGATACA +GCCTATACAGAGTTGCAACAGTCACCGGATGGAACAGTGGATAACGCAAGTTTGCTTTCT +TTGATTCCGTATCCGCAGCATGAGTATTATCACTCACCAATGCCTGGAGGGAATATTGTT +TGGTATCGGGTAAGGACGGTTGACAGGATCGGTAACGTGTCTCAGTGGACTGATTTTGTC +AGAGGTATGGCATCAACAAACGTTGACGATATCATTGGGGAGATTTCTGTTGATATCGAA +AACTCACCTGGTTACGAGTGGCTTGTTGATAACGCAACAGACAACGCGGCGCAGAACTCA +GCTAACGCAGAGGCAGCAATAGAAAACGCGCTCGCCAATGACAAAGACGCGATCTACATG +AAGAAGGAGAACGGAAAACGAAAAGCTGAGTACACGAAATCACTGAAACTTATTGCTGAT +GAGACGCAGGCACGAGTGACGGCGATCGAGCAATTGAAGGCAAGTTTTGGCGATCAGATT +AGCGCTAGCAACAGCGAGCTGCGTGAGGTTATCGCAACCGAGACTGAAGCACTATCGCGT +GAGATTGACCAGCTTAAGGCTCAGATTGGTGACGATATTCAGGCAAGTCTGACTGATATT +CGGGAGGTTATCGCAACCGAGACTGAAGCACTATCGCGTGAGATTGACCAGCTTAAGGCT +CAGATTGGTGACGATATTCAGGCAAGTCTGACTGATATTCGGGAGGCTATCGCGAACGAG +ACTGAGGCTAGAACGCAAGCTGACTTAACACTTAGCGCGCGGCTTGGAAATAACGAGGCG +GCACTTGCTCAAAAACTAGACTCGTGGAGTAACGCGGATTCTACTGGTGCAATGTACGGT +GTCAAGTTGGGTCTGAAATACAACGGCCAGGAATACAGTGCAGGCATGGCTATGTCTCTA +GTTGGTTCCGGAGCTGCGGTTAAGGCGCAGATTTTGTTTGAGGCGTCACGATTTGCCATC +ATGACTGGAATGAATGGTCAGACTCAGTACCCCTTCGTTGTTGAGAATGGTCAGGTTATT +TTAAGTAGCGCGATTATCAAGAACGGATTCATCACCAACGCAATGATTGGAAACTTCATC +CAGTCGAATAACTATGTATTTAACCAGTCCGGATGGAGGCTTGACAAGGGTGGAACATTC +GAAAATTACGGAAGTGACGGTGAGGGTGCAATGAAGCAAACTAATACCACAATATCTGTT +AGGGATGCGAGTGGTCGCCTGAGGGTTCAGATTGGCAGGTTGACTGGCTCATGGTAATAT +CAAGGGCATCGAGAGATGCCCTTTTCTTTTGGAGGATTTATTATGGCGTACGGTATATCA +ACTTGGGACGCAAATGGCGTTTATAATAACTATGGAATTAAGCCTATTACGGTTGTTGGT +TGGAACTTTTTGTCAGCAGGCCAGAATTCAGCATCGTTCAGCTATCAAGTGCCTCCTGGT +ATGCATGTGAACTACGTTATAAGCCTTGACGATGGCGCCATTAGTGGGCCTGGCAGGAAA +ATTATTGCTAGCGGTAATACGATAACAGTAACGCCAACAAACTCACCTGGGCCAAACGTG +TACCCATCATCAAACTGTTACTTAATAGCATATCTGGAGAATGATTAATGTCATACGGTG +CTTTTATAGATGTAAACGGAAACCCATTCATAACCCCGTTATCCACGCCATTCGCTTTAT +ATGCGAGAGGGGAAATTCAATCAGTAAATGTTAGTGGTTCACAGGTTGCGGAGAGATACG +TTCGGATACCTACAGGTGTTCCGGTTATAGCTTTTTGCAAAACAACAAACACGCAGCAGG +GGACCGCGCTTTCAGCCTTTACTTTCAGAAGCGGACCCAATGTTGGAACTGTTTATATAA +GGGGGACAAATCCAGCAAACCAATCATACACGCTAACATACTACATATTTGCCATATTTG +AGCAGTCACTACCGAGATGGGGTATGGCAATATGGGATGCGTCAGGAAAGCTAGTGCTGA +CAAATGAGACAAAAGTCCTTAGTGATTTGGTTACAATCGGCACTCCTGGATACGCTGGCG +GTGGATTAAACATAGACACAACACTCAGCGGAAGCTACGCGGTTGTTCCAACTATACTTG +GCAACTATCAAGTTGTTATTGGAAGGTTGCCAACTGGGCAACCAATAATAGGAAACTCAA +CAGCAGGCAGTTCATGCAGGTACAACGGGAGCACAACGAGAATAAATGCAGCAGCAACCA +CTGCGGCAGGTCAGATAATGAACACAACGAATAATGGAAATATTATAACAGCAATCAAAA +CGGCAGCATACGACTAAGCCCCTTGCGGGGCTTTTATTTTATATCGAGCAATCGTGAGAT +TTGAAGTTTTTCTCTGATACATAGTTGAAGGAGAATGGGTATCCAGCACGCAATACCATC +TCCTTTCCGCGCATCTTAGATCCAAAAACGTAAACCGAGTACTCCGCGCCGCCTGATTCA +TAAATTGCCGTGCAAGTGCGCTCAGGCATCGATGAGCAACCAGTCAGGATGAATGCCGCA +GCGATAATGGTGATTAACTTTTTCATTTGTATGTCCTCGTCGTTAGTGTGATTGCATTGT +ATGTCGCATTTACTTTTATTGCAATAGAGCGATTACAATTTTTTTCGTGTAACAGGCGTA +TATTTTTGTAACCGGAATGGGTGTTACAAAATCGCCTCCATCCGACCGCAGGGAGATATG +ATAAAAACTCTATATAATATATATAGATAGATAATATTTAATTTTAGCTTTATATATATA +ATTATTGTTGTGTAACAGTTGTATATCGTGTAACAGGTGATTTGATTGATTCGTCAAATT +TCTATCATGTATGTTCAAAATTTAATCAATCTGGATTCTATTTGTAGGTATCTCTGTATT +TCTAATAAAAAGCGGTTACAAGTGTTACGCAATAGACAGCGCATAAAAATCTACTTAAGC +CATTGATTCTGTTGATGCTGGTTGTAACTTGAGCAATATAGACACGCAATTACACACTGA +TTACATGTATTCGATTGACTAAACGCTGTTAATGGCTATAATGGATTCATCGTAAACGAA +GGAGATAAACGCAATGTTCCAGGTATTCACATCAAGCCAGCTTTCTAACGACGAGTATCA +TAGAAACGAAGGTTGGGCGTCAGAGTATGTAAGCGGATCGAGTCTTGCAGAAATTTATCA +GACCTGCCCTGCTAACTGGAGATTCAAGAAGAACGAGACAACGAAAGCTCTGGAGTTCGG +TACTCAGTCGCACACCAACTTTGAGAGTCGAGATCTGTTTACTGCAACGTATGCTAGATG +CCCTGCTCCGTCAGAGTTTAAGGATCTGATTACTTCGCAGGCGGCGCTGGCAGCAAAATT +AAAATCATTCGGCCTGAAAGGTACATCCGGTAAGCAGTACCCGGACCTCATCAAAATGAT +GGTTGATTGTGGTGAAGAACTCAACGTTCAATACCTGATTGAACTGATCGCAGAAGCTGA +GGCCCGTGCTGAAGGAAAACAACTTGTTGACGCGGACAAATACGACGCTTGCATGAAGAT +GAGAGCCATCCTTGAGCAGAATCCCGATCATGAAGCGTGCATCAACAGTGAAACGGCGCA +GCGTGAGATTTCAATCTTCGGTGAGATATCCGGCGTAAAAGTTAAGGTTCGACTTGACCA +TCTGGACTACAAAGAGAATGTTCCAGGTCGTGTCCTGACTGGTTATGATGAGAATGGCGA +TCCGGTATTTGAAGACGTAATTTTCCCGGAAGCACTGATTATCACAGATTTCAAAACTAC +GATGAGCGCCAACCCGTTAGAGTTCCCGAGACTGGCATACAATCACGGCTATTACCTGAA +GATGGCATTGCAGCATGACCTGCTACGACGCGCAATCCAGGCTGGAGCTTTTGAAGGTAA +CTTCCCGGAAGACATTCCGATCGTGGTTCGATTGCTTGCGCAGGAGAAAAAAGAGCCTTA +TATCGCACTGGCTTACCGTATGACTATGGAGCAAATCAGGATAGGTCGTAACCAGTACAT +TAGCGTAGTCCACACTTACAAGGCTTGCTCTGAAATGGATGTTTGGCCTGGGTACGCTGG +CGACGCAAGCGAGATCGAACTTGAAACGCCATCATGGGTGCGTTACCAAAATAAGTAAAC +GGCATAAATAGCTAAACAAATAATTAATGAGGTGTTATAATGCACCTCATACACCAATCA +GGAGAAGTTAAGATGCAATTATCACCAGAAACAAACGAAATCCTTCCCGCACTGTTCAAT +GCTCGCAATAAATTTGCTAAAGCAAAGAAGGACGCAAAAAACAATCACCTGAAAAATTCA +TACGCAACTCTTGATGCAATGATGGCTGCGGTTAGTCCGGCGCTAACCGACAACGATATT +ATGATCCTGCAATCAATGCTGGACACCAGCACTGAAACAACCTTCCATCTTGAAACGATG +CTGATTCACAAATCCGGGCAGTGGGCCAAATTCTTCATGATGATGCCGATTGCAAAGCGC +GATCCGCAAGGCGTAGGTTCTGCAATGACGTATGCTCGACGTTACTCATTAGCCGCAGCG +CTGGGGATTAGCCAGAGTGATGACGATGCTCAGCTTGCAGTGAAATCCGTCAAGGACTGG +AAAAAAGAACTTGATGCGTGTGAAGACATCGAGTCACTGAAAGATGTATGGGCCAACGCT +TACCGCCAGACTGACACGGCGAGCAAGTCAATCATTCAGGATCACTACAACGCATTGAAG +GCTAAATTTGAGATCGGTAAAGCTCGCGGCATTCGCCCGGCGCAACCGGAACAGAAAAAA +CAGGTTGAAGCAACAAGCGCGAAGCCTGTACAATCCCAATCAATCACCAACTTCGAATAA +TCATCAGGGCGGCTTAGGTCGCCCATAAAATTTAGGAGAGAAAAACATGCATATTATCAC +TGGCGAGATCCGCAAAGAACCAAAGATTCTTGAACGTAACGGCGGCAATACTTATATTAT +CGAACTGGCAGAAAGCTATAAGCCTCGTGATGGCGATCGCGAATACACCAACTACACGTT +CTTTTTTAGCGACGGTGGGAAGCCAGGCCTTGCTGACTGGTATCGTGAAGCGTTCCAAGT +TGGTCGAGTTATCTCAGTATCGTGCGAGACGTTGAAGATCTCATCACGCGAACACAACGG +AATGATTTACAATTCATTGCAGGCGGCTGACTTCCCTAAACTAGTATTTAGTCAACGAGG +TCAAAGCAACCAGCAACAACGAGCGCCTCAGCAACAACAGCGTTCTCAGCAGCAATCACA +ACCACAACCAAATCAACAATCAACATTTGACGACGATATTCCATTCTAAAGAGAAGCCCC +GCATTGCGGGGCTTTTTATTACTTCATCTCAATTGCCTTCGGAAATGAGTCTACGTATTT +TTTAAGCTTTCAACAAACTCTCCAGATACCATTCTTTCATGATTTCTTTTGCACTCTTCA +ACCTTCTTGTTGTAAATCGAGTCATTAGGCATCTCAACTCGAACGCTAACAAAAACATCA +CTCGGAATGTCAATAGGATCTCCGTCGCTATACCCATCCTTATGATTTCTTGCGAATGAA +GGCGCGTTATCATGAGTTCTATGATAAGTCTTGATAACCAAAGACCCATCAGGGTTAACA +TCATAATCTACCCACAATATAGGCTGTCCGTTAATGTCTTTTGGTATCTCTATACCACCA +TTAACACCACCCCACGCTAGATCAGAATTTAGTGATAAACACCCCCTTATCAAATACTCG +CCAGTAGAAATCCTCTCAACTGTACAACCTTCGCTCTCGTCGTTTGTTCTATGGCTCCCG +TTGCCAAAAATATCAACGATGGGCGAGGCTCTTTTTATAAATCCGTTAGAGTCAACGGTT +GTATTGCCTGTATTCCATATAACCTGGGTAGATTGCCATTGCCCCGCTCCTGTCTGTTTT +CTGAGCCTCACATCCTTATAGTGTGGAGAGGTCATTGATGCGTCTTCAATAACAAAACCG +AAAGCACTACCATCTCCTTGACCGTTTCTATGACGAACAGATATAGCTGTGACCCATCCA +ACAGAATTAACATTGTAACTTGCAAGGCTGCTAGGCTGAGATGATAAGGCGTTAACATAA +TCTACCCAGCTTGATTTCTCGTTAGATGCCGGACTTGATACAACGTTACCGGAAGAACTT +GCCTGAAACGGTATCCACGGAGAAAACTTTGCATTATTGCCGTTTTTCATCCTTATGTAA +AAAGTGCCGCGACCCGCTCTTGCGTATTCGTTTTGAGAATTTCCAGATTCTGGGGCAGTA +CCCAAATAAGTAAAAGGCGTAAATCTCTGAATGCAAGCGCTTGCACCGTTAGCTCCATTA +GGAAGAACCTCAAGCATACCTGCCGTAGCCTCAGGATAGCCCCTACTAGCGGTAGCTAAA +GCACTTGATGATTGCATATATATTCCAGGGCCATCACCTTCACCAGTTATTGAATCAAGA +TGTTTTTCCCCTAAGTTAGTTCTATCAAGTTTGTAAACCTTTAGGCTTGTCCTTGCCTGC +TCCTCGGTAGTTCCACCAGTGCCACCAAAATTAACCCCTAGAGCCTTGTTTTCACTTCCA +TCATGAGTCATGGCGCCCCAGTCGCCGCTATCCTTGACGATCAGCTTAAATCCGGTCCTA +TCGTCATTTGAATAGATTTCCGTTAAATCTCCGCGAGGTTGATTTAATCTATTTACTGCT +AGATTTTTCTTTGATTGCTCCTTGTCAGCAAGGCCAGAAAGATTTCCGTCCTTAGTTAAA +AGATTGTCAGCATTAACGCTATTTGCTGCGTCCTGAGCTTCCTGAGCACTTACCGCTGCC +GCGTCTTTAGCGCTAACCGCCGCATCGCGTGCTGCCTGGGCATCGTTTTTTGCTGAGGTT +GCAGTTTGCGCCGCAGTCTGAGCATCATTAACATAGCCAGATAGATCGCCTTTAGCGTCA +TTGATTGCCTGAATGGCCGCCGCCTCTTCAGAGTTAATATGCGTAACGGCTGAATTTTCT +TTTTGCTGCACATTAGTGATAGCCTTGTCTTTTGCTGCATTAATGCTACCAATAGCGCTA +TCTGAGATTTGCTGCGTCTGGTTTTTTATTGAGATAGACTCGTCACGCGCTGAATTAGCT +GAGTCGCGAGCAAGCTCCGCTTGATTCTGCGCAGCTTCTGCGGCTGATTTGTTTGACTTA +ATTTCTTGAACGATATTGTTCAGGTTGTCCATATCAAGATCGGCAATAATATCCAATGCC +GATGCGATCTCAGTCTCTTTGCTCTGGTAGTACCGAAGCGTCTCAGCGACGTTTTGCGCA +AGACCGTTAACGGTCAATGAATCGTTAAGCAAGATCACATACTTACCGTCAGCGGCAGTC +TGACCATCCGTAGAGATAGCCTTTAGCTCTGTGTCGCTCACAATGTCGCTGATTACCGCC +AGTTTAATTGGTTGCTCCAGGAATACGATAGTTGCACCGACTCGAATCAGAGCAAGCTGA +TCTTTCCATTTTGTATCGGTTCCGTGAACCGTACCGTCTGCATCCATTGATGCAGTACCG +CGTCTATATAAAGCCATAGTATAAACTCCTTAAGTAAATAGCACGAATTGCTAAGCAATG +ATTTTATCATTGTGATACCTGCCTTGCAATGGGCAATAAAAAACCGCCCGAAGGCGGTTA +GAGTAGGTCACTTGCTGAGAATAATTTGGCTTGCATTCCTGAGCGGTATCCTCCGGGATT +CGGTATGATTATTTTTAATTGTCTGTCGTCTGCCATGTACAAGGTATCCTTGTCTGAAGG +TTCACACGCCACCCTTACCTCTCGATCACCTTTATACCTGTATGCAACCATTTCGAGGTT +TTCAGGGGTGAAGCAGGCCCATACCTCTTGCTTTGTGTGAAATGCAATATGCTGAGCATC +AATCCAGTTAAGGCATAAGTAGATCGCCTTTTCAGTTTTACCAGTCACGGCAACCGAGCA +ACTCGTGTATTTCTTCGCATAAAATGACTCTCTTCCTTCTTCATCAATAATCAAAATATT +GCAAAACTCATCATCAAGCCCATCCTCGTGCACAAGTTGACACGGTATAGTGTGAAATAC +GCTCTCTCTTCCATCCTCGCTACGCTTTATACCGACATCGAATGATTCGGTGGGTAGTGA +CTCAAACACGCTCAACGGCGTATTTACACGTTTCTCTGTGCGCTCCATAACCTTCATCAC +CGCCTCATGTTCAGCCATCATGACATTGACACCTGACACCGGAGTCCGGCGCGCTTTCTT +GTTTGCCTTGACGATGTATTCCTGCGGAACCTTTCCGAGAAATCTTCCCAGGATGTTTAC +GCATTCGCTGTACGGCATACCAGTCAACTTCATCAGCCAGCCAATACCAGAGTCGTTACC +GCATGAGTTGCAGATCGCACCGCCGTCGCCCGGGGTGTTCAGGTTATCAGTCCAGCGAAA +TCGGTCTTTACCGCCGCAGTTGGGGCAGGGTTGGTGCTTCTTATTAAAAACATTATTCGG +CAATCCGCAGATTGATTGGAAAGCCTCGCGCCATAACCCCTTCATGTACGGTAAAACGTC +CTCTTTCTGAAACATCATAAATTCTTCGTTCACTTCCAGATCTCCAAAATAAAAAACGCG +TAGAAGGATGTTAACCCGCTACGCGTCGTTTGTTTTAACTAAAAATGCTATTGGTCGTTC +CGATATCTTCCCTCTTCACGACGCGCATAAATTTCTTGTTCTTGCATCGCTTCTCAAGGC +ACTTGCCGTTACCGTCAAACTTCAGGTCGAATCGCAACCATGCGGCCCTAAACCCTTTGC +ACCCCTGTCGGCGGTAAGCACGATGCGCAGCTTCAGCGCCTTGCCATGAAATCATATTGC +GTTCTTTCCAGCCTTGCACCGTCTGATTGCTAACCTTCAGAGCCTTTGCGCAGGCCGCCG +GGCCGCCGTAATATTCGATAAGGGCATCAAGTCGCGCTCGCAGTCCGGCGCGCGTTTCTT +CTTTATGAATATAGAACCCGCAACGCTGGCGAGGCTTTTTATCTTTACCGCGCCGTGTTC +CGTTGTTACCATTGATGTGACGTTTATCGATTTCACCAGTTGACTCTGCGATACGTTGAA +TACTCATCTTGATTCCCCTATAGCACTTTTTGCTAAAAACGTTTACTTTATGCCGTGTAT +TATAGCGTAAACGTTACAACGATTCAAAGGATTAATAGCCGTGACAATGAACATTAAAAA +ACAGATTGCATTACTTGGCGATGACTATATAAAGAGAACTCAGGAGCGATTTACTGTTGG +TGAGGTTGTTCCTTATCCGTACCAGGTTGTTGCTTATGCCGAGATCGCGAAACGCCTATC +AAATTACGAGCATCCATTCTTCGTTAAAGCGTCTGTATCCGCAGGTAAGACAATCATCTT +CGCTATGGTGGCAAAGCAGTGTCAGAAAATGGGCTTAAAAATGCTTGTCCTGGCTCGTCA +GGGTGAGATTGTCGATCAGGATAGCGAAGAGATCGACAACTTCGGGGTAACGAACTCCAT +CTTCTCAGCGTCACTTGGAATCAAGTCCTGCTACTTCCCGATCGTGGTTGGCTCAGAGGG +TACTGTTGCAAATGGCCTCGACAATGAGTTAGCTGATTTCGTCCCGCATGTAATTGGGAT +CGACGAATGTCACCAGGTGGATTGGGAAGACCTTGCGCAAGCCATCGAGGGTAAGGAAAC +AATGGAACAAATGAGGGGCGAGAAAGGGAAAATTATCATGGACGGAGATATTCCCCTGAT +TGGTAATGATGGAAAGCCTTTGCTTGGAACTAAGCGCAGTCAGTACACGATCGTAATCAT +GGAAATGATGCGGCGCTGTAAAAAGGTTCACGGTCACGATCTCAGAATATTTGGTATGAC +TGGATCTGAATTTCGTGGCGTAGTTCCTATTCTGGTAGAGAATCCGAAAGCATTGGGATT +CTGGCGTGAGCGAGTAACTGATATCGACACAAACTATCTGATTGAGTTCGGCTCTGTCGT +TCCGACTATATTCGGATCAACAGACGGAGTTCATTACGATCTTGATAAGTTCAAGGCGTC +TAGCGAGGACGGAGTGCAGGACTTTACAGAGAAAGACATGAAGGCTATGGAAGATGAGAT +CCTTCATGATAAATCTCTGACTCAGCGAATCATGCAAATGGTCGCCAAAAAGGCAGAAGA +ACGCAATGCGGTCCTGATTACATGTGCTGGTGTGCGCCACTGCAAAGAGGCAGCGGCAGC +ACTTCCTCCGGGAAGCACCTATGCAATTATTACTGGCGACACAGACAACAAAGCGCGCAA +GAAGATTCTTGACGATGTAAGGGCCGGAAAAATTAAATACACCTTTCAGGTAATGGCGCT +CACTACTGGCGTTAACGTTCCAAATTGGGATTTCAGTGTCATACTCCGCAAGATAGGATC +GCTCACTCTGTTGATTCAACTTTTGGGTAGGGGTATGCGACTGCTTAAATCCTGGCAGGT +TGCTGAAGGAATGGTTAAGCAGGACCATCTGGTATGGGATTTTGCAGGTACGATGGATGA +GCTGGGTCAGCTTTATTTCGATCCGATACTTGAGCAGGCGCAATTCCAGAAGCGTTTTGA +AAACGGCAAAGATCCGAAAACATGTCCGAAATGCGGTTGCGTAAATAGCTTCTATGCTCG +ACGATGCGTTAATGTCATTGATGGTGAGCGTTGCGATCATTTCTGGACTTCTCAGATTTG +TGAGGACCAGGTTGACGAGCGAACCGGGAAAATCCTTGTTAAAGGATGCGGTGCAGAGAA +TGACGTTGTTGCGCGAGTCTGTCGTTGTTGTGATGCTTCTCTTGTCGATCCTAACCTGAA +GTTATCCGGTAAGGCGTACACCAAGAATGACTGGTATGAAGTAAAGAATTTTGAGGTTAC +GCTAACCAAAAACCAGAAAGGCATAATATACAAATACACTCTGATTAACGACGATGGTGA +TGAGTTCAAGGCGTATGAAAAATTCTTCCCCGAGTCTGACTCTAAGATTTGCGGTACGCT +ATGGAAAACTAAAGGTGTACTTCCTCATGTGTCAGATCCTAAAATGCGCCGCTACTTTAT +CGGAATGAAGAACGCCATCAAGATTTTGCAATACTCACATCATATTGCTCACCCGGTGCG +CGTAACTCATCGTCGCAACCAGAAGAAAGAAGATATCATCTCACGCAAAGACTTCGGTAT +GGAGGATATCCCGGAATGATTACAGACAAAGGTGATTATTTAGAATTTTACGAGAGAGAC +ACAAGCGACACTCGAAAGGAGGATGCTCATCAGGTGGATTGTGTATCTTGGCTGAAATAC +AATTTTCCTCACCTTCTATTTTGGCACACTGTCAATGAAGGTGAAAAAACAATCACATCG +GCGCTCAGGGATGAGCAGGCAGGATTACTTAAAGGCGTGTCAGACTTCGTTATCCTGATT +GGTGTTAACTCACGATACCCGTTTGCAGCAATCGAACTTAAGCGGGTTAATAAGTCAGGC +AAAGGAAAGGCGTCACCAGTCAGCGACAAGCAAAGGGAATTTCTCCAAAAGGTCCGGGAG +CGTGGCGGCTTCTCTGCCGTCGCATACGGATTCGGGCAATTCAAGATCGCAATTTACGAA +ATGATGAAATAGCACTTTTTGTTAAAACTGCCGGGATGGAATCTGGCATTATTATCTCAC +CAAAACGAGAGGAATAAAAATGAAAGACTTTAATGATATCGAAACTATCGACTTTGCAGA +AACTGGTTGCTCATTCACTCGCGAAGCAATAGCATCAGGCGGTTATTATCAGGCATTGAA +AACGCCAACCTGTAAAGAGATTTCAGGGCGTCGATACAAGGGGACAAATACCCCTGACGC +TGTTCGTGATTTATGGTCAACTCCGCGAGAGGTTATTGCATACCTTGAGGGTCGTTATGG +GAAATATGATCTCGACGCTGCGGCAAGCGAAGAAAATAAAGTTTGCGAGAAGTTTTACTC +TCAGGAAACAAACTGCTTAAAACGTTGGTGGGGAAAGAATAAGCACGTTTGGTTAAATCC +TCCTTATAGCCGACCTGATATATTTGTCAAGAAGGCCATTGAGCAAATGGAGCACAACAA +TCAGATCGATATGCTTTTACCTGCAGATAACTCTACTGCGTGGTTTACTGAAGCGCGGCA +GAACGCAGCTGAAATAATCTGGATTGAAGCGGACTTGACTGAGGATATTGACGGCAATGA +ATACGCACGATCCGGTCGCCTGGCTTTCATATCCGGTGAAACTGGAAAGGCCGTAGACGG +TAATAACAAAGGTTCGGTAATTTTTATTATGCGCGAACTTAAAGAAGGTGAGGTGCAACA +GACTCACTACATCCCAATCACAAGCATTTGCCCTTCGGTGAAAAACAAACGAGCAAAGGT +GAGGAAAGTATGATGAGCGAAAAAATGGTTCCTGTTAAATTAACTGAGCAAGGTTTATGG +CTACTTTATCGAGCTACGTGCTGCGAAATTATGGAGCGAAACGGATTGACTCAGGATGTT +ATTGGTTGCGATCTGTGGGAGTTCACTAGTTCTCTTGATATGTCTTTCGATGAGATAAAA +AATGAATACATAGAGAACTGGCCTTCAATCATACAGAAAGACGTGGAAGAACTTAAAGCT +GATACAATCGTACAGCACTAATTGCTAAAACTACCCGGCGAAAGTCGGGTATAGTTATTT +CATAGAAACGAAATGAGGAATCAGAAGATGGCACGCATTAACGCAAACTTTTTCAATATC +GCTCAGCAGTCCGCAAAAATGGCTGTTCATATTACGAACAAGCAAGGCGGCAACTTCGAT +TGGGATATTGCTATGAACTTCCTTAAAATGTCTTATTACCGTTGCTCAGTTGAAGAAGTC +GAAGGCTTCATCTCTGACGTGGAGAAATTAACTAATGCTGATAAAAAAGCAAGGTAAGCG +CGAAGTGTGGGAGCACGCAAAGGAATGCGGCATCTCAGACGATATAGCATTAATTGCTAA +ATACTTTGATATAAAGGATGTTAGCATTATATCAAACGGCAAGATTTCATTTATGGAAGG +TATGCCGAGAAAAATGCAAAGAGTTCCAGCCACTCCATCACTTGAGTTTTACCGCGAAGA +GGGAAAGAGAATTGAGCGAGAAAGAAAATCCACAAAAAACGGCAAGTCTTCCCGGCTTAA +ATATTAATGCGGACGAATACCAGGCAATATGGATCGGGAAAAAGCAGGTTAAGCAAATCC +CTTTCTCTGACTGGTTGCCACCTGACTTTGTTAATGTGCTTTGCACTATCGGTATTGAGC +AGGAGTTGCATATAGGTTACTACTCACCTGGCCGAAACAGTATGATGCTTGAGGTTGACG +GAAAGCTCGTTGAGTTTAAATCTTCAGATCTAGGATTCTGGTTAAAGGCTGTGGCATGAA +ACTTTATTTTGCTGTAGTATTAACACCGCTAATTTCATTTTCAGTAATGTATTTCATTAT +CATGTAAGGATTAAAATATGTCACAAGCTAAAATCACTACCGAGCAACTTATCGAAGAGC +GCATGAGCGGCCTGACACTTCGCGAGATCGCGGAAAAGTACGGTATGCACATTCGCACTG +TCGAGGCGCGTCACGCAAAACTTGCAAAAGAAGGCCACTTCCACGGCAACGAGCATGTTG +CTAAGATGGTTCCGGAAGGCTTCATGGTAAAAGGCACGTCAACCATGATTGACGCGGAAG +GTAACGAGAAGATTCGTTGGGTTAAGACATCAGTTGATAATGAGCGCCTTGAGGTTCTAA +TGGAAAAAGCGCGTGAAGCATTCTGTTCAGAGTTGCCTAAGGCTATTCCATCTGAATCAC +CTGACGTTAGTTTTGATGAAGACACGCTTGCGATGTATCCAGTTTTTGATTTGCACATTG +GTGCTCTTGCTCACAAACATGAGTGCGGCGAAAACTACGACACAGCGACAGCAGAGAAGG +TTATGAATGGGTTCTTTGACTACGCTGTAGATAAGGCGCCAAACTCAAAGAATGCCGTAT +TGGTATTGGGTGGCGATTTCCTACATTACGACTCTTTGGAGTCTAAGACTCCAGCGTCAG +GCCATTACTTAGATTCTGACAGTCGTTACGCTAAGCTTGTTTATGTCGCAATCCGATCAG +TACGACGCGCAGTCTCTCGAATGCTGGAGAAGCACCAAGTTATTGATATTAAAGCAATAA +GTGGGAATCACGACGAATCAGGGATGGTTTGGTTGCGCGCTGCGCTTGCTGCATTTTATG +AAGATGAGCCGCGCGTAAATGTTGATGTTAGCCCTGCCGCAATGATGATGACCAGCTTTG +GTAAGACCCTTATTGGATACACTCACGGGCATCAAATGCGAAAAGCAGATACTCGACTAA +GTGTTATGGCAACTGATTTTCGTAAGTTGTTTGGTCAAAGTGATTACGTTTACACGCATA +GCGGTCACTGGCACAGTCAAAAGATTACAGAAACAAACTTGGGTATTGATGAGGTTCATG +GTCAGCTTGGAAGTCCTGACGCATACTCTGCCAATGGCGGTTGGAGGTCTCAGCGTCAAG +CTGCTGTGATTGTCTATCACAAGGAATTTGGTGAGGTTGGACGATTCATTTGTCGACCTG +AAATGTTCTAAATAGCACCTTTTGTTAAAACAGTACCCGCGAAAGCGGGTATTATTGTTT +TATAGAAACAAGAGGAGATTGCAATGAACTGGCACGAGCATTACGAATATAGGGATGGTG +TTCTATATCACAAGGTAAAGCCATGCAGAAGGCATGATGTAAATATTGGGGATGTTGCTG +GAAGGGTTGCCAAAAACGGCTATCACTATGTTGTTCACAAGAACAGGCCGTATAAGAGAT +CTCGAGTTATATGGGAGATGTTTAATGGTGAGATACCAGATGGTTTTGTTATAGATCATC +TGAATCACAATGCCACCGATGATAGGATCGATAACCTTGAGTGTAAGCCAAGAAGAGAGA +ATATGGTTAATGTTAAGTTAAGGATTGATAGCACGACCGGAGTAACTGGCGTATCAAGAA +AGAGGGATAACAAGTGGAGGGCGTACATAACAATTATGGGTAAGCAGAAGTGCAAGAGCT +TTGACACGTTTGAGGAAGCTTGCGCGCAGAGGATTGAATGGTCAGTAACTCATGATTTTC +ACCCAAATCACGGTGGAACATACTAATAGCACCTTTTACCTAACCCGCGCCACAGAAGTG +CGGCATAGTAACCACATCGAAAACAGAGATGCTATATCATGAAGATAGTCAAGTGCATCC +GAAATGACTCCAAAACACTTCCATTCCGTGTAAATCAGATCTATAGTGTTGGTTATGATT +TCGGTGGGGGATTATTTGAGATTTACGACGGGCGAGGTTCAGCAATCCAGACTCCTCTGA +ACGGTCACTACCTGGAATTTATTGAGATAGATTAACAATAGCATTCATCACCTTACAGGC +TGGCATGATTTACATGCTGGCCTTTTTGCGTTGTGTCAAATAAATTTGAAGGTTAAAATC +GACTCACTTGTTCAAAAAATATATGGTGAGATTATGAAAGAGTTTTTAACGGCTGCTACG +TCAAGCACTGGCGGTGCTTCGTTGGTAGGGGCGGCGACAGGGCAACTTTATATTGCTGGC +GCTACATTCATTTGCTTTCTGCTTTTTGGTGCCTGGGGAGCGTACTGGAAGTATCGTGAT +AGCAAGGCAATTCAGGAAGCGTTAAACGATGGCGATCTAAATAAGGCGCTTAAGATCAGG +GGGAGATAATGAGTTTAAAAAATAACGTTATAGGCGCATCAATCGGGGCCGCTTTGACGT +TGACACCTACCCTACTGGAACGGATCGAAGGTATAGAATACGAGGTTTATTACGATATCG +CCGGAGTCCCTACCGTATGCAGCGGAATAACCGGGCCTGACGTCATACCTGGTAAGAAAT +ACACTAAGCGAGAATGCGATGCATTGCTGATAAAACATATCGGCGTCGCTCAGCGATACG +TTGACAAGAAGGTTAAGGTTGACATTCCGGTAACTATGCGCGCATCACTGTATAGCTTCA +CTTTCAACGTTGGGACTGGCGCTTTCGGATCGTCTACAATGCTTAAGCTAATCAATCAGC +GCAAGCACAAAGAAGCGTGTAATCAGTTATGGCGATGGGTATACTACTACAACCCAAAAA +CCAAAAAGCGCGAAGTGTCGAGAGGGATCAAGAATCGGCGCGCTGAAGAATACGCATATT +GCGTTAAGGAACTATAATGAAACTTAAGAAAACGTGCATTGCAATTACGGTTGCTGTTGG +TGTGATTTCTCTATCCGGTTGTTCGACGGCATCTGCTCTGAGTGGTTTACTTTCTGACTC +CCCGGATGTTACGGCGCAGGTTGGCGCTGAGAACACAAAACAACTAGCAGGAGTAACAGC +AAAGGCGGATGATAAGCGAGAAGTGAAGGTGAGTGATTCAAATATTGGCAAGATTGACTC +ATCCGTCAAGAAGTCCGTGGAGGTGTCAACCATTCAGGCCAACACGGTTAACGCTGAAAG +CATCACAGTAACCAAATCTGGAAGCTGGTACGATCCTGTGGTTTGCTGGATTCTCGTTTT +TATTGTCCTGTTGCTGTTTTATTTTTTAATTCGTAAGCACGAAAAAAAGGAGGCGTAAGC +CTCCTTTCTTATTTGTACCTTTTGACGTGAAGTAGCAACTCCCCATCCTGATCGCAAAGA +TTGTGCTCACCGTCGTTATTGGCCCTCATCGAAGTGAGCAGTAGGCTAAGTAATCCTTGC +TCAAATTCTTCTTTCGTAAGCTGGAGCCTTGCGCACAATTCGACGTTCCGATCTATCAAT +GTTTCTACGTTCGCCAAACAAGTCTTCATCACTCATTTCTCCAATGTGCATCATTTCCCA +CGTATATCGATTGTTGTAGCCATCAATACACATCAGCTTCATCATTACCGGGCGCTTGAT +CTTTCCCTTGCACCAGTAAAATCCACCCTTGCAATCAAGATAACCCTCAGTCACGCAGCG +TGCGCAAAACTCCTTAGACAGCGCGCTTGTAAATTCACGGCGAGTCATTCCGGCGGCCTT +AGCAAATCGCTCACTTTCCTTGTGGGCGTATATAAATTTCGCTATGTGCTGTCGAGTGTA +TTTGTCGTAACCTTCGCAGAACCTGAACAGATCTAAAAGAAGAAGCATATTATTAACCCA +TCAGTCGAGGATTGATGAAAACAATATCATCAATCCGGCAAGTGTAGTTCATCTCTTCGA +GCGTGATCAGCAGGCTGTCGATTCGCTCAGCTACTTTTTGTTGACCGTTGAACGGCGTAA +CGTTACGGCACTTTGCAACAATGCTATGAATAGGTGCGCGACCTTTGTTCTTCTTTGCGA +TCTCAGTGATAACATCAATCAGCTTACGAGATTCAGCCTCATCACCAGCATACCCGGCGG +CGCTGGCAGACGACAGATAGGTTCTGGAAAGCTCATTGAAAATCATGATCGCTTCCTGCA +TTGTTTCAAGGTCAATCTCACGGTTGGAACGGTTCGGTGATTCACCCTCCCAGTTCTTGA +TTGTGTGAAGAACTGAAGCAATGCGCAAAGCGTGCTTATCGAACTTGCCGAGATGACCGC +GTAGCATTGAGTGAGAGTATTTGCCCCCGGCTGCGAAATCCGGCTCCATAGCCTGGCGAG +CAAGGTTTAATTCACGCATAGCATTACGGCTTACAGAGAGAACAACGTTGTCCTCCTTCA +TAATGTTGTGCACAAGTCGATAATATTTACTCACCAATCCTCGATCGACTTCCTTATACA +GTGCATCACCATTTTCATCGCAAAGAATACGAGTGCCTAAAAGAGGTTCCTCGCGAACCA +ATAGGAAACGCTCAGATACACCGATACCGCGCTGGCCTGCGTCCATGATACCCTTGATTG +TTTCATCCTGCGCAATGACGCAGATCGAACCGACCGGGCAAAGAGATAAGTTATTGTCCT +GATTTGAACGCGCAACCTCCATATGGTTTTTATCCCACGCCTTGAGGATAAGCTCGCTGT +TTGATTTCTTATCAGAACCGCCATAAGTCAGGCCAAGCAATGTATTTATTGCCGTTGCCT +CATCAGAGATTACGGAAAAGTGGCCTTGAACAGCAGCTACTTTCGCAAGACCTTCCGGTG +TAGGATCTGATACCGCGAAAACAATATCAGCCATCTTCTTGATCTTCTCTTCCAGTTTTT +CCTTGTCCTCGTACAGCGCCGCCGTCGTGTTACCCTTCGGATCGTTTTTGATTTCCTTCT +CGATCTGACGTAGCTGACTGGTTAAACGGATACGTTCCTTTTTGCGCTCTTCATTCAGTC +TCTGAATCTCTGCGCGCATAGGTGTAATCGCCGCTGAGTTAATCGCGGATTTACCTGTTG +ATGGTGGCTGGCTAATCACCATATAAAGAGCGGTCGGTTGTTCTTCTCCGTGATATTGCA +CCCAAAACTTCCCTAGCATCGCGGCTGAGATGCACCCAATGAAATGAGCGTACGCAGACG +AAACAGGAAACTGTACAGACTCAGCTTTTGCTTTTGCATATTCGAATACCAGGTTATCGC +CACCTAACGAAATCAGCGGGAACTTATCGTTTCCACTGTTGATATCGATCGGGTCTTGCC +AAAACGAAACTGAATCCCCGTAGCTGTTTTCACGAATTGCGATCGCCACCGGATTAACTC +CAGTGCTATTTGCGATCTCAATAATCTGTTGGTAATTCAGTTTTGGCTTAATATTAAACA +TCACAATAAACTCCTTAGTTGACGGCGTGAATGATACACCGTCAATGGTACACGCGTTTT +GCAAAAAGTGCTATTCGATCCGTGATTCTCATGAAAACCGTATAGCCTTTCCGCTTCCTT +CCTTGCTTTAGCCGCTGCATCCAGCGTCATGAAGGTCCCTAAGTGTTTTGTCTTTTTATT +TATGGTTATGTTTGCCGTGTATCTATTTGTTTTTTTATTCCAATACACTCCCATAACTCC +TGTGTTTGATGACACTACTCCCTTGTTTCTTAGGTTGTCTTGCCTAGTTACTAGCCTGAG +ATTCTCAATTCTGTTATCGTCTCTTTCATGGTTTATGTGGTCTACATCCATTCCTTCTGG +TATGTTTCCGTAGTGTATTTTCCATACTATTCGATGGGCGTATTCAAATACACCTCCTGG +AAAACATATTGACCGATAACCTTTTTTATTAACAGTTCCGGCAAGTAGATTTCGTCTTCT +TCCGATTCTCCACTCTTTCCAGTACAGTCTCCCGCTAACGTATTTCAGTTCATCTTTCAT +AAGTATTTACTCTCAAAGGTTGTGCCGTCTGCGATACTAAAACCAACCTCTTCGCGGAAT +AAGGTCCAGCGGCAACCGTCCTCATCAAAGATGTAACCAGCTACGCCTCCAAGAGCGCGA +CCGCTTTCTACCTGGTAACGCTTTCCAACCTTGAATGATTTTTTCATTGGGTTGCGATGG +TCAAGTCCGGTGCACTTGAGCGTCTTTGTTTTCAACTCGGTGAACTTCGCGAAGAAGATT +TCATCAGATCCGTGAATATGTAATTCATCATGCTTTTCAAGTTTTAAATACTTCCCGCAT +TTTAACTTTACTTCACGAGTATCATCATAGCGCTCACGACCTTTGTACAGATTGTTTACT +TCGAATCCTGTGATTTTGTCTGCGGAAGTGCATTTAAGTTTGATTGATTTCATTGTGTTC +GCTCCTGATTGGTTATCTTGAATAAGGCCACTTTATCAAATGACCTTACGGCAATATTAA +CAAATCGTGCTATTTACCAGGGAATATAATCGTCAACGTCCTCATTAATATCATCACCGG +GTAACTCAAAGTTAGGATCGTATTCGCTTTCAGCATCCATATCCATATCACCAAGAGCCT +CGTCAAGTGTGATTTCTTTATATGCTACCTTGATGCACCATTCGGAACTAAGTCCGGCAT +CAAGCGCTGCAAAATAACGAGTCCAGAAATTATCTTGTTCCATTAATTAGCCCCACGAAT +TAATGTAAATCGAGTTAGCCTCAAGCGTTGCACGAACATCCGCATCTGTTGCATTTATCA +GTCGAGAGCCTGGCACGCTTCCGATAACACTATTTCCGTTACGCGTCTTAGTTACCGTCA +TTGAAATAAAGCCTGAAGACTTATCCATCTTGATAAATACACGGCCTTTTGCGTTCAGGT +GCTTGATGATATTCTCAACTTTAATGCTCATTTTTGATTCCTTTGTTTTGTTTGGTATGG +GAGTAATATACCTTACTCCCCGATGTGTGTCTTTAGCAATTTGTGCTATCAGCACTCAGA +AATTACTTTACACTCTTTGATCGTGCCGCCGAGAACTACCTTTTGCATCATTGCTTTTTG +GCGACTATCGTAAACGCGAACATTTTTTACGCTGTTAAACTTTCCTGTATAGAAAGTGTG +AACATAAATCATTTTTTACCGTCCCATGTTTGATATGCCGTAACCAGGTTAATGCTATGG +TCGAACTCGTCAACGCTAAGCGATCCGATTCTAACAAGCTCTTTGTTGTGCTTAATGTCA +ACCATCCGGCTATATGGATGACCTGGCGCGTATTTAAATAGACGCTTGCTGTCGGCAATG +TCTGCGTTAATTCCGTAGAATCGGCCTACTGCCGCCTTAACGCGCTCCATGATGTTTTTT +TCGTGGATTGCCGCCGCGTTATTTAGTGCCGCAACCGTACTTCCGTTAATCTTCGGTGAC +TTCGTTGTGATTTTGTAAGTGTGGTTCATTATTCCCCCTAACCAGTCAGGATAGCTTTCT +AGCTTCATCAATGATAACAGGTTCGAGACGCTCAAGTCGAACGCGGAACGTGTTCATAAA +TCCCTCGTTATTCTTCAATAACTCGAAGGCGTCCGTAGCTTCAATGGCGCTACTACCCAT +GTAAGCGATTACGCGCTTCTGTGTTTTTCGTGAGATCGCCACTACGCGGTAAATATTATC +ACTCATACAATGCCTCCTTAACACGATCCGCTACGAATGAGTTAACGGTGATTGAATAGT +CAGCCACGAAGCAAGGAGCAACACTGCCTGGCTCAGTGCGCATGAATGGTTTATTGAACT +GAGTGATTACCTCGCCAGTATCGTTATCAATTAAGCGAGACGCGCCTAAGCGGTCCACCA +CGCGCGTTACGCGGTCCACGCACTCCATTACATCACTTACGTCATAATAGGCGGTTAGAG +CGGCTATCATTGCCTTCTCTTGCTTGTTGTGAATCTTACGCGCAATGTTCTCTATCGCAA +CGTGCACTTTCTCGTCAGTCTGGATTACCGAGCTGGCTGAGAAGTCCAGATCTGTGAACT +GCTTAAACATAATACTTTCCTCGTTTACTGTTGATGTGATGAATCATACCCAGTAGACAT +GGATAAGTCGTTAGCAAAAAATGCTATTCCAATAATTGCTCATTATTCCATCAATACTGG +AATATCTGGAATAATCACGATCATGATTGATCTCTAATGATGAGATGTGATTGTTGCATG +GTGTGCAACTGTTGATGTGATTGTTGCTTAGAATGCAATGATTGTGAGAGGGGGGATCTA +GTGTTACCAGGTTCGCCTGGTAGTCATCTCCATTTTTAGCAAAAAGTGCTATCGATTACG +ATTACGCTTGATTGCGTGTTAATCATTGTATATGATTTGCCTAAATCGCAATCGTATACA +AAATTACGCCAAATTACGCCTTATAATATACATATAGATATATGTATATATATAATATAT +AAGTGTTTTTTTTATTTATATATATAGATACTATTATTTGTAATATGGTTGTATAGCGTG +TCTTTTATGTTAATCGTTGGATGCTCTGATTATGATGCTCATGTTTATATATACAGTAGT +TTTATCGGCGCATATTTATTTAGGGAAATCGATTAACACGATTAACACGATTAACCACAC +TTAAAATCGCCCCGCAGACCTTGACACATGCGGGTCGTCGTGTAACACTACGATTAGAGA +CACGGTGTTACATGTTAATCATGGTGTATATTGAAACTAAGGAGAAAGCTATGAGTTCTT +ACCAGTCAGACGCAGTACAGGCAGCAATCAAGGCAGCTTACGAGAAGGCCGGAGTAACGG +TTGAGCAGCGACCGGAGGCAAAAGTGACCGATGTTATCCGGGCCGCCTGCGATCAGCTTT +ATGGTGATGGCGAGAATACCGAGTTCACATTCGACGCGAATAAGATGGCTGAGGCCGCAG +CAAGAAAGTCGATGCCAGACGCTGACGAACATGATGTTGCCAAAGGCGCCGAGTCCTGGT +TGCTCGGGAAGACGGATGAGATTAACGAGAAGTTTAAATCCTCATTCATCACCCCGATCG +TTTCTCGACACTTCTCCAAGATCGGCAAGTCGGTCAAGGTGAGCGTGACCATGAACGATG +AGAAGTTGCGAGTCGTTACTATCTCAGTGAGTGACGAAGAAGTTCCGGTGAAGAAGCGCC +GCAGCCGGAAAAAAGTCAGCCTGGCTGATTGTCTGGATTCGTTTGTTCCTGATGTTGATG +ATCTTGAGAAAGGCGACGTTACTGTAAGCACCGTGCGCGACCTGGTTCGCCAGATGAAAG +CGCATATCGAAAAATGTGGACTGTAAGGAGAAGTAATTATGTTTAATATCAAACCATTAA +CAGAAGCAGAGAAACAGGCTCAGGCCAAGCAAACCGAAAACATCCAAGTGATCGCTGATG +CGCTGATTGGTAAGAGGTCAATCAAAATAAACCTCGACACTGTTGGTCAGTCATTTTTTA +CTAAAGGTTTGGATAAGTACGTTATAAATGTGAAGGCGAGAGACCTGGTGGCGAGAATTC +AAAAGCTAAACAATCAAAAGCTAAAGCTCATCAAGGTCGAAGGCAACATGTGCGAAATTG +AGAACCTCAGCGCACCAGACCCGAATAAGTGGGAAATCACCGATGTCGAGTTTATCGTAG +AATAGCACTTTTTGTTAAAACCGGATCGGGGTATCTTGCTATAGTTACCCCATCAAAACG +AGATACCAATCAGAGGAATCACCATGTCAATCGTCAAGAACCAGCAAGCCATCGATTCAA +CCAATAACAACCGCTTTGCTATTTTCATCACTCGCGACAACAAGCGCTTTGCAGTAAAGG +CCGTACCAGGTGGATACAAAACCTACATGGAAGATAACGGAAAGTGGGTGCGGTGCGACA +ACCTCGCAAACTTCTTGGTCTGGAACGCAGACCTGCAGGGATTCGATGACATCAGCACTT +TAATTGAGGAGTAATAATCATGCCACGTTACAGCAACCTAACTCAACTAACCCGCGTCAA +CGGGCACATGATCCCGGCAAAATCCACTCACTACGCAATGGGAGCAAAGCACGGATTGTA +TTTCAAATGGCGCGGTCAATGGAACTTCACGGTTATTCGTAATTTCTACATTAGAGTTAC +AGGTGATGACCCGCAATCGGTCGTAGAGAACTCAATCGGCGACAACAAGATCGAGGTGCT +GAAATGAACTTCAACATAATTGCTTTCTGGTCTGCCGTATGGTTCTTTTGCGTAGGTCAT +GTTGTGGTTGGAATCGTAATCATGTTGCTACTGTGTGCGGGAGCGTTCGAATGATGCGGA +TTCTGATTTGCATGATGGCGGCGGTCGCCATGGCTATCCTGGTAGTGTCCGGCTGCGGCG +AGGCCAGGGATAGCTGTCATGAAACCGGGAGCCAGGTTACTACTTTCGTGATGGTTGGCA +ACGTATTGCTACCAATAACATCAAATGAAATCACTTGCGAATAGCACTTTTTGTTAAAAC +TCAACTCCGGGGTTGCGATATAGTAACCCCATCGACAACGAACGAGGACGCAAACATGAA +AATTAAATTACTTAGCAATGGCGGTTACAAGGGATTCACCCGCGACCTGGAAGCTGACCC +TATCGTGGTTGACGCGGTTAAGTGCGACTCAAGTACTGGAGGCTACCGCGTTAAGGTTGA +TGACCTTGTAAAAGCTGGCGTGTACGATCTTGATTATGGCCTGTCGGTTAGCCCGGTATT +TGGCCCAGCTGACTTCAACGAGAAAGACGGAACGATGTTCTTTTTTGATTGGGAAGTGAA +GGCAAACATCAAGCCGCGCAAGGTTCGTCTTCTCAGCAATGGCGGCTACCCGATGCGACC +AGGTTATGAGAATCGCACGTTCCCGGTTATCGTTGACTTCATTGGGACAACTGACAACTT +GGTATACGTTAGCCATGAGCAACTTAAGGCAGTTGGATTCGTTGGCGGTATGAATAAAGA +AGCGCTTTGCTTCTTCCATCGATGTCCAGAGCCGATCGGTATTGAGTGCGAGTTAGTATA +CTAAGCACGAATTGTTAAAAGGGGATTTGGCCTGACTGGTATAATCCCCACATCAACCAC +TAATAGGAAAGCATCATGTTAAAATTAAAAGATATTCAGTTCCCTGTAGTATTTAACACT +ATTAGCTGCGGTAAAATAACCTGCCACAGCAAAGATCGCGCAACAGATTCATCATTCAAT +GAGTGCCACCCGTCTATTGTTGGTAATCTTATTGAGCTTCATAACAATCACAATCCTGAT +AACATCCCATCTCTTCCATATTATGTTGAGGGAGTCGGGCCTGGTTGGAAGGTTGGTCGC +TCCATCTTCCATGCAGCAAAGCCAGAAATCAAGCCAGCGCTACAATGCACTCAGATCGAG +AACATGCCATTGAGCGCGACACTAAAAGGTGTCCAACTTGATAGCGAATCCTGGATCGAG +ATTACCGCCACGCCTAAAACTATTGAGGTACATGATGATGTGGTGATTCTCCTGTTGCAT +TACGGCAGCTTTAAGCACAAGACGGTATCAGGTGAAATCAGCATTAAGCGCGGAACTCTT +GTCCGCTACGAGGTGAAATAATGACTGCATGGGTCTTGATTATCTTGATGAGCAAAGGTC +CGGATCACGTATACATGGAAAGTCAACAATCATGCAACAAGGCACGGGAAGTTATCGCAG +AGAACAAGCCGTTCGGATATGAAGTAAAAACTATGTGCGTTAAACGATAGCACGAATTGC +TAAACCTTCCGCAAGGCCATTTGATATAGTGGCCTTATTGAAGCACGACAACCAACTGGA +GGTAAAATTTATGAAATTCGAATGTATCAGCGATAACACCAAAAAATTTACTGTTGGCAA +AATTTACGATGTTCCGACTGAGCACGCAGAGCAAACCGTAGCTCTGACCGACGACACGGG +CCGCAACCGAATTGCAACCGTAACTCACAACGGTGAAGGTCTTCGCTGGAATAGCGGCGG +CACTAAGTTCGCAACGTTCGGCAAGAAGCGCAAGCGCACCTTCCGCGTCAACGGCAATGT +TGCAGCTAACAAGATCCATAACGTCAAGCCGTCGGAAGTTGACCGCAAGCCAGCGCTGAA +GTTTAAAGAGAAGGTGGATTTATTCAATCTTGCCGCCTCGCTTGTTCTCCTGGTCGCTGC +TATTTCGCTGCTTTCCATCATGTAATGCTAATGGGGAATCGACTCTGAACGGTTCCCCTT +TCTTTTGGAGAAAACACTATGCCAGACTTTTCTAACTGGAATAACGAGCCGCCATCATTT +CAGGAGTTGCTATTCTGCCTCCTGGTCCTGACATTATCTCTTAAGGGTGTTTTATGGCTA +CTATCATGACAGTAGAAGATGCAGCACGCGATGCAGTGGAAGGAATGCGCCCAAATACCT +CCAGAATAGCACACTACTACAAATCTGAGGTGTCGGCAGTGCAATTGGTCCACGAAATTT +TAAGGCTCCCACAAGTCGATTCAGCGCGCGTTGTGACGTGCTTAAAAAATTATTTTTGCA +TCACTATTAAAACGAATAGCACGAATTGCTAAAACCTATCAAGGGGAATACGCTATGATT +CCCCTACACCAACAAACGAGGAAGCGATCATGAAACACTTAATCTGCATTGAAGCGCCTA +ACGATCAATACACCCTGCATGGACTTGGTGTGTTCAAAGGTCACTACATTACCGCAGGAA +CTTACGATGCTCGTCGCGGCGATGGCGACCTAATGATTACGTCAAAAGAAGTAAATCCGT +ACATCATGCAGAATCTTGGCAATAACGAATATATGGCCTATGGCTGCAACGCGGTGTACA +AGCACGTTAAGATCCGCAAGCGTGTTGTGCGTGCATTCAAGAAGATTGCAATTAAATACT +GGAAAATGAGCAAGAAAGATGCCGGACGTTGGGCGCGCAACGTTGCAGATTCATACTTCT +ATCGTAACGGCGAATCCTGCTACTTCCTGATCGATGAACTTATGGAAAACTACGGTGGCG +ACTTCAGCCAGGGTAGCTTTGATGACTGGGCCAACTATGAGATCAGTTGCTGGTAATAGC +ACGAATTGCTAAAACTTGCTCAAGGGCATTTGTTAGAATGCCCTTCGTTGAGTTAAGCAA +CCAATCAGAGGAATAAATCATGGATAAAATCACAATTTGGGGCCAGACAATCAACCTGTT +TCTCGGCACGCGCCGAGTGGCAATCTTTGACTTTGACGGGACACTTAGCGATGGATCTGG +TCGACTTCACCTGCTGCCAACAAAGGATTTGCACTTGACTGAAAGCTGGTCTGAGTTTAA +CCGAGCGGCAATATTTGATAACCCAATCCAGAGCACGATCGATGTGATGAACTCTATGTT +TGCCGCTGGGTATCATGTGATCATTTTAACCGGGCGAAGTGATGAGGTGCGTTACGCATC +TGAGTTATGGCTTAAGCATCACGGCGCTCGATATGATTACTTAGTCATGCGACCGCATAC +CGACAACCGCAAAGACACGGTAATGAAAGAAGAGGCAGTGCGCGCTATCGGCATTGATAA +CATTCTTGCGGCTTGGGATGACTCAGTGAATATAATAAAAAAATTCAGAGATCTAGGGAT +AACCACATATCAGGTTTGTGAATATGCCTGTGATAGTCGAGAGGATTTAAATAGTCATGG +TGTCGATTGATAACAAATCAATGGTAAGAGAGTTATTTACTTATTCTGACGGCGTTCTGT +ATTGGAAGGCCAAATCATCTAAATACAGTAGAGCTAAAATAGGAGGCGCGGCAGGAAGCA +AGGATAAAGACGGATACATAATAATCAGAGTAAGAAACGAAACTAGAGGCGCTCACAGGC +TTGTATGGATATACCATAATGGCAAGATACCTGATGGAATGGAGGTAGACCATATGGATG +GAGACATAACAAACAATAGAATAGAAAACCTAAGATTGGTAACGAGAACCATAAATAACA +GGAATCAAAAAAAGAGATCTGATAACACAACCGGAGTATCCGGTGTAACTTTCATGAAAG +ATAGAGGAAAGTATAGGGCGCAAGTTAGAAACAAGAGACTCGGGCAGTTCGACACAATAG +AAGAGGCCGCCAAAGCAGTAAAGGATGAGCGGGATAGATTAGGTTTATTCACAAAAAGAC +ACGGGGTGTAAACATGAAAACAGCTATCATTTTAAACGGCGCACCTGGTGCGGGAAAGGA +CACTATCGGATGCATCCTGGCTGACACTTACGATCATGTAGCGCTACGCAGCTTCAAAGC +GCCAATGTTTGAGATTGCCCGAGCAATCCTGGGTGAGACTAATTTCGAGTATTTCATGTT +CTTGTATGAGGACCGTCGCTATAAAGAAGAGCCAGCGTCAATCCTGAACGGTAAAAGCCC +GCGCCAGTTTATGATCTGGATTAGCGAGGAGGTCATCAAGCCGCAGTTCGGAAATCGCTT +CTTCGGTATGCGAGCGGAAAGTAAGGTGAAAGAGTCGCATTCACTTTCGGTATTTACTGA +CGGTGGATTCAAAGACGAGATCTTGCAGATGATTGAAGGTGACATCCAGGTCAAGCTGTG +TCGAATCCATCGCAACGGTTGCAACTTTGACAACGACAGTCGCGACTATATCTATCTTGA +CGATATGATCGGGGTCAACGGTTATCAGGAGTGTGACTTCTTTTCTGTCGAAGGCCATCC +AGAAATTACCGCTCAGCACATAGCCGCCACGTTCATCAATAAATAGCACGAATTGCTAAA +ACGTCGGTGTGGTGATTTGATATAGTTACCTCATCGACAACGAAGAGAGAAAATCGAAAT +GATGGTATCAACTGATAAGTTTTTCACTTGCACCAAAACTTCTGAGGTATTCGAGCTGGT +TCATACTGACAACGGTGATTTCATGCATGACGGTTGCGACGCTTTCATTGAAGTGAAAGA +AAGCGACTATGACGACGGAGTTTATTACAACCCTGCGGTTAACACGCAGTTTTTTACCCC +GATCGAAGAGGAAGGAGAAGAGGCATGATCACGATTAACCTGTCAGATAAACAAGCGCGT +GAAATACTAGACACTATCGGAGAACAGCTTCACGTAAAAGGCGATACCGCTGAGATTCTT +AACCAGATCGAAAGACAGCTAACCCCTGTGTCGACGAATCAAGCTGAGTTCGCAGCATGG +AAAAGCGAACGCATCCTGCCAAATATCATCAAGGCATGGAAGCGCAAGCATAAAAAAGAA +ATCAACGTTGAGGATTTATTTACCGATGAATTAAGTCCTTCAAATGTTGCTCAATACCAG +TTGCGATACATGGAGTCGGTTTGCAATCAGGTTTTAGGTGTAAGTTTTTCATTCAAAGGT +GATAAATAATGTTCGGTTTAAGCGAAGCGGAGTGGAATGTTGTAAAGCGTGCCGCGAAAG +AATTAAACAAATTCGTCAGCGGAATGAAGAAAGAAGATCGGAAAAACGACAAGATTATGA +TTGACGTAATTTCGACTCACCATAAAAAGGTCGAACTACTCATTGACCGCTACAAATTTG +TCTGGACTGCCGGGTATATTGCAGGGCGCGTAGGTAACAAAGAGGGGGATTATGAATAAT +GGCCAATTTACCAAAGAAAGGCGATCAGGTTCGATGTGTCACTTCACGCAATGGTAATGC +TTTATCGGCGGGGTGCTTGTACGACGTAGAAAAAGTCAGTAAGTCAAAGAGGCTTGTATT +CGTGTACGGCGACGATGGAAATCTGCATGAGATTGATTACCCGCAGGATGTAACTAATGG +TCAATTCGAAATTAATGATTGACCTAAATCCCTGAGCGGTGATAGTATTAATCCCGTAGA +CAGACGAGGCGCAACTAAGCGCAACGCGTGAGACGATTCTCACACTTCCAGCTAACAAGC +TCGGTTGCATAGTGGTTAAGCAACGCCGCAGACCCGTAAGCGGCAACAATTCAAGAGGAT +TGCATAATGCAAAAAACTAAAGACGAATCAGTCAAAATTGAAATTAAAGTAACTCGCAAC +GGTGAAACCACTCGTTATAAAAAACGATTAAATCCTGGCGAGGCTGTTATTGGTCGCATT +GCTGGCGTTATGATTAAGGCGCAGGAAGATGAAGCGATTCAAAGTTAAATTAATTATTCG +AAAGATGGGAATGTTTTGCCAGTCGTGCAAGCAATCTTTCGAAGCTGAATTATCAGCAAC +CAGTCAGGATGAAGCCATCACGAAAGCAAAAAAACTTTCCGGCGCTAACCTTGACACTCA +CAAAATAAATATTGAATTAATCAAGGAGATTTAACATGACAATTTTTTTATTAATTATCG +CTGGTGTCATTATTTTTGGTGCTGGTTTGTTTGCTGGCTTCGCACTTGTGGCGGCAGCAA +TTGCGATGGACGCGAAGGATAAAACTGGTGTATGGCTGACCTACTCACCTAAGAAGGACC +AATGGGAAATGACTGGCGACCTTGCTCACTGCTATTCTAAAGCTAAGACCCACCCTAAAG +GCATTAAACGACGATTGTCGTGATGAACACTAACCCGCTCCGGCGGGTTTTTTTAT diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/AY216660.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/test-data/AY216660.gff3 Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,315 @@ +##gff-version 3 +##sequence-region AY216660.2 1 48836 +AY216660.2 GbkToGff gene 40 576 . + . locus_tag=CPT-T1_001;ID=CPT-T1_001.gene; +AY216660.2 GbkToGff mRNA 40 576 . + . locus_tag=CPT-T1_001;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_001.mRNA;Parent=CPT-T1_001.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 40 43 . + . locus_tag=CPT-T1_001;regulatory_class=ribosome_binding_site;ID=CPT-T1_001.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_001.mRNA; +AY216660.2 GbkToGff CDS 52 576 . + 0 locus_tag=CPT-T1_001;codon_start=1;transl_table=11;product=terminase small subunit;translation=MSEPKNAPVVQGGNFKELYKKKFGTVLAKNRAMTPEQLFDLSVKYFEWAEDNAIKASESASFQGGVYESLVHKPRVFTWTGYRLFIGASEAAIIKWKREEEYSEVMEFVESVINEQKFQLAANGVINASFIGKDLGIDKPASINIENSSASASTVVATTEDAMKEAVNSILDML;note=Orf no. 54 see PMID: 14972552;ID=CPT-T1_001.CDS.1;Parent=CPT-T1_001.mRNA; +### +AY216660.2 GbkToGff gene 589 2184 . + . locus_tag=CPT-T1_002;ID=CPT-T1_002.gene; +AY216660.2 GbkToGff mRNA 589 2184 . + . locus_tag=CPT-T1_002;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_002.mRNA;Parent=CPT-T1_002.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 589 592 . + . locus_tag=CPT-T1_002;regulatory_class=ribosome_binding_site;ID=CPT-T1_002.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_002.mRNA; +AY216660.2 GbkToGff CDS 601 2184 . + 0 locus_tag=CPT-T1_002;codon_start=1;transl_table=11;product=terminase large subunit;translation=MGDLIMIQWEDLNATQKLAIKKMSEANFEKMIRIWFQLMQAQQFQPNWHHLYLCHEVEEIIAGRRGNTIFNVTPGSGKTEVFSIHLPVYAMLKCKKVRNLNVSFADSLVKRNSKRVREIISSNEFQELWPCKFGTSKDEEMQVLNEDGKVWFELISAAAGGRITGSRGGYMTPGFSGMVMLDDIDKPDDMFSKVKRERTHMLLKNTIRSRRMHNETPIIAIQQRLHAQDSTWFMMNGGMGIEFDQISIPALVTEEYGKTLPDWLQPYFERDVLSSEYVELDGVKHYSFWPSKESVHDLLALREADQYTFDSQYQQKPIALGGSVFNSEWWTYYGSSLDADEPDPGKYDYRFITADTAQKTGELNDYTVFCLWGKKNDKVYFIDGIRGKWEAPDMERQFTAFVNQAWRHNKSMGVLRKIYVEDKASGTGLIQNLRKKTPISITPLQRNKDKVTRAMDAQPVIKAGRVVLPEEHPMLAEIIAEHSAFTYDDTHPHDDIVDNFMDAANIELLTIDDPIERMKRLAGMVKR;note=Orf no. 53 see PMID: 14972552;ID=CPT-T1_002.CDS.1;Parent=CPT-T1_002.mRNA; +### +AY216660.2 GbkToGff gene 2230 3522 . + . locus_tag=CPT-T1_003;ID=CPT-T1_003.gene; +AY216660.2 GbkToGff mRNA 2230 23794 . + . locus_tag=CPT-T1_003;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_003.mRNA;Parent=CPT-T1_003.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 2230 2233 . + . locus_tag=CPT-T1_003;regulatory_class=ribosome_binding_site;ID=CPT-T1_003.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_003.mRNA; +AY216660.2 GbkToGff CDS 2239 3522 . + 0 locus_tag=CPT-T1_003;note=HHPred predicted structural similarity at 99%25 probability to phage T4 portal protein gp20 Protein Data Bank entry 3JA7 over most of protein%3B Orf no. 52 see PMID: 14972552;codon_start=1;transl_table=11;product=portal protein;translation=MKIVKHDGYNDIFNGGADGSPKPFFMSDASYHVGSFYNDNATAKRIVDVIPEEMVTAGFKMSGVKDEKEFKSLWDSYKLDSSLVDLLCWARLYGGAAMVAIIKDNRMLTSQAKPGAKLEGVRVYDRFAITVEKRVTNARSPRYGEPEIYKVSPGDNMQPYLIHHSRVFIADGERVAQQARKQNQGWGASVLNKSLIDAICDYDYCESLATQILRRKQQAVWKVKGLAEMCDDDDAQYAARLRLAQVDDNSGVGRAIGIDAETEEYDVLNSDISGVPEFLSSKMDRIVSLSGIHEIIIKNKNVGGVSASQNTALETFYKLVDRKREEDYRPLLEFLLPFIVDEEEWSIEFEPLSVPSKKEESEITKNNVESVTKAITEQIIDLEEARDTLRSIAPEFKLKDGNNINIREPEETTEPEPGLGEKLEDEN;ID=CPT-T1_003.CDS.1;Parent=CPT-T1_003.mRNA; +AY216660.2 GbkToGff gene 3496 4273 . + . locus_tag=CPT-T1_004;ID=CPT-T1_004.gene; +AY216660.2 GbkToGff mRNA 3496 4273 . + . locus_tag=CPT-T1_004;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_004.mRNA;Parent=CPT-T1_004.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 3496 3499 . + . locus_tag=CPT-T1_004;regulatory_class=ribosome_binding_site;ID=CPT-T1_004.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_004.mRNA; +AY216660.2 GbkToGff CDS 3512 4273 . + 0 locus_tag=CPT-T1_004;note=InterPro domain IPR006528,Orf no. 51 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid morphogenesis protein;translation=MKINGVATQWRYPEMSERAMSRSLQDVAAKLTEKMRDELKPMKFDATDEEIDQTERSLLDYVESLIAPIIGSLSSVALTIYKFNSKQWLRIARNAGGKKNQAVMLLALIGPTAAESWYSGQYNLWRSQVATSIRKFAANMVTDFTDKLRAASGQGKSKDFVVELAKERFGIYRNWAKNRASGIVGTWNSRLMRQRIKDAGVSYYFWRGVMDLREREKHVRWEGKRIAVDSDHVFPGEEYNCRCWAVPDFSTGD;ID=CPT-T1_004.CDS.1;Parent=CPT-T1_004.mRNA; +AY216660.2 GbkToGff gene 4264 5388 . + . locus_tag=CPT-T1_005;ID=CPT-T1_005.gene; +AY216660.2 GbkToGff mRNA 4264 40736 . + . locus_tag=CPT-T1_005;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_005.mRNA;Parent=CPT-T1_005.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 4264 4268 . + . locus_tag=CPT-T1_005;regulatory_class=ribosome_binding_site;ID=CPT-T1_005.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_005.mRNA; +AY216660.2 GbkToGff CDS 4276 5388 . + 0 locus_tag=CPT-T1_005;note=HHPred predicted structural similarity at 72%25 probability to phage T4 prohead core protein protease gp21 Protein Data Bank entry 5JBL over predicted catalytic third of protein%3B Orf no. 50 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid maturation protease;translation=MKAKQRFDSVKIKAHFDDNGFLVDRPIVARIGAQVYKTPHGDRVEFRPASEVFKQDSLQSFAGKPITVGHVTVTPQNAKDVVVGSCAGAGIASGVGVEVPLSIYSDYAISKAKAKEAGELSVGYTSVDIDKPGWGSNETGEYIFEEDMKQDEAPPEGWVRFDAVQTNIKVNHIALVFKGRAGIAKLNLDAEQEFPYDNNVQLTNEDKQMKKIKIDSVDVEVTEDVANHIEKLTAQIATIQGKADGFEAERDALKVKVDSLPELVKAEVEKQKADAAARAEVTAVAETAGVKHDGLDIKDVKIAVVKAMLDKDVSEKSDAYIDAMFDVAKDSDIMAIQRKAVKGDSIEGGKPEEKNDAAPVTPNSRLSKVM;ID=CPT-T1_005.CDS.1;Parent=CPT-T1_005.mRNA; +AY216660.2 GbkToGff gene 5389 5876 . + . locus_tag=CPT-T1_006;ID=CPT-T1_006.gene; +AY216660.2 GbkToGff mRNA 5389 5876 . + . locus_tag=CPT-T1_006;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_006.mRNA;Parent=CPT-T1_006.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 5389 5392 . + . locus_tag=CPT-T1_006;regulatory_class=ribosome_binding_site;ID=CPT-T1_006.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_006.mRNA; +AY216660.2 GbkToGff CDS 5400 5876 . + 0 locus_tag=CPT-T1_006;note=HHPred predicted structural similarity at 92%25 probability to phage TW1 Protein Data Bank Entry 5WK1 capsid stabilizing protein%2C equivalent to phage lambda gpD dec protein%2C over most of protein%3B Orf no. 49 see PMID: 14972552;codon_start=1;transl_table=11;product=capsid decoration protein;translation=MAQINASYQRDMAIALPGMVADTSKYNIDGACVVNEGDVLVGAAVQVVQAQAVDGHKLVKALTTGTTPYGVAIRSHWQTVNAQNQMIYEDGGAINVMTSGRVWMLSKSTEAPTFGSAVKLDVDGQEKSDGTIETTWTYAGGWTKYKDIQLVEVQLHQL;ID=CPT-T1_006.CDS.1;Parent=CPT-T1_006.mRNA; +AY216660.2 GbkToGff gene 5926 6705 . + . locus_tag=CPT-T1_007;ID=CPT-T1_007.gene; +AY216660.2 GbkToGff mRNA 5926 6705 . + . locus_tag=CPT-T1_007;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_007.mRNA;Parent=CPT-T1_007.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 5926 5929 . + . locus_tag=CPT-T1_007;regulatory_class=ribosome_binding_site;ID=CPT-T1_007.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_007.mRNA; +AY216660.2 GbkToGff CDS 5938 6705 . + 0 locus_tag=CPT-T1_007;note=InterPro domains IPR008964 and IPR003343%2C invasin/intimin cell-adhesion fragments superfamily%3B bacterial Ig-like domain-containing protein%3B Orf no. 48 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAYENLMLRPACPGNLSDTSTYNIDGACVAQGDIEFGSAVQVVGIVDGVKVVTALSDGGTPYGIAFRSQYEHLSGKILDGEVCNVVSHGRVWALTSLDEAPSLFSKLQFGSGGVVTGGSGYAGWTFAGGFVKHEDGYIIEVRVKQNAFIVPPPPPPVVLVESATITTDKESPQPNNVTIQCVANALPANATDKTGKWSIDATNIATVNPDSGLVTPVGGEVVGDFNITWTANDASKTTATIAYRVEAVPTPEVDV;ID=CPT-T1_007.CDS.1;Parent=CPT-T1_007.mRNA; +AY216660.2 GbkToGff gene 6784 7755 . + . locus_tag=CPT-T1_008;ID=CPT-T1_008.gene; +AY216660.2 GbkToGff mRNA 6784 7755 . + . locus_tag=CPT-T1_008;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_008.mRNA;Parent=CPT-T1_008.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 6784 6787 . + . locus_tag=CPT-T1_008;regulatory_class=ribosome_binding_site;ID=CPT-T1_008.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_008.mRNA; +AY216660.2 GbkToGff CDS 6796 7755 . + 0 locus_tag=CPT-T1_008;codon_start=1;transl_table=11;product=major capsid protein;translation=MTTKKFDEADKSNVEMYLIQAGVKQDAAATMGIWTAQELHRIKSQSYEEDYPVGSALRVFPVTTELSPTDKTFEYMTFDKVGTAQIIADYTDDLPLVDALGTSEFGKVFRLGNAYLISIDEIKAGQATGRPLSTRKASACQLAHDQLVNRLVFKGSAPHKIVSVFNHPNITKITSGKWIDVSTMKPETAEAELTQAIETIETITRGQHRATNILIPPSMRKVLAIRMPETTMSYLDYFKSQNSGIEIDSIAELEDIDGAGTKGVLVYEKNPMNMSIEIPEAFNMLPAQPKDLHFKVPCTSKCTGLTIYRPMTIVLITGV;note=Orf no. 47 see PMID: 14972552;ID=CPT-T1_008.CDS.1;Parent=CPT-T1_008.mRNA; +AY216660.2 GbkToGff gene 7794 8093 . + . locus_tag=CPT-T1_009;ID=CPT-T1_009.gene; +AY216660.2 GbkToGff mRNA 7794 8093 . + . locus_tag=CPT-T1_009;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_009.mRNA;Parent=CPT-T1_009.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 7794 7797 . + . locus_tag=CPT-T1_009;regulatory_class=ribosome_binding_site;ID=CPT-T1_009.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_009.mRNA; +AY216660.2 GbkToGff CDS 7806 8093 . + 0 locus_tag=CPT-T1_009;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAKEKTVVIVNVGVALQMFRLEDGSFAKVLPDEEVTLPASVLDLPGLRCLIAREEIEVKDDSATNRKIRAEMAKITKPDPWDKMSVKELEDGGEY;note=Orf no. 46 see PMID: 14972552;ID=CPT-T1_009.CDS.1;Parent=CPT-T1_009.mRNA; +AY216660.2 GbkToGff gene 8127 8548 . + . locus_tag=CPT-T1_010;ID=CPT-T1_010.gene; +AY216660.2 GbkToGff mRNA 8127 8548 . + . locus_tag=CPT-T1_010;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_010.mRNA;Parent=CPT-T1_010.gene; +AY216660.2 GbkToGff CDS 8138 8548 . + 0 locus_tag=CPT-T1_010;note=HHPred predicted structural similarity at 96%25 probability to Bsubtilis yqbG (myophage protein%2C see PMID 29279385) Protein Data Bank entry 1ZTS over most of protein%3B Orf no. 45 see PMID: 14972552;codon_start=1;transl_table=11;product=head-to-tail connector complex protein;translation=MNQETLIAVVEQMRKLVPALRKVPDETLYAWVEMAELFVCQKTFKDAYVKALALYALHLAFLDGALKGEDEDLESYSRRVTSFSLSGEFSQTFGEVTKNQSGDMMLSTPWGKMFEQLKARRRGRFALMTGLRGGCH;ID=CPT-T1_010.CDS.1;Parent=CPT-T1_010.mRNA; +AY216660.2 GbkToGff gene 8534 8919 . + . locus_tag=CPT-T1_011;ID=CPT-T1_011.gene; +AY216660.2 GbkToGff mRNA 8534 8919 . + . locus_tag=CPT-T1_011;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_011.mRNA;Parent=CPT-T1_011.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 8534 8538 . + . locus_tag=CPT-T1_011;regulatory_class=ribosome_binding_site;ID=CPT-T1_011.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_011.mRNA; +AY216660.2 GbkToGff CDS 8548 8919 . + 0 locus_tag=CPT-T1_011;note=HHPred predicted structural similarity at 96%25 probability to phage SPP1 gp15 Protein Data Bank entry 5A21 over most of protein%3B Orf no. 44 see PMID: 14972552;codon_start=1;transl_table=11;product=head-to-tail connector complex protein;translation=MNYSQIERMARKGVAFFTDPSRPMNLIKQGEYGYDENGFEIPPMEQVIPISGATRRPNAREIDGETIRASDILGIFNNDHEINEGDYIEIDGIRHVVVDARPVQASLEPVAYRPVLRRVSVGG;ID=CPT-T1_011.CDS.1;Parent=CPT-T1_011.mRNA; +AY216660.2 GbkToGff gene 8897 9355 . + . locus_tag=CPT-T1_012;ID=CPT-T1_012.gene; +AY216660.2 GbkToGff mRNA 8897 9355 . + . locus_tag=CPT-T1_012;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_012.mRNA;Parent=CPT-T1_012.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 8897 8901 . + . locus_tag=CPT-T1_012;regulatory_class=ribosome_binding_site;ID=CPT-T1_012.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_012.mRNA; +AY216660.2 GbkToGff CDS 8912 9355 . + 0 locus_tag=CPT-T1_012;codon_start=1;transl_table=11;product=hypothetical protein;translation=MANYQIRRFQGEIDAWINAAESTLEHAIEIFVRDVHDALVSRSPVDTGRFKGNWQITFNEIPNHALNRYDKTGGVVRGEEQAKTYGMFSRGGAITSVHFSNMLIYANALEYGHSQQAPSGVVGLVALRLRSYMADAIKQARRQQNAL;note=Orf no. 43 see PMID: 14972552;ID=CPT-T1_012.CDS.1;Parent=CPT-T1_012.mRNA; +AY216660.2 GbkToGff gene 9332 9743 . + . locus_tag=CPT-T1_013;ID=CPT-T1_013.gene; +AY216660.2 GbkToGff mRNA 9332 9743 . + . locus_tag=CPT-T1_013;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_013.mRNA;Parent=CPT-T1_013.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 9332 9336 . + . locus_tag=CPT-T1_013;regulatory_class=ribosome_binding_site;ID=CPT-T1_013.Shine_Dalgarno_seqeunce.2;Parent=CPT-T1_013.mRNA; +AY216660.2 GbkToGff CDS 9345 9743 . + 0 locus_tag=CPT-T1_013;note=HHPred predicted structural similarity at 96%25 probability to phage lambda minor tail protein U Protein Data Bank entry 3FZ2 over most of protein%3B Orf no. 42 see PMID: 14972552;codon_start=1;transl_table=11;product=minor tail protein;translation=MHYELSAAARAAFLSKYRDFPHYMENRNFTPPKDGGMWLRFNYIEGDTLYLSIDRKCKSYIAIVQIGVVFPPGSGVDEARLKAKEIADFFKDGKMLNVGYIFEGAIVHQIVKHESGWMIPVRFTVRVDTKET;ID=CPT-T1_013.CDS.1;Parent=CPT-T1_013.mRNA; +AY216660.2 GbkToGff gene 9733 10414 . + . locus_tag=CPT-T1_014;ID=CPT-T1_014.gene; +AY216660.2 GbkToGff mRNA 9733 10414 . + . locus_tag=CPT-T1_014;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_014.mRNA;Parent=CPT-T1_014.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 9733 9737 . + . locus_tag=CPT-T1_014;regulatory_class=ribosome_binding_site;ID=CPT-T1_014.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_014.mRNA; +AY216660.2 GbkToGff CDS 9746 10414 . + 0 locus_tag=CPT-T1_014;note=HHPred predicted structural similarity at 99%25 probability to phage lambda major tail protein V Protein Data Bank entry 2K4 over half of protein%3B Orf no. 41 see PMID: 14972552;codon_start=1;transl_table=11;product=major tail protein;translation=MHLPNGAQIFVETSRGVEVEATAITNAENPVATVASKGDLAKGDYVIVTQSTWAKMVSRVLIVTDAQETSITLAGIDTSDTLVFPAGGTMSFAKITGWTEIPCVQEIGQDGGEQQYYTYQCLSDDKEQQIPTFKSAVSLTYTFAHEFDNPIYPILRKLDSSGQVTAVRMYVPKASEMRMWAGILSFNDIPSTQVNEMETVELAVSLKGDFTFISSTLASPGA;ID=CPT-T1_014.CDS.1;Parent=CPT-T1_014.mRNA; +AY216660.2 GbkToGff gene 10516 10845 . + . locus_tag=CPT-T1_015;ID=CPT-T1_015.gene; +AY216660.2 GbkToGff mRNA 10516 10845 . + . locus_tag=CPT-T1_015;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_015.mRNA;Parent=CPT-T1_015.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 10516 10520 . + . locus_tag=CPT-T1_015;regulatory_class=ribosome_binding_site;ID=CPT-T1_015.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_015.mRNA; +AY216660.2 GbkToGff CDS 10528 10845 . + 0 locus_tag=CPT-T1_015;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKN;ID=CPT-T1_015.CDS.1;Parent=CPT-T1_015.mRNA; +AY216660.2 GbkToGff gene 10516 11162 . + . locus_tag=CPT-T1_016;ID=CPT-T1_016.gene; +AY216660.2 GbkToGff mRNA 10516 11162 . + . locus_tag=CPT-T1_016;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_016.mRNA;Parent=CPT-T1_016.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 10516 10520 . + . locus_tag=CPT-T1_016;regulatory_class=ribosome_binding_site;ID=CPT-T1_016.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_016.mRNA; +AY216660.2 GbkToGff CDS 10528 10839 . + 0 locus_tag=CPT-T1_016;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKKLKRAVYLLYQKPPTEEQLRSVGLSLSDYEDEEPETIIGDAEMVKAWNVFTSMLTQWRSSGAGAYGLDYNVLPMLFKIYKIEDEELALQDVRIMEAKALEMIAKQNN;note=Orf no. 40 see PMID: 14972552;ID=CPT-T1_016.CDS.1;Parent=CPT-T1_016.mRNA; +AY216660.2 GbkToGff CDS 10839 11162 . + 0 locus_tag=CPT-T1_016;codon_start=1;transl_table=11;product=tape measure chaperone frameshift product;translation=MAKFNFVLGQLPDFKLPVTFTMPNGEDATIIFTVRHLSSKEVQDMYAKQGEMNDSDFITKIASGWNLEEEFNEENTRKLVQYYPSAAYNLTATYIKALAGHRAKKLKRAVYLLYQKPPTEEQLRSVGLSLSDYEDEEPETIIGDAEMVKAWNVFTSMLTQWRSSGAGAYGLDYNVLPMLFKIYKIEDEELALQDVRIMEAKALEMIAKQNN;note=Orf no. 40 see PMID: 14972552;ID=CPT-T1_016.CDS.1;Parent=CPT-T1_016.mRNA; +AY216660.2 GbkToGff gene 11192 14076 . + . locus_tag=CPT-T1_017;ID=CPT-T1_017.gene; +AY216660.2 GbkToGff mRNA 11192 14076 . + . locus_tag=CPT-T1_017;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_017.mRNA;Parent=CPT-T1_017.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 11192 11197 . + . locus_tag=CPT-T1_017;regulatory_class=ribosome_binding_site;ID=CPT-T1_017.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_017.mRNA; +AY216660.2 GbkToGff CDS 11203 14076 . + 0 locus_tag=CPT-T1_017;codon_start=1;transl_table=11;product=tape measure protein;translation=MVDKVAGLSLDVDVSTVQRAVKSLKEFSKANDQAADSMGSLINESEVAKQKAKEHAEQLRRQRKEYEAVEKAIDPTVSKMERLKIASQQLDKLWQQGVVPDETFFRLGEMLDLQNAKLARSRAMLTEEGQAALQEAKAKEQAAVRSKAFMDALNGQVNAIGKTHAELMELKAAELGLSKEAAPLIAKLKDQGRAMNAAGISAGEYRQAMRMLPAQITDVVTSLASGMPVWMVAIQQGGQIKDSFGGIGNTFKVLLSYINPVTAGVGVLVGSLGILAKAGYDSYKSITDIQNALIETGGYAGVTAEELDSVSKKIAQTSNSTIGSIREIVTELASSGKYTREQIQNITKATAEWSASTGKSASQIISEFEKIASDPVKGLKKLNEQYNFLEKGQLTYIDTLSRTKGETEAVSEATKLFADVMEKRMKSIADNATPLEKMWSDIKQWASDAWGWVGDHTLGALNLIIDVVQGTVIQVKMILAKGDEYISNFIASAIKATQSLPGMSDFGADVLKEQENIVKSSRDNYDQLASDLDAINARVEKGEMGYIEAMRQRRTLEKQYSEETKEAIRKEAEEIEKRNRERNKQSKIVRSPTEQFDKELISLRAQLKVLQEHKEIGQKLSAQRKALFTTEATIAVLREASSKRQLSAEEKALLASQERVIELAKQKAEIGDQIVKQQQLNDLTDKSLKFVNEMTAATEQLNASRGLSTRDMERQAELAKITTDYINSGGSEGDEKLQNMIKAQNDYYAAEDAKRADWLAGAESAFADYGDAAMDMYGNVNEIASSALNGMSDMMVQFLTTGKANFEDFAKNIIGMIIKMIAQMVIFNTISGMMGGKTWSFAGGASSGASAASQATPTPAASVFRSVSSGGAAVSLAAAAGSVATSGFNASNSAPKVVNHSGGGTVVDVSGMEVKVDNGSDPRGISQGVEMMFKKMIRESCSQGGEVYNYIQEKTGG;note=Orf no. 38 see PMID: 14972552;ID=CPT-T1_017.CDS.1;Parent=CPT-T1_017.mRNA; +AY216660.2 GbkToGff gene 14067 14432 . + . locus_tag=CPT-T1_018;ID=CPT-T1_018.gene; +AY216660.2 GbkToGff mRNA 14067 14432 . + . locus_tag=CPT-T1_018;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_018.mRNA;Parent=CPT-T1_018.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 14067 14072 . + . locus_tag=CPT-T1_018;regulatory_class=ribosome_binding_site;ID=CPT-T1_018.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_018.mRNA; +AY216660.2 GbkToGff CDS 14079 14432 . + 0 locus_tag=CPT-T1_018;note=similar to lambda tail tip protein M UniProt ID P03737%3B Orf no. 37 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MATLDTFGWCTQVQGGGGSLTTTNSDRSIQFGNGYMQLASSGFNTTRREYSVVYAGEDFMAVYDFCNSHRIKPFAWTPPDGKIGIWVVKPNSLGAKPVSRDVMEINVTFMEQFTSME;ID=CPT-T1_018.CDS.1;Parent=CPT-T1_018.mRNA; +AY216660.2 GbkToGff gene 14500 15294 . + . locus_tag=CPT-T1_019;ID=CPT-T1_019.gene; +AY216660.2 GbkToGff mRNA 14500 15294 . + . locus_tag=CPT-T1_019;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_019.mRNA;Parent=CPT-T1_019.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 14500 14503 . + . locus_tag=CPT-T1_019;regulatory_class=ribosome_binding_site;ID=CPT-T1_019.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_019.mRNA; +AY216660.2 GbkToGff CDS 14512 15294 . + 0 locus_tag=CPT-T1_019;note=similar to lambda tail tip protein L UniProt ID P03738%3B Orf no. 36 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MSENKKLYDEESGKSLFHNCLQSLYPGEIITLIEVDGSKFGAQVYRFHGENIQYTPEEIMQAQQTGTLPPKEITFRGEKYGARPFGISGISFDSSGKATKPQLTVANIDSRVSAMIRAYNGLMQAKVTIWITQRELINSDGSIADGAYRKLVYYIERPNYVDKSVARFDLTSPYDMDGIMIPSRLTQSVCYFAQRGWYKTGKGCGYNGQNGYFDKDNNPVDDPSLDFCPGTVTACRLRFGANNELDFGGCAVASLQRKNQ;ID=CPT-T1_019.CDS.1;Parent=CPT-T1_019.mRNA; +AY216660.2 GbkToGff gene 15279 16025 . + . locus_tag=CPT-T1_020;ID=CPT-T1_020.gene; +AY216660.2 GbkToGff mRNA 15279 16025 . + . locus_tag=CPT-T1_020;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_020.mRNA;Parent=CPT-T1_020.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 15279 15282 . + . locus_tag=CPT-T1_020;regulatory_class=ribosome_binding_site;ID=CPT-T1_020.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_020.mRNA; +AY216660.2 GbkToGff CDS 15291 16025 . + 0 locus_tag=CPT-T1_020;note=similar to lambda tail tip protein K UniProt ID P03729%3B Orf no. 35 see PMID: 14972552;codon_start=1;transl_table=11;product=tail tip protein;translation=MISAKIKLEIMTHAQEEYPRECCGVVTQKGRVQKYHRIDNVHRDPENHFMMDAVQYACIEDDAESTTIAIVHSHTGDGATTLPSAHDTCMCNEMEVTWIIVSVPEGDMRFVKPEKLPLIGRPWSLGSFDCYGLVMAWHKEHGVELRDRRLNFEWWKPEYGINLYQDYYKQDGFVEIPDQNNPSFGDMVIMQIGQNVPVWNHAGIYLGDNQILHHAFGKLSRRDIYSGWYQDHTVLIVRHKDLKL;ID=CPT-T1_020.CDS.1;Parent=CPT-T1_020.mRNA; +AY216660.2 GbkToGff gene 16009 16621 . + . locus_tag=CPT-T1_021;ID=CPT-T1_021.gene; +AY216660.2 GbkToGff mRNA 16009 16621 . + . locus_tag=CPT-T1_021;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_021.mRNA;Parent=CPT-T1_021.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 16009 16012 . + . locus_tag=CPT-T1_021;regulatory_class=ribosome_binding_site;ID=CPT-T1_021.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_021.mRNA; +AY216660.2 GbkToGff CDS 16022 16621 . + 0 locus_tag=CPT-T1_021;note=single transmembrane domain predicted N-in and C-out%3B similar to lambda tail tip assembly protein I UniProt ID P03730%3B Orf no. 34 see PMID: 14972552;codon_start=1;transl_table=11;product=tail assembly protein;translation=MNDVKVIKLSGSLGRRFGVFHRYAVDSYPEAIRALSSQVDGFKEYMQSEVGSRSKFAIFVDGVNVGHHEEEKFKCAKEIRIVPIPTGSKTGGLFQVVLGAAIMVAAFYTGGASLALMGTMSSSLFMMGGAMVLGGVMQMISPQPGGANFEVQSSKNKPSYAFGGAVNTTAAGYPLPVPYGYRAGGGATFSAGSYAEDMS;ID=CPT-T1_021.CDS.1;Parent=CPT-T1_021.mRNA; +AY216660.2 GbkToGff gene 16688 20217 . + . locus_tag=CPT-T1_022;ID=CPT-T1_022.gene; +AY216660.2 GbkToGff mRNA 16688 20217 . + . locus_tag=CPT-T1_022;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_022.mRNA;Parent=CPT-T1_022.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 16688 16691 . + . locus_tag=CPT-T1_022;regulatory_class=ribosome_binding_site;ID=CPT-T1_022.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_022.mRNA; +AY216660.2 GbkToGff CDS 16699 20217 . + 0 locus_tag=CPT-T1_022;note=similar to phage lambda tip attachment protein J UniProt ID P03749%3B Orf no. 33 see PMID: 14972552;codon_start=1;transl_table=11;product=tail fiber;translation=MIQKVISGSKGGSQKPHNPVEMEDNLISINKIKILLAVSDGEIDETFSLKQLMFNSVPVQNEDGSFNFEGVKAEFRPGTQTQEYIKGMEDSSSEVTVNREVTTDNPYTISVTNKTLSAIRIKMFMPRGVRIESNGDKNGVRVEYEVQQAVDGGSFETVLTDVIEGKTMSGYDRSRRVNLPNFNNQVIFRVVRKTPDSNDSNVVDAIQVKSYAEVIDAKFRYPLTGLLFVEFDSKMFPNQLPTISIRKRWKIVNVPSNYDPESRTYNGNWDGTFKKAWTNNPAWVLYDLMINQRYGLDQKELGIAVDKWALYEAAQYCDQMVPDGKGGTEPRYLCDVIIQSQTDAYKVIRDICSIFRGMSFWNGESISVIIDRPREPAYIFTNDNVVNGDFSYTFASEKSMYTTCNVMFDDEQNMYQQDVEPVFDREATLRFGNNVTSITAIGCTRRSEANRRGRWILKTNLRSTTVNFATGLEGMIPTIGDVVAIADNFWSSNLTMNLSGRLLEVSGSQIFLPFRVDARAGDFIIVNKPDGKPVKRTISSVSADGKTIEVNIGFGFPVKPNTVFAIDRTDIALQQYVVTKIDKGDDDEEFTYKITAVEYDPNKYDEIDYGVNIDDRPTSIVEPDQIPRPKNVQVSSESRIVQGMSVETMIVSWDKVPYAVFYDVQWRKDNGNWQNVPQTANKEVYVEGIYAGNYQVRVRSVAGSGTTSGWSNIVAATLTGKQGEPGRPINLTATDDVVFGIRTKWGFSDGSGDTAYTELQQSPDGTVDNASLLSLIPYPQHEYYHSPMPGGNIVWYRVRTVDRIGNVSQWTDFVRGMASTNVDDIIGEISVDIENSPGYEWLVDNATDNAAQNSANAEAAIENALANDKDAIYMKKENGKRKAEYTKSLKLIADETQARVTAIEQLKASFGDQISASNSELREVIATETEALSREIDQLKAQIGDDIQASLTDIREVIATETEALSREIDQLKAQIGDDIQASLTDIREAIANETEARTQADLTLSARLGNNEAALAQKLDSWSNADSTGAMYGVKLGLKYNGQEYSAGMAMSLVGSGAAVKAQILFEASRFAIMTGMNGQTQYPFVVENGQVILSSAIIKNGFITNAMIGNFIQSNNYVFNQSGWRLDKGGTFENYGSDGEGAMKQTNTTISVRDASGRLRVQIGRLTGSW;ID=CPT-T1_022.CDS.1;Parent=CPT-T1_022.mRNA; +AY216660.2 GbkToGff gene 20251 20568 . + . locus_tag=CPT-T1_023;ID=CPT-T1_023.gene; +AY216660.2 GbkToGff mRNA 20251 20568 . + . locus_tag=CPT-T1_023;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_023.mRNA;Parent=CPT-T1_023.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 20251 20255 . + . locus_tag=CPT-T1_023;regulatory_class=ribosome_binding_site;ID=CPT-T1_023.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_023.mRNA; +AY216660.2 GbkToGff CDS 20263 20568 . + 0 locus_tag=CPT-T1_023;codon_start=1;transl_table=11;product=hypothetical protein;translation=MAYGISTWDANGVYNNYGIKPITVVGWNFLSAGQNSASFSYQVPPGMHVNYVISLDDGAISGPGRKIIASGNTITVTPTNSPGPNVYPSSNCYLIAYLEND;note=Orf no. 32 see PMID: 14972552;ID=CPT-T1_023.CDS.1;Parent=CPT-T1_023.mRNA; +AY216660.2 GbkToGff gene 20556 21257 . + . locus_tag=CPT-T1_024;ID=CPT-T1_024.gene; +AY216660.2 GbkToGff mRNA 20556 21257 . + . locus_tag=CPT-T1_024;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_024.mRNA;Parent=CPT-T1_024.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 20556 20559 . + . locus_tag=CPT-T1_024;regulatory_class=ribosome_binding_site;ID=CPT-T1_024.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_024.mRNA; +AY216660.2 GbkToGff CDS 20568 21257 . + 0 locus_tag=CPT-T1_024;note=InterPro domain IPR013750%3B GHMP kinase domain- containing protein%3B Orf no. 31 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSYGAFIDVNGNPFITPLSTPFALYARGEIQSVNVSGSQVAERYVRIPTGVPVIAFCKTTNTQQGTALSAFTFRSGPNVGTVYIRGTNPANQSYTLTYYIFAIFEQSLPRWGMAIWDASGKLVLTNETKVLSDLVTIGTPGYAGGGLNIDTTLSGSYAVVPTILGNYQVVIGRLPTGQPIIGNSTAGSSCRYNGSTTRINAAATTAAGQIMNTTNNGNIITAIKTAAYD;ID=CPT-T1_024.CDS.1;Parent=CPT-T1_024.mRNA; +AY216660.2 GbkToGff gene 21279 21518 . - . locus_tag=CPT-T1_025;ID=CPT-T1_025.gene; +AY216660.2 GbkToGff mRNA 21279 21518 . - . locus_tag=CPT-T1_025;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_025.mRNA;Parent=CPT-T1_025.gene; +AY216660.2 GbkToGff CDS 21279 21506 . - 0 locus_tag=CPT-T1_025;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKKLITIIAAAFILTGCSSMPERTCTAIYESGGAEYSVYVFGSKMRGKEMVLRAGYPFSFNYVSEKNFKSHDCSI;note=Orf no. 30 see PMID: 14972552;ID=CPT-T1_025.CDS.1;Parent=CPT-T1_025.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 21515 21518 . - . locus_tag=CPT-T1_025;regulatory_class=ribosome_binding_site;ID=CPT-T1_025.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_025.mRNA; +AY216660.2 GbkToGff gene 21755 21906 . + . locus_tag=CPT-T1_026;ID=CPT-T1_026.gene; +AY216660.2 GbkToGff mRNA 21755 21906 . + . locus_tag=CPT-T1_026;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_026.mRNA;Parent=CPT-T1_026.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 21755 21758 . + . locus_tag=CPT-T1_026;regulatory_class=ribosome_binding_site;ID=CPT-T1_026.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_026.mRNA; +AY216660.2 GbkToGff CDS 21766 21906 . + 0 locus_tag=CPT-T1_026;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIRQISIMYVQNLINLDSICRYLCISNKKRLQVLRNRQRIKIYLSH;ID=CPT-T1_026.CDS.1;Parent=CPT-T1_026.mRNA; +AY216660.2 GbkToGff gene 22020 23098 . + . locus_tag=CPT-T1_027;ID=CPT-T1_027.gene; +AY216660.2 GbkToGff mRNA 22020 23098 . + . locus_tag=CPT-T1_027;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_027.mRNA;Parent=CPT-T1_027.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 22020 22024 . + . locus_tag=CPT-T1_027;regulatory_class=ribosome_binding_site;ID=CPT-T1_027.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_027.mRNA; +AY216660.2 GbkToGff CDS 22034 23098 . + 0 locus_tag=CPT-T1_027;codon_start=1;transl_table=11;product=exodeoxyribonuclease VIII;translation=MFQVFTSSQLSNDEYHRNEGWASEYVSGSSLAEIYQTCPANWRFKKNETTKALEFGTQSHTNFESRDLFTATYARCPAPSEFKDLITSQAALAAKLKSFGLKGTSGKQYPDLIKMMVDCGEELNVQYLIELIAEAEARAEGKQLVDADKYDACMKMRAILEQNPDHEACINSETAQREISIFGEISGVKVKVRLDHLDYKENVPGRVLTGYDENGDPVFEDVIFPEALIITDFKTTMSANPLEFPRLAYNHGYYLKMALQHDLLRRAIQAGAFEGNFPEDIPIVVRLLAQEKKEPYIALAYRMTMEQIRIGRNQYISVVHTYKACSEMDVWPGYAGDASEIELETPSWVRYQNK;note=Orf no. 29 see PMID: 14972552;ID=CPT-T1_027.CDS.1;Parent=CPT-T1_027.mRNA; +AY216660.2 GbkToGff gene 23129 23820 . + . locus_tag=CPT-T1_028;ID=CPT-T1_028.gene; +AY216660.2 GbkToGff mRNA 23129 23820 . + . locus_tag=CPT-T1_028;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_028.mRNA;Parent=CPT-T1_028.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 23129 23133 . + . locus_tag=CPT-T1_028;regulatory_class=ribosome_binding_site;ID=CPT-T1_028.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_028.mRNA; +AY216660.2 GbkToGff CDS 23140 23820 . + 0 locus_tag=CPT-T1_028;note=InterPro domain IPR007499%3B Orf no. 28 see PMID: 14972552;codon_start=1;transl_table=11;product=recombinase;translation=MHLIHQSGEVKMQLSPETNEILPALFNARNKFAKAKKDAKNNHLKNSYATLDAMMAAVSPALTDNDIMILQSMLDTSTETTFHLETMLIHKSGQWAKFFMMMPIAKRDPQGVGSAMTYARRYSLAAALGISQSDDDAQLAVKSVKDWKKELDACEDIESLKDVWANAYRQTDTASKSIIQDHYNALKAKFEIGKARGIRPAQPEQKKQVEATSAKPVQSQSITNFE;ID=CPT-T1_028.CDS.1;Parent=CPT-T1_028.mRNA; +AY216660.2 GbkToGff gene 23854 24289 . + . locus_tag=CPT-T1_029;ID=CPT-T1_029.gene; +AY216660.2 GbkToGff mRNA 23854 24289 . + . locus_tag=CPT-T1_029;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_029.mRNA;Parent=CPT-T1_029.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 23854 23858 . + . locus_tag=CPT-T1_029;regulatory_class=ribosome_binding_site;ID=CPT-T1_029.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_029.mRNA; +AY216660.2 GbkToGff CDS 23867 24289 . + 0 locus_tag=CPT-T1_029;note=InterPro domain IPR012340,Orf no. 27 see PMID: 14972552;codon_start=1;transl_table=11;product=single-stranded DNA-binding protein;translation=MHIITGEIRKEPKILERNGGNTYIIELAESYKPRDGDREYTNYTFFFSDGGKPGLADWYREAFQVGRVISVSCETLKISSREHNGMIYNSLQAADFPKLVFSQRGQSNQQQRAPQQQQRSQQQSQPQPNQQSTFDDDIPF;ID=CPT-T1_029.CDS.1;Parent=CPT-T1_029.mRNA; +AY216660.2 GbkToGff gene 24351 26550 . - . locus_tag=CPT-T1_030;ID=CPT-T1_030.gene; +AY216660.2 GbkToGff mRNA 24351 26550 . - . locus_tag=CPT-T1_030;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_030.mRNA;Parent=CPT-T1_030.gene; +AY216660.2 GbkToGff CDS 24351 26537 . - 0 locus_tag=CPT-T1_030;codon_start=1;transl_table=11;product=hypothetical protein;translation=MALYRRGTASMDADGTVHGTDTKWKDQLALIRVGATIVFLEQPIKLAVISDIVSDTELKAISTDGQTAADGKYVILLNDSLTVNGLAQNVAETLRYYQSKETEIASALDIIADLDMDNLNNIVQEIKSNKSAAEAAQNQAELARDSANSARDESISIKNQTQQISDSAIGSINAAKDKAITNVQQKENSAVTHINSEEAAAIQAINDAKGDLSGYVNDAQTAAQTATSAKNDAQAARDAAVSAKDAAAVSAQEAQDAANSVNADNLLTKDGNLSGLADKEQSKKNLAVNRLNQPRGDLTEIYSNDDRTGFKLIVKDSGDWGAMTHDGSENKALGVNFGGTGGTTEEQARTSLKVYKLDRTNLGEKHLDSITGEGDGPGIYMQSSSALATASRGYPEATAGMLEVLPNGANGASACIQRFTPFTYLGTAPESGNSQNEYARAGRGTFYIRMKNGNNAKFSPWIPFQASSSGNVVSSPASNEKSSWVDYVNALSSQPSSLASYNVNSVGWVTAISVRHRNGQGDGSAFGFVIEDASMTSPHYKDVRLRKQTGAGQWQSTQVIWNTGNTTVDSNGFIKRASPIVDIFGNGSHRTNDESEGCTVERISTGEYLIRGCLSLNSDLAWGGVNGGIEIPKDINGQPILWVDYDVNPDGSLVIKTYHRTHDNAPSFARNHKDGYSDGDPIDIPSDVFVSVRVEMPNDSIYNKKVEECKRNHERMVSGEFVESLKNT;note=Orf no. 26 see PMID: 14972552;ID=CPT-T1_030.CDS.1;Parent=CPT-T1_030.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 26546 26550 . - . locus_tag=CPT-T1_030;regulatory_class=ribosome_binding_site;ID=CPT-T1_030.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_030.mRNA; +AY216660.2 GbkToGff gene 26447 26600 . - . locus_tag=CPT-T1_031;ID=CPT-T1_031.gene; +AY216660.2 GbkToGff mRNA 26447 26600 . - . locus_tag=CPT-T1_031;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_031.mRNA;Parent=CPT-T1_031.gene; +AY216660.2 GbkToGff CDS 26447 26590 . - 0 locus_tag=CPT-T1_031;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIKSLLSNSCYLLKEFILWLYIDAVLHQWMQTVRFTEPIQNGKISLL;note=Orf no. 25 see PMID: 14972552;ID=CPT-T1_031.CDS.1;Parent=CPT-T1_031.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 26597 26600 . - . locus_tag=CPT-T1_031;regulatory_class=ribosome_binding_site;ID=CPT-T1_031.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_031.mRNA; +AY216660.2 GbkToGff gene 26638 27585 . - . locus_tag=CPT-T1_032;ID=CPT-T1_032.gene; +AY216660.2 GbkToGff mRNA 26638 27585 . - . locus_tag=CPT-T1_032;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_032.mRNA;Parent=CPT-T1_032.gene; +AY216660.2 GbkToGff CDS 26638 27573 . - 0 locus_tag=CPT-T1_032;codon_start=1;transl_table=11;product=DNA primase;translation=MNEEFMMFQKEDVLPYMKGLWREAFQSICGLPNNVFNKKHQPCPNCGGKDRFRWTDNLNTPGDGGAICNSCGNDSGIGWLMKLTGMPYSECVNILGRFLGKVPQEYIVKANKKARRTPVSGVNVMMAEHEAVMKVMERTEKRVNTPLSVFESLPTESFDVGIKRSEDGRESVFHTIPCQLVHEDGLDDEFCNILIIDEEGRESFYAKKYTSCSVAVTGKTEKAIYLCLNWIDAQHIAFHTKQEVWACFTPENLEMVAYRYKGDREVRVACEPSDKDTLYMADDRQLKIIIPNPGGYRSGMQAKLFSASDLL;note=alternative start codon to Orf no. 24 see PMID: 14972552;ID=CPT-T1_032.CDS.1;Parent=CPT-T1_032.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 27582 27585 . - . locus_tag=CPT-T1_032;regulatory_class=ribosome_binding_site;ID=CPT-T1_032.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_032.mRNA; +AY216660.2 GbkToGff gene 27635 28098 . - . locus_tag=CPT-T1_033;ID=CPT-T1_033.gene; +AY216660.2 GbkToGff mRNA 27635 28098 . - . locus_tag=CPT-T1_033;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_033.mRNA;Parent=CPT-T1_033.gene; +AY216660.2 GbkToGff CDS 27635 28087 . - 0 locus_tag=CPT-T1_033;note=HHPred predicted structural similarity at 97%25 probability to phage P22 repression protein C2 Protein Data Bank entry 2R1J over most of protein and phage lambda repressor entry 3BDN%3BOrf no. 23 see PMID: 14972552;codon_start=1;transl_table=11;product=transcriptional regulator;translation=MSIQRIAESTGEIDKRHINGNNGTRRGKDKKPRQRCGFYIHKEETRAGLRARLDALIEYYGGPAACAKALKVSNQTVQGWKERNMISWQGAEAAHRAYRRQGCKGFRAAWLRFDLKFDGNGKCLEKRCKNKKFMRVVKREDIGTTNSIFS;ID=CPT-T1_033.CDS.1;Parent=CPT-T1_033.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 28095 28098 . - . locus_tag=CPT-T1_033;regulatory_class=ribosome_binding_site;ID=CPT-T1_033.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_033.mRNA; +AY216660.2 GbkToGff gene 28168 30199 . + . locus_tag=CPT-T1_034;ID=CPT-T1_034.gene; +AY216660.2 GbkToGff mRNA 28168 30199 . + . locus_tag=CPT-T1_034;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_034.mRNA;Parent=CPT-T1_034.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 28168 28171 . + . locus_tag=CPT-T1_034;regulatory_class=ribosome_binding_site;ID=CPT-T1_034.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_034.mRNA; +AY216660.2 GbkToGff CDS 28181 30199 . + 0 locus_tag=CPT-T1_034;codon_start=1;transl_table=11;product=DNA helicase;translation=MTMNIKKQIALLGDDYIKRTQERFTVGEVVPYPYQVVAYAEIAKRLSNYEHPFFVKASVSAGKTIIFAMVAKQCQKMGLKMLVLARQGEIVDQDSEEIDNFGVTNSIFSASLGIKSCYFPIVVGSEGTVANGLDNELADFVPHVIGIDECHQVDWEDLAQAIEGKETMEQMRGEKGKIIMDGDIPLIGNDGKPLLGTKRSQYTIVIMEMMRRCKKVHGHDLRIFGMTGSEFRGVVPILVENPKALGFWRERVTDIDTNYLIEFGSVVPTIFGSTDGVHYDLDKFKASSEDGVQDFTEKDMKAMEDEILHDKSLTQRIMQMVAKKAEERNAVLITCAGVRHCKEAAAALPPGSTYAIITGDTDNKARKKILDDVRAGKIKYTFQVMALTTGVNVPNWDFSVILRKIGSLTLLIQLLGRGMRLLKSWQVAEGMVKQDHLVWDFAGTMDELGQLYFDPILEQAQFQKRFENGKDPKTCPKCGCVNSFYARRCVNVIDGERCDHFWTSQICEDQVDERTGKILVKGCGAENDVVARVCRCCDASLVDPNLKLSGKAYTKNDWYEVKNFEVTLTKNQKGIIYKYTLINDDGDEFKAYEKFFPESDSKICGTLWKTKGVLPHVSDPKMRRYFIGMKNAIKILQYSHHIAHPVRVTHRRNQKKEDIISRKDFGMEDIPE;note=Orf no. 22 see PMID: 14972552;ID=CPT-T1_034.CDS.1;Parent=CPT-T1_034.mRNA; +AY216660.2 GbkToGff gene 30181 30612 . + . locus_tag=CPT-T1_035;ID=CPT-T1_035.gene; +AY216660.2 GbkToGff mRNA 30181 30612 . + . locus_tag=CPT-T1_035;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_035.mRNA;Parent=CPT-T1_035.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 30181 30185 . + . locus_tag=CPT-T1_035;regulatory_class=ribosome_binding_site;ID=CPT-T1_035.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_035.mRNA; +AY216660.2 GbkToGff CDS 30196 30612 . + 0 locus_tag=CPT-T1_035;note=HHPred predicted structural similarity at 97%25 probability to B. subtilis recombination protein U/resolvase Protein Data Bank entry 1ZP7 over most of protein%3B Orf no. 21 see PMID: 14972552;codon_start=1;transl_table=11;product=Holliday junction resolvase;translation=MITDKGDYLEFYERDTSDTRKEDAHQVDCVSWLKYNFPHLLFWHTVNEGEKTITSALRDEQAGLLKGVSDFVILIGVNSRYPFAAIELKRVNKSGKGKASPVSDKQREFLQKVRERGGFSAVAYGFGQFKIAIYEMMK;ID=CPT-T1_035.CDS.1;Parent=CPT-T1_035.mRNA; +AY216660.2 GbkToGff gene 30669 31393 . + . locus_tag=CPT-T1_036;ID=CPT-T1_036.gene; +AY216660.2 GbkToGff mRNA 30669 31393 . + . locus_tag=CPT-T1_036;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_036.mRNA;Parent=CPT-T1_036.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 30669 30672 . + . locus_tag=CPT-T1_036;regulatory_class=ribosome_binding_site;ID=CPT-T1_036.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_036.mRNA; +AY216660.2 GbkToGff CDS 30680 31393 . + 0 locus_tag=CPT-T1_036;codon_start=1;transl_table=11;product=DNA adenine methyltransferase;translation=MKDFNDIETIDFAETGCSFTREAIASGGYYQALKTPTCKEISGRRYKGTNTPDAVRDLWSTPREVIAYLEGRYGKYDLDAAASEENKVCEKFYSQETNCLKRWWGKNKHVWLNPPYSRPDIFVKKAIEQMEHNNQIDMLLPADNSTAWFTEARQNAAEIIWIEADLTEDIDGNEYARSGRLAFISGETGKAVDGNNKGSVIFIMRELKEGEVQQTHYIPITSICPSVKNKRAKVRKV;note=Orf no. 20 see PMID: 14972552;ID=CPT-T1_036.CDS.1;Parent=CPT-T1_036.mRNA; +AY216660.2 GbkToGff gene 31377 31641 . + . locus_tag=CPT-T1_037;ID=CPT-T1_037.gene; +AY216660.2 GbkToGff mRNA 31377 31641 . + . locus_tag=CPT-T1_037;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_037.mRNA;Parent=CPT-T1_037.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31377 31380 . + . locus_tag=CPT-T1_037;regulatory_class=ribosome_binding_site;ID=CPT-T1_037.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_037.mRNA; +AY216660.2 GbkToGff CDS 31393 31641 . + 0 locus_tag=CPT-T1_037;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSEKMVPVKLTEQGLWLLYRATCCEIMERNGLTQDVIGCDLWEFTSSLDMSFDEIKNEYIENWPSIIQKDVEELKADTIVQH;note=alternative start codon to Orf no. 19 see PMID: 14972552;ID=CPT-T1_037.CDS.1;Parent=CPT-T1_037.mRNA; +AY216660.2 GbkToGff gene 31695 31917 . + . locus_tag=CPT-T1_038;ID=CPT-T1_038.gene; +AY216660.2 GbkToGff mRNA 31695 31917 . + . locus_tag=CPT-T1_038;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_038.mRNA;Parent=CPT-T1_038.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31695 31698 . + . locus_tag=CPT-T1_038;regulatory_class=ribosome_binding_site;ID=CPT-T1_038.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_038.mRNA; +AY216660.2 GbkToGff CDS 31708 31917 . + 0 locus_tag=CPT-T1_038;codon_start=1;transl_table=11;product=hypothetical protein;translation=MARINANFFNIAQQSAKMAVHITNKQGGNFDWDIAMNFLKMSYYRCSVEEVEGFISDVEKLTNADKKAR;note=Orf no. 18 see PMID: 14972552;ID=CPT-T1_038.CDS.1;Parent=CPT-T1_038.mRNA; +AY216660.2 GbkToGff gene 31881 32167 . + . locus_tag=CPT-T1_039;ID=CPT-T1_039.gene; +AY216660.2 GbkToGff mRNA 31881 32167 . + . locus_tag=CPT-T1_039;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_039.mRNA;Parent=CPT-T1_039.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 31881 31884 . + . locus_tag=CPT-T1_039;regulatory_class=ribosome_binding_site;ID=CPT-T1_039.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_039.mRNA; +AY216660.2 GbkToGff CDS 31895 32167 . + 0 locus_tag=CPT-T1_039;codon_start=1;transl_table=11;product=hypothetical protein;translation=MLIKKQGKREVWEHAKECGISDDIALIAKYFDIKDVSIISNGKISFMEGMPRKMQRVPATPSLEFYREEGKRIERERKSTKNGKSSRLKY;ID=CPT-T1_039.CDS.1;Parent=CPT-T1_039.mRNA; +AY216660.2 GbkToGff gene 32099 32399 . + . locus_tag=CPT-T1_040;ID=CPT-T1_040.gene; +AY216660.2 GbkToGff mRNA 32099 32399 . + . locus_tag=CPT-T1_040;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_040.mRNA;Parent=CPT-T1_040.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 32099 32102 . + . locus_tag=CPT-T1_040;regulatory_class=ribosome_binding_site;ID=CPT-T1_040.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_040.mRNA; +AY216660.2 GbkToGff CDS 32112 32399 . + 0 locus_tag=CPT-T1_040;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSEKENPQKTASLPGLNINADEYQAIWIGKKQVKQIPFSDWLPPDFVNVLCTIGIEQELHIGYYSPGRNSMMLEVDGKLVEFKSSDLGFWLKAVA;note=Orf no. 17 see PMID: 14972552;ID=CPT-T1_040.CDS.1;Parent=CPT-T1_040.mRNA; +AY216660.2 GbkToGff gene 32467 33611 . + . locus_tag=CPT-T1_041;ID=CPT-T1_041.gene; +AY216660.2 GbkToGff mRNA 32467 33611 . + . locus_tag=CPT-T1_041;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_041.mRNA;Parent=CPT-T1_041.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 32467 32470 . + . locus_tag=CPT-T1_041;regulatory_class=ribosome_binding_site;ID=CPT-T1_041.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_041.mRNA; +AY216660.2 GbkToGff CDS 32478 33611 . + 0 locus_tag=CPT-T1_041;note=HHPred predicted structural similarity at 99%25 probability to E. coli exonuclease SbcD Protein Data Bank entry 4LTY over two thirds of protein%3B Orf no. 16 see PMID: 14972552;codon_start=1;transl_table=11;product=exonuclease;translation=MSQAKITTEQLIEERMSGLTLREIAEKYGMHIRTVEARHAKLAKEGHFHGNEHVAKMVPEGFMVKGTSTMIDAEGNEKIRWVKTSVDNERLEVLMEKAREAFCSELPKAIPSESPDVSFDEDTLAMYPVFDLHIGALAHKHECGENYDTATAEKVMNGFFDYAVDKAPNSKNAVLVLGGDFLHYDSLESKTPASGHYLDSDSRYAKLVYVAIRSVRRAVSRMLEKHQVIDIKAISGNHDESGMVWLRAALAAFYEDEPRVNVDVSPAAMMMTSFGKTLIGYTHGHQMRKADTRLSVMATDFRKLFGQSDYVYTHSGHWHSQKITETNLGIDEVHGQLGSPDAYSANGGWRSQRQAAVIVYHKEFGEVGRFICRPEMF;ID=CPT-T1_041.CDS.1;Parent=CPT-T1_041.mRNA; +AY216660.2 GbkToGff gene 33673 34166 . + . locus_tag=CPT-T1_042;ID=CPT-T1_042.gene; +AY216660.2 GbkToGff mRNA 33673 34166 . + . locus_tag=CPT-T1_042;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_042.mRNA;Parent=CPT-T1_042.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 33673 33677 . + . locus_tag=CPT-T1_042;regulatory_class=ribosome_binding_site;ID=CPT-T1_042.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_042.mRNA; +AY216660.2 GbkToGff CDS 33684 34166 . + 0 locus_tag=CPT-T1_042;note=similar to phage T7 protein 3.8%3B InterPro domain IPR003615%3B Orf no. 15 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MNWHEHYEYRDGVLYHKVKPCRRHDVNIGDVAGRVAKNGYHYVVHKNRPYKRSRVIWEMFNGEIPDGFVIDHLNHNATDDRIDNLECKPRRENMVNVKLRIDSTTGVTGVSRKRDNKWRAYITIMGKQKCKSFDTFEEACAQRIEWSVTHDFHPNHGGTY;ID=CPT-T1_042.CDS.1;Parent=CPT-T1_042.mRNA; +AY216660.2 GbkToGff gene 34226 34415 . + . locus_tag=CPT-T1_043;ID=CPT-T1_043.gene; +AY216660.2 GbkToGff mRNA 34226 34415 . + . locus_tag=CPT-T1_043;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_043.mRNA;Parent=CPT-T1_043.gene; +AY216660.2 GbkToGff CDS 34239 34415 . + 0 locus_tag=CPT-T1_043;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKIVKCIRNDSKTLPFRVNQIYSVGYDFGGGLFEIYDGRGSAIQTPLNGHYLEFIEID;note=Orf no. 14 see PMID: 14972552;ID=CPT-T1_043.CDS.1;Parent=CPT-T1_043.mRNA; +AY216660.2 GbkToGff gene 34525 34749 . + . locus_tag=CPT-T1_044;ID=CPT-T1_044.gene; +AY216660.2 GbkToGff mRNA 34525 34749 . + . locus_tag=CPT-T1_044;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_044.mRNA;Parent=CPT-T1_044.gene; +AY216660.2 GbkToGff CDS 34534 34749 . + 0 locus_tag=CPT-T1_044;note=1 transmembrane domain%2C predicted N-out and C-in%3B Orf no. 13 see PMID: 14972552;codon_start=1;transl_table=11;product=pinholin class 2;translation=MKEFLTAATSSTGGASLVGAATGQLYIAGATFICFLLFGAWGAYWKYRDSKAIQEALNDGDLNKALKIRGR;ID=CPT-T1_044.CDS.1;Parent=CPT-T1_044.mRNA; +AY216660.2 GbkToGff gene 34739 35237 . + . locus_tag=CPT-T1_045;ID=CPT-T1_045.gene; +AY216660.2 GbkToGff mRNA 34739 35237 . + . locus_tag=CPT-T1_045;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_045.mRNA;Parent=CPT-T1_045.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 34739 34743 . + . locus_tag=CPT-T1_045;regulatory_class=ribosome_binding_site;ID=CPT-T1_045.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_045.mRNA; +AY216660.2 GbkToGff CDS 34749 35237 . + 0 locus_tag=CPT-T1_045;codon_start=1;transl_table=11;product=SAR endolysin;translation=MSLKNNVIGASIGAALTLTPTLLERIEGIEYEVYYDIAGVPTVCSGITGPDVIPGKKYTKRECDALLIKHIGVAQRYVDKKVKVDIPVTMRASLYSFTFNVGTGAFGSSTMLKLINQRKHKEACNQLWRWVYYYNPKTKKREVSRGIKNRRAEEYAYCVKEL;note=Orf no. 12 see PMID: 14972552;ID=CPT-T1_045.CDS.1;Parent=CPT-T1_045.mRNA; +AY216660.2 GbkToGff gene 35227 35638 . + . locus_tag=CPT-T1_046;ID=CPT-T1_046.gene; +AY216660.2 GbkToGff mRNA 35227 35638 . + . locus_tag=CPT-T1_046;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_046.mRNA;Parent=CPT-T1_046.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 35227 35230 . + . locus_tag=CPT-T1_046;regulatory_class=ribosome_binding_site;ID=CPT-T1_046.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_046.mRNA; +AY216660.2 GbkToGff CDS 35237 35638 . + 0 locus_tag=CPT-T1_046;note=PMID 30135120 shows molecular function of unimolecular spanin%3B Orf no. 11 see PMID: 14972552;codon_start=1;transl_table=11;product=u-spanin;translation=MKLKKTCIAITVAVGVISLSGCSTASALSGLLSDSPDVTAQVGAENTKQLAGVTAKADDKREVKVSDSNIGKIDSSVKKSVEVSTIQANTVNAESITVTKSGSWYDPVVCWILVFIVLLLFYFLIRKHEKKEA;ID=CPT-T1_046.CDS.1;Parent=CPT-T1_046.mRNA; +AY216660.2 GbkToGff gene 35761 36179 . - . locus_tag=CPT-T1_047;ID=CPT-T1_047.gene; +AY216660.2 GbkToGff mRNA 35761 36179 . - . locus_tag=CPT-T1_047;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_047.mRNA;Parent=CPT-T1_047.gene; +AY216660.2 GbkToGff CDS 35761 36168 . - 0 locus_tag=CPT-T1_047;note=HHPred predicted structural similarity at 88%25 probability to S. epidermidis RipR transcriptional regulator Protein Data Bank entry 3IWF over most of protein%3B Orf no. 10 see PMID: 14972552;codon_start=1;transl_table=11;product=HTH domain-containing protein;translation=MLLLLDLFRFCEGYDKYTRQHIAKFIYAHKESERFAKAAGMTRREFTSALSKEFCARCVTEGYLDCKGGFYWCKGKIKRPVMMKLMCIDGYNNRYTWEMMHIGEMSDEDLFGERRNIDRSERRIVRKAPAYERRI;ID=CPT-T1_047.CDS.1;Parent=CPT-T1_047.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 36176 36179 . - . locus_tag=CPT-T1_047;regulatory_class=ribosome_binding_site;ID=CPT-T1_047.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_047.mRNA; +AY216660.2 GbkToGff gene 36173 37755 . - . locus_tag=CPT-T1_048;ID=CPT-T1_048.gene; +AY216660.2 GbkToGff mRNA 36173 37755 . - . locus_tag=CPT-T1_048;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_048.mRNA;Parent=CPT-T1_048.gene; +AY216660.2 GbkToGff CDS 36173 37741 . - 0 locus_tag=CPT-T1_048;note=HHPred predicted structural similarity at 95%25 probability to E. coli AAA ATPase ravA Protein Data Bank entry 3NBX over AAA domain of protein%3B Orf no. 9 see PMID: 14972552;codon_start=1;transl_table=11;product=putative ATPase;translation=MFNIKPKLNYQQIIEIANSTGVNPVAIAIRENSYGDSVSFWQDPIDINSGNDKFPLISLGGDNLVFEYAKAKAESVQFPVSSAYAHFIGCISAAMLGKFWVQYHGEEQPTALYMVISQPPSTGKSAINSAAITPMRAEIQRLNEERKKERIRLTSQLRQIEKEIKNDPKGNTTAALYEDKEKLEEKIKKMADIVFAVSDPTPEGLAKVAAVQGHFSVISDEATAINTLLGLTYGGSDKKSNSELILKAWDKNHMEVARSNQDNNLSLCPVGSICVIAQDETIKGIMDAGQRGIGVSERFLLVREEPLLGTRILCDENGDALYKEVDRGLVSKYYRLVHNIMKEDNVVLSVSRNAMRELNLARQAMEPDFAAGGKYSHSMLRGHLGKFDKHALRIASVLHTIKNWEGESPNRSNREIDLETMQEAIMIFNELSRTYLSSASAAGYAGDEAESRKLIDVITEIAKKNKGRAPIHSIVAKCRNVTPFNGQQKVAERIDSLLITLEEMNYTCRIDDIVFINPRLMG;ID=CPT-T1_048.CDS.1;Parent=CPT-T1_048.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 37751 37755 . - . locus_tag=CPT-T1_048;regulatory_class=ribosome_binding_site;ID=CPT-T1_048.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_048.mRNA; +AY216660.2 GbkToGff gene 37741 38294 . - . locus_tag=CPT-T1_049;ID=CPT-T1_049.gene; +AY216660.2 GbkToGff mRNA 37741 38294 . - . locus_tag=CPT-T1_049;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_049.mRNA;Parent=CPT-T1_049.gene; +AY216660.2 GbkToGff CDS 37741 38280 . - 0 locus_tag=CPT-T1_049;note=InterPro domain IPR003615%3B similar to phage T7 protein 3.8%3B Orf no. 8 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MKDELKYVSGRLYWKEWRIGRRRNLLAGTVNKKGYRSICFPGGVFEYAHRIVWKIHYGNIPEGMDVDHINHERDDNRIENLRLVTRQDNLRNKGVVSSNTGVMGVYWNKKTNRYTANITINKKTKHLGTFMTLDAAAKARKEAERLYGFHENHGSNSTFCKTRVPLTVYHSRRQLRSLL;ID=CPT-T1_049.CDS.1;Parent=CPT-T1_049.mRNA; +AY216660.2 GbkToGff gene 38277 38706 . - . locus_tag=CPT-T1_050;ID=CPT-T1_050.gene; +AY216660.2 GbkToGff mRNA 38277 38706 . - . locus_tag=CPT-T1_050;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_050.mRNA;Parent=CPT-T1_050.gene; +AY216660.2 GbkToGff CDS 38277 38693 . - 0 locus_tag=CPT-T1_050;note=HHPred predicted structural similarity at 88%25 probability to S. epidermidis RipR transcriptional regulator Protein Data Bank entry 3IWF over most of protein%3B Orf no. 7 see PMID: 14972552;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKSIKLKCTSADKITGFEVNNLYKGRERYDDTREVKLKCGKYLKLEKHDELHIHGSDEIFFAKFTELKTKTLKCTGLDHRNPMKKSFKVGKRYQVESGRALGGVAGYIFDEDGCRWTLFREEVGFSIADGTTFESKYL;ID=CPT-T1_050.CDS.1;Parent=CPT-T1_050.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 38702 38706 . - . locus_tag=CPT-T1_050;regulatory_class=ribosome_binding_site;ID=CPT-T1_050.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_050.mRNA; +AY216660.2 GbkToGff gene 38774 38994 . - . locus_tag=CPT-T1_051;ID=CPT-T1_051.gene; +AY216660.2 GbkToGff mRNA 38774 38994 . - . locus_tag=CPT-T1_051;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_051.mRNA;Parent=CPT-T1_051.gene; +AY216660.2 GbkToGff CDS 38774 38983 . - 0 locus_tag=CPT-T1_051;codon_start=1;transl_table=11;product=hypothetical protein;translation=MEQDNFWTRYFAALDAGLSSEWCIKVAYKEITLDEALGDMDMDAESEYDPNFELPGDDINEDVDDYIPW;note=Orf no. 6 see PMID: 14972552;ID=CPT-T1_051.CDS.1;Parent=CPT-T1_051.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 38991 38994 . - . locus_tag=CPT-T1_051;regulatory_class=ribosome_binding_site;ID=CPT-T1_051.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_051.mRNA; +AY216660.2 GbkToGff gene 38987 39222 . - . locus_tag=CPT-T1_052;ID=CPT-T1_052.gene; +AY216660.2 GbkToGff mRNA 38987 39222 . - . locus_tag=CPT-T1_052;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_052.mRNA;Parent=CPT-T1_052.gene; +AY216660.2 GbkToGff CDS 38987 39211 . - 0 locus_tag=CPT-T1_052;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSIKVENIIKHLNAKGRVFIKMDKSSGFISMTVTKTRNGNSVIGSVPGSRLINATDADVRATLEANSIYINSWG;note=Orf no. 5 see PMID: 14972552;ID=CPT-T1_052.CDS.1;Parent=CPT-T1_052.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39219 39222 . - . locus_tag=CPT-T1_052;regulatory_class=ribosome_binding_site;ID=CPT-T1_052.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_052.mRNA; +AY216660.2 GbkToGff gene 39289 39444 . - . locus_tag=CPT-T1_053;ID=CPT-T1_053.gene; +AY216660.2 GbkToGff mRNA 39289 39444 . - . locus_tag=CPT-T1_053;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_053.mRNA;Parent=CPT-T1_053.gene; +AY216660.2 GbkToGff CDS 39289 39432 . - 0 locus_tag=CPT-T1_053;codon_start=1;transl_table=11;product=hypothetical protein;translation=MIYVHTFYTGKFNSVKNVRVYDSRQKAMMQKVVLGGTIKECKVISEC;note=Orf no. 4 see PMID: 14972552;ID=CPT-T1_053.CDS.1;Parent=CPT-T1_053.mRNA; +AY216660.2 GbkToGff gene 39429 39758 . - . locus_tag=CPT-T1_054;ID=CPT-T1_054.gene; +AY216660.2 GbkToGff mRNA 39429 39758 . - . locus_tag=CPT-T1_054;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_054.mRNA;Parent=CPT-T1_054.gene; +AY216660.2 GbkToGff CDS 39429 39749 . - 0 locus_tag=CPT-T1_054;codon_start=1;transl_table=11;product=hypothetical protein;translation=MNHTYKITTKSPKINGSTVAALNNAAAIHEKNIMERVKAAVGRFYGINADIADSKRLFKYAPGHPYSRMVDIKHNKELVRIGSLSVDEFDHSINLVTAYQTWDGKK;note=Orf no. 3 see PMID: 14972552;ID=CPT-T1_054.CDS.1;Parent=CPT-T1_054.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39755 39758 . - . locus_tag=CPT-T1_054;regulatory_class=ribosome_binding_site;ID=CPT-T1_054.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_054.mRNA; +AY216660.2 GbkToGff gene 39766 39978 . - . locus_tag=CPT-T1_055;ID=CPT-T1_055.gene; +AY216660.2 GbkToGff mRNA 39766 39978 . - . locus_tag=CPT-T1_055;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_055.mRNA;Parent=CPT-T1_055.gene; +AY216660.2 GbkToGff CDS 39766 39966 . - 0 locus_tag=CPT-T1_055;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSDNIYRVVAISRKTQKRVIAYMGSSAIEATDAFELLKNNEGFMNTFRVRLERLEPVIIDEARKLS;note=Orf no. 2 see PMID: 14972552;ID=CPT-T1_055.CDS.1;Parent=CPT-T1_055.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 39973 39978 . - . locus_tag=CPT-T1_055;regulatory_class=ribosome_binding_site;ID=CPT-T1_055.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_055.mRNA; +AY216660.2 GbkToGff gene 39959 40342 . - . locus_tag=CPT-T1_056;ID=CPT-T1_056.gene; +AY216660.2 GbkToGff mRNA 39959 40342 . - . locus_tag=CPT-T1_056;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_056.mRNA;Parent=CPT-T1_056.gene; +AY216660.2 GbkToGff CDS 39959 40330 . - 0 locus_tag=CPT-T1_056;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFKQFTDLDFSASSVIQTDEKVHVAIENIARKIHNKQEKAMIAALTAYYDVSDVMECVDRVTRVVDRLGASRLIDNDTGEVITQFNKPFMRTEPGSVAPCFVADYSITVNSFVADRVKEALYE;note=Orf no. 1 see PMID: 14972552;ID=CPT-T1_056.CDS.1;Parent=CPT-T1_056.mRNA; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 40339 40342 . - . locus_tag=CPT-T1_056;regulatory_class=ribosome_binding_site;ID=CPT-T1_056.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_056.mRNA; +AY216660.2 GbkToGff gene 41020 41606 . + . locus_tag=CPT-T1_057;ID=CPT-T1_057.gene; +AY216660.2 GbkToGff mRNA 41020 41606 . + . locus_tag=CPT-T1_057;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_057.mRNA;Parent=CPT-T1_057.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 41020 41024 . + . locus_tag=CPT-T1_057;regulatory_class=ribosome_binding_site;ID=CPT-T1_057.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_057.mRNA; +AY216660.2 GbkToGff CDS 41031 41606 . + 0 locus_tag=CPT-T1_057;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSSYQSDAVQAAIKAAYEKAGVTVEQRPEAKVTDVIRAACDQLYGDGENTEFTFDANKMAEAAARKSMPDADEHDVAKGAESWLLGKTDEINEKFKSSFITPIVSRHFSKIGKSVKVSVTMNDEKLRVVTISVSDEEVPVKKRRSRKKVSLADCLDSFVPDVDDLEKGDVTVSTVRDLVRQMKAHIEKCGL;note=Orf no. 77 see PMID: 14972552;ID=CPT-T1_057.CDS.1;Parent=CPT-T1_057.mRNA; +AY216660.2 GbkToGff gene 41606 41945 . + . locus_tag=CPT-T1_058;ID=CPT-T1_058.gene; +AY216660.2 GbkToGff mRNA 41606 41945 . + . locus_tag=CPT-T1_058;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_058.mRNA;Parent=CPT-T1_058.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 41606 41610 . + . locus_tag=CPT-T1_058;regulatory_class=ribosome_binding_site;ID=CPT-T1_058.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_058.mRNA; +AY216660.2 GbkToGff CDS 41619 41945 . + 0 locus_tag=CPT-T1_058;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFNIKPLTEAEKQAQAKQTENIQVIADALIGKRSIKINLDTVGQSFFTKGLDKYVINVKARDLVARIQKLNNQKLKLIKVEGNMCEIENLSAPDPNKWEITDVEFIVE;note=Orf no. 76 see PMID: 14972552;ID=CPT-T1_058.CDS.1;Parent=CPT-T1_058.mRNA; +AY216660.2 GbkToGff gene 42013 42254 . + . locus_tag=CPT-T1_059;ID=CPT-T1_059.gene; +AY216660.2 GbkToGff mRNA 42013 42254 . + . locus_tag=CPT-T1_059;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_059.mRNA;Parent=CPT-T1_059.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42013 42016 . + . locus_tag=CPT-T1_059;regulatory_class=ribosome_binding_site;ID=CPT-T1_059.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_059.mRNA; +AY216660.2 GbkToGff CDS 42024 42254 . + 0 locus_tag=CPT-T1_059;codon_start=1;transl_table=11;product=hypothetical protein;translation=MSIVKNQQAIDSTNNNRFAIFITRDNKRFAVKAVPGGYKTYMEDNGKWVRCDNLANFLVWNADLQGFDDISTLIEE;note=Orf no. 75 see PMID: 14972552;ID=CPT-T1_059.CDS.1;Parent=CPT-T1_059.mRNA; +AY216660.2 GbkToGff gene 42247 42487 . + . locus_tag=CPT-T1_060;ID=CPT-T1_060.gene; +AY216660.2 GbkToGff mRNA 42247 42487 . + . locus_tag=CPT-T1_060;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_060.mRNA;Parent=CPT-T1_060.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42247 42251 . + . locus_tag=CPT-T1_060;regulatory_class=ribosome_binding_site;ID=CPT-T1_060.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_060.mRNA; +AY216660.2 GbkToGff CDS 42260 42487 . + 0 locus_tag=CPT-T1_060;codon_start=1;transl_table=11;product=hypothetical protein;translation=MPRYSNLTQLTRVNGHMIPAKSTHYAMGAKHGLYFKWRGQWNFTVIRNFYIRVTGDDPQSVVENSIGDNKIEVLK;note=Orf no. 74 see PMID: 14972552;ID=CPT-T1_060.CDS.1;Parent=CPT-T1_060.mRNA; +AY216660.2 GbkToGff gene 42473 42594 . + . locus_tag=CPT-T1_061;ID=CPT-T1_061.gene; +AY216660.2 GbkToGff mRNA 42473 42594 . + . locus_tag=CPT-T1_061;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_061.mRNA;Parent=CPT-T1_061.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42473 42477 . + . locus_tag=CPT-T1_061;regulatory_class=ribosome_binding_site;ID=CPT-T1_061.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_061.mRNA; +AY216660.2 GbkToGff CDS 42484 42594 . + 0 locus_tag=CPT-T1_061;codon_start=1;transl_table=11;product=hypothetical protein;translation=MNFNIIAFWSAVWFFCVGHVVVGIVIMLLLCAGAFE;note=single transmembrane domain predicted N-in and C-out%3BOrf no. 73 see PMID: 14972552;ID=CPT-T1_061.CDS.1;Parent=CPT-T1_061.mRNA; +AY216660.2 GbkToGff gene 42580 42764 . + . locus_tag=CPT-T1_062;ID=CPT-T1_062.gene; +AY216660.2 GbkToGff mRNA 42580 42764 . + . locus_tag=CPT-T1_062;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_062.mRNA;Parent=CPT-T1_062.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42580 42583 . + . locus_tag=CPT-T1_062;regulatory_class=ribosome_binding_site;ID=CPT-T1_062.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_062.mRNA; +AY216660.2 GbkToGff CDS 42591 42764 . + 0 locus_tag=CPT-T1_062;codon_start=1;transl_table=11;product=hypothetical protein;translation=MMRILICMMAAVAMAILVVSGCGEARDSCHETGSQVTTFVMVGNVLLPITSNEITCE;note=Orf no. 72 see PMID: 14972552;ID=CPT-T1_062.CDS.1;Parent=CPT-T1_062.mRNA; +AY216660.2 GbkToGff gene 42824 43324 . + . locus_tag=CPT-T1_063;ID=CPT-T1_063.gene; +AY216660.2 GbkToGff mRNA 42824 43324 . + . locus_tag=CPT-T1_063;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_063.mRNA;Parent=CPT-T1_063.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 42824 42827 . + . locus_tag=CPT-T1_063;regulatory_class=ribosome_binding_site;ID=CPT-T1_063.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_063.mRNA; +AY216660.2 GbkToGff CDS 42836 43324 . + 0 locus_tag=CPT-T1_063;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKIKLLSNGGYKGFTRDLEADPIVVDAVKCDSSTGGYRVKVDDLVKAGVYDLDYGLSVSPVFGPADFNEKDGTMFFFDWEVKANIKPRKVRLLSNGGYPMRPGYENRTFPVIVDFIGTTDNLVYVSHEQLKAVGFVGGMNKEALCFFHRCPEPIGIECELVY;note=Orf no. 71 see PMID: 14972552;ID=CPT-T1_063.CDS.1;Parent=CPT-T1_063.mRNA; +AY216660.2 GbkToGff gene 43385 43881 . + . locus_tag=CPT-T1_064;ID=CPT-T1_064.gene; +AY216660.2 GbkToGff mRNA 43385 43881 . + . locus_tag=CPT-T1_064;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_064.mRNA;Parent=CPT-T1_064.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 43385 43388 . + . locus_tag=CPT-T1_064;regulatory_class=ribosome_binding_site;ID=CPT-T1_064.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_064.mRNA; +AY216660.2 GbkToGff CDS 43396 43881 . + 0 locus_tag=CPT-T1_064;codon_start=1;transl_table=11;product=hypothetical protein;translation=MLKLKDIQFPVVFNTISCGKITCHSKDRATDSSFNECHPSIVGNLIELHNNHNPDNIPSLPYYVEGVGPGWKVGRSIFHAAKPEIKPALQCTQIENMPLSATLKGVQLDSESWIEITATPKTIEVHDDVVILLLHYGSFKHKTVSGEISIKRGTLVRYEVK;note=Orf no. 70 see PMID: 14972552;ID=CPT-T1_064.CDS.1;Parent=CPT-T1_064.mRNA; +AY216660.2 GbkToGff gene 43870 44030 . + . locus_tag=CPT-T1_065;ID=CPT-T1_065.gene; +AY216660.2 GbkToGff mRNA 43870 44030 . + . locus_tag=CPT-T1_065;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_065.mRNA;Parent=CPT-T1_065.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 43870 43874 . + . locus_tag=CPT-T1_065;regulatory_class=ribosome_binding_site;ID=CPT-T1_065.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_065.mRNA; +AY216660.2 GbkToGff CDS 43881 44030 . + 0 locus_tag=CPT-T1_065;codon_start=1;transl_table=11;product=hypothetical protein;translation=MTAWVLIILMSKGPDHVYMESQQSCNKAREVIAENKPFGYEVKTMCVKR;note=Orf no. 69 see PMID: 14972552;ID=CPT-T1_065.CDS.1;Parent=CPT-T1_065.mRNA; +AY216660.2 GbkToGff gene 44098 44485 . + . locus_tag=CPT-T1_066;ID=CPT-T1_066.gene; +AY216660.2 GbkToGff mRNA 44098 44485 . + . locus_tag=CPT-T1_066;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_066.mRNA;Parent=CPT-T1_066.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44098 44103 . + . locus_tag=CPT-T1_066;regulatory_class=ribosome_binding_site;ID=CPT-T1_066.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_066.mRNA; +AY216660.2 GbkToGff CDS 44111 44485 . + 0 locus_tag=CPT-T1_066;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKFECISDNTKKFTVGKIYDVPTEHAEQTVALTDDTGRNRIATVTHNGEGLRWNSGGTKFATFGKKRKRTFRVNGNVAANKIHNVKPSEVDRKPALKFKEKVDLFNLAASLVLLVAAISLLSIM;note=single transmembrane domain predicted N-in and C-out%3B Orf no. 68 see PMID: 14972552;ID=CPT-T1_066.CDS.1;Parent=CPT-T1_066.mRNA; +AY216660.2 GbkToGff regulatory 44493 44526 . + . regulatory_class=terminator%2C rho-independent;ID=AY216660.2.regulatory.16; +AY216660.2 GbkToGff gene 44527 44649 . + . locus_tag=CPT-T1_067;ID=CPT-T1_067.gene; +AY216660.2 GbkToGff mRNA 44527 44649 . + . locus_tag=CPT-T1_067;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_067.mRNA;Parent=CPT-T1_067.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44527 44530 . + . locus_tag=CPT-T1_067;regulatory_class=ribosome_binding_site;ID=CPT-T1_067.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_067.mRNA; +AY216660.2 GbkToGff CDS 44539 44649 . + 0 locus_tag=CPT-T1_067;codon_start=1;transl_table=11;product=hypothetical protein;translation=MPDFSNWNNEPPSFQELLFCLLVLTLSLKGVLWLLS;note=single transmembrane domain predicted N-out and C-in%3B Orf no. 67 see PMID: 14972552;ID=CPT-T1_067.CDS.1;Parent=CPT-T1_067.mRNA; +AY216660.2 GbkToGff gene 44625 44852 . + . locus_tag=CPT-T1_068;ID=CPT-T1_068.gene; +AY216660.2 GbkToGff mRNA 44625 44852 . + . locus_tag=CPT-T1_068;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_068.mRNA;Parent=CPT-T1_068.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44625 44628 . + . locus_tag=CPT-T1_068;regulatory_class=ribosome_binding_site;ID=CPT-T1_068.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_068.mRNA; +AY216660.2 GbkToGff CDS 44634 44852 . + 0 locus_tag=CPT-T1_068;codon_start=1;transl_table=11;product=hypothetical protein;translation=MATIMTVEDAARDAVEGMRPNTSRIAHYYKSEVSAVQLVHEILRLPQVDSARVVTCLKNYFCITIKTNSTNC;note=Orf no. 66 see PMID: 14972552;ID=CPT-T1_068.CDS.1;Parent=CPT-T1_068.mRNA; +AY216660.2 GbkToGff gene 44898 45356 . + . locus_tag=CPT-T1_069;ID=CPT-T1_069.gene; +AY216660.2 GbkToGff mRNA 44898 45356 . + . locus_tag=CPT-T1_069;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_069.mRNA;Parent=CPT-T1_069.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 44898 44901 . + . locus_tag=CPT-T1_069;regulatory_class=ribosome_binding_site;ID=CPT-T1_069.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_069.mRNA; +AY216660.2 GbkToGff CDS 44910 45356 . + 0 locus_tag=CPT-T1_069;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKHLICIEAPNDQYTLHGLGVFKGHYITAGTYDARRGDGDLMITSKEVNPYIMQNLGNNEYMAYGCNAVYKHVKIRKRVVRAFKKIAIKYWKMSKKDAGRWARNVADSYFYRNGESCYFLIDELMENYGGDFSQGSFDDWANYEISCW;note=Orf no. 65 see PMID: 14972552;ID=CPT-T1_069.CDS.1;Parent=CPT-T1_069.mRNA; +AY216660.2 GbkToGff gene 45428 45970 . + . locus_tag=CPT-T1_070;ID=CPT-T1_070.gene; +AY216660.2 GbkToGff mRNA 45428 45970 . + . locus_tag=CPT-T1_070;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_070.mRNA;Parent=CPT-T1_070.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 45428 45431 . + . locus_tag=CPT-T1_070;regulatory_class=ribosome_binding_site;ID=CPT-T1_070.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_070.mRNA; +AY216660.2 GbkToGff CDS 45440 45970 . + 0 locus_tag=CPT-T1_070;note=HHPred 99 probability structural alignment to phage T4 polynucleotide kinase Protein Data bank entry 5UJ0 over most of protein%3B Orf no. 64 see PMID: 14972552%3B InterPro domain IPR023214;codon_start=1;transl_table=11;product=polynucleotide kinase PnkP;translation=MDKITIWGQTINLFLGTRRVAIFDFDGTLSDGSGRLHLLPTKDLHLTESWSEFNRAAIFDNPIQSTIDVMNSMFAAGYHVIILTGRSDEVRYASELWLKHHGARYDYLVMRPHTDNRKDTVMKEEAVRAIGIDNILAAWDDSVNIIKKFRDLGITTYQVCEYACDSREDLNSHGVD;ID=CPT-T1_070.CDS.1;Parent=CPT-T1_070.mRNA; +AY216660.2 GbkToGff gene 45941 46451 . + . locus_tag=CPT-T1_071;ID=CPT-T1_071.gene; +AY216660.2 GbkToGff mRNA 45941 46451 . + . locus_tag=CPT-T1_071;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_071.mRNA;Parent=CPT-T1_071.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 45941 45944 . + . locus_tag=CPT-T1_071;regulatory_class=ribosome_binding_site;ID=CPT-T1_071.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_071.mRNA; +AY216660.2 GbkToGff CDS 45957 46451 . + 0 locus_tag=CPT-T1_071;note=InterPro domains IPR001471%2C IPR003615%2C and IPR016177%3B similar to phage T7 protein 3.8%3B alternative in-frame start site proposed as T1 p63a%3B Orf no. 63 see PMID: 14972552;codon_start=1;transl_table=11;product=HNH endonuclease;translation=MVSIDNKSMVRELFTYSDGVLYWKAKSSKYSRAKIGGAAGSKDKDGYIIIRVRNETRGAHRLVWIYHNGKIPDGMEVDHMDGDITNNRIENLRLVTRTINNRNQKKRSDNTTGVSGVTFMKDRGKYRAQVRNKRLGQFDTIEEAAKAVKDERDRLGLFTKRHGV;ID=CPT-T1_071.CDS.1;Parent=CPT-T1_071.mRNA; +AY216660.2 GbkToGff gene 46444 47026 . + . locus_tag=CPT-T1_072;ID=CPT-T1_072.gene; +AY216660.2 GbkToGff mRNA 46444 47026 . + . locus_tag=CPT-T1_072;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_072.mRNA;Parent=CPT-T1_072.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 46444 46447 . + . locus_tag=CPT-T1_072;regulatory_class=ribosome_binding_site;ID=CPT-T1_072.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_072.mRNA; +AY216660.2 GbkToGff CDS 46454 47026 . + 0 locus_tag=CPT-T1_072;note=InterPro domain IPR027417%3B HHPred predicted structural similarity at 99%25 probability to phage T4 DNK Protein Data Bank Entry 1DEK%3B Orf no. 62 see PMID: 14972552;codon_start=1;transl_table=11;product=deoxynucleotide kinase;translation=MKTAIILNGAPGAGKDTIGCILADTYDHVALRSFKAPMFEIARAILGETNFEYFMFLYEDRRYKEEPASILNGKSPRQFMIWISEEVIKPQFGNRFFGMRAESKVKESHSLSVFTDGGFKDEILQMIEGDIQVKLCRIHRNGCNFDNDSRDYIYLDDMIGVNGYQECDFFSVEGHPEITAQHIAATFINK;ID=CPT-T1_072.CDS.1;Parent=CPT-T1_072.mRNA; +AY216660.2 GbkToGff gene 47085 47308 . + . locus_tag=CPT-T1_073;ID=CPT-T1_073.gene; +AY216660.2 GbkToGff mRNA 47085 47308 . + . locus_tag=CPT-T1_073;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_073.mRNA;Parent=CPT-T1_073.gene; +AY216660.2 GbkToGff CDS 47099 47308 . + 0 locus_tag=CPT-T1_073;codon_start=1;transl_table=11;product=hypothetical protein;translation=MMVSTDKFFTCTKTSEVFELVHTDNGDFMHDGCDAFIEVKESDYDDGVYYNPAVNTQFFTPIEEEGEEA;note=Orf no. 61 see PMID: 14972552;ID=CPT-T1_073.CDS.1;Parent=CPT-T1_073.mRNA; +AY216660.2 GbkToGff gene 47293 47649 . + . locus_tag=CPT-T1_074;ID=CPT-T1_074.gene; +AY216660.2 GbkToGff mRNA 47293 47649 . + . locus_tag=CPT-T1_074;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_074.mRNA;Parent=CPT-T1_074.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47293 47297 . + . locus_tag=CPT-T1_074;regulatory_class=ribosome_binding_site;ID=CPT-T1_074.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_074.mRNA; +AY216660.2 GbkToGff CDS 47305 47649 . + 0 locus_tag=CPT-T1_074;codon_start=1;transl_table=11;product=hypothetical protein;translation=MITINLSDKQAREILDTIGEQLHVKGDTAEILNQIERQLTPVSTNQAEFAAWKSERILPNIIKAWKRKHKKEINVEDLFTDELSPSNVAQYQLRYMESVCNQVLGVSFSFKGDK;note=Orf no. 60 see PMID: 14972552;ID=CPT-T1_074.CDS.1;Parent=CPT-T1_074.mRNA; +AY216660.2 GbkToGff gene 47637 47879 . + . locus_tag=CPT-T1_075;ID=CPT-T1_075.gene; +AY216660.2 GbkToGff mRNA 47637 47879 . + . locus_tag=CPT-T1_075;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_075.mRNA;Parent=CPT-T1_075.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47637 47640 . + . locus_tag=CPT-T1_075;regulatory_class=ribosome_binding_site;ID=CPT-T1_075.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_075.mRNA; +AY216660.2 GbkToGff CDS 47649 47879 . + 0 locus_tag=CPT-T1_075;codon_start=1;transl_table=11;product=hypothetical protein;translation=MFGLSEAEWNVVKRAAKELNKFVSGMKKEDRKNDKIMIDVISTHHKKVELLIDRYKFVWTAGYIAGRVGNKEGDYE;note=Orf no. 59 see PMID: 14972552;ID=CPT-T1_075.CDS.1;Parent=CPT-T1_075.mRNA; +AY216660.2 GbkToGff gene 47864 48082 . + . locus_tag=CPT-T1_076;ID=CPT-T1_076.gene; +AY216660.2 GbkToGff mRNA 47864 48082 . + . locus_tag=CPT-T1_076;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_076.mRNA;Parent=CPT-T1_076.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 47864 47868 . + . locus_tag=CPT-T1_076;regulatory_class=ribosome_binding_site;ID=CPT-T1_076.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_076.mRNA; +AY216660.2 GbkToGff CDS 47879 48082 . + 0 locus_tag=CPT-T1_076;codon_start=1;transl_table=11;product=hypothetical protein;translation=MANLPKKGDQVRCVTSRNGNALSAGCLYDVEKVSKSKRLVFVYGDDGNLHEIDYPQDVTNGQFEIND;note=Orf no. 58 see PMID: 14972552;ID=CPT-T1_076.CDS.1;Parent=CPT-T1_076.mRNA; +AY216660.2 GbkToGff gene 48235 48408 . + . locus_tag=CPT-T1_077;ID=CPT-T1_077.gene; +AY216660.2 GbkToGff mRNA 48235 48408 . + . locus_tag=CPT-T1_077;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_077.mRNA;Parent=CPT-T1_077.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 48235 48238 . + . locus_tag=CPT-T1_077;regulatory_class=ribosome_binding_site;ID=CPT-T1_077.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_077.mRNA; +AY216660.2 GbkToGff CDS 48247 48408 . + 0 locus_tag=CPT-T1_077;codon_start=1;transl_table=11;product=hypothetical protein;translation=MQKTKDESVKIEIKVTRNGETTRYKKRLNPGEAVIGRIAGVMIKAQEDEAIQS;note=Orf no. 57 see PMID: 14972552;ID=CPT-T1_077.CDS.1;Parent=CPT-T1_077.mRNA; +AY216660.2 GbkToGff gene 48377 48574 . + . locus_tag=CPT-T1_078;ID=CPT-T1_078.gene; +AY216660.2 GbkToGff mRNA 48377 48574 . + . locus_tag=CPT-T1_078;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_078.mRNA;Parent=CPT-T1_078.gene; +AY216660.2 GbkToGff CDS 48389 48574 . + 0 locus_tag=CPT-T1_078;codon_start=1;transl_table=11;product=hypothetical protein;translation=MKRFKVKLIIRKMGMFCQSCKQSFEAELSATSQDEAITKAKKLSGANLDTHKINIELIKEI;note=alternative start codon to Orf no. 56 see PMID: 14972552;ID=CPT-T1_078.CDS.1;Parent=CPT-T1_078.mRNA; +AY216660.2 GbkToGff gene 48564 48803 . + . locus_tag=CPT-T1_079;ID=CPT-T1_079.gene; +AY216660.2 GbkToGff mRNA 48564 48803 . + . locus_tag=CPT-T1_079;Notes=mRNA feature automatically generated by Gbk to GFF conversion;ID=CPT-T1_079.mRNA;Parent=CPT-T1_079.gene; +AY216660.2 GbkToGff Shine_Dalgarno_seqeunce 48564 48568 . + . locus_tag=CPT-T1_079;regulatory_class=ribosome_binding_site;ID=CPT-T1_079.Shine_Dalgarno_seqeunce.1;Parent=CPT-T1_079.mRNA; +AY216660.2 GbkToGff CDS 48576 48803 . + 0 locus_tag=CPT-T1_079;codon_start=1;transl_table=11;product=hypothetical protein;translation=MTIFLLIIAGVIIFGAGLFAGFALVAAAIAMDAKDKTGVWLTYSPKKDQWEMTGDLAHCYSKAKTHPKGIKRRLS;note=single transmembrane domain predicted N-out and C-in%3B Orf no. 55 see PMID: 14972552;ID=CPT-T1_079.CDS.1;Parent=CPT-T1_079.mRNA; diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/PhageQC_Out.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/test-data/PhageQC_Out.gff3 Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,185 @@ +##gff-version 3 +AY216660.2 feature gene 25 34 . + . note=Missing RBS;ID=CPT-T1_001.gene;Name= +AY216660.2 feature gene 574 583 . + . note=Missing RBS;ID=CPT-T1_002.gene;Name= +AY216660.2 feature gene 2215 2224 . + . note=Missing RBS;ID=CPT-T1_003.gene;Name= +AY216660.2 feature gene 3481 3490 . + . note=Missing RBS;ID=CPT-T1_004.gene;Name= +AY216660.2 feature gene 4249 4258 . + . note=Missing RBS;ID=CPT-T1_005.gene;Name= +AY216660.2 feature gene 5374 5383 . + . note=Missing RBS;ID=CPT-T1_006.gene;Name= +AY216660.2 feature gene 5911 5920 . + . note=Missing RBS;ID=CPT-T1_007.gene;Name= +AY216660.2 feature gene 6769 6778 . + . note=Missing RBS;ID=CPT-T1_008.gene;Name= +AY216660.2 feature gene 7779 7788 . + . note=Missing RBS;ID=CPT-T1_009.gene;Name= +AY216660.2 feature gene 8112 8121 . + . note=Missing RBS;ID=CPT-T1_010.gene;Name= +AY216660.2 feature gene 8519 8528 . + . note=Missing RBS;ID=CPT-T1_011.gene;Name= +AY216660.2 feature gene 8882 8891 . + . note=Missing RBS;ID=CPT-T1_012.gene;Name= +AY216660.2 feature gene 9317 9326 . + . note=Missing RBS;ID=CPT-T1_013.gene;Name= +AY216660.2 feature gene 9718 9727 . + . note=Missing RBS;ID=CPT-T1_014.gene;Name= +AY216660.2 feature gene 10501 10510 . + . note=Missing RBS;ID=CPT-T1_015.gene;Name= +AY216660.2 feature gene 10501 10510 . + . note=Missing RBS;ID=CPT-T1_016.gene;Name= +AY216660.2 feature gene 11177 11186 . + . note=Missing RBS;ID=CPT-T1_017.gene;Name= +AY216660.2 feature gene 14052 14061 . + . note=Missing RBS;ID=CPT-T1_018.gene;Name= +AY216660.2 feature gene 14485 14494 . + . note=Missing RBS;ID=CPT-T1_019.gene;Name= +AY216660.2 feature gene 15264 15273 . + . note=Missing RBS;ID=CPT-T1_020.gene;Name= +AY216660.2 feature gene 15994 16003 . + . note=Missing RBS;ID=CPT-T1_021.gene;Name= +AY216660.2 feature gene 16673 16682 . + . note=Missing RBS;ID=CPT-T1_022.gene;Name= +AY216660.2 feature gene 20236 20245 . + . note=Missing RBS;ID=CPT-T1_023.gene;Name= +AY216660.2 feature gene 20541 20550 . + . note=Missing RBS;ID=CPT-T1_024.gene;Name= +AY216660.2 feature gene 21524 21533 . - . note=Missing RBS;ID=CPT-T1_025.gene;Name= +AY216660.2 feature gene 21740 21749 . + . note=Missing RBS;ID=CPT-T1_026.gene;Name= +AY216660.2 feature gene 22005 22014 . + . note=Missing RBS;ID=CPT-T1_027.gene;Name= +AY216660.2 feature gene 23114 23123 . + . note=Missing RBS;ID=CPT-T1_028.gene;Name= +AY216660.2 feature gene 23839 23848 . + . note=Missing RBS;ID=CPT-T1_029.gene;Name= +AY216660.2 feature gene 26556 26565 . - . note=Missing RBS;ID=CPT-T1_030.gene;Name= +AY216660.2 feature gene 26606 26615 . - . note=Missing RBS;ID=CPT-T1_031.gene;Name= +AY216660.2 feature gene 27591 27600 . - . note=Missing RBS;ID=CPT-T1_032.gene;Name= +AY216660.2 feature gene 28104 28113 . - . note=Missing RBS;ID=CPT-T1_033.gene;Name= +AY216660.2 feature gene 28153 28162 . + . note=Missing RBS;ID=CPT-T1_034.gene;Name= +AY216660.2 feature gene 30166 30175 . + . note=Missing RBS;ID=CPT-T1_035.gene;Name= +AY216660.2 feature gene 30654 30663 . + . note=Missing RBS;ID=CPT-T1_036.gene;Name= +AY216660.2 feature gene 31362 31371 . + . note=Missing RBS;ID=CPT-T1_037.gene;Name= +AY216660.2 feature gene 31680 31689 . + . note=Missing RBS;ID=CPT-T1_038.gene;Name= +AY216660.2 feature gene 31866 31875 . + . note=Missing RBS;ID=CPT-T1_039.gene;Name= +AY216660.2 feature gene 32084 32093 . + . note=Missing RBS;ID=CPT-T1_040.gene;Name= +AY216660.2 feature gene 32452 32461 . + . note=Missing RBS;ID=CPT-T1_041.gene;Name= +AY216660.2 feature gene 33658 33667 . + . note=Missing RBS;ID=CPT-T1_042.gene;Name= +AY216660.2 feature gene 34211 34220 . + . note=Missing RBS;ID=CPT-T1_043.gene;Name= +AY216660.2 feature gene 34510 34519 . + . note=Missing RBS;ID=CPT-T1_044.gene;Name= +AY216660.2 feature gene 34724 34733 . + . note=Missing RBS;ID=CPT-T1_045.gene;Name= +AY216660.2 feature gene 35212 35221 . + . note=Missing RBS;ID=CPT-T1_046.gene;Name= +AY216660.2 feature gene 36185 36194 . - . note=Missing RBS;ID=CPT-T1_047.gene;Name= +AY216660.2 feature gene 37761 37770 . - . note=Missing RBS;ID=CPT-T1_048.gene;Name= +AY216660.2 feature gene 38300 38309 . - . note=Missing RBS;ID=CPT-T1_049.gene;Name= +AY216660.2 feature gene 38712 38721 . - . note=Missing RBS;ID=CPT-T1_050.gene;Name= +AY216660.2 feature gene 39000 39009 . - . note=Missing RBS;ID=CPT-T1_051.gene;Name= +AY216660.2 feature gene 39228 39237 . - . note=Missing RBS;ID=CPT-T1_052.gene;Name= +AY216660.2 feature gene 39450 39459 . - . note=Missing RBS;ID=CPT-T1_053.gene;Name= +AY216660.2 feature gene 39764 39773 . - . note=Missing RBS;ID=CPT-T1_054.gene;Name= +AY216660.2 feature gene 39984 39993 . - . note=Missing RBS;ID=CPT-T1_055.gene;Name= +AY216660.2 feature gene 40348 40357 . - . note=Missing RBS;ID=CPT-T1_056.gene;Name= +AY216660.2 feature gene 41005 41014 . + . note=Missing RBS;ID=CPT-T1_057.gene;Name= +AY216660.2 feature gene 41591 41600 . + . note=Missing RBS;ID=CPT-T1_058.gene;Name= +AY216660.2 feature gene 41998 42007 . + . note=Missing RBS;ID=CPT-T1_059.gene;Name= +AY216660.2 feature gene 42232 42241 . + . note=Missing RBS;ID=CPT-T1_060.gene;Name= +AY216660.2 feature gene 42458 42467 . + . note=Missing RBS;ID=CPT-T1_061.gene;Name= +AY216660.2 feature gene 42565 42574 . + . note=Missing RBS;ID=CPT-T1_062.gene;Name= +AY216660.2 feature gene 42809 42818 . + . note=Missing RBS;ID=CPT-T1_063.gene;Name= +AY216660.2 feature gene 43370 43379 . + . note=Missing RBS;ID=CPT-T1_064.gene;Name= +AY216660.2 feature gene 43855 43864 . + . note=Missing RBS;ID=CPT-T1_065.gene;Name= +AY216660.2 feature gene 44083 44092 . + . note=Missing RBS;ID=CPT-T1_066.gene;Name= +AY216660.2 feature gene 44512 44521 . + . note=Missing RBS;ID=CPT-T1_067.gene;Name= +AY216660.2 feature gene 44610 44619 . + . note=Missing RBS;ID=CPT-T1_068.gene;Name= +AY216660.2 feature gene 44883 44892 . + . note=Missing RBS;ID=CPT-T1_069.gene;Name= +AY216660.2 feature gene 45413 45422 . + . note=Missing RBS;ID=CPT-T1_070.gene;Name= +AY216660.2 feature gene 45926 45935 . + . note=Missing RBS;ID=CPT-T1_071.gene;Name= +AY216660.2 feature gene 46429 46438 . + . note=Missing RBS;ID=CPT-T1_072.gene;Name= +AY216660.2 feature gene 47070 47079 . + . note=Missing RBS;ID=CPT-T1_073.gene;Name= +AY216660.2 feature gene 47278 47287 . + . note=Missing RBS;ID=CPT-T1_074.gene;Name= +AY216660.2 feature gene 47622 47631 . + . note=Missing RBS;ID=CPT-T1_075.gene;Name= +AY216660.2 feature gene 47849 47858 . + . note=Missing RBS;ID=CPT-T1_076.gene;Name= +AY216660.2 feature gene 48220 48229 . + . note=Missing RBS;ID=CPT-T1_077.gene;Name= +AY216660.2 feature gene 48362 48371 . + . note=Missing RBS;ID=CPT-T1_078.gene;Name= +AY216660.2 feature gene 48549 48558 . + . note=Missing RBS;ID=CPT-T1_079.gene;Name= +AY216660.2 feature gene 2185 2238 . . . note=Excessive gap%2C 54 bases; +AY216660.2 feature gene 5877 5937 . . . note=Excessive gap%2C 61 bases; +AY216660.2 feature gene 6706 6795 . . . note=Excessive gap%2C 90 bases; +AY216660.2 feature gene 10415 10527 . . . note=Excessive gap%2C 113 bases; +AY216660.2 feature gene 14433 14511 . . . note=Excessive gap%2C 79 bases; +AY216660.2 feature gene 14424 14540 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 14424 14433 . + . Parent=; +AY216660.2 feature CDS 14439 14540 . + 0 Parent=; +AY216660.2 feature gene 14447 14451 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 14452 14461 . - . Parent=; +AY216660.2 feature CDS 14447 14524 . - 0 Parent=; +AY216660.2 feature gene 16622 16698 . . . note=Excessive gap%2C 77 bases; +AY216660.2 feature gene 21507 21765 . . . note=Excessive gap%2C 259 bases; +AY216660.2 feature gene 21907 22033 . . . note=Excessive gap%2C 127 bases; +AY216660.2 feature gene 24290 24350 . . . note=Excessive gap%2C 61 bases; +AY216660.2 feature gene 24286 24390 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 24286 24295 . + . Parent=; +AY216660.2 feature CDS 24301 24390 . + 0 Parent=; +AY216660.2 feature gene 27574 27634 . . . note=Excessive gap%2C 61 bases; +AY216660.2 feature gene 28088 28180 . . . note=Excessive gap%2C 93 bases; +AY216660.2 feature gene 30613 30679 . . . note=Excessive gap%2C 67 bases; +AY216660.2 feature gene 30629 30718 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 30629 30638 . + . Parent=; +AY216660.2 feature CDS 30644 30718 . + 0 Parent=; +AY216660.2 feature gene 31642 31707 . . . note=Excessive gap%2C 66 bases; +AY216660.2 feature gene 32400 32477 . . . note=Excessive gap%2C 78 bases; +AY216660.2 feature gene 32385 32492 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 32385 32394 . + . Parent=; +AY216660.2 feature CDS 32400 32492 . + 0 Parent=; +AY216660.2 feature gene 33612 33683 . . . note=Excessive gap%2C 72 bases; +AY216660.2 feature gene 34167 34238 . . . note=Excessive gap%2C 72 bases; +AY216660.2 feature gene 34416 34533 . . . note=Excessive gap%2C 118 bases; +AY216660.2 feature gene 35639 35760 . . . note=Excessive gap%2C 122 bases; +AY216660.2 feature gene 35674 35678 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 35679 35688 . - . Parent=; +AY216660.2 feature CDS 35674 35754 . - 0 Parent=; +AY216660.2 feature gene 35674 35678 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 35679 35688 . - . Parent=; +AY216660.2 feature CDS 35674 35766 . - 0 Parent=; +AY216660.2 feature gene 38694 38773 . . . note=Excessive gap%2C 80 bases; +AY216660.2 feature gene 38719 38814 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 38719 38728 . + . Parent=; +AY216660.2 feature CDS 38734 38814 . + 0 Parent=; +AY216660.2 feature gene 38725 38814 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 38725 38734 . + . Parent=; +AY216660.2 feature CDS 38740 38814 . + 0 Parent=; +AY216660.2 feature gene 39212 39288 . . . note=Excessive gap%2C 77 bases; +AY216660.2 feature gene 40331 41030 . . . note=Excessive gap%2C 700 bases; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40496 . - 0 Parent=; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40511 . - 0 Parent=; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40541 . - 0 Parent=; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40559 . - 0 Parent=; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40595 . - 0 Parent=; +AY216660.2 feature gene 40416 40420 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40421 40430 . - . Parent=; +AY216660.2 feature CDS 40416 40637 . - 0 Parent=; +AY216660.2 feature gene 40817 40921 . + . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 40817 40826 . + . Parent=; +AY216660.2 feature CDS 40832 40921 . + 0 Parent=; +AY216660.2 feature gene 41946 42023 . . . note=Excessive gap%2C 78 bases; +AY216660.2 feature gene 42765 42835 . . . note=Excessive gap%2C 71 bases; +AY216660.2 feature gene 43325 43395 . . . note=Excessive gap%2C 71 bases; +AY216660.2 feature gene 44031 44110 . . . note=Excessive gap%2C 80 bases; +AY216660.2 feature gene 44486 44538 . . . note=Excessive gap%2C 53 bases; +AY216660.2 feature gene 44853 44909 . . . note=Excessive gap%2C 57 bases; +AY216660.2 feature gene 45357 45439 . . . note=Excessive gap%2C 83 bases; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45445 . - 0 Parent=; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45463 . - 0 Parent=; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45466 . - 0 Parent=; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45475 . - 0 Parent=; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45478 . - 0 Parent=; +AY216660.2 feature gene 45368 45372 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 45373 45382 . - . Parent=; +AY216660.2 feature CDS 45368 45481 . - 0 Parent=; +AY216660.2 feature gene 47027 47098 . . . note=Excessive gap%2C 72 bases; +AY216660.2 feature gene 48083 48246 . . . note=Excessive gap%2C 164 bases; +AY216660.2 feature gene 48086 48090 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 48091 48100 . - . Parent=; +AY216660.2 feature CDS 48086 48169 . - 0 Parent=; +AY216660.2 feature gene 48160 48164 . - . note=Possible gene; +AY216660.2 feature Shine_Dalgarno_sequence 48165 48174 . - . Parent=; +AY216660.2 feature CDS 48160 48255 . - 0 Parent=; +AY216660.2 feature gene 10528 10844 . . . note=Excessive Overlap;ID=CPT-T1_015.gene;Name= +AY216660.2 feature gene 26447 26536 . . . note=Excessive Overlap;ID=CPT-T1_030.gene;Name= +AY216660.2 feature gene 32112 32166 . . . note=Excessive Overlap;ID=CPT-T1_039.gene;Name= diff -r 000000000000 -r c3140b08d703 cpt_phageqc_annotation/test-data/PhageQC_Out.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_phageqc_annotation/test-data/PhageQC_Out.html Fri Jun 17 13:00:50 2022 +0000 @@ -0,0 +1,1142 @@ + + + + + + + + + + Phage QC on AY216660.2 - 76 + + + + + + + + + + + +
+
+ +
+
+
+
+

Phage AY216660.2

+ +
+ +
+
+ +

Gene Starts

+

Genes missing RBS 0 / 79

+

The following genes have issues with their RBS.

+

+ Since you have not annotated any possible RBSs, this does not count off from your overall score. +

+

Feature TypeIDLocationErrorUpstream (-15 .. -5)
geneCPT-T1_001.gene40..576 [1]No RBS annotated, None foundtaaatgttaa
geneCPT-T1_002.gene589..2184 [1]Unannotated but valid RBStaacttt AGG
geneCPT-T1_003.gene2230..3522 [1]No RBS annotated, None foundacggcccttt
geneCPT-T1_004.gene3496..4273 [1]Unannotated but valid RBScc GGAG ccgg
geneCPT-T1_005.gene4264..5388 [1]No RBS annotated, None foundtccagacttt
geneCPT-T1_006.gene5389..5876 [1]No RBS annotated, None foundagcaaagtaa
geneCPT-T1_007.gene5926..6705 [1]No RBS annotated, None foundtagtcctttt
geneCPT-T1_008.gene6784..7755 [1]No RBS annotated, None foundttttttatta
geneCPT-T1_009.gene7794..8093 [1]No RBS annotated, None foundgtcccttttt
geneCPT-T1_010.gene8127..8548 [1]No RBS annotated, None foundagcgcctttt
geneCPT-T1_011.gene8534..8919 [1]Unannotated but valid RBSatgac AGGA c
geneCPT-T1_012.gene8897..9355 [1]No RBS annotated, None foundatcgtccagt
geneCPT-T1_013.gene9332..9743 [1]No RBS annotated, None foundgcaatcaagc
geneCPT-T1_014.gene9733..10414 [1]Unannotated but valid RBStac GAG taga
geneCPT-T1_015.gene10516..10845 [1]No RBS annotated, None foundtgctatcaac
geneCPT-T1_016.gene10516..11162 [1]No RBS annotated, None foundtgctatcaac
geneCPT-T1_017.gene11192..14076 [1]No RBS annotated, None foundgttttttcgt
geneCPT-T1_018.gene14067..14432 [1]Unannotated but valid RBScattc AGGA a
geneCPT-T1_019.gene14500..15294 [1]Unannotated but valid RBSgttgc AGGT a
geneCPT-T1_020.gene15279..16025 [1]No RBS annotated, None foundtgtcgcttca
geneCPT-T1_021.gene16009..16621 [1]No RBS annotated, None foundtaatcgttcg
geneCPT-T1_022.gene16688..20217 [1]No RBS annotated, None foundataaatagca
geneCPT-T1_023.gene20251..20568 [1]No RBS annotated, None foundatgccctttt
geneCPT-T1_024.gene20556..21257 [1]No RBS annotated, None foundcttaatagca
geneCPT-T1_025.gene21279..21518 [-1]No RBS annotated, None foundaatcacacta
geneCPT-T1_026.gene21755..21906 [1]No RBS annotated, None foundtgtatatcgt
geneCPT-T1_027.gene22020..23098 [1]No RBS annotated, None foundattcatcgta
geneCPT-T1_028.gene23129..23820 [1]No RBS annotated, None foundaaacaaataa
geneCPT-T1_029.gene23854..24289 [1]No RBS annotated, None foundtcgcccataa
geneCPT-T1_030.gene24351..26550 [-1]No RBS annotated, None foundcgtgctattt
geneCPT-T1_031.gene26447..26600 [-1]No RBS annotated, None foundtgcccattgc
geneCPT-T1_032.gene26638..27585 [-1]No RBS annotated, None foundcgcgtttttt
geneCPT-T1_033.gene27635..28098 [-1]No RBS annotated, None foundgcaaaaagtg
geneCPT-T1_034.gene28168..30199 [1]No RBS annotated, None foundgttacaacga
geneCPT-T1_035.gene30181..30612 [1]No RBS annotated, None foundcaaagacttc
geneCPT-T1_036.gene30669..31393 [1]No RBS annotated, None foundatctcaccaa
geneCPT-T1_037.gene31377..31641 [1]No RBS annotated, None foundaaaacaaacg
geneCPT-T1_038.gene31695..31917 [1]No RBS annotated, None foundtcatagaaac
geneCPT-T1_039.gene31881..32167 [1]No RBS annotated, None foundcttcatctct
geneCPT-T1_040.gene32099..32399 [1]Unannotated but valid RBS GAG ttttacc
geneCPT-T1_041.gene32467..33611 [1]No RBS annotated, None foundtttcattatc
geneCPT-T1_042.gene33673..34166 [1]No RBS annotated, None foundttttatagaa
geneCPT-T1_043.gene34226..34415 [1]No RBS annotated, None foundaccacatcga
geneCPT-T1_044.gene34525..34749 [1]No RBS annotated, None foundgttcaaaaaa
geneCPT-T1_045.gene34739..35237 [1]Unannotated but valid RBS AGG cgcttaa
geneCPT-T1_046.gene35227..35638 [1]No RBS annotated, None foundacgcatattg
geneCPT-T1_047.gene35761..36179 [-1]No RBS annotated, None foundcaatcctcga
geneCPT-T1_048.gene36173..37755 [-1]No RBS annotated, None foundtcacgccgtc
geneCPT-T1_049.gene37741..38294 [-1]No RBS annotated, None foundgacggcacaa
geneCPT-T1_050.gene38277..38706 [-1]No RBS annotated, None foundtcaagataac
geneCPT-T1_051.gene38774..38994 [-1]No RBS annotated, None foundtttacattaa
geneCPT-T1_052.gene38987..39222 [-1]No RBS annotated, None foundtaccaaacaa
geneCPT-T1_053.gene39289..39444 [-1]No RBS annotated, None foundggcatatcaa
geneCPT-T1_054.gene39429..39758 [-1]No RBS annotated, None foundtatcctgact
geneCPT-T1_055.gene39766..39978 [-1]Unannotated but valid RBStagc GGA tcg
geneCPT-T1_056.gene39959..40342 [-1]No RBS annotated, None foundacatcaacag
geneCPT-T1_057.gene41020..41606 [1]No RBS annotated, None foundtgtatattga
geneCPT-T1_058.gene41606..41945 [1]Unannotated but valid RBSaaaatgt GGA
geneCPT-T1_059.gene42013..42254 [1]Unannotated but valid RBSac GAG atacc
geneCPT-T1_060.gene42247..42487 [1]No RBS annotated, None foundtcagcacttt
geneCPT-T1_061.gene42473..42594 [1]No RBS annotated, None foundggcgacaaca
geneCPT-T1_062.gene42580..42764 [1]No RBS annotated, None foundttgctactgt
geneCPT-T1_063.gene42824..43324 [1]No RBS annotated, None foundccatcgacaa
geneCPT-T1_064.gene43385..43881 [1]No RBS annotated, None foundacatcaacca
geneCPT-T1_065.gene43870..44030 [1]No RBS annotated, None foundactcttgtcc
geneCPT-T1_066.gene44098..44485 [1]No RBS annotated, None foundgcacgacaac
geneCPT-T1_067.gene44527..44649 [1]No RBS annotated, None foundgttccccttt
geneCPT-T1_068.gene44625..44852 [1]No RBS annotated, None foundgacattatct
geneCPT-T1_069.gene44898..45356 [1]No RBS annotated, None foundcctacaccaa
geneCPT-T1_070.gene45428..45970 [1]No RBS annotated, None foundttaagcaacc
geneCPT-T1_071.gene45941..46451 [1]No RBS annotated, None foundgcctgtgata
geneCPT-T1_072.gene46444..47026 [1]No RBS annotated, None foundtcacaaaaag
geneCPT-T1_073.gene47085..47308 [1]No RBS annotated, None foundctcatcgaca
geneCPT-T1_074.gene47293..47649 [1]No RBS annotated, None foundcccgatcgaa
geneCPT-T1_075.gene47637..47879 [1]No RBS annotated, None foundaagtttttca
geneCPT-T1_076.gene47864..48082 [1]Unannotated but valid RBScgt AGGT aac
geneCPT-T1_077.gene48235..48408 [1]No RBS annotated, None foundgcggcaacaa
geneCPT-T1_078.gene48377..48574 [1]No RBS annotated, None foundctggcgttat
geneCPT-T1_079.gene48564..48803 [1]No RBS annotated, None foundatattgaatt
+
+ +

Start Codon Usage

+

This section covers genes with unusual start codons

+
+ + + + + + + + + + + + +
Start CodonCount
ATG74
GTG3
TTG2
+
+ +
+ + + + + + + + + + + +
Feature TypeIDLocationError
+
+ +

Intergenic Gaps

+

Phage genomes are under pressure to maintain high coding density. Large intergenic gaps may be a sign of incorrect gene starts or missing genes.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RegionSizeBounding Gene Transcription DirectionMessage
2184 .. 223854→ → +
5876 .. 593761→ → +
6705 .. 679590→ → +
10414 .. 10527113→ → +
14432 .. 1451179→ → + 2 ORFs found in this region +
16621 .. 1669877→ → +
21506 .. 21765259→ → +
21906 .. 22033127→ → +
24289 .. 2435061→ → + 1 ORFs found in this region +
27573 .. 2763461→ → +
28087 .. 2818093→ → +
30612 .. 3067967→ → + 1 ORFs found in this region +
31641 .. 3170766→ → +
32399 .. 3247778→ → + 1 ORFs found in this region +
33611 .. 3368372→ → +
34166 .. 3423872→ → +
34415 .. 34533118→ → +
35638 .. 35760122→ → + 2 ORFs found in this region +
38693 .. 3877380→ → + 2 ORFs found in this region +
39211 .. 3928877→ → +
40330 .. 41030700→ → + 7 ORFs found in this region +
41945 .. 4202378→ → +
42764 .. 4283571→ → +
43324 .. 4339571→ → +
44030 .. 4411080→ → +
44485 .. 4453853→ → +
44852 .. 4490957→ → +
45356 .. 4543983→ → + 6 ORFs found in this region +
47026 .. 4709872→ → +
48082 .. 48246164→ → + 2 ORFs found in this region +
+
+ +

Overlapping Genes

+

Large gene overlaps may indicate an incorrect gene start or miscalled gene.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Feature AFeature BShared RegionOverlap Length
CPT-T1_015.gene ([10516:10845](+))CPT-T1_016.gene ([10516:11162](+))10527..10844317 bp
CPT-T1_030.gene ([24351:26550](-))CPT-T1_031.gene ([26447:26600](-))26446..2653690 bp
CPT-T1_039.gene ([31881:32167](+))CPT-T1_040.gene ([32099:32399](+))32111..3216655 bp
+
+ + +

Antisense Genes

+

Possible Morons 78 / 79 (Doesn't count towards score)

+
+ + + + + + + + + + + + + + + +
FeatureRBSSurrounding Features
CPT-T1_025.geneNo RBS Available + → + → + + → + → + +
+
+ + +

Annotation Issues

+

Missing Product Tags 79 / 79

+
+ + + + + + + + + +
FeatureQualifiers
+
+ + + + + +
+
+
+ + + + +