# HG changeset patch # User cpt # Date 1685933052 0 # Node ID 4f4b413056f6182237d83faa9a9aba78cc043111 # Parent 6e7e20cb1fc757c94092ef6394043068654e4191 planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,115 @@ + + + + python + biopython + requests + cpt_gffparser + + + + + + + + 10.1371/journal.pcbi.1008214 + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + 10.1371/journal.pcbi.1008214 + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + + + + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + + + + diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/cpt-macros.xml --- a/cpt_gff_rebase/cpt-macros.xml Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ - - - - - python - biopython - requests - - - - - - - - 10.1371/journal.pcbi.1008214 - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - 10.1371/journal.pcbi.1008214 - - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - - - - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - - - - diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/gff3.py --- a/cpt_gff_rebase/gff3.py Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,346 +0,0 @@ -import copy -import logging - -log = logging.getLogger() -log.setLevel(logging.WARN) - - -def feature_lambda( - feature_list, - test, - test_kwargs, - subfeatures=True, - parent=None, - invert=False, - recurse=True, -): - """Recursively search through features, testing each with a test function, yielding matches. - - GFF3 is a hierachical data structure, so we need to be able to recursively - search through features. E.g. if you're looking for a feature with - ID='bob.42', you can't just do a simple list comprehension with a test - case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. - - :type feature_list: list - :param feature_list: an iterable of features - - :type test: function reference - :param test: a closure with the method signature (feature, **kwargs) where - the kwargs are those passed in the next argument. This - function should return True or False, True if the feature is - to be yielded as part of the main feature_lambda function, or - False if it is to be ignored. This function CAN mutate the - features passed to it (think "apply"). - - :type test_kwargs: dictionary - :param test_kwargs: kwargs to pass to your closure when it is called. - - :type subfeatures: boolean - :param subfeatures: when a feature is matched, should just that feature be - yielded to the caller, or should the entire sub_feature - tree for that feature be included? subfeatures=True is - useful in cases such as searching for a gene feature, - and wanting to know what RBS/Shine_Dalgarno_sequences - are in the sub_feature tree (which can be accomplished - with two feature_lambda calls). subfeatures=False is - useful in cases when you want to process (and possibly - return) the entire feature tree, such as applying a - qualifier to every single feature. - - :type invert: boolean - :param invert: Negate/invert the result of the filter. - - :rtype: yielded list - :return: Yields a list of matching features. - """ - # Either the top level set of [features] or the subfeature attribute - for feature in feature_list: - feature._parent = parent - if not parent: - # Set to self so we cannot go above root. - feature._parent = feature - test_result = test(feature, **test_kwargs) - # if (not invert and test_result) or (invert and not test_result): - if invert ^ test_result: - if not subfeatures: - feature_copy = copy.deepcopy(feature) - feature_copy.sub_features = list() - yield feature_copy - else: - yield feature - - if recurse and hasattr(feature, "sub_features"): - for x in feature_lambda( - feature.sub_features, - test, - test_kwargs, - subfeatures=subfeatures, - parent=feature, - invert=invert, - recurse=recurse, - ): - yield x - - -def fetchParent(feature): - if not hasattr(feature, "_parent") or feature._parent is None: - return feature - else: - return fetchParent(feature._parent) - - -def feature_test_true(feature, **kwargs): - return True - - -def feature_test_type(feature, **kwargs): - if "type" in kwargs: - return str(feature.type).upper() == str(kwargs["type"]).upper() - elif "types" in kwargs: - for x in kwargs["types"]: - if str(feature.type).upper() == str(x).upper(): - return True - return False - raise Exception("Incorrect feature_test_type call, need type or types") - - -def feature_test_qual_value(feature, **kwargs): - """Test qualifier values. - - For every feature, check that at least one value in - feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] - """ - if isinstance(kwargs["qualifier"], list): - for qualifier in kwargs["qualifier"]: - for attribute_value in feature.qualifiers.get(qualifier, []): - if attribute_value in kwargs["attribute_list"]: - return True - else: - for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): - if attribute_value in kwargs["attribute_list"]: - return True - return False - - -def feature_test_location(feature, **kwargs): - if "strand" in kwargs: - if feature.location.strand != kwargs["strand"]: - return False - - return feature.location.start <= kwargs["loc"] <= feature.location.end - - -def feature_test_quals(feature, **kwargs): - """ - Example:: - - a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']}) - - # Check if a contains a Note - feature_test_quals(a, {'Note': None}) # Returns True - feature_test_quals(a, {'Product': None}) # Returns False - - # Check if a contains a note with specific value - feature_test_quals(a, {'Note': ['ome']}) # Returns True - - # Check if a contains a note with specific value - feature_test_quals(a, {'Note': ['other']}) # Returns False - """ - for key in kwargs: - if key not in feature.qualifiers: - return False - - # Key is present, no value specified - if kwargs[key] is None: - return True - - # Otherwise there is a key value we're looking for. - # so we make a list of matches - matches = [] - # And check all of the feature qualifier valuse - for value in feature.qualifiers[key]: - # For that kwargs[key] value - for x in kwargs[key]: - matches.append(x in value) - - # If none matched, then we return false. - if not any(matches): - return False - - return True - - -def feature_test_contains(feature, **kwargs): - if "index" in kwargs: - return feature.location.start < kwargs["index"] < feature.location.end - elif "range" in kwargs: - return ( - feature.location.start < kwargs["range"]["start"] < feature.location.end - and feature.location.start < kwargs["range"]["end"] < feature.location.end - ) - else: - raise RuntimeError("Must use index or range keyword") - - -def get_id(feature=None, parent_prefix=None): - result = "" - if parent_prefix is not None: - result += parent_prefix + "|" - if "locus_tag" in feature.qualifiers: - result += feature.qualifiers["locus_tag"][0] - elif "gene" in feature.qualifiers: - result += feature.qualifiers["gene"][0] - elif "Gene" in feature.qualifiers: - result += feature.qualifiers["Gene"][0] - elif "product" in feature.qualifiers: - result += feature.qualifiers["product"][0] - elif "Product" in feature.qualifiers: - result += feature.qualifiers["Product"][0] - elif "Name" in feature.qualifiers: - result += feature.qualifiers["Name"][0] - else: - return feature.id - # Leaving in case bad things happen. - # result += '%s_%s_%s_%s' % ( - # feature.id, - # feature.location.start, - # feature.location.end, - # feature.location.strand - # ) - return result - - -def get_gff3_id(gene): - return gene.qualifiers.get("Name", [gene.id])[0] - - -def ensure_location_in_bounds(start=0, end=0, parent_length=0): - # This prevents frameshift errors - while start < 0: - start += 3 - while end < 0: - end += 3 - while start > parent_length: - start -= 3 - while end > parent_length: - end -= 3 - return (start, end) - - -def coding_genes(feature_list): - for x in genes(feature_list): - if ( - len( - list( - feature_lambda( - x.sub_features, - feature_test_type, - {"type": "CDS"}, - subfeatures=False, - ) - ) - ) - > 0 - ): - yield x - - -def genes(feature_list, feature_type="gene", sort=False): - """ - Simple filter to extract gene features from the feature set. - """ - - if not sort: - for x in feature_lambda( - feature_list, feature_test_type, {"type": feature_type}, subfeatures=True - ): - yield x - else: - data = list(genes(feature_list, feature_type=feature_type, sort=False)) - data = sorted(data, key=lambda feature: feature.location.start) - for x in data: - yield x - - -def wa_unified_product_name(feature): - """ - Try and figure out a name. We gave conflicting instructions, so - this isn't as trivial as it should be. Sometimes it will be in - 'product' or 'Product', othertimes in 'Name' - """ - # Manually applied tags. - protein_product = feature.qualifiers.get( - "product", feature.qualifiers.get("Product", [None]) - )[0] - - # If neither of those are available ... - if protein_product is None: - # And there's a name... - if "Name" in feature.qualifiers: - if not is_uuid(feature.qualifiers["Name"][0]): - protein_product = feature.qualifiers["Name"][0] - - return protein_product - - -def is_uuid(name): - return name.count("-") == 4 and len(name) == 36 - - -def get_rbs_from(gene): - # Normal RBS annotation types - rbs_rbs = list( - feature_lambda( - gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False - ) - ) - rbs_sds = list( - feature_lambda( - gene.sub_features, - feature_test_type, - {"type": "Shine_Dalgarno_sequence"}, - subfeatures=False, - ) - ) - # Fraking apollo - apollo_exons = list( - feature_lambda( - gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False - ) - ) - apollo_exons = [x for x in apollo_exons if len(x) < 10] - # These are more NCBI's style - regulatory_elements = list( - feature_lambda( - gene.sub_features, - feature_test_type, - {"type": "regulatory"}, - subfeatures=False, - ) - ) - rbs_regulatory = list( - feature_lambda( - regulatory_elements, - feature_test_quals, - {"regulatory_class": ["ribosome_binding_site"]}, - subfeatures=False, - ) - ) - # Here's hoping you find just one ;) - return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons - - -def nice_name(record): - """ - get the real name rather than NCBI IDs and so on. If fails, will return record.id - """ - name = record.id - likely_parental_contig = list(genes(record.features, feature_type="contig")) - if len(likely_parental_contig) == 1: - name = likely_parental_contig[0].qualifiers.get("organism", [name])[0] - return name - - -def fsort(it): - for i in sorted(it, key=lambda x: int(x.location.start)): - yield i diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/gff3_rebase.py --- a/cpt_gff_rebase/gff3_rebase.py Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,131 +0,0 @@ -#!/usr/bin/env python -import sys -import logging -import argparse -from gff3 import feature_lambda, feature_test_qual_value -from CPT_GFFParser import gffParse, gffWrite -from Bio.SeqFeature import FeatureLocation - -log = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -def __get_features(child, interpro=False): - child_features = {} - for rec in gffParse(child): - log.info("Parsing %s", rec.id) - # Only top level - for feature in rec.features: - # Get the record id as parent_feature_id (since this is how it will be during remapping) - parent_feature_id = rec.id - # If it's an interpro specific gff3 file - if interpro: - # Then we ignore polypeptide features as they're useless - if feature.type == "polypeptide": - continue - - try: - child_features[parent_feature_id].append(feature) - except KeyError: - child_features[parent_feature_id] = [feature] - # Keep a list of feature objects keyed by parent record id - return child_features - - -def __update_feature_location(feature, parent, protein2dna): - start = feature.location.start - end = feature.location.end - if protein2dna: - start *= 3 - end *= 3 - - if parent.location.strand >= 0: - ns = parent.location.start + start - ne = parent.location.start + end - st = +1 - else: - ns = parent.location.end - end - ne = parent.location.end - start - st = -1 - - # Don't let start/stops be less than zero. - # - # Instead, we'll replace with %3 to try and keep it in the same reading - # frame that it should be in. - - if ns < 0: - ns %= 3 - if ne < 0: - ne %= 3 - - feature.location = FeatureLocation(ns, ne, strand=st) - - if hasattr(feature, "sub_features"): - for subfeature in feature.sub_features: - __update_feature_location(subfeature, parent, protein2dna) - - -def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"): - # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID - child_features = __get_features(child, interpro=interpro) - - for rec in gffParse(parent): - replacement_features = [] - # Horrifically slow I believe - for feature in feature_lambda( - rec.features, - # Filter features in the parent genome by those that are - # "interesting", i.e. have results in child_features array. - # Probably an unnecessary optimisation. - feature_test_qual_value, - {"qualifier": map_by, "attribute_list": child_features.keys()}, - subfeatures=False, - ): - - # Features which will be re-mapped - to_remap = child_features[feature.id] - - fixed_features = [] - for x in to_remap: - # Then update the location of the actual feature - __update_feature_location(x, feature, protein2dna) - - if interpro: - for y in ("status", "Target"): - try: - del x.qualifiers[y] - except: - pass - - fixed_features.append(x) - replacement_features.extend(fixed_features) - # We do this so we don't include the original set of features that we - # were rebasing against in our result. - rec.features = replacement_features - rec.annotations = {} - gffWrite([rec], sys.stdout) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="rebase gff3 features against parent locations", epilog="" - ) - parser.add_argument( - "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations" - ) - parser.add_argument( - "child", - type=argparse.FileType("r"), - help="Child GFF3 annotations to rebase against parent", - ) - parser.add_argument( - "--interpro", action="store_true", help="Interpro specific modifications" - ) - parser.add_argument( - "--protein2dna", - action="store_true", - help="Map protein translated results to original DNA data", - ) - parser.add_argument("--map_by", help="Map by key", default="ID") - args = parser.parse_args() - rebase(**vars(args)) diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/gff3_rebase.xml --- a/cpt_gff_rebase/gff3_rebase.xml Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ - - against parent features - - macros.xml - cpt-macros.xml - - - $default]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - cds42 - MRTNASC - -Then analyzed that feature, producing the *child* annotation file:: - - #gff-version 3 - cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein - -This tool will then localize the results properly against the parent and permit -proper visualization of the results in the correct location:: - - #gff-version 3 - PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein - -**Options** - -The **Interpro specific modifications** option selectively ignores *features* (*i.e.* polypeptide) and -qualifiers (status, Target) not needed in the output. - -The **Map protein translated results to original DNA data** option indicates that the DNA sequences were translated into -protein sequence during the genomic export process. When this option is selected, -the tool will multiply the bases by three to obtain the correct DNA locations. - -]]> - - diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/macros.xml --- a/cpt_gff_rebase/macros.xml Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ - - - - - python - biopython - cpt_gffparser - - - - - "$blast_tsv" - - - - - - - "$blast_xml" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - "$gff3_data" - - -#if str($reference_genome.reference_genome_source) == 'cached': - "${reference_genome.fasta_indexes.fields.path}" -#else if str($reference_genome.reference_genome_source) == 'history': - genomeref.fa -#end if - - -#if $reference_genome.reference_genome_source == 'history': - ln -s $reference_genome.genome_fasta genomeref.fa; -#end if - - -#if str($reference_genome.reference_genome_source) == 'cached': - "${reference_genome.fasta_indexes.fields.path}" -#else if str($reference_genome.reference_genome_source) == 'history': - genomeref.fa -#end if - - - - - - - "$sequences" - - - - - diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/T7_CLEAN.gff3 --- a/cpt_gff_rebase/test-data/T7_CLEAN.gff3 Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,171 +0,0 @@ -##gff-version 3 -NC_001604 GenBank contig 1 39937 . + 1 ID=NC_001604;Dbxref=BioProject:PRJNA485481,taxon:10760;Name=NC_001604;Note=Enterobacteria phage T7%2C complete genome.,VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3],[4],and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443,474,and 388 to 424. [4] inserted a T at nucleotide 17511,increasing the total sequence to 39937 bp. This change,originally found in T3 DNA [8],revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG,changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1] unless otherwise noted. The sequence shown is that of the l strand,which corresponds to the sequence of all mRNAs of known functional significance. Early mRNAs are produced by three major promoters for E. coli RNA polymerase A1,A2,and A3,located near the left end of the DNA. A fourth major E. coli promoter,A0 (also called D),that would direct transcription leftward,and several minor E. coli promoters (see Table 6 in [1]) function in vitro but have no known in vivo function. Late mRNAs are produced by 15 promoters for T7 RNA polymerase distributed across the right-most 85%25 of the DNA,and named e.g. phi10,for the first gene downstream of the promoter. There are also two T7 promoters,phiOL and phiOR,associated with possible origins of replication at the left and right ends of T7 DNA. The 23 base-pair consensus sequence for T7 promoters stretches from -17 to +6,where the initiating nucleotide is at +1. T7 DNA also contains a 160 base-pair terminal repetition. The beginning and end of RNAs are determined by the promoters,by a terminator for E. coli RNA polymerase,TE,located at the end of the early region,a terminator for T7 RNA polymerase,Tphi,located just downstream of gene 10,and a series of RNase III cleavage sites. Early mRNAs made by E. coli RNA polymerase are listed in Features. The many RNAs predicted to be made by T7 RNA polymerase are not listed but can be deduced from the position of the transcription signals (see Tables 8 and 9 in [1]). Promoters are listed in Features by the known or predicted first nucleotide of the RNA,terminators by the last nucleotide of the RNA,and RNase III sites by the nucleotide 5' of the position of cleavage. Genes are numbered 0.3 to 19.5 in order of their left-to-right position on the genome. Proteins are named by the gene number,e.g.,the gene 1 protein,or by a functional name,e.g.,T7 RNA polymerase. There is now genetic or biochemical evidence that proteins are produced from at least 52 of the 56 T7 genes. Gene 4 produces two proteins,4A and 4B,by initiating translation at two different sites in the same reading frame. Gene 10 produces two proteins,10A and 10B,by frameshifting during translation. Genes 0.6 and 5.5 probably also make two proteins by translational frameshifting,the gene 5.5 frameshift producing a gene 5.5-5.7 fusion protein. COMPLETENESS: full length. ;comment1=VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3]%2C [4]%2C and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443%2C 474%2C and 388 to 424. [4] inserted a T at nucleotide 17511%2C increasing the total sequence to 39937 bp. This change%2C originally found in T3 DNA [8]%2C revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG%2C changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1] unless otherwise noted. The sequence shown is that of the l strand%2C which corresponds to the sequence of all mRNAs of known functional significance. Early mRNAs are produced by three major promoters for E. coli RNA polymerase A1%2C A2%2C and A3%2C located near the left end of the DNA. A fourth major E. coli promoter%2C A0 (also called D)%2C that would direct transcription leftward%2C and several minor E. coli promoters (see Table 6 in [1]) function in vitro but have no known in vivo function. Late mRNAs are produced by 15 promoters for T7 RNA polymerase distributed across the right-most 85%25 of the DNA%2C and named e.g. phi10%2C for the first gene downstream of the promoter. There are also two T7 promoters%2C phiOL and phiOR%2C associated with possible origins of replication at the left and right ends of T7 DNA. The 23 base-pair consensus sequence for T7 promoters stretches from -17 to +6%2C where the initiating nucleotide is at +1. T7 DNA also contains a 160 base-pair terminal repetition. The beginning and end of RNAs are determined by the promoters%3B by a terminator for E. coli RNA polymerase%2C TE%2C located at the end of the early region%3B a terminator for T7 RNA polymerase%2C Tphi%2C located just downstream of gene 10%3B and a series of RNase III cleavage sites. Early mRNAs made by E. coli RNA polymerase are listed in Features. The many RNAs predicted to be made by T7 RNA polymerase are not listed but can be deduced from the position of the transcription signals (see Tables 8 and 9 in [1]). Promoters are listed in Features by the known or predicted first nucleotide of the RNA%2C terminators by the last nucleotide of the RNA%2C and RNase III sites by the nucleotide 5' of the position of cleavage. Genes are numbered 0.3 to 19.5 in order of their left-to-right position on the genome. Proteins are named by the gene number%2C e.g.%2C the gene 1 protein%2C or by a functional name%2C e.g.%2C T7 RNA polymerase. There is now genetic or biochemical evidence that proteins are produced from at least 52 of the 56 T7 genes. Gene 4 produces two proteins%2C 4A and 4B%2C by initiating translation at two different sites in the same reading frame. Gene 10 produces two proteins%2C 10A and 10B%2C by frameshifting during translation. Genes 0.6 and 5.5 probably also make two proteins by translational frameshifting%2C the gene 5.5 frameshift producing a gene 5.5-5.7 fusion protein. COMPLETENESS: full length. ;date=13-AUG-2018;host=Escherichia coli;mol_type=genomic DNA;organism=Escherichia phage T7; -NC_001604 GenBank regulatory 224 224 . + 1 ID=GenBank:regulatory:NC_001604:224:224;Note=E. coli promoter A0 (leftward);regulatory_class=promoter; -NC_001604 GenBank regulatory 405 405 . + 1 ID=GenBank:regulatory:NC_001604:405:405;Note=T7 promoter phiOL;regulatory_class=promoter; -NC_001604 GenBank regulatory 498 498 . + 1 ID=GenBank:regulatory:NC_001604:498:498;Note=E. coli promoter A1;regulatory_class=promoter; -NC_001604 GenBank regulatory 626 626 . + 1 ID=GenBank:regulatory:NC_001604:626:626;Note=E. coli promoter A2;regulatory_class=promoter; -NC_001604 GenBank regulatory 750 750 . + 1 ID=GenBank:regulatory:NC_001604:750:750;Note=E. coli promoter A3;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 890 890 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:890:890;Note=RNase III site R0.3; -NC_001604 GenBank gene 891 1468 . + 1 ID=T7p01;Dbxref=GeneID:1261063;Name=T7p01;Note=gene 0.3; -NC_001604 GenBank mRNA 891 1468 . + 1 ID=T7p01.t01;Parent=T7p01;Dbxref=GeneID:1261063;Name=T7p01;product=0.3 mRNA; -NC_001604 GenBank CDS 925 1278 . + 1 ID=T7p01.p01;Parent=T7p01.t01;Dbxref=UniProtKB/Swiss-Prot:P03775,GeneID:1261063;Name=T7p01;Note=gene 0.3%2C inhibits EcoB and EcoK host restriction;codon_start=1;product=hypothetical protein;protein_id=NP_041954.1;transl_table=11;translation=length.117; -NC_001604 GenBank exon 891 1468 . + 1 Parent=T7p01.t01;Name=T7p01; -NC_001604 GenBank sequence_secondary_structure 1468 1468 . + 1 Parent=T7p01;Name=T7p01;Note=RNase III site R0.5; -NC_001604 GenBank CDS 1278 1433 . + 1 ID=T7p02;Dbxref=UniProtKB/Swiss-Prot:P03776,GeneID:1261053;Name=T7p02;Note=gene 0.4;codon_start=1;product=hypothetical protein;protein_id=NP_041955.1;transl_table=11;translation=length.51; -NC_001604 GenBank gene 1278 1433 . + 1 ID=T7p02.gene;Alias=T7p02;Dbxref=GeneID:1261053;Name=T7p02;Note=gene 0.4; -NC_001604 GenBank gene 1469 3138 . + 1 ID=T7p03;Dbxref=GeneID:1261070;Name=T7p03;Note=gene 0.7; -NC_001604 GenBank mRNA 1469 3138 . + 1 ID=T7p03.t01;Parent=T7p03;Dbxref=GeneID:1261070;Name=T7p03;product=0.7 mRNA; -NC_001604 GenBank CDS 2021 3100 . + 1 ID=T7p03.p01;Parent=T7p03.t01;Dbxref=GOA:P00513,UniProtKB/Swiss-Prot:P00513,GeneID:1261070;Name=T7p03;Note=The T7 seryl-threonyl protein kinase gp0.7 is involved in host transcription shutoff and Col Ib exclusion. Phosphorylates E. coli RNA polymerase. Other names: gp0.7.;codon_start=1;product=protein kinase;protein_id=NP_041959.1;transl_table=11;translation=length.359; -NC_001604 GenBank exon 1469 3138 . + 1 Parent=T7p03.t01;Name=T7p03; -NC_001604 GenBank regulatory 3113 3113 . + 1 Parent=T7p03;Name=T7p03;Note=E. coli C promoter;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 3138 3138 . + 1 Parent=T7p03;Name=T7p03;Note=RNase III site R1; -NC_001604 GenBank CDS 1496 1639 . + 1 ID=T7p04;Dbxref=UniProtKB/Swiss-Prot:P03777,GeneID:1261051;Name=T7p04;Note=gene 0.5;codon_start=1;product=hypothetical protein;protein_id=NP_041956.1;transl_table=11;translation=length.47; -NC_001604 GenBank regulatory 1514 1514 . + 1 Parent=T7p04;Name=T7p04;Note=E. coli B promoter;regulatory_class=promoter; -NC_001604 GenBank gene 1496 1639 . + 1 ID=T7p04.gene;Alias=T7p04;Dbxref=GeneID:1261051;Name=T7p04;Note=gene 0.5; -NC_001604 GenBank CDS 1636 1794 . + 1 ID=T7p05.cds1;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261061;Name=T7p05;Note=possible gene 0.6B;codon_start=1;product=hypothetical protein;protein_id=NP_041957.1;transl_table=11;translation=length.111; -NC_001604 GenBank CDS 1796 1972 . + 1 ID=T7p05.cds2;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261061;Name=T7p05;Note=possible gene 0.6B;codon_start=1;product=hypothetical protein;protein_id=NP_041957.1;transl_table=11;translation=length.111; -NC_001604 GenBank gene 1636 1972 . + 1 ID=T7p05.gene;Alias=T7p05;Dbxref=GeneID:1261061;Name=T7p05;Note=possible gene 0.6B; -NC_001604 GenBank CDS 1636 1797 . + 1 ID=T7p06;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261071;Name=T7p06;Note=gene 0.6A;codon_start=1;product=hypothetical protein;protein_id=NP_041958.1;transl_table=11;translation=length.53; -NC_001604 GenBank gene 1636 1797 . + 1 ID=T7p06.gene;Alias=T7p06;Dbxref=GeneID:1261071;Name=T7p06;Note=gene 0.6A; -NC_001604 GenBank gene 3139 5887 . + 1 ID=T7p07;Dbxref=GeneID:1261050;Name=T7p07;Note=gene 1; -NC_001604 GenBank mRNA 3139 5887 . + 1 ID=T7p07.t01;Parent=T7p07;Dbxref=GeneID:1261050;Name=T7p07;product=1 mRNA; -NC_001604 GenBank CDS 3171 5822 . + 1 ID=T7p07.p01;Parent=T7p07.t01;Dbxref=GOA:P00573,UniProtKB/Swiss-Prot:P00573,GeneID:1261050;Name=T7p07;Note=A family of single subunit RNA polymerases.;codon_start=1;product=T3/T7-like RNA polymerase;protein_id=NP_041960.1;transl_table=11;translation=length.883; -NC_001604 GenBank exon 3139 5887 . + 1 Parent=T7p07.t01;Name=T7p07; -NC_001604 GenBank regulatory 5848 5848 . + 1 Parent=T7p07;Name=T7p07;Note=T7 promoter phi1.1A;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 5887 5887 . + 1 Parent=T7p07;Name=T7p07;Note=RNase III site R1.1; -NC_001604 GenBank gene 5888 6448 . + 1 ID=T7p08;Dbxref=GeneID:1261049;Name=T7p08;Note=gene 1.2; -NC_001604 GenBank mRNA 5888 6448 . + 1 ID=T7p08.t01;Parent=T7p08;Dbxref=GeneID:1261049;Name=T7p08;product=1.1 mRNA; -NC_001604 GenBank CDS 6137 6394 . + 1 ID=T7p08.p01;Parent=T7p08.t01;Dbxref=GOA:P03780,UniProtKB/Swiss-Prot:P03780,GeneID:1261049;Name=T7p08;Note=inhibits activity of the host dGTPase [dgt]. Essential only in strains that overexpress dGTPase [optA1 mutation]. In T7%2C gp1.2 also causes F plasmid exclusion. In T3%2C however%2C gp1.2 overcomes the exclusion system. Other names: dGTP triphosphohydrolase inhibitor%3B gp1.2.;codon_start=1;product=host dGTPase inhibitor;protein_id=NP_041962.1;transl_table=11;translation=length.85; -NC_001604 GenBank exon 5888 6448 . + 1 Parent=T7p08.t01;Name=T7p08; -NC_001604 GenBank regulatory 5923 5923 . + 1 Parent=T7p08;Name=T7p08;Note=T7 promoter phi1.1B;regulatory_class=promoter; -NC_001604 GenBank regulatory 6409 6409 . + 1 Parent=T7p08;Name=T7p08;Note=T7 promoter phi1.3;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 6448 6448 . + 1 Parent=T7p08;Name=T7p08;Note=RNase III site R1.3; -NC_001604 GenBank CDS 6007 6135 . + 1 ID=T7p09;Dbxref=UniProtKB/Swiss-Prot:P03779,GeneID:1261072;Name=T7p09;Note=other names: gp1.1;codon_start=1;product=hypothetical protein;protein_id=NP_041961.1;transl_table=11;translation=length.42; -NC_001604 GenBank gene 6007 6135 . + 1 ID=T7p09.gene;Alias=T7p09;Dbxref=GeneID:1261072;Name=T7p09;Note=gene 1.1; -NC_001604 GenBank gene 6449 7588 . + 1 ID=T7p10;Dbxref=GeneID:1261055;Name=T7p10;Note=gene 1.3; -NC_001604 GenBank mRNA 6449 7588 . + 1 ID=T7p10.t01;Parent=T7p10;Dbxref=GeneID:1261055;Name=T7p10;product=1.3 mRNA; -NC_001604 GenBank CDS 6475 7554 . + 1 ID=T7p10.p01;Parent=T7p10.t01;Dbxref=GOA:P00969,UniProtKB/Swiss-Prot:P00969,GeneID:1261055;Name=T7p10;Note=Catalyzes the ATP-dependent formation of a phosphodiester bond at the site of single-stranded breaks in double-stranded DNA. T7 ligase is essential in ligase-deficient hosts only.;codon_start=1;product=ATP-dependent DNA ligase;protein_id=NP_041963.1;transl_table=11;translation=length.359; -NC_001604 GenBank exon 6449 7588 . + 1 Parent=T7p10.t01;Name=T7p10; -NC_001604 GenBank regulatory 7588 7588 . + 1 Parent=T7p10;Name=T7p10;Note=E. coli transcription terminator TE;regulatory_class=terminator; -NC_001604 GenBank CDS 7608 7763 . + 1 ID=T7p11;Dbxref=UniProtKB/Swiss-Prot:P03791,GeneID:1261075;Name=T7p11;Note=gene 1.4;codon_start=1;product=hypothetical protein;protein_id=NP_041964.1;transl_table=11;translation=length.51; -NC_001604 GenBank regulatory 7778 7778 . + 1 ID=GenBank:regulatory:NC_001604:7778:7778;Note=T7 promoter phi1.5;regulatory_class=promoter; -NC_001604 GenBank gene 7608 7763 . + 1 ID=T7p11.gene;Alias=T7p11;Dbxref=GeneID:1261075;Name=T7p11;Note=gene 1.4; -NC_001604 GenBank CDS 7791 7880 . + 1 ID=T7p12;Dbxref=UniProtKB/Swiss-Prot:P03792,GeneID:1261074;Name=T7p12;Note=gene 1.5;codon_start=1;product=hypothetical protein;protein_id=NP_041965.1;transl_table=11;translation=length.29; -NC_001604 GenBank regulatory 7895 7895 . + 1 ID=GenBank:regulatory:NC_001604:7895:7895;Note=T7 promoter phi1.6;regulatory_class=promoter; -NC_001604 GenBank gene 7791 7880 . + 1 ID=T7p12.gene;Alias=T7p12;Dbxref=GeneID:1261074;Name=T7p12;Note=gene 1.5; -NC_001604 GenBank CDS 7906 8166 . + 1 ID=T7p13;Dbxref=UniProtKB/Swiss-Prot:P03793,GeneID:1261076;Name=T7p13;Note=gene 1.6;codon_start=1;product=hypothetical protein;protein_id=NP_041966.1;transl_table=11;translation=length.86; -NC_001604 GenBank gene 7906 8166 . + 1 ID=T7p13.gene;Alias=T7p13;Dbxref=GeneID:1261076;Name=T7p13;Note=gene 1.6; -NC_001604 GenBank CDS 8166 8756 . + 1 ID=T7p14;Dbxref=UniProtKB/Swiss-Prot:P03781,GeneID:1261060;Name=T7p14;Note=gene 1.7;codon_start=1;product=hypothetical protein;protein_id=NP_041967.1;transl_table=11;translation=length.196; -NC_001604 GenBank gene 8166 8756 . + 1 ID=T7p14.gene;Alias=T7p14;Dbxref=GeneID:1261060;Name=T7p14;Note=gene 1.7; -NC_001604 GenBank CDS 8749 8895 . + 1 ID=T7p15;Dbxref=UniProtKB/Swiss-Prot:P03794,GeneID:1261054;Name=T7p15;Note=not essential in T7. Other names: gp1.8;codon_start=1;product=hypothetical protein;protein_id=NP_041968.1;transl_table=11;translation=length.48; -NC_001604 GenBank gene 8749 8895 . + 1 ID=T7p15.gene;Alias=T7p15;Dbxref=GeneID:1261054;Name=T7p15;Note=gene 1.8; -NC_001604 GenBank CDS 8898 9092 . + 1 ID=T7p16;Dbxref=UniProtKB/Swiss-Prot:P03704,GeneID:1261073;Name=T7p16;Note=T7 RNA polymerase inhibitor binds to host RNA pol and suppresses its activity on a subset of promoters. gp2 deficient T7 display reduced DNA replication and premature breakdown of replicating DNA%2C specifically at the left end of the genome%2C along with the presence of empty proheads. Rifampin can compensate for the missing gp2 function. Other names: gp2.;codon_start=1;product=inhibitor of host bacterial RNA polymerase;protein_id=NP_041969.1;transl_table=11;translation=length.64; -NC_001604 GenBank regulatory 9107 9107 . + 1 ID=GenBank:regulatory:NC_001604:9107:9107;Note=T7 promoter phi2.5;regulatory_class=promoter; -NC_001604 GenBank gene 8898 9092 . + 1 ID=T7p16.gene;Alias=T7p16;Dbxref=GeneID:1261073;Name=T7p16;Note=gene 2; -NC_001604 GenBank CDS 9158 9856 . + 1 ID=T7p17;Dbxref=GOA:P03696,UniProtKB/Swiss-Prot:P03696,GeneID:1261080;Name=T7p17;Note=binds single-stranded DNA. In phage T7 gp2.5 is essential for DNA replication and recombination. Other names: gp2.5%3B SSB.;codon_start=1;product=single-stranded DNA-binding protein;protein_id=NP_041970.1;transl_table=11;translation=length.232; -NC_001604 GenBank gene 9158 9856 . + 1 ID=T7p17.gene;Alias=T7p17;Dbxref=GeneID:1261080;Name=T7p17;Note=gene 2.5; -NC_001604 GenBank CDS 9857 10276 . + 1 ID=T7p18;Dbxref=GOA:P03795,UniProtKB/Swiss-Prot:P03795,GeneID:1261078;Name=T7p18;Note=gene 2.8;codon_start=1;product=hypothetical protein;protein_id=NP_041971.1;transl_table=11;translation=length.139; -NC_001604 GenBank gene 9857 10276 . + 1 ID=T7p18.gene;Alias=T7p18;Dbxref=GeneID:1261078;Name=T7p18;Note=gene 2.8; -NC_001604 GenBank CDS 10257 10706 . + 1 ID=T7p19;Dbxref=GOA:P00641,UniProtKB/Swiss-Prot:P00641,GeneID:1261079;Name=T7p19;Note=T7 endonuclease I is a Holliday junction resolvase encoded by T7 gene 3. Mutants in gene 3 are defective in recombination and accumulate branched DNA. Endonuclease I may also play a role in the degradation of the host genome following infection with T7.;codon_start=1;product=endonuclease I;protein_id=NP_041972.1;transl_table=11;translation=length.149; -NC_001604 GenBank gene 10257 10706 . + 1 ID=T7p19.gene;Alias=T7p19;Dbxref=GeneID:1261079;Name=T7p19;Note=gene 3; -NC_001604 GenBank CDS 10706 11161 . + 1 ID=T7p20;Dbxref=GOA:P00806,UniProtKB/Swiss-Prot:P00806,GeneID:1261077;Name=T7p20;Note=T7 lysozyme hydrolyzes an amide bond in the host cell wall following its release from the cytoplasm. In addition%2C T7 lysozyme inhibits T7 RNA polymerase initiation. This inhibition is greater for class II promoters than class III promoters and therefore may aid in temporal regulation of transcription and the switch to particle assembly. In T7%2C lysozyme%2C unlike the T7 holin%2C is expressed with and lies in the same region as the replication genes. Lack of gp3.5 reduces replication and burst size and delays%2C but does not completely prevent lysis. Mutations in the muralytic domain of gene 16%2C an inner capsid protein%2C can partially compensate for a deletion of gp3.5. Other names: gp3.5%3B amidase%3B N-acetylmuramoyl-L-alanine amidase;codon_start=1;product=lysozyme;protein_id=NP_041973.1;transl_table=11;translation=length.151; -NC_001604 GenBank regulatory 11180 11180 . + 1 ID=GenBank:regulatory:NC_001604:11180:11180;Note=T7 promoter phi3.8;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 11203 11203 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:11203:11203;Note=possible RNase III site R3.8; -NC_001604 GenBank gene 10706 11161 . + 1 ID=T7p20.gene;Alias=T7p20;Dbxref=GeneID:1261077;Name=T7p20;Note=gene 3.5; -NC_001604 GenBank CDS 11225 11590 . + 1 ID=T7p21;Dbxref=GOA:P03797,UniProtKB/Swiss-Prot:P03797,GeneID:1261065;Name=T7p21;Note=gene 3.8;codon_start=1;product=putative NHN endonuclease;protein_id=NP_041974.1;transl_table=11;translation=length.121; -NC_001604 GenBank gene 11225 11590 . + 1 ID=T7p21.gene;Alias=T7p21;Dbxref=GeneID:1261065;Name=T7p21;Note=gene 3.8; -NC_001604 GenBank CDS 11565 13265 . + 1 ID=T7p22;Dbxref=GOA:P03692,UniProtKB/Swiss-Prot:P03692,GeneID:1261046;Name=T7p22;Note=gene 4A%2C primase/helicase [14%2C15];codon_start=1;product=DNA primase/helicase;protein_id=NP_041975.1;transl_table=11;translation=length.566; -NC_001604 GenBank gene 11565 13265 . + 1 ID=T7p22.gene;Alias=T7p22;Dbxref=GeneID:1261046;Name=T7p22;Note=gene 4A; -NC_001604 GenBank CDS 11635 11757 . + 1 ID=T7p23;Dbxref=UniProtKB/Swiss-Prot:P03782,GeneID:1261047;Name=T7p23;Note=gene 4.1;codon_start=1;product=hypothetical protein;protein_id=NP_041976.1;transl_table=11;translation=length.40; -NC_001604 GenBank gene 11635 11757 . + 1 ID=T7p23.gene;Alias=T7p23;Dbxref=GeneID:1261047;Name=T7p23;Note=gene 4.1; -NC_001604 GenBank CDS 11754 13265 . + 1 ID=T7p24;Dbxref=GOA:P03692,UniProtKB/Swiss-Prot:P03692,GeneID:1261048;Name=T7p24;Note=gene 4B/helicase [14%2C15];codon_start=1;product=helicase;protein_id=NP_041977.1;transl_table=11;translation=length.503; -NC_001604 GenBank regulatory 12671 12671 . + 1 Parent=T7p24;Name=T7p24;Note=T7 promoter phi4c;regulatory_class=promoter; -NC_001604 GenBank gene 11754 13265 . + 1 ID=T7p24.gene;Alias=T7p24;Dbxref=GeneID:1261048;Name=T7p24;Note=gene 4B; -NC_001604 GenBank CDS 12988 13326 . + 1 ID=T7p25;Dbxref=UniProtKB/Swiss-Prot:P03783,GeneID:1261021;Name=T7p25;Note=gene 4.2;codon_start=1;product=hypothetical protein;protein_id=NP_041978.1;transl_table=11;translation=length.112; -NC_001604 GenBank regulatory 13341 13341 . + 1 ID=GenBank:regulatory:NC_001604:13341:13341;Note=T7 promoter phi4.3;regulatory_class=promoter; -NC_001604 GenBank gene 12988 13326 . + 1 ID=T7p25.gene;Alias=T7p25;Dbxref=GeneID:1261021;Name=T7p25;Note=gene 4.2; -NC_001604 GenBank CDS 13352 13564 . + 1 ID=T7p26;Dbxref=UniProtKB/Swiss-Prot:P03784,GeneID:1261069;Name=T7p26;Note=not essential in T7%3B Other names: gp4.3.;codon_start=1;product=hypothetical protein;protein_id=NP_041979.1;transl_table=11;translation=length.70; -NC_001604 GenBank gene 13352 13564 . + 1 ID=T7p26.gene;Alias=T7p26;Dbxref=GeneID:1261069;Name=T7p26;Note=gene 4.3; -NC_001604 GenBank CDS 13584 13853 . + 1 ID=T7p27;Dbxref=UniProtKB/Swiss-Prot:P03785,GeneID:1261059;Name=T7p27;Note=not essential in T7. Other names: gp4.5.;codon_start=1;product=hypothetical protein;protein_id=NP_041980.1;transl_table=11;translation=length.89; -NC_001604 GenBank sequence_secondary_structure 13892 13892 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:13892:13892;Note=RNase III site R4.7; -NC_001604 GenBank regulatory 13915 13915 . + 1 ID=GenBank:regulatory:NC_001604:13915:13915;Note=T7 promoter phi4.7;regulatory_class=promoter; -NC_001604 GenBank gene 13584 13853 . + 1 ID=T7p27.gene;Alias=T7p27;Dbxref=GeneID:1261059;Name=T7p27;Note=gene 4.5; -NC_001604 GenBank CDS 13927 14334 . + 1 ID=T7p28;Dbxref=UniProtKB/Swiss-Prot:P03786,GeneID:1261043;Name=T7p28;Note=gene 4.7;codon_start=1;product=hypothetical protein;protein_id=NP_041981.1;transl_table=11;translation=length.135; -NC_001604 GenBank gene 13927 14334 . + 1 ID=T7p28.gene;Alias=T7p28;Dbxref=GeneID:1261043;Name=T7p28;Note=gene 4.7; -NC_001604 GenBank CDS 14353 16467 . + 1 ID=T7p29;Dbxref=GOA:P00581,UniProtKB/Swiss-Prot:P00581,GeneID:1261044;Name=T7p29;Note=gene 5;codon_start=1;product=DNA polymerase;protein_id=NP_041982.1;transl_table=11;translation=length.704; -NC_001604 GenBank gene 14353 16467 . + 1 ID=T7p29.gene;Alias=T7p29;Dbxref=GeneID:1261044;Name=T7p29;Note=gene 5; -NC_001604 GenBank CDS 16483 16839 . + 1 ID=T7p30;Dbxref=UniProtKB/Swiss-Prot:P03798,GeneID:1261045;Name=T7p30;Note=gene 5.3;codon_start=1;product=hypothetical protein;protein_id=NP_041983.1;transl_table=11;translation=length.118; -NC_001604 GenBank gene 16483 16839 . + 1 ID=T7p30.gene;Alias=T7p30;Dbxref=GeneID:1261045;Name=T7p30;Note=gene 5.3; -NC_001604 GenBank CDS 16851 17147 . + 1 ID=T7p31.cds1;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041984.1;transl_table=11;translation=length.169; -NC_001604 GenBank CDS 17147 17359 . + 1 ID=T7p31.cds2;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041984.1;transl_table=11;translation=length.169; -NC_001604 GenBank gene 16851 17359 . + 1 ID=T7p31.gene;Alias=T7p31;Dbxref=GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7; -NC_001604 GenBank CDS 16851 17150 . + 1 ID=T7p32;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261038;Name=T7p32;Note=in Enterobacteria phage T7%2C gp5.5 abolishes E. coli nucleoid protein H-NS-mediated inhibition of transcription by T7 RNA polymerases in vitro. Not essential%2C but mutants have lower burst size. Mutants in this gene are not capable of replicating in phage lambda lysogens. Other names: gp5.5;codon_start=1;product=host protein H-NS-interacting protein;protein_id=NP_041985.1;transl_table=11;translation=length.99; -NC_001604 GenBank gene 16851 17150 . + 1 ID=T7p32.gene;Alias=T7p32;Dbxref=GeneID:1261038;Name=T7p32;Note=gene 5.5; -NC_001604 GenBank CDS 17150 17359 . + 1 ID=T7p33;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261040;Name=T7p33;Note=gene 5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041986.1;transl_table=11;translation=length.69; -NC_001604 GenBank gene 17150 17359 . + 1 ID=T7p33.gene;Alias=T7p33;Dbxref=GeneID:1261040;Name=T7p33;Note=gene 5.7; -NC_001604 GenBank CDS 17359 17517 . + 1 ID=T7p34;Dbxref=UniProtKB/Swiss-Prot:P20406,GeneID:1261037;Name=T7p34;Note=not essential. Other names: gp5.9%3B exonuclease V inhibitor;codon_start=1;product=host recBCD nuclease inhibitor;protein_id=NP_041987.1;transl_table=11;translation=length.52; -NC_001604 GenBank gene 17359 17517 . + 1 ID=T7p34.gene;Alias=T7p34;Dbxref=GeneID:1261037;Name=T7p34;Note=gene 5.9; -NC_001604 GenBank CDS 17504 18406 . + 1 ID=T7p35;Dbxref=GOA:P00638,UniProtKB/Swiss-Prot:P00638,GeneID:1261052;Name=T7p35;Note=The T7 exonuclease encoded by gene 6 is required for (a) recombination and (b) for the degradation of host chromosomal DNA. The latter process provides nucleotides for phage DNA replication. Both processes are carried out together with the T7 gene 3-encoded endonuclease/Holliday junction resolvase. In addition%2C the exonuclease also functions as an RNase H that removes RNA primers during DNA replication and promotes concatemer formation.;codon_start=1;product=exonuclease;protein_id=NP_041988.1;transl_table=11;translation=length.300; -NC_001604 GenBank gene 17504 18406 . + 1 ID=T7p35.gene;Alias=T7p35;Dbxref=GeneID:1261052;Name=T7p35;Note=gene 6; -NC_001604 GenBank CDS 18394 18507 . + 1 ID=T7p36;Dbxref=UniProtKB/Swiss-Prot:P03799,GeneID:1261058;Name=T7p36;Note=gene 6.3;codon_start=1;product=hypothetical protein;protein_id=NP_041989.1;transl_table=11;translation=length.37; -NC_001604 GenBank regulatory 18545 18545 . + 1 ID=GenBank:regulatory:NC_001604:18545:18545;Note=T7 promoter phi6.5;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 18563 18563 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:18563:18563;Note=RNase III site R6.5; -NC_001604 GenBank gene 18394 18507 . + 1 ID=T7p36.gene;Alias=T7p36;Dbxref=GeneID:1261058;Name=T7p36;Note=gene 6.3; -NC_001604 GenBank CDS 18605 18859 . + 1 ID=T7p37;Dbxref=UniProtKB/Swiss-Prot:P03800,GeneID:1261036;Name=T7p37;Note=gene 6.5;codon_start=1;product=hypothetical protein;protein_id=NP_041990.1;transl_table=11;translation=length.84; -NC_001604 GenBank gene 18605 18859 . + 1 ID=T7p37.gene;Alias=T7p37;Dbxref=GeneID:1261036;Name=T7p37;Note=gene 6.5; -NC_001604 GenBank CDS 18864 19130 . + 1 ID=T7p38;Dbxref=UniProtKB/Swiss-Prot:P03801,GeneID:1261039;Name=T7p38;Note=may be involved in virion morphogenesis and is injected from virion into host cell. Other names: gp6.7;codon_start=1;product=hypothetical protein;protein_id=NP_041991.1;transl_table=11;translation=length.88; -NC_001604 GenBank gene 18864 19130 . + 1 ID=T7p38.gene;Alias=T7p38;Dbxref=GeneID:1261039;Name=T7p38;Note=gene 6.7; -NC_001604 GenBank CDS 19130 19531 . + 1 ID=T7p39;Dbxref=UniProtKB/Swiss-Prot:P03750,GeneID:1261056;Name=T7p39;Note=gene 7%2C host range;codon_start=1;product=hypothetical protein;protein_id=NP_041992.1;transl_table=11;translation=length.133; -NC_001604 GenBank gene 19130 19531 . + 1 ID=T7p39.gene;Alias=T7p39;Dbxref=GeneID:1261056;Name=T7p39;Note=gene 7; -NC_001604 GenBank CDS 19535 19834 . + 1 ID=T7p40;Dbxref=UniProtKB/Swiss-Prot:P03751,GeneID:1261035;Name=T7p40;Note=required for virion infectivity but not morphogenesis. In T7%2C gp 7.3 appears to be required for the assembly of tail fibers on capsids. Other names: gp7.3;codon_start=1;product=tail assembly protein;protein_id=NP_041993.1;transl_table=11;translation=length.99; -NC_001604 GenBank gene 19535 19834 . + 1 ID=T7p40.gene;Alias=T7p40;Dbxref=GeneID:1261035;Name=T7p40;Note=gene 7.3; -NC_001604 GenBank CDS 19848 20240 . + 1 ID=T7p41;Dbxref=GOA:P03796,UniProtKB/Swiss-Prot:P03796,GeneID:1261028;Name=T7p41;Note=gene 7.7;codon_start=1;product=hypothetical protein;protein_id=NP_041994.1;transl_table=11;translation=length.130; -NC_001604 GenBank gene 19848 20240 . + 1 ID=T7p41.gene;Alias=T7p41;Dbxref=GeneID:1261028;Name=T7p41;Note=gene 7.7; -NC_001604 GenBank CDS 20240 21850 . + 1 ID=T7p42;Dbxref=GOA:P03728,UniProtKB/Swiss-Prot:P03728,GeneID:1261033;Name=T7p42;Note=gene 8;codon_start=1;product=head-tail connector protein;protein_id=NP_041995.1;transl_table=11;translation=length.536; -NC_001604 GenBank regulatory 21865 21865 . + 1 ID=GenBank:regulatory:NC_001604:21865:21865;Note=T7 promoter phi9;regulatory_class=promoter; -NC_001604 GenBank gene 20240 21850 . + 1 ID=T7p42.gene;Alias=T7p42;Dbxref=GeneID:1261033;Name=T7p42;Note=gene 8; -NC_001604 GenBank CDS 21950 22873 . + 1 ID=T7p43;Dbxref=GOA:P03716,UniProtKB/Swiss-Prot:P03716,GeneID:1261027;Name=T7p43;Note=Phage T7-like scaffolding protein. The protein is encoded by gene 9 in T7 (gp9) and is required for the formation of pro-capsids.;codon_start=1;product=capsid assembly protein;protein_id=NP_041996.1;transl_table=11;translation=length.307; -NC_001604 GenBank regulatory 22904 22904 . + 1 ID=GenBank:regulatory:NC_001604:22904:22904;Note=T7 promoter phi10;regulatory_class=promoter; -NC_001604 GenBank gene 21950 22873 . + 1 ID=T7p43.gene;Alias=T7p43;Dbxref=GeneID:1261027;Name=T7p43;Note=gene 9; -NC_001604 GenBank CDS 22967 23989 . + 1 ID=T7p44.cds1;Dbxref=GOA:P19727,UniProtKB/Swiss-Prot:P19727,GeneID:1261029;Name=T7p44;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041997.1;transl_table=11;translation=length.398; -NC_001604 GenBank CDS 23989 24162 . + 1 ID=T7p44.cds2;Dbxref=GOA:P19727,UniProtKB/Swiss-Prot:P19727,GeneID:1261029;Name=T7p44;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041997.1;transl_table=11;translation=length.398; -NC_001604 GenBank gene 22967 24162 . + 1 ID=T7p44.gene;Alias=T7p44;Dbxref=GeneID:1261029;Name=T7p44;Note=gene 10B; -NC_001604 GenBank CDS 22967 24004 . + 1 ID=T7p45;Dbxref=GOA:P19726,UniProtKB/Swiss-Prot:P19726,GeneID:1261026;Name=T7p45;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041998.1;transl_table=11;translation=length.345; -NC_001604 GenBank regulatory 24210 24210 . + 1 ID=GenBank:regulatory:NC_001604:24210:24210;Note=T7 transcription terminator Tphi;regulatory_class=terminator; -NC_001604 GenBank gene 22967 24004 . + 1 ID=T7p45.gene;Alias=T7p45;Dbxref=GeneID:1261026;Name=T7p45;Note=gene 10A; -NC_001604 GenBank CDS 24228 24818 . + 1 ID=T7p46;Dbxref=UniProtKB/Swiss-Prot:P03746,GeneID:1261030;Name=T7p46;Note=Tail tubular proteins A and B are required for assembly of tails of T7-like phages.;codon_start=1;product=tail tubular protein A;protein_id=NP_041999.1;transl_table=11;translation=length.196; -NC_001604 GenBank gene 24228 24818 . + 1 ID=T7p46.gene;Alias=T7p46;Dbxref=GeneID:1261030;Name=T7p46;Note=gene 11; -NC_001604 GenBank CDS 24842 27226 . + 1 ID=T7p47;Dbxref=UniProtKB/Swiss-Prot:P03747,GeneID:1261024;Name=T7p47;Note=gene 12;codon_start=1;product=tail tubular protein B;protein_id=NP_042000.1;transl_table=11;translation=length.794; -NC_001604 GenBank regulatory 27274 27274 . + 1 ID=GenBank:regulatory:NC_001604:27274:27274;Note=T7 promoter phi13;regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 27281 27281 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:27281:27281;Note=possible RNase III site R13; -NC_001604 GenBank gene 24842 27226 . + 1 ID=T7p47.gene;Alias=T7p47;Dbxref=GeneID:1261024;Name=T7p47;Note=gene 12; -NC_001604 GenBank CDS 27307 27723 . + 1 ID=T7p48;Dbxref=UniProtKB/Swiss-Prot:P03723,GeneID:1261025;Name=T7p48;Note=gene 13;codon_start=1;product=internal virion protein A;protein_id=NP_042001.1;transl_table=11;translation=length.138; -NC_001604 GenBank gene 27307 27723 . + 1 ID=T7p48.gene;Alias=T7p48;Dbxref=GeneID:1261025;Name=T7p48;Note=gene 13; -NC_001604 GenBank CDS 27728 28318 . + 1 ID=T7p49;Dbxref=UniProtKB/Swiss-Prot:P03724,GeneID:1261032;Name=T7p49;Note=Approximately 12 copies of the internal virion protein B encoded in phage T7 by gene 14 are part of the internal core of the T7 virion. Along with gp16 and gp15%2C the other internal core proteins%2C gp14 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA. gp14 appears to localize to the outer host membrane after ejection. Other names: gp14;codon_start=1;product=internal virion protein B;protein_id=NP_042002.1;transl_table=11;translation=length.196; -NC_001604 GenBank gene 27728 28318 . + 1 ID=T7p49.gene;Alias=T7p49;Dbxref=GeneID:1261032;Name=T7p49;Note=gene 14; -NC_001604 GenBank CDS 28325 30568 . + 1 ID=T7p50;Dbxref=UniProtKB/Swiss-Prot:P03725,GeneID:1261034;Name=T7p50;Note=Approximately 12 copies of the internal virion protein C encoded by phage T7 gene 15 (gp15) are part of the internal core of the T7 virion. Along with gp14 and gp16%2C the other internal core proteins%2C gp15 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA.;codon_start=1;product=internal virion protein C;protein_id=NP_042003.1;transl_table=11;translation=length.747; -NC_001604 GenBank gene 28325 30568 . + 1 ID=T7p50.gene;Alias=T7p50;Dbxref=GeneID:1261034;Name=T7p50;Note=gene 15; -NC_001604 GenBank CDS 30595 34551 . + 1 ID=T7p51;Dbxref=GOA:P03726,UniProtKB/Swiss-Prot:P03726,GeneID:1261031;Name=T7p51;Note=Approximately 3 copies of the internal virion protein D encoded by phage T7 gene 16 (gp16) are part of the internal core of the T7 virion. Along with gp14 and gp15%2C the other internal core proteins%2C gp16 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA. The N-terminus has similarity to a lytic transglycosylase and may help form a channel for phage DNA translocation through the crosslinked peptidoglycan layer of the host envelope.;codon_start=1;product=internal virion protein D;protein_id=NP_042004.1;transl_table=11;translation=length.1318; -NC_001604 GenBank regulatory 34566 34566 . + 1 ID=GenBank:regulatory:NC_001604:34566:34566;Note=T7 promoter phi17;regulatory_class=promoter; -NC_001604 GenBank gene 30595 34551 . + 1 ID=T7p51.gene;Alias=T7p51;Dbxref=GeneID:1261031;Name=T7p51;Note=gene 16; -NC_001604 GenBank CDS 34624 36285 . + 1 ID=T7p52;Dbxref=UniProtKB/Swiss-Prot:P03748,GeneID:1261023;Name=T7p52;Note=in phages T7 and T3 trimers of gp17 form each of the 6 kinked tail fibers. Other names: gp17.;codon_start=1;product=tail fiber protein;protein_id=NP_042005.1;transl_table=11;translation=length.553; -NC_001604 GenBank gene 34624 36285 . + 1 ID=T7p52.gene;Alias=T7p52;Dbxref=GeneID:1261023;Name=T7p52;Note=gene 17; -NC_001604 GenBank CDS 36344 36547 . + 1 ID=T7p53;Dbxref=GOA:P03802,UniProtKB/Swiss-Prot:P03802,GeneID:1261022;Name=T7p53;Note=Type II holins have two putative transmembrane domains and are thought to allow endolysins access to the cell wall at the optimal lysis time. However%2C in phage T7 the holin protein gp17.5 does not appear to be essential and gp17.5 mutants only show a minor delay in lysis. Other names: gp17.5%3B lysis protein;codon_start=1;product=type II holin;protein_id=NP_042006.1;transl_table=11;translation=length.67; -NC_001604 GenBank gene 36344 36547 . + 1 ID=T7p53.gene;Alias=T7p53;Dbxref=GeneID:1261022;Name=T7p53;Note=gene 17.5; -NC_001604 GenBank CDS 36553 36822 . + 1 ID=T7p54;Dbxref=GOA:P03693,UniProtKB/Swiss-Prot:P03693,GeneID:1261042;Name=T7p54;Note=involved in the packaging of genome monomers into a procapsid using head-to-tail concatemers of genomes. other names: DNA packaging protein A%3B DNA maturation protein A%3B terminase%2C small subunit;codon_start=1;product=DNA packaging protein%2C small subunit;protein_id=NP_042007.1;transl_table=11;translation=length.89; -NC_001604 GenBank regulatory 36836 36836 . + 1 ID=GenBank:regulatory:NC_001604:36836:36836;Note=E. coli promoter E[6];regulatory_class=promoter; -NC_001604 GenBank sequence_secondary_structure 36856 36856 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:36856:36856;Note=RNase III site R18.5; -NC_001604 GenBank gene 36553 36822 . + 1 ID=T7p54.gene;Alias=T7p54;Dbxref=GeneID:1261042;Name=T7p54;Note=gene 18; -NC_001604 GenBank CDS 36917 37348 . + 1 ID=T7p55;Dbxref=GOA:P03803,UniProtKB/Swiss-Prot:P03803,GeneID:1261067;Name=T7p55;Note=analog of phage lambda protein Rz%2C a cell lysis protein. Rz and gp18.5 share distant sequence similarity%2C similar function%2C and a similar genome neighborhood. In T7%2C gp18.5 interacts with gp18.7%2C a lambda RZ1-like lysis protein. Other names: gp18.5;codon_start=1;product=phage lambda Rz-like lysis protein;protein_id=NP_042008.1;transl_table=11;translation=length.143; -NC_001604 GenBank gene 36917 37348 . + 1 ID=T7p55.gene;Alias=T7p55;Dbxref=GeneID:1261067;Name=T7p55;Note=gene 18.5; -NC_001604 GenBank CDS 37032 37283 . + 1 ID=T7p56;Dbxref=UniProtKB/Swiss-Prot:P03788,GeneID:1261057;Name=T7p56;Note=in Enterobacteria phage T7%2C this protein interacts with gp18.5 and is expressed from the -1 frame of a gene completely overlapping gene 18.5. This suggests that it may be an analog of lambda lysis protein Rz1. Other names: gp18.7.;codon_start=1;product=phage lambda Rz1-like protein;protein_id=NP_042009.1;transl_table=11;translation=length.83; -NC_001604 GenBank gene 37032 37283 . + 1 ID=T7p56.gene;Alias=T7p56;Dbxref=GeneID:1261057;Name=T7p56;Note=gene 18.7; -NC_001604 GenBank CDS 37370 39130 . + 1 ID=T7p57;Dbxref=GOA:P03694,UniProtKB/Swiss-Prot:P03694,GeneID:1261062;Name=T7p57;Note=gene 19;codon_start=1;product=DNA maturation protein;protein_id=NP_042010.1;transl_table=11;translation=length.586; -NC_001604 GenBank gene 37370 39130 . + 1 ID=T7p57.gene;Alias=T7p57;Dbxref=GeneID:1261062;Name=T7p57;Note=gene 19; -NC_001604 GenBank CDS 38016 38273 . + 1 ID=T7p58;Dbxref=UniProtKB/Swiss-Prot:P03789,GeneID:1261064;Name=T7p58;Note=gene 19.2;codon_start=1;product=hypothetical protein;protein_id=NP_042011.1;transl_table=11;translation=length.85; -NC_001604 GenBank gene 38016 38273 . + 1 ID=T7p58.gene;Alias=T7p58;Dbxref=GeneID:1261064;Name=T7p58;Note=gene 19.2; -NC_001604 GenBank CDS 38553 38726 . + 1 ID=T7p59;Dbxref=UniProtKB/Swiss-Prot:P03790,GeneID:1261066;Name=T7p59;Note=gene 19.3;codon_start=1;product=hypothetical protein;protein_id=NP_042012.1;transl_table=11;translation=length.57; -NC_001604 GenBank regulatory 39229 39229 . + 1 ID=GenBank:regulatory:NC_001604:39229:39229;Note=T7 promoter phiOR;regulatory_class=promoter; -NC_001604 GenBank gene 38553 38726 . + 1 ID=T7p59.gene;Alias=T7p59;Dbxref=GeneID:1261066;Name=T7p59;Note=gene 19.3; -NC_001604 GenBank CDS 39389 39538 . + 1 ID=T7p60;Dbxref=UniProtKB/Swiss-Prot:P03804,GeneID:1261068;Name=T7p60;Note=gene 19.5;codon_start=1;product=hypothetical protein;protein_id=NP_042013.1;transl_table=11;translation=length.49; -NC_001604 GenBank gene 39389 39538 . + 1 ID=T7p60.gene;Alias=T7p60;Dbxref=GeneID:1261068;Name=T7p60;Note=gene 19.5; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/T7_TMHMM.gff3 --- a/cpt_gff_rebase/test-data/T7_TMHMM.gff3 Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -##gff-version 3 -T7p04 feature Chain 2 47 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04 -T7p04 TMHMM Topological domain 1 22 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d -T7p04 TMHMM Transmembrane 23 45 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d -T7p04 TMHMM Topological domain 46 47 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d -##gff-version 3 -T7p11 feature Chain 2 51 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11 -T7p11 TMHMM Topological domain 1 4 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc -T7p11 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc -T7p11 TMHMM Topological domain 25 27 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc -T7p11 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc -T7p11 TMHMM Topological domain 51 51 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc -##gff-version 3 -T7p25 feature Chain 2 112 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25 -T7p25 TMHMM Topological domain 1 6 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 -T7p25 TMHMM Transmembrane 7 29 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 -T7p25 TMHMM Topological domain 30 33 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 -T7p25 TMHMM Transmembrane 34 56 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 -T7p25 TMHMM Topological domain 57 112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 -##gff-version 3 -T7p36 feature Chain 2 37 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36 -T7p36 TMHMM Topological domain 1 4 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 -T7p36 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 -T7p36 TMHMM Topological domain 25 37 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 -##gff-version 3 -T7p53 feature Chain 2 67 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53 -T7p53 TMHMM Topological domain 1 36 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb -T7p53 TMHMM Transmembrane 37 55 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb -T7p53 TMHMM Topological domain 56 67 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb -##gff-version 3 -T7p56 feature Chain 2 83 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56 -T7p56 TMHMM Topological domain 1 27 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e -T7p56 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e -T7p56 TMHMM Topological domain 51 83 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e -##gff-version 3 -T7p60 feature Chain 2 49 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60 -T7p60 TMHMM Topological domain 1 12 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 -T7p60 TMHMM Transmembrane 13 30 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 -T7p60 TMHMM Topological domain 31 49 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/T7_TMHMM_REBASE.gff3 --- a/cpt_gff_rebase/test-data/T7_TMHMM_REBASE.gff3 Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -##gff-version 3 -NC_001604 feature Chain 1499 1636 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04; -NC_001604 TMHMM Topological domain 1496 1561 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; -NC_001604 TMHMM Transmembrane 1562 1630 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; -NC_001604 TMHMM Topological domain 1631 1636 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; -NC_001604 feature Chain 7611 7760 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11; -NC_001604 TMHMM Topological domain 7608 7619 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; -NC_001604 TMHMM Transmembrane 7620 7679 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; -NC_001604 TMHMM Topological domain 7680 7688 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; -NC_001604 TMHMM Transmembrane 7689 7757 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; -NC_001604 TMHMM Topological domain 7758 7760 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; -NC_001604 feature Chain 12991 13323 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25; -NC_001604 TMHMM Topological domain 12988 13005 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; -NC_001604 TMHMM Transmembrane 13006 13074 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; -NC_001604 TMHMM Topological domain 13075 13086 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; -NC_001604 TMHMM Transmembrane 13087 13155 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; -NC_001604 TMHMM Topological domain 13156 13323 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; -NC_001604 feature Chain 18397 18504 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36; -NC_001604 TMHMM Topological domain 18394 18405 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; -NC_001604 TMHMM Transmembrane 18406 18465 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; -NC_001604 TMHMM Topological domain 18466 18504 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; -NC_001604 feature Chain 36347 36544 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53; -NC_001604 TMHMM Topological domain 36344 36451 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; -NC_001604 TMHMM Transmembrane 36452 36508 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; -NC_001604 TMHMM Topological domain 36509 36544 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; -NC_001604 feature Chain 37035 37280 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56; -NC_001604 TMHMM Topological domain 37032 37112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; -NC_001604 TMHMM Transmembrane 37113 37181 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; -NC_001604 TMHMM Topological domain 37182 37280 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; -NC_001604 feature Chain 39392 39535 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60; -NC_001604 TMHMM Topological domain 39389 39424 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; -NC_001604 TMHMM Transmembrane 39425 39478 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; -NC_001604 TMHMM Topological domain 39479 39535 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/child.gff --- a/cpt_gff_rebase/test-data/child.gff Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -#gff-version 3 -cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/nonprotein.gff --- a/cpt_gff_rebase/test-data/nonprotein.gff Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -##gff-version 3 -PhageBob blastp match_part 300 349 1e-40 + . ID=m00001;Notes=RNAse A Protein; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/parent.gff --- a/cpt_gff_rebase/test-data/parent.gff Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -#gff-version 3 -PhageBob maker cds 300 500 . + . ID=gene42 -PhageBob maker cds 300 500 . + . Parent=gene42;ID=cds42 diff -r 6e7e20cb1fc7 -r 4f4b413056f6 cpt_gff_rebase/test-data/proteins.gff --- a/cpt_gff_rebase/test-data/proteins.gff Fri Jun 17 04:00:49 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -##gff-version 3 -PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 gff3.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3.py Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,346 @@ +import copy +import logging + +log = logging.getLogger() +log.setLevel(logging.WARN) + + +def feature_lambda( + feature_list, + test, + test_kwargs, + subfeatures=True, + parent=None, + invert=False, + recurse=True, +): + """Recursively search through features, testing each with a test function, yielding matches. + + GFF3 is a hierachical data structure, so we need to be able to recursively + search through features. E.g. if you're looking for a feature with + ID='bob.42', you can't just do a simple list comprehension with a test + case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. + + :type feature_list: list + :param feature_list: an iterable of features + + :type test: function reference + :param test: a closure with the method signature (feature, **kwargs) where + the kwargs are those passed in the next argument. This + function should return True or False, True if the feature is + to be yielded as part of the main feature_lambda function, or + False if it is to be ignored. This function CAN mutate the + features passed to it (think "apply"). + + :type test_kwargs: dictionary + :param test_kwargs: kwargs to pass to your closure when it is called. + + :type subfeatures: boolean + :param subfeatures: when a feature is matched, should just that feature be + yielded to the caller, or should the entire sub_feature + tree for that feature be included? subfeatures=True is + useful in cases such as searching for a gene feature, + and wanting to know what RBS/Shine_Dalgarno_sequences + are in the sub_feature tree (which can be accomplished + with two feature_lambda calls). subfeatures=False is + useful in cases when you want to process (and possibly + return) the entire feature tree, such as applying a + qualifier to every single feature. + + :type invert: boolean + :param invert: Negate/invert the result of the filter. + + :rtype: yielded list + :return: Yields a list of matching features. + """ + # Either the top level set of [features] or the subfeature attribute + for feature in feature_list: + feature._parent = parent + if not parent: + # Set to self so we cannot go above root. + feature._parent = feature + test_result = test(feature, **test_kwargs) + # if (not invert and test_result) or (invert and not test_result): + if invert ^ test_result: + if not subfeatures: + feature_copy = copy.deepcopy(feature) + feature_copy.sub_features = list() + yield feature_copy + else: + yield feature + + if recurse and hasattr(feature, "sub_features"): + for x in feature_lambda( + feature.sub_features, + test, + test_kwargs, + subfeatures=subfeatures, + parent=feature, + invert=invert, + recurse=recurse, + ): + yield x + + +def fetchParent(feature): + if not hasattr(feature, "_parent") or feature._parent is None: + return feature + else: + return fetchParent(feature._parent) + + +def feature_test_true(feature, **kwargs): + return True + + +def feature_test_type(feature, **kwargs): + if "type" in kwargs: + return str(feature.type).upper() == str(kwargs["type"]).upper() + elif "types" in kwargs: + for x in kwargs["types"]: + if str(feature.type).upper() == str(x).upper(): + return True + return False + raise Exception("Incorrect feature_test_type call, need type or types") + + +def feature_test_qual_value(feature, **kwargs): + """Test qualifier values. + + For every feature, check that at least one value in + feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] + """ + if isinstance(kwargs["qualifier"], list): + for qualifier in kwargs["qualifier"]: + for attribute_value in feature.qualifiers.get(qualifier, []): + if attribute_value in kwargs["attribute_list"]: + return True + else: + for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): + if attribute_value in kwargs["attribute_list"]: + return True + return False + + +def feature_test_location(feature, **kwargs): + if "strand" in kwargs: + if feature.location.strand != kwargs["strand"]: + return False + + return feature.location.start <= kwargs["loc"] <= feature.location.end + + +def feature_test_quals(feature, **kwargs): + """ + Example:: + + a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']}) + + # Check if a contains a Note + feature_test_quals(a, {'Note': None}) # Returns True + feature_test_quals(a, {'Product': None}) # Returns False + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['ome']}) # Returns True + + # Check if a contains a note with specific value + feature_test_quals(a, {'Note': ['other']}) # Returns False + """ + for key in kwargs: + if key not in feature.qualifiers: + return False + + # Key is present, no value specified + if kwargs[key] is None: + return True + + # Otherwise there is a key value we're looking for. + # so we make a list of matches + matches = [] + # And check all of the feature qualifier valuse + for value in feature.qualifiers[key]: + # For that kwargs[key] value + for x in kwargs[key]: + matches.append(x in value) + + # If none matched, then we return false. + if not any(matches): + return False + + return True + + +def feature_test_contains(feature, **kwargs): + if "index" in kwargs: + return feature.location.start < kwargs["index"] < feature.location.end + elif "range" in kwargs: + return ( + feature.location.start < kwargs["range"]["start"] < feature.location.end + and feature.location.start < kwargs["range"]["end"] < feature.location.end + ) + else: + raise RuntimeError("Must use index or range keyword") + + +def get_id(feature=None, parent_prefix=None): + result = "" + if parent_prefix is not None: + result += parent_prefix + "|" + if "locus_tag" in feature.qualifiers: + result += feature.qualifiers["locus_tag"][0] + elif "gene" in feature.qualifiers: + result += feature.qualifiers["gene"][0] + elif "Gene" in feature.qualifiers: + result += feature.qualifiers["Gene"][0] + elif "product" in feature.qualifiers: + result += feature.qualifiers["product"][0] + elif "Product" in feature.qualifiers: + result += feature.qualifiers["Product"][0] + elif "Name" in feature.qualifiers: + result += feature.qualifiers["Name"][0] + else: + return feature.id + # Leaving in case bad things happen. + # result += '%s_%s_%s_%s' % ( + # feature.id, + # feature.location.start, + # feature.location.end, + # feature.location.strand + # ) + return result + + +def get_gff3_id(gene): + return gene.qualifiers.get("Name", [gene.id])[0] + + +def ensure_location_in_bounds(start=0, end=0, parent_length=0): + # This prevents frameshift errors + while start < 0: + start += 3 + while end < 0: + end += 3 + while start > parent_length: + start -= 3 + while end > parent_length: + end -= 3 + return (start, end) + + +def coding_genes(feature_list): + for x in genes(feature_list): + if ( + len( + list( + feature_lambda( + x.sub_features, + feature_test_type, + {"type": "CDS"}, + subfeatures=False, + ) + ) + ) + > 0 + ): + yield x + + +def genes(feature_list, feature_type="gene", sort=False): + """ + Simple filter to extract gene features from the feature set. + """ + + if not sort: + for x in feature_lambda( + feature_list, feature_test_type, {"type": feature_type}, subfeatures=True + ): + yield x + else: + data = list(genes(feature_list, feature_type=feature_type, sort=False)) + data = sorted(data, key=lambda feature: feature.location.start) + for x in data: + yield x + + +def wa_unified_product_name(feature): + """ + Try and figure out a name. We gave conflicting instructions, so + this isn't as trivial as it should be. Sometimes it will be in + 'product' or 'Product', othertimes in 'Name' + """ + # Manually applied tags. + protein_product = feature.qualifiers.get( + "product", feature.qualifiers.get("Product", [None]) + )[0] + + # If neither of those are available ... + if protein_product is None: + # And there's a name... + if "Name" in feature.qualifiers: + if not is_uuid(feature.qualifiers["Name"][0]): + protein_product = feature.qualifiers["Name"][0] + + return protein_product + + +def is_uuid(name): + return name.count("-") == 4 and len(name) == 36 + + +def get_rbs_from(gene): + # Normal RBS annotation types + rbs_rbs = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False + ) + ) + rbs_sds = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "Shine_Dalgarno_sequence"}, + subfeatures=False, + ) + ) + # Fraking apollo + apollo_exons = list( + feature_lambda( + gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False + ) + ) + apollo_exons = [x for x in apollo_exons if len(x) < 10] + # These are more NCBI's style + regulatory_elements = list( + feature_lambda( + gene.sub_features, + feature_test_type, + {"type": "regulatory"}, + subfeatures=False, + ) + ) + rbs_regulatory = list( + feature_lambda( + regulatory_elements, + feature_test_quals, + {"regulatory_class": ["ribosome_binding_site"]}, + subfeatures=False, + ) + ) + # Here's hoping you find just one ;) + return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons + + +def nice_name(record): + """ + get the real name rather than NCBI IDs and so on. If fails, will return record.id + """ + name = record.id + likely_parental_contig = list(genes(record.features, feature_type="contig")) + if len(likely_parental_contig) == 1: + name = likely_parental_contig[0].qualifiers.get("organism", [name])[0] + return name + + +def fsort(it): + for i in sorted(it, key=lambda x: int(x.location.start)): + yield i diff -r 6e7e20cb1fc7 -r 4f4b413056f6 gff3_rebase.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3_rebase.py Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,131 @@ +#!/usr/bin/env python +import sys +import logging +import argparse +from gff3 import feature_lambda, feature_test_qual_value +from CPT_GFFParser import gffParse, gffWrite +from Bio.SeqFeature import FeatureLocation + +log = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +def __get_features(child, interpro=False): + child_features = {} + for rec in gffParse(child): + log.info("Parsing %s", rec.id) + # Only top level + for feature in rec.features: + # Get the record id as parent_feature_id (since this is how it will be during remapping) + parent_feature_id = rec.id + # If it's an interpro specific gff3 file + if interpro: + # Then we ignore polypeptide features as they're useless + if feature.type == "polypeptide": + continue + + try: + child_features[parent_feature_id].append(feature) + except KeyError: + child_features[parent_feature_id] = [feature] + # Keep a list of feature objects keyed by parent record id + return child_features + + +def __update_feature_location(feature, parent, protein2dna): + start = feature.location.start + end = feature.location.end + if protein2dna: + start *= 3 + end *= 3 + + if parent.location.strand >= 0: + ns = parent.location.start + start + ne = parent.location.start + end + st = +1 + else: + ns = parent.location.end - end + ne = parent.location.end - start + st = -1 + + # Don't let start/stops be less than zero. + # + # Instead, we'll replace with %3 to try and keep it in the same reading + # frame that it should be in. + + if ns < 0: + ns %= 3 + if ne < 0: + ne %= 3 + + feature.location = FeatureLocation(ns, ne, strand=st) + + if hasattr(feature, "sub_features"): + for subfeature in feature.sub_features: + __update_feature_location(subfeature, parent, protein2dna) + + +def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"): + # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID + child_features = __get_features(child, interpro=interpro) + + for rec in gffParse(parent): + replacement_features = [] + # Horrifically slow I believe + for feature in feature_lambda( + rec.features, + # Filter features in the parent genome by those that are + # "interesting", i.e. have results in child_features array. + # Probably an unnecessary optimisation. + feature_test_qual_value, + {"qualifier": map_by, "attribute_list": child_features.keys()}, + subfeatures=False, + ): + + # Features which will be re-mapped + to_remap = child_features[feature.id] + + fixed_features = [] + for x in to_remap: + # Then update the location of the actual feature + __update_feature_location(x, feature, protein2dna) + + if interpro: + for y in ("status", "Target"): + try: + del x.qualifiers[y] + except: + pass + + fixed_features.append(x) + replacement_features.extend(fixed_features) + # We do this so we don't include the original set of features that we + # were rebasing against in our result. + rec.features = replacement_features + rec.annotations = {} + gffWrite([rec], sys.stdout) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="rebase gff3 features against parent locations", epilog="" + ) + parser.add_argument( + "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations" + ) + parser.add_argument( + "child", + type=argparse.FileType("r"), + help="Child GFF3 annotations to rebase against parent", + ) + parser.add_argument( + "--interpro", action="store_true", help="Interpro specific modifications" + ) + parser.add_argument( + "--protein2dna", + action="store_true", + help="Map protein translated results to original DNA data", + ) + parser.add_argument("--map_by", help="Map by key", default="ID") + args = parser.parse_args() + rebase(**vars(args)) diff -r 6e7e20cb1fc7 -r 4f4b413056f6 gff3_rebase.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3_rebase.xml Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,100 @@ + + against parent features + + macros.xml + cpt-macros.xml + + + '$default']]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + cds42 + MRTNASC + +Then analyzed that feature, producing the *child* annotation file:: + + #gff-version 3 + cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein + +This tool will then localize the results properly against the parent and permit +proper visualization of the results in the correct location:: + + #gff-version 3 + PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein + +**Options** + +The **Interpro specific modifications** option selectively ignores *features* (*i.e.* polypeptide) and +qualifiers (status, Target) not needed in the output. + +The **Map protein translated results to original DNA data** option indicates that the DNA sequences were translated into +protein sequence during the genomic export process. When this option is selected, +the tool will multiply the bases by three to obtain the correct DNA locations. + +]]> + + diff -r 6e7e20cb1fc7 -r 4f4b413056f6 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,74 @@ + + + + progressivemauve + + bcbiogff + + + + 2.4.0 + + 10.1371/journal.pone.0011147 + + + 10.1093/bioinformatics/btm039 + + + '$xmfa' + + + + + + '$sequences' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + '$gff3_data' + + + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + + + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + + + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + + diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/T7_CLEAN.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/T7_CLEAN.gff3 Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,171 @@ +##gff-version 3 +NC_001604 GenBank contig 1 39937 . + 1 ID=NC_001604;Dbxref=BioProject:PRJNA485481,taxon:10760;Name=NC_001604;Note=Enterobacteria phage T7%2C complete genome.,VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3],[4],and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443,474,and 388 to 424. [4] inserted a T at nucleotide 17511,increasing the total sequence to 39937 bp. This change,originally found in T3 DNA [8],revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG,changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1] unless otherwise noted. The sequence shown is that of the l strand,which corresponds to the sequence of all mRNAs of known functional significance. Early mRNAs are produced by three major promoters for E. coli RNA polymerase A1,A2,and A3,located near the left end of the DNA. A fourth major E. coli promoter,A0 (also called D),that would direct transcription leftward,and several minor E. coli promoters (see Table 6 in [1]) function in vitro but have no known in vivo function. Late mRNAs are produced by 15 promoters for T7 RNA polymerase distributed across the right-most 85%25 of the DNA,and named e.g. phi10,for the first gene downstream of the promoter. There are also two T7 promoters,phiOL and phiOR,associated with possible origins of replication at the left and right ends of T7 DNA. The 23 base-pair consensus sequence for T7 promoters stretches from -17 to +6,where the initiating nucleotide is at +1. T7 DNA also contains a 160 base-pair terminal repetition. The beginning and end of RNAs are determined by the promoters,by a terminator for E. coli RNA polymerase,TE,located at the end of the early region,a terminator for T7 RNA polymerase,Tphi,located just downstream of gene 10,and a series of RNase III cleavage sites. Early mRNAs made by E. coli RNA polymerase are listed in Features. The many RNAs predicted to be made by T7 RNA polymerase are not listed but can be deduced from the position of the transcription signals (see Tables 8 and 9 in [1]). Promoters are listed in Features by the known or predicted first nucleotide of the RNA,terminators by the last nucleotide of the RNA,and RNase III sites by the nucleotide 5' of the position of cleavage. Genes are numbered 0.3 to 19.5 in order of their left-to-right position on the genome. Proteins are named by the gene number,e.g.,the gene 1 protein,or by a functional name,e.g.,T7 RNA polymerase. There is now genetic or biochemical evidence that proteins are produced from at least 52 of the 56 T7 genes. Gene 4 produces two proteins,4A and 4B,by initiating translation at two different sites in the same reading frame. Gene 10 produces two proteins,10A and 10B,by frameshifting during translation. Genes 0.6 and 5.5 probably also make two proteins by translational frameshifting,the gene 5.5 frameshift producing a gene 5.5-5.7 fusion protein. COMPLETENESS: full length. ;comment1=VALIDATED REFSEQ: This record has undergone validation or preliminary review. The reference sequence was derived from V01146. The sequence was submitted by the authors [1] on magnetic tape and revised according to [3]%2C [4]%2C and [5]. [3] made changes at 8 positions in gene 1 without affecting the size of the total sequence but changing gene 1 amino acids 443%2C 474%2C and 388 to 424. [4] inserted a T at nucleotide 17511%2C increasing the total sequence to 39937 bp. This change%2C originally found in T3 DNA [8]%2C revealed gene 5.9 and shortened gene 6. [5] changed the nucleotides at 11061 and 11062 from GT to TG%2C changing amino acid 119 of T7 lysozyme (gene 3.5) from glycine to valine. Features have been extracted from [1] unless otherwise noted. The sequence shown is that of the l strand%2C which corresponds to the sequence of all mRNAs of known functional significance. Early mRNAs are produced by three major promoters for E. coli RNA polymerase A1%2C A2%2C and A3%2C located near the left end of the DNA. A fourth major E. coli promoter%2C A0 (also called D)%2C that would direct transcription leftward%2C and several minor E. coli promoters (see Table 6 in [1]) function in vitro but have no known in vivo function. Late mRNAs are produced by 15 promoters for T7 RNA polymerase distributed across the right-most 85%25 of the DNA%2C and named e.g. phi10%2C for the first gene downstream of the promoter. There are also two T7 promoters%2C phiOL and phiOR%2C associated with possible origins of replication at the left and right ends of T7 DNA. The 23 base-pair consensus sequence for T7 promoters stretches from -17 to +6%2C where the initiating nucleotide is at +1. T7 DNA also contains a 160 base-pair terminal repetition. The beginning and end of RNAs are determined by the promoters%3B by a terminator for E. coli RNA polymerase%2C TE%2C located at the end of the early region%3B a terminator for T7 RNA polymerase%2C Tphi%2C located just downstream of gene 10%3B and a series of RNase III cleavage sites. Early mRNAs made by E. coli RNA polymerase are listed in Features. The many RNAs predicted to be made by T7 RNA polymerase are not listed but can be deduced from the position of the transcription signals (see Tables 8 and 9 in [1]). Promoters are listed in Features by the known or predicted first nucleotide of the RNA%2C terminators by the last nucleotide of the RNA%2C and RNase III sites by the nucleotide 5' of the position of cleavage. Genes are numbered 0.3 to 19.5 in order of their left-to-right position on the genome. Proteins are named by the gene number%2C e.g.%2C the gene 1 protein%2C or by a functional name%2C e.g.%2C T7 RNA polymerase. There is now genetic or biochemical evidence that proteins are produced from at least 52 of the 56 T7 genes. Gene 4 produces two proteins%2C 4A and 4B%2C by initiating translation at two different sites in the same reading frame. Gene 10 produces two proteins%2C 10A and 10B%2C by frameshifting during translation. Genes 0.6 and 5.5 probably also make two proteins by translational frameshifting%2C the gene 5.5 frameshift producing a gene 5.5-5.7 fusion protein. COMPLETENESS: full length. ;date=13-AUG-2018;host=Escherichia coli;mol_type=genomic DNA;organism=Escherichia phage T7; +NC_001604 GenBank regulatory 224 224 . + 1 ID=GenBank:regulatory:NC_001604:224:224;Note=E. coli promoter A0 (leftward);regulatory_class=promoter; +NC_001604 GenBank regulatory 405 405 . + 1 ID=GenBank:regulatory:NC_001604:405:405;Note=T7 promoter phiOL;regulatory_class=promoter; +NC_001604 GenBank regulatory 498 498 . + 1 ID=GenBank:regulatory:NC_001604:498:498;Note=E. coli promoter A1;regulatory_class=promoter; +NC_001604 GenBank regulatory 626 626 . + 1 ID=GenBank:regulatory:NC_001604:626:626;Note=E. coli promoter A2;regulatory_class=promoter; +NC_001604 GenBank regulatory 750 750 . + 1 ID=GenBank:regulatory:NC_001604:750:750;Note=E. coli promoter A3;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 890 890 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:890:890;Note=RNase III site R0.3; +NC_001604 GenBank gene 891 1468 . + 1 ID=T7p01;Dbxref=GeneID:1261063;Name=T7p01;Note=gene 0.3; +NC_001604 GenBank mRNA 891 1468 . + 1 ID=T7p01.t01;Parent=T7p01;Dbxref=GeneID:1261063;Name=T7p01;product=0.3 mRNA; +NC_001604 GenBank CDS 925 1278 . + 1 ID=T7p01.p01;Parent=T7p01.t01;Dbxref=UniProtKB/Swiss-Prot:P03775,GeneID:1261063;Name=T7p01;Note=gene 0.3%2C inhibits EcoB and EcoK host restriction;codon_start=1;product=hypothetical protein;protein_id=NP_041954.1;transl_table=11;translation=length.117; +NC_001604 GenBank exon 891 1468 . + 1 Parent=T7p01.t01;Name=T7p01; +NC_001604 GenBank sequence_secondary_structure 1468 1468 . + 1 Parent=T7p01;Name=T7p01;Note=RNase III site R0.5; +NC_001604 GenBank CDS 1278 1433 . + 1 ID=T7p02;Dbxref=UniProtKB/Swiss-Prot:P03776,GeneID:1261053;Name=T7p02;Note=gene 0.4;codon_start=1;product=hypothetical protein;protein_id=NP_041955.1;transl_table=11;translation=length.51; +NC_001604 GenBank gene 1278 1433 . + 1 ID=T7p02.gene;Alias=T7p02;Dbxref=GeneID:1261053;Name=T7p02;Note=gene 0.4; +NC_001604 GenBank gene 1469 3138 . + 1 ID=T7p03;Dbxref=GeneID:1261070;Name=T7p03;Note=gene 0.7; +NC_001604 GenBank mRNA 1469 3138 . + 1 ID=T7p03.t01;Parent=T7p03;Dbxref=GeneID:1261070;Name=T7p03;product=0.7 mRNA; +NC_001604 GenBank CDS 2021 3100 . + 1 ID=T7p03.p01;Parent=T7p03.t01;Dbxref=GOA:P00513,UniProtKB/Swiss-Prot:P00513,GeneID:1261070;Name=T7p03;Note=The T7 seryl-threonyl protein kinase gp0.7 is involved in host transcription shutoff and Col Ib exclusion. Phosphorylates E. coli RNA polymerase. Other names: gp0.7.;codon_start=1;product=protein kinase;protein_id=NP_041959.1;transl_table=11;translation=length.359; +NC_001604 GenBank exon 1469 3138 . + 1 Parent=T7p03.t01;Name=T7p03; +NC_001604 GenBank regulatory 3113 3113 . + 1 Parent=T7p03;Name=T7p03;Note=E. coli C promoter;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 3138 3138 . + 1 Parent=T7p03;Name=T7p03;Note=RNase III site R1; +NC_001604 GenBank CDS 1496 1639 . + 1 ID=T7p04;Dbxref=UniProtKB/Swiss-Prot:P03777,GeneID:1261051;Name=T7p04;Note=gene 0.5;codon_start=1;product=hypothetical protein;protein_id=NP_041956.1;transl_table=11;translation=length.47; +NC_001604 GenBank regulatory 1514 1514 . + 1 Parent=T7p04;Name=T7p04;Note=E. coli B promoter;regulatory_class=promoter; +NC_001604 GenBank gene 1496 1639 . + 1 ID=T7p04.gene;Alias=T7p04;Dbxref=GeneID:1261051;Name=T7p04;Note=gene 0.5; +NC_001604 GenBank CDS 1636 1794 . + 1 ID=T7p05.cds1;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261061;Name=T7p05;Note=possible gene 0.6B;codon_start=1;product=hypothetical protein;protein_id=NP_041957.1;transl_table=11;translation=length.111; +NC_001604 GenBank CDS 1796 1972 . + 1 ID=T7p05.cds2;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261061;Name=T7p05;Note=possible gene 0.6B;codon_start=1;product=hypothetical protein;protein_id=NP_041957.1;transl_table=11;translation=length.111; +NC_001604 GenBank gene 1636 1972 . + 1 ID=T7p05.gene;Alias=T7p05;Dbxref=GeneID:1261061;Name=T7p05;Note=possible gene 0.6B; +NC_001604 GenBank CDS 1636 1797 . + 1 ID=T7p06;Dbxref=UniProtKB/Swiss-Prot:P03778,GeneID:1261071;Name=T7p06;Note=gene 0.6A;codon_start=1;product=hypothetical protein;protein_id=NP_041958.1;transl_table=11;translation=length.53; +NC_001604 GenBank gene 1636 1797 . + 1 ID=T7p06.gene;Alias=T7p06;Dbxref=GeneID:1261071;Name=T7p06;Note=gene 0.6A; +NC_001604 GenBank gene 3139 5887 . + 1 ID=T7p07;Dbxref=GeneID:1261050;Name=T7p07;Note=gene 1; +NC_001604 GenBank mRNA 3139 5887 . + 1 ID=T7p07.t01;Parent=T7p07;Dbxref=GeneID:1261050;Name=T7p07;product=1 mRNA; +NC_001604 GenBank CDS 3171 5822 . + 1 ID=T7p07.p01;Parent=T7p07.t01;Dbxref=GOA:P00573,UniProtKB/Swiss-Prot:P00573,GeneID:1261050;Name=T7p07;Note=A family of single subunit RNA polymerases.;codon_start=1;product=T3/T7-like RNA polymerase;protein_id=NP_041960.1;transl_table=11;translation=length.883; +NC_001604 GenBank exon 3139 5887 . + 1 Parent=T7p07.t01;Name=T7p07; +NC_001604 GenBank regulatory 5848 5848 . + 1 Parent=T7p07;Name=T7p07;Note=T7 promoter phi1.1A;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 5887 5887 . + 1 Parent=T7p07;Name=T7p07;Note=RNase III site R1.1; +NC_001604 GenBank gene 5888 6448 . + 1 ID=T7p08;Dbxref=GeneID:1261049;Name=T7p08;Note=gene 1.2; +NC_001604 GenBank mRNA 5888 6448 . + 1 ID=T7p08.t01;Parent=T7p08;Dbxref=GeneID:1261049;Name=T7p08;product=1.1 mRNA; +NC_001604 GenBank CDS 6137 6394 . + 1 ID=T7p08.p01;Parent=T7p08.t01;Dbxref=GOA:P03780,UniProtKB/Swiss-Prot:P03780,GeneID:1261049;Name=T7p08;Note=inhibits activity of the host dGTPase [dgt]. Essential only in strains that overexpress dGTPase [optA1 mutation]. In T7%2C gp1.2 also causes F plasmid exclusion. In T3%2C however%2C gp1.2 overcomes the exclusion system. Other names: dGTP triphosphohydrolase inhibitor%3B gp1.2.;codon_start=1;product=host dGTPase inhibitor;protein_id=NP_041962.1;transl_table=11;translation=length.85; +NC_001604 GenBank exon 5888 6448 . + 1 Parent=T7p08.t01;Name=T7p08; +NC_001604 GenBank regulatory 5923 5923 . + 1 Parent=T7p08;Name=T7p08;Note=T7 promoter phi1.1B;regulatory_class=promoter; +NC_001604 GenBank regulatory 6409 6409 . + 1 Parent=T7p08;Name=T7p08;Note=T7 promoter phi1.3;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 6448 6448 . + 1 Parent=T7p08;Name=T7p08;Note=RNase III site R1.3; +NC_001604 GenBank CDS 6007 6135 . + 1 ID=T7p09;Dbxref=UniProtKB/Swiss-Prot:P03779,GeneID:1261072;Name=T7p09;Note=other names: gp1.1;codon_start=1;product=hypothetical protein;protein_id=NP_041961.1;transl_table=11;translation=length.42; +NC_001604 GenBank gene 6007 6135 . + 1 ID=T7p09.gene;Alias=T7p09;Dbxref=GeneID:1261072;Name=T7p09;Note=gene 1.1; +NC_001604 GenBank gene 6449 7588 . + 1 ID=T7p10;Dbxref=GeneID:1261055;Name=T7p10;Note=gene 1.3; +NC_001604 GenBank mRNA 6449 7588 . + 1 ID=T7p10.t01;Parent=T7p10;Dbxref=GeneID:1261055;Name=T7p10;product=1.3 mRNA; +NC_001604 GenBank CDS 6475 7554 . + 1 ID=T7p10.p01;Parent=T7p10.t01;Dbxref=GOA:P00969,UniProtKB/Swiss-Prot:P00969,GeneID:1261055;Name=T7p10;Note=Catalyzes the ATP-dependent formation of a phosphodiester bond at the site of single-stranded breaks in double-stranded DNA. T7 ligase is essential in ligase-deficient hosts only.;codon_start=1;product=ATP-dependent DNA ligase;protein_id=NP_041963.1;transl_table=11;translation=length.359; +NC_001604 GenBank exon 6449 7588 . + 1 Parent=T7p10.t01;Name=T7p10; +NC_001604 GenBank regulatory 7588 7588 . + 1 Parent=T7p10;Name=T7p10;Note=E. coli transcription terminator TE;regulatory_class=terminator; +NC_001604 GenBank CDS 7608 7763 . + 1 ID=T7p11;Dbxref=UniProtKB/Swiss-Prot:P03791,GeneID:1261075;Name=T7p11;Note=gene 1.4;codon_start=1;product=hypothetical protein;protein_id=NP_041964.1;transl_table=11;translation=length.51; +NC_001604 GenBank regulatory 7778 7778 . + 1 ID=GenBank:regulatory:NC_001604:7778:7778;Note=T7 promoter phi1.5;regulatory_class=promoter; +NC_001604 GenBank gene 7608 7763 . + 1 ID=T7p11.gene;Alias=T7p11;Dbxref=GeneID:1261075;Name=T7p11;Note=gene 1.4; +NC_001604 GenBank CDS 7791 7880 . + 1 ID=T7p12;Dbxref=UniProtKB/Swiss-Prot:P03792,GeneID:1261074;Name=T7p12;Note=gene 1.5;codon_start=1;product=hypothetical protein;protein_id=NP_041965.1;transl_table=11;translation=length.29; +NC_001604 GenBank regulatory 7895 7895 . + 1 ID=GenBank:regulatory:NC_001604:7895:7895;Note=T7 promoter phi1.6;regulatory_class=promoter; +NC_001604 GenBank gene 7791 7880 . + 1 ID=T7p12.gene;Alias=T7p12;Dbxref=GeneID:1261074;Name=T7p12;Note=gene 1.5; +NC_001604 GenBank CDS 7906 8166 . + 1 ID=T7p13;Dbxref=UniProtKB/Swiss-Prot:P03793,GeneID:1261076;Name=T7p13;Note=gene 1.6;codon_start=1;product=hypothetical protein;protein_id=NP_041966.1;transl_table=11;translation=length.86; +NC_001604 GenBank gene 7906 8166 . + 1 ID=T7p13.gene;Alias=T7p13;Dbxref=GeneID:1261076;Name=T7p13;Note=gene 1.6; +NC_001604 GenBank CDS 8166 8756 . + 1 ID=T7p14;Dbxref=UniProtKB/Swiss-Prot:P03781,GeneID:1261060;Name=T7p14;Note=gene 1.7;codon_start=1;product=hypothetical protein;protein_id=NP_041967.1;transl_table=11;translation=length.196; +NC_001604 GenBank gene 8166 8756 . + 1 ID=T7p14.gene;Alias=T7p14;Dbxref=GeneID:1261060;Name=T7p14;Note=gene 1.7; +NC_001604 GenBank CDS 8749 8895 . + 1 ID=T7p15;Dbxref=UniProtKB/Swiss-Prot:P03794,GeneID:1261054;Name=T7p15;Note=not essential in T7. Other names: gp1.8;codon_start=1;product=hypothetical protein;protein_id=NP_041968.1;transl_table=11;translation=length.48; +NC_001604 GenBank gene 8749 8895 . + 1 ID=T7p15.gene;Alias=T7p15;Dbxref=GeneID:1261054;Name=T7p15;Note=gene 1.8; +NC_001604 GenBank CDS 8898 9092 . + 1 ID=T7p16;Dbxref=UniProtKB/Swiss-Prot:P03704,GeneID:1261073;Name=T7p16;Note=T7 RNA polymerase inhibitor binds to host RNA pol and suppresses its activity on a subset of promoters. gp2 deficient T7 display reduced DNA replication and premature breakdown of replicating DNA%2C specifically at the left end of the genome%2C along with the presence of empty proheads. Rifampin can compensate for the missing gp2 function. Other names: gp2.;codon_start=1;product=inhibitor of host bacterial RNA polymerase;protein_id=NP_041969.1;transl_table=11;translation=length.64; +NC_001604 GenBank regulatory 9107 9107 . + 1 ID=GenBank:regulatory:NC_001604:9107:9107;Note=T7 promoter phi2.5;regulatory_class=promoter; +NC_001604 GenBank gene 8898 9092 . + 1 ID=T7p16.gene;Alias=T7p16;Dbxref=GeneID:1261073;Name=T7p16;Note=gene 2; +NC_001604 GenBank CDS 9158 9856 . + 1 ID=T7p17;Dbxref=GOA:P03696,UniProtKB/Swiss-Prot:P03696,GeneID:1261080;Name=T7p17;Note=binds single-stranded DNA. In phage T7 gp2.5 is essential for DNA replication and recombination. Other names: gp2.5%3B SSB.;codon_start=1;product=single-stranded DNA-binding protein;protein_id=NP_041970.1;transl_table=11;translation=length.232; +NC_001604 GenBank gene 9158 9856 . + 1 ID=T7p17.gene;Alias=T7p17;Dbxref=GeneID:1261080;Name=T7p17;Note=gene 2.5; +NC_001604 GenBank CDS 9857 10276 . + 1 ID=T7p18;Dbxref=GOA:P03795,UniProtKB/Swiss-Prot:P03795,GeneID:1261078;Name=T7p18;Note=gene 2.8;codon_start=1;product=hypothetical protein;protein_id=NP_041971.1;transl_table=11;translation=length.139; +NC_001604 GenBank gene 9857 10276 . + 1 ID=T7p18.gene;Alias=T7p18;Dbxref=GeneID:1261078;Name=T7p18;Note=gene 2.8; +NC_001604 GenBank CDS 10257 10706 . + 1 ID=T7p19;Dbxref=GOA:P00641,UniProtKB/Swiss-Prot:P00641,GeneID:1261079;Name=T7p19;Note=T7 endonuclease I is a Holliday junction resolvase encoded by T7 gene 3. Mutants in gene 3 are defective in recombination and accumulate branched DNA. Endonuclease I may also play a role in the degradation of the host genome following infection with T7.;codon_start=1;product=endonuclease I;protein_id=NP_041972.1;transl_table=11;translation=length.149; +NC_001604 GenBank gene 10257 10706 . + 1 ID=T7p19.gene;Alias=T7p19;Dbxref=GeneID:1261079;Name=T7p19;Note=gene 3; +NC_001604 GenBank CDS 10706 11161 . + 1 ID=T7p20;Dbxref=GOA:P00806,UniProtKB/Swiss-Prot:P00806,GeneID:1261077;Name=T7p20;Note=T7 lysozyme hydrolyzes an amide bond in the host cell wall following its release from the cytoplasm. In addition%2C T7 lysozyme inhibits T7 RNA polymerase initiation. This inhibition is greater for class II promoters than class III promoters and therefore may aid in temporal regulation of transcription and the switch to particle assembly. In T7%2C lysozyme%2C unlike the T7 holin%2C is expressed with and lies in the same region as the replication genes. Lack of gp3.5 reduces replication and burst size and delays%2C but does not completely prevent lysis. Mutations in the muralytic domain of gene 16%2C an inner capsid protein%2C can partially compensate for a deletion of gp3.5. Other names: gp3.5%3B amidase%3B N-acetylmuramoyl-L-alanine amidase;codon_start=1;product=lysozyme;protein_id=NP_041973.1;transl_table=11;translation=length.151; +NC_001604 GenBank regulatory 11180 11180 . + 1 ID=GenBank:regulatory:NC_001604:11180:11180;Note=T7 promoter phi3.8;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 11203 11203 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:11203:11203;Note=possible RNase III site R3.8; +NC_001604 GenBank gene 10706 11161 . + 1 ID=T7p20.gene;Alias=T7p20;Dbxref=GeneID:1261077;Name=T7p20;Note=gene 3.5; +NC_001604 GenBank CDS 11225 11590 . + 1 ID=T7p21;Dbxref=GOA:P03797,UniProtKB/Swiss-Prot:P03797,GeneID:1261065;Name=T7p21;Note=gene 3.8;codon_start=1;product=putative NHN endonuclease;protein_id=NP_041974.1;transl_table=11;translation=length.121; +NC_001604 GenBank gene 11225 11590 . + 1 ID=T7p21.gene;Alias=T7p21;Dbxref=GeneID:1261065;Name=T7p21;Note=gene 3.8; +NC_001604 GenBank CDS 11565 13265 . + 1 ID=T7p22;Dbxref=GOA:P03692,UniProtKB/Swiss-Prot:P03692,GeneID:1261046;Name=T7p22;Note=gene 4A%2C primase/helicase [14%2C15];codon_start=1;product=DNA primase/helicase;protein_id=NP_041975.1;transl_table=11;translation=length.566; +NC_001604 GenBank gene 11565 13265 . + 1 ID=T7p22.gene;Alias=T7p22;Dbxref=GeneID:1261046;Name=T7p22;Note=gene 4A; +NC_001604 GenBank CDS 11635 11757 . + 1 ID=T7p23;Dbxref=UniProtKB/Swiss-Prot:P03782,GeneID:1261047;Name=T7p23;Note=gene 4.1;codon_start=1;product=hypothetical protein;protein_id=NP_041976.1;transl_table=11;translation=length.40; +NC_001604 GenBank gene 11635 11757 . + 1 ID=T7p23.gene;Alias=T7p23;Dbxref=GeneID:1261047;Name=T7p23;Note=gene 4.1; +NC_001604 GenBank CDS 11754 13265 . + 1 ID=T7p24;Dbxref=GOA:P03692,UniProtKB/Swiss-Prot:P03692,GeneID:1261048;Name=T7p24;Note=gene 4B/helicase [14%2C15];codon_start=1;product=helicase;protein_id=NP_041977.1;transl_table=11;translation=length.503; +NC_001604 GenBank regulatory 12671 12671 . + 1 Parent=T7p24;Name=T7p24;Note=T7 promoter phi4c;regulatory_class=promoter; +NC_001604 GenBank gene 11754 13265 . + 1 ID=T7p24.gene;Alias=T7p24;Dbxref=GeneID:1261048;Name=T7p24;Note=gene 4B; +NC_001604 GenBank CDS 12988 13326 . + 1 ID=T7p25;Dbxref=UniProtKB/Swiss-Prot:P03783,GeneID:1261021;Name=T7p25;Note=gene 4.2;codon_start=1;product=hypothetical protein;protein_id=NP_041978.1;transl_table=11;translation=length.112; +NC_001604 GenBank regulatory 13341 13341 . + 1 ID=GenBank:regulatory:NC_001604:13341:13341;Note=T7 promoter phi4.3;regulatory_class=promoter; +NC_001604 GenBank gene 12988 13326 . + 1 ID=T7p25.gene;Alias=T7p25;Dbxref=GeneID:1261021;Name=T7p25;Note=gene 4.2; +NC_001604 GenBank CDS 13352 13564 . + 1 ID=T7p26;Dbxref=UniProtKB/Swiss-Prot:P03784,GeneID:1261069;Name=T7p26;Note=not essential in T7%3B Other names: gp4.3.;codon_start=1;product=hypothetical protein;protein_id=NP_041979.1;transl_table=11;translation=length.70; +NC_001604 GenBank gene 13352 13564 . + 1 ID=T7p26.gene;Alias=T7p26;Dbxref=GeneID:1261069;Name=T7p26;Note=gene 4.3; +NC_001604 GenBank CDS 13584 13853 . + 1 ID=T7p27;Dbxref=UniProtKB/Swiss-Prot:P03785,GeneID:1261059;Name=T7p27;Note=not essential in T7. Other names: gp4.5.;codon_start=1;product=hypothetical protein;protein_id=NP_041980.1;transl_table=11;translation=length.89; +NC_001604 GenBank sequence_secondary_structure 13892 13892 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:13892:13892;Note=RNase III site R4.7; +NC_001604 GenBank regulatory 13915 13915 . + 1 ID=GenBank:regulatory:NC_001604:13915:13915;Note=T7 promoter phi4.7;regulatory_class=promoter; +NC_001604 GenBank gene 13584 13853 . + 1 ID=T7p27.gene;Alias=T7p27;Dbxref=GeneID:1261059;Name=T7p27;Note=gene 4.5; +NC_001604 GenBank CDS 13927 14334 . + 1 ID=T7p28;Dbxref=UniProtKB/Swiss-Prot:P03786,GeneID:1261043;Name=T7p28;Note=gene 4.7;codon_start=1;product=hypothetical protein;protein_id=NP_041981.1;transl_table=11;translation=length.135; +NC_001604 GenBank gene 13927 14334 . + 1 ID=T7p28.gene;Alias=T7p28;Dbxref=GeneID:1261043;Name=T7p28;Note=gene 4.7; +NC_001604 GenBank CDS 14353 16467 . + 1 ID=T7p29;Dbxref=GOA:P00581,UniProtKB/Swiss-Prot:P00581,GeneID:1261044;Name=T7p29;Note=gene 5;codon_start=1;product=DNA polymerase;protein_id=NP_041982.1;transl_table=11;translation=length.704; +NC_001604 GenBank gene 14353 16467 . + 1 ID=T7p29.gene;Alias=T7p29;Dbxref=GeneID:1261044;Name=T7p29;Note=gene 5; +NC_001604 GenBank CDS 16483 16839 . + 1 ID=T7p30;Dbxref=UniProtKB/Swiss-Prot:P03798,GeneID:1261045;Name=T7p30;Note=gene 5.3;codon_start=1;product=hypothetical protein;protein_id=NP_041983.1;transl_table=11;translation=length.118; +NC_001604 GenBank gene 16483 16839 . + 1 ID=T7p30.gene;Alias=T7p30;Dbxref=GeneID:1261045;Name=T7p30;Note=gene 5.3; +NC_001604 GenBank CDS 16851 17147 . + 1 ID=T7p31.cds1;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041984.1;transl_table=11;translation=length.169; +NC_001604 GenBank CDS 17147 17359 . + 1 ID=T7p31.cds2;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041984.1;transl_table=11;translation=length.169; +NC_001604 GenBank gene 16851 17359 . + 1 ID=T7p31.gene;Alias=T7p31;Dbxref=GeneID:1261041;Name=T7p31;Note=possible gene 5.5-5.7; +NC_001604 GenBank CDS 16851 17150 . + 1 ID=T7p32;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261038;Name=T7p32;Note=in Enterobacteria phage T7%2C gp5.5 abolishes E. coli nucleoid protein H-NS-mediated inhibition of transcription by T7 RNA polymerases in vitro. Not essential%2C but mutants have lower burst size. Mutants in this gene are not capable of replicating in phage lambda lysogens. Other names: gp5.5;codon_start=1;product=host protein H-NS-interacting protein;protein_id=NP_041985.1;transl_table=11;translation=length.99; +NC_001604 GenBank gene 16851 17150 . + 1 ID=T7p32.gene;Alias=T7p32;Dbxref=GeneID:1261038;Name=T7p32;Note=gene 5.5; +NC_001604 GenBank CDS 17150 17359 . + 1 ID=T7p33;Dbxref=GOA:P03787,UniProtKB/Swiss-Prot:P03787,GeneID:1261040;Name=T7p33;Note=gene 5.7;codon_start=1;product=hypothetical protein;protein_id=NP_041986.1;transl_table=11;translation=length.69; +NC_001604 GenBank gene 17150 17359 . + 1 ID=T7p33.gene;Alias=T7p33;Dbxref=GeneID:1261040;Name=T7p33;Note=gene 5.7; +NC_001604 GenBank CDS 17359 17517 . + 1 ID=T7p34;Dbxref=UniProtKB/Swiss-Prot:P20406,GeneID:1261037;Name=T7p34;Note=not essential. Other names: gp5.9%3B exonuclease V inhibitor;codon_start=1;product=host recBCD nuclease inhibitor;protein_id=NP_041987.1;transl_table=11;translation=length.52; +NC_001604 GenBank gene 17359 17517 . + 1 ID=T7p34.gene;Alias=T7p34;Dbxref=GeneID:1261037;Name=T7p34;Note=gene 5.9; +NC_001604 GenBank CDS 17504 18406 . + 1 ID=T7p35;Dbxref=GOA:P00638,UniProtKB/Swiss-Prot:P00638,GeneID:1261052;Name=T7p35;Note=The T7 exonuclease encoded by gene 6 is required for (a) recombination and (b) for the degradation of host chromosomal DNA. The latter process provides nucleotides for phage DNA replication. Both processes are carried out together with the T7 gene 3-encoded endonuclease/Holliday junction resolvase. In addition%2C the exonuclease also functions as an RNase H that removes RNA primers during DNA replication and promotes concatemer formation.;codon_start=1;product=exonuclease;protein_id=NP_041988.1;transl_table=11;translation=length.300; +NC_001604 GenBank gene 17504 18406 . + 1 ID=T7p35.gene;Alias=T7p35;Dbxref=GeneID:1261052;Name=T7p35;Note=gene 6; +NC_001604 GenBank CDS 18394 18507 . + 1 ID=T7p36;Dbxref=UniProtKB/Swiss-Prot:P03799,GeneID:1261058;Name=T7p36;Note=gene 6.3;codon_start=1;product=hypothetical protein;protein_id=NP_041989.1;transl_table=11;translation=length.37; +NC_001604 GenBank regulatory 18545 18545 . + 1 ID=GenBank:regulatory:NC_001604:18545:18545;Note=T7 promoter phi6.5;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 18563 18563 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:18563:18563;Note=RNase III site R6.5; +NC_001604 GenBank gene 18394 18507 . + 1 ID=T7p36.gene;Alias=T7p36;Dbxref=GeneID:1261058;Name=T7p36;Note=gene 6.3; +NC_001604 GenBank CDS 18605 18859 . + 1 ID=T7p37;Dbxref=UniProtKB/Swiss-Prot:P03800,GeneID:1261036;Name=T7p37;Note=gene 6.5;codon_start=1;product=hypothetical protein;protein_id=NP_041990.1;transl_table=11;translation=length.84; +NC_001604 GenBank gene 18605 18859 . + 1 ID=T7p37.gene;Alias=T7p37;Dbxref=GeneID:1261036;Name=T7p37;Note=gene 6.5; +NC_001604 GenBank CDS 18864 19130 . + 1 ID=T7p38;Dbxref=UniProtKB/Swiss-Prot:P03801,GeneID:1261039;Name=T7p38;Note=may be involved in virion morphogenesis and is injected from virion into host cell. Other names: gp6.7;codon_start=1;product=hypothetical protein;protein_id=NP_041991.1;transl_table=11;translation=length.88; +NC_001604 GenBank gene 18864 19130 . + 1 ID=T7p38.gene;Alias=T7p38;Dbxref=GeneID:1261039;Name=T7p38;Note=gene 6.7; +NC_001604 GenBank CDS 19130 19531 . + 1 ID=T7p39;Dbxref=UniProtKB/Swiss-Prot:P03750,GeneID:1261056;Name=T7p39;Note=gene 7%2C host range;codon_start=1;product=hypothetical protein;protein_id=NP_041992.1;transl_table=11;translation=length.133; +NC_001604 GenBank gene 19130 19531 . + 1 ID=T7p39.gene;Alias=T7p39;Dbxref=GeneID:1261056;Name=T7p39;Note=gene 7; +NC_001604 GenBank CDS 19535 19834 . + 1 ID=T7p40;Dbxref=UniProtKB/Swiss-Prot:P03751,GeneID:1261035;Name=T7p40;Note=required for virion infectivity but not morphogenesis. In T7%2C gp 7.3 appears to be required for the assembly of tail fibers on capsids. Other names: gp7.3;codon_start=1;product=tail assembly protein;protein_id=NP_041993.1;transl_table=11;translation=length.99; +NC_001604 GenBank gene 19535 19834 . + 1 ID=T7p40.gene;Alias=T7p40;Dbxref=GeneID:1261035;Name=T7p40;Note=gene 7.3; +NC_001604 GenBank CDS 19848 20240 . + 1 ID=T7p41;Dbxref=GOA:P03796,UniProtKB/Swiss-Prot:P03796,GeneID:1261028;Name=T7p41;Note=gene 7.7;codon_start=1;product=hypothetical protein;protein_id=NP_041994.1;transl_table=11;translation=length.130; +NC_001604 GenBank gene 19848 20240 . + 1 ID=T7p41.gene;Alias=T7p41;Dbxref=GeneID:1261028;Name=T7p41;Note=gene 7.7; +NC_001604 GenBank CDS 20240 21850 . + 1 ID=T7p42;Dbxref=GOA:P03728,UniProtKB/Swiss-Prot:P03728,GeneID:1261033;Name=T7p42;Note=gene 8;codon_start=1;product=head-tail connector protein;protein_id=NP_041995.1;transl_table=11;translation=length.536; +NC_001604 GenBank regulatory 21865 21865 . + 1 ID=GenBank:regulatory:NC_001604:21865:21865;Note=T7 promoter phi9;regulatory_class=promoter; +NC_001604 GenBank gene 20240 21850 . + 1 ID=T7p42.gene;Alias=T7p42;Dbxref=GeneID:1261033;Name=T7p42;Note=gene 8; +NC_001604 GenBank CDS 21950 22873 . + 1 ID=T7p43;Dbxref=GOA:P03716,UniProtKB/Swiss-Prot:P03716,GeneID:1261027;Name=T7p43;Note=Phage T7-like scaffolding protein. The protein is encoded by gene 9 in T7 (gp9) and is required for the formation of pro-capsids.;codon_start=1;product=capsid assembly protein;protein_id=NP_041996.1;transl_table=11;translation=length.307; +NC_001604 GenBank regulatory 22904 22904 . + 1 ID=GenBank:regulatory:NC_001604:22904:22904;Note=T7 promoter phi10;regulatory_class=promoter; +NC_001604 GenBank gene 21950 22873 . + 1 ID=T7p43.gene;Alias=T7p43;Dbxref=GeneID:1261027;Name=T7p43;Note=gene 9; +NC_001604 GenBank CDS 22967 23989 . + 1 ID=T7p44.cds1;Dbxref=GOA:P19727,UniProtKB/Swiss-Prot:P19727,GeneID:1261029;Name=T7p44;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041997.1;transl_table=11;translation=length.398; +NC_001604 GenBank CDS 23989 24162 . + 1 ID=T7p44.cds2;Dbxref=GOA:P19727,UniProtKB/Swiss-Prot:P19727,GeneID:1261029;Name=T7p44;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041997.1;transl_table=11;translation=length.398; +NC_001604 GenBank gene 22967 24162 . + 1 ID=T7p44.gene;Alias=T7p44;Dbxref=GeneID:1261029;Name=T7p44;Note=gene 10B; +NC_001604 GenBank CDS 22967 24004 . + 1 ID=T7p45;Dbxref=GOA:P19726,UniProtKB/Swiss-Prot:P19726,GeneID:1261026;Name=T7p45;Note=major capsid protein. Involved in F-exclusion of wt T7 phage. A minor capsid protein (gp10B) is produced from gene 10 by a -1 frameshift towards the end of 10A%2C resulting in a slightly larger protein. Other names: gp10A.;codon_start=1;product=major capsid protein;protein_id=NP_041998.1;transl_table=11;translation=length.345; +NC_001604 GenBank regulatory 24210 24210 . + 1 ID=GenBank:regulatory:NC_001604:24210:24210;Note=T7 transcription terminator Tphi;regulatory_class=terminator; +NC_001604 GenBank gene 22967 24004 . + 1 ID=T7p45.gene;Alias=T7p45;Dbxref=GeneID:1261026;Name=T7p45;Note=gene 10A; +NC_001604 GenBank CDS 24228 24818 . + 1 ID=T7p46;Dbxref=UniProtKB/Swiss-Prot:P03746,GeneID:1261030;Name=T7p46;Note=Tail tubular proteins A and B are required for assembly of tails of T7-like phages.;codon_start=1;product=tail tubular protein A;protein_id=NP_041999.1;transl_table=11;translation=length.196; +NC_001604 GenBank gene 24228 24818 . + 1 ID=T7p46.gene;Alias=T7p46;Dbxref=GeneID:1261030;Name=T7p46;Note=gene 11; +NC_001604 GenBank CDS 24842 27226 . + 1 ID=T7p47;Dbxref=UniProtKB/Swiss-Prot:P03747,GeneID:1261024;Name=T7p47;Note=gene 12;codon_start=1;product=tail tubular protein B;protein_id=NP_042000.1;transl_table=11;translation=length.794; +NC_001604 GenBank regulatory 27274 27274 . + 1 ID=GenBank:regulatory:NC_001604:27274:27274;Note=T7 promoter phi13;regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 27281 27281 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:27281:27281;Note=possible RNase III site R13; +NC_001604 GenBank gene 24842 27226 . + 1 ID=T7p47.gene;Alias=T7p47;Dbxref=GeneID:1261024;Name=T7p47;Note=gene 12; +NC_001604 GenBank CDS 27307 27723 . + 1 ID=T7p48;Dbxref=UniProtKB/Swiss-Prot:P03723,GeneID:1261025;Name=T7p48;Note=gene 13;codon_start=1;product=internal virion protein A;protein_id=NP_042001.1;transl_table=11;translation=length.138; +NC_001604 GenBank gene 27307 27723 . + 1 ID=T7p48.gene;Alias=T7p48;Dbxref=GeneID:1261025;Name=T7p48;Note=gene 13; +NC_001604 GenBank CDS 27728 28318 . + 1 ID=T7p49;Dbxref=UniProtKB/Swiss-Prot:P03724,GeneID:1261032;Name=T7p49;Note=Approximately 12 copies of the internal virion protein B encoded in phage T7 by gene 14 are part of the internal core of the T7 virion. Along with gp16 and gp15%2C the other internal core proteins%2C gp14 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA. gp14 appears to localize to the outer host membrane after ejection. Other names: gp14;codon_start=1;product=internal virion protein B;protein_id=NP_042002.1;transl_table=11;translation=length.196; +NC_001604 GenBank gene 27728 28318 . + 1 ID=T7p49.gene;Alias=T7p49;Dbxref=GeneID:1261032;Name=T7p49;Note=gene 14; +NC_001604 GenBank CDS 28325 30568 . + 1 ID=T7p50;Dbxref=UniProtKB/Swiss-Prot:P03725,GeneID:1261034;Name=T7p50;Note=Approximately 12 copies of the internal virion protein C encoded by phage T7 gene 15 (gp15) are part of the internal core of the T7 virion. Along with gp14 and gp16%2C the other internal core proteins%2C gp15 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA.;codon_start=1;product=internal virion protein C;protein_id=NP_042003.1;transl_table=11;translation=length.747; +NC_001604 GenBank gene 28325 30568 . + 1 ID=T7p50.gene;Alias=T7p50;Dbxref=GeneID:1261034;Name=T7p50;Note=gene 15; +NC_001604 GenBank CDS 30595 34551 . + 1 ID=T7p51;Dbxref=GOA:P03726,UniProtKB/Swiss-Prot:P03726,GeneID:1261031;Name=T7p51;Note=Approximately 3 copies of the internal virion protein D encoded by phage T7 gene 16 (gp16) are part of the internal core of the T7 virion. Along with gp14 and gp15%2C the other internal core proteins%2C gp16 is ejected from the phage head and forms part of a putative channel that spans the entire host cell envelope and allows entry of DNA. The N-terminus has similarity to a lytic transglycosylase and may help form a channel for phage DNA translocation through the crosslinked peptidoglycan layer of the host envelope.;codon_start=1;product=internal virion protein D;protein_id=NP_042004.1;transl_table=11;translation=length.1318; +NC_001604 GenBank regulatory 34566 34566 . + 1 ID=GenBank:regulatory:NC_001604:34566:34566;Note=T7 promoter phi17;regulatory_class=promoter; +NC_001604 GenBank gene 30595 34551 . + 1 ID=T7p51.gene;Alias=T7p51;Dbxref=GeneID:1261031;Name=T7p51;Note=gene 16; +NC_001604 GenBank CDS 34624 36285 . + 1 ID=T7p52;Dbxref=UniProtKB/Swiss-Prot:P03748,GeneID:1261023;Name=T7p52;Note=in phages T7 and T3 trimers of gp17 form each of the 6 kinked tail fibers. Other names: gp17.;codon_start=1;product=tail fiber protein;protein_id=NP_042005.1;transl_table=11;translation=length.553; +NC_001604 GenBank gene 34624 36285 . + 1 ID=T7p52.gene;Alias=T7p52;Dbxref=GeneID:1261023;Name=T7p52;Note=gene 17; +NC_001604 GenBank CDS 36344 36547 . + 1 ID=T7p53;Dbxref=GOA:P03802,UniProtKB/Swiss-Prot:P03802,GeneID:1261022;Name=T7p53;Note=Type II holins have two putative transmembrane domains and are thought to allow endolysins access to the cell wall at the optimal lysis time. However%2C in phage T7 the holin protein gp17.5 does not appear to be essential and gp17.5 mutants only show a minor delay in lysis. Other names: gp17.5%3B lysis protein;codon_start=1;product=type II holin;protein_id=NP_042006.1;transl_table=11;translation=length.67; +NC_001604 GenBank gene 36344 36547 . + 1 ID=T7p53.gene;Alias=T7p53;Dbxref=GeneID:1261022;Name=T7p53;Note=gene 17.5; +NC_001604 GenBank CDS 36553 36822 . + 1 ID=T7p54;Dbxref=GOA:P03693,UniProtKB/Swiss-Prot:P03693,GeneID:1261042;Name=T7p54;Note=involved in the packaging of genome monomers into a procapsid using head-to-tail concatemers of genomes. other names: DNA packaging protein A%3B DNA maturation protein A%3B terminase%2C small subunit;codon_start=1;product=DNA packaging protein%2C small subunit;protein_id=NP_042007.1;transl_table=11;translation=length.89; +NC_001604 GenBank regulatory 36836 36836 . + 1 ID=GenBank:regulatory:NC_001604:36836:36836;Note=E. coli promoter E[6];regulatory_class=promoter; +NC_001604 GenBank sequence_secondary_structure 36856 36856 . + 1 ID=GenBank:sequence_secondary_structure:NC_001604:36856:36856;Note=RNase III site R18.5; +NC_001604 GenBank gene 36553 36822 . + 1 ID=T7p54.gene;Alias=T7p54;Dbxref=GeneID:1261042;Name=T7p54;Note=gene 18; +NC_001604 GenBank CDS 36917 37348 . + 1 ID=T7p55;Dbxref=GOA:P03803,UniProtKB/Swiss-Prot:P03803,GeneID:1261067;Name=T7p55;Note=analog of phage lambda protein Rz%2C a cell lysis protein. Rz and gp18.5 share distant sequence similarity%2C similar function%2C and a similar genome neighborhood. In T7%2C gp18.5 interacts with gp18.7%2C a lambda RZ1-like lysis protein. Other names: gp18.5;codon_start=1;product=phage lambda Rz-like lysis protein;protein_id=NP_042008.1;transl_table=11;translation=length.143; +NC_001604 GenBank gene 36917 37348 . + 1 ID=T7p55.gene;Alias=T7p55;Dbxref=GeneID:1261067;Name=T7p55;Note=gene 18.5; +NC_001604 GenBank CDS 37032 37283 . + 1 ID=T7p56;Dbxref=UniProtKB/Swiss-Prot:P03788,GeneID:1261057;Name=T7p56;Note=in Enterobacteria phage T7%2C this protein interacts with gp18.5 and is expressed from the -1 frame of a gene completely overlapping gene 18.5. This suggests that it may be an analog of lambda lysis protein Rz1. Other names: gp18.7.;codon_start=1;product=phage lambda Rz1-like protein;protein_id=NP_042009.1;transl_table=11;translation=length.83; +NC_001604 GenBank gene 37032 37283 . + 1 ID=T7p56.gene;Alias=T7p56;Dbxref=GeneID:1261057;Name=T7p56;Note=gene 18.7; +NC_001604 GenBank CDS 37370 39130 . + 1 ID=T7p57;Dbxref=GOA:P03694,UniProtKB/Swiss-Prot:P03694,GeneID:1261062;Name=T7p57;Note=gene 19;codon_start=1;product=DNA maturation protein;protein_id=NP_042010.1;transl_table=11;translation=length.586; +NC_001604 GenBank gene 37370 39130 . + 1 ID=T7p57.gene;Alias=T7p57;Dbxref=GeneID:1261062;Name=T7p57;Note=gene 19; +NC_001604 GenBank CDS 38016 38273 . + 1 ID=T7p58;Dbxref=UniProtKB/Swiss-Prot:P03789,GeneID:1261064;Name=T7p58;Note=gene 19.2;codon_start=1;product=hypothetical protein;protein_id=NP_042011.1;transl_table=11;translation=length.85; +NC_001604 GenBank gene 38016 38273 . + 1 ID=T7p58.gene;Alias=T7p58;Dbxref=GeneID:1261064;Name=T7p58;Note=gene 19.2; +NC_001604 GenBank CDS 38553 38726 . + 1 ID=T7p59;Dbxref=UniProtKB/Swiss-Prot:P03790,GeneID:1261066;Name=T7p59;Note=gene 19.3;codon_start=1;product=hypothetical protein;protein_id=NP_042012.1;transl_table=11;translation=length.57; +NC_001604 GenBank regulatory 39229 39229 . + 1 ID=GenBank:regulatory:NC_001604:39229:39229;Note=T7 promoter phiOR;regulatory_class=promoter; +NC_001604 GenBank gene 38553 38726 . + 1 ID=T7p59.gene;Alias=T7p59;Dbxref=GeneID:1261066;Name=T7p59;Note=gene 19.3; +NC_001604 GenBank CDS 39389 39538 . + 1 ID=T7p60;Dbxref=UniProtKB/Swiss-Prot:P03804,GeneID:1261068;Name=T7p60;Note=gene 19.5;codon_start=1;product=hypothetical protein;protein_id=NP_042013.1;transl_table=11;translation=length.49; +NC_001604 GenBank gene 39389 39538 . + 1 ID=T7p60.gene;Alias=T7p60;Dbxref=GeneID:1261068;Name=T7p60;Note=gene 19.5; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/T7_TMHMM.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/T7_TMHMM.gff3 Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,39 @@ +##gff-version 3 +T7p04 feature Chain 2 47 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04 +T7p04 TMHMM Topological domain 1 22 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +T7p04 TMHMM Transmembrane 23 45 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +T7p04 TMHMM Topological domain 46 47 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d +##gff-version 3 +T7p11 feature Chain 2 51 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11 +T7p11 TMHMM Topological domain 1 4 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Topological domain 25 27 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +T7p11 TMHMM Topological domain 51 51 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc +##gff-version 3 +T7p25 feature Chain 2 112 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25 +T7p25 TMHMM Topological domain 1 6 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Transmembrane 7 29 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Topological domain 30 33 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Transmembrane 34 56 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +T7p25 TMHMM Topological domain 57 112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9 +##gff-version 3 +T7p36 feature Chain 2 37 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36 +T7p36 TMHMM Topological domain 1 4 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +T7p36 TMHMM Transmembrane 5 24 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +T7p36 TMHMM Topological domain 25 37 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8 +##gff-version 3 +T7p53 feature Chain 2 67 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53 +T7p53 TMHMM Topological domain 1 36 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +T7p53 TMHMM Transmembrane 37 55 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +T7p53 TMHMM Topological domain 56 67 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb +##gff-version 3 +T7p56 feature Chain 2 83 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56 +T7p56 TMHMM Topological domain 1 27 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +T7p56 TMHMM Transmembrane 28 50 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +T7p56 TMHMM Topological domain 51 83 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e +##gff-version 3 +T7p60 feature Chain 2 49 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60 +T7p60 TMHMM Topological domain 1 12 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 +T7p60 TMHMM Transmembrane 13 30 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 +T7p60 TMHMM Topological domain 31 49 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599 diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/T7_TMHMM_REBASE.gff3 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/T7_TMHMM_REBASE.gff3 Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,33 @@ +##gff-version 3 +NC_001604 feature Chain 1499 1636 . + . Description=Transmembrane protein;ID=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d;Note=Transmembrane protein - N out C in;Target=T7p04; +NC_001604 TMHMM Topological domain 1496 1561 . + . Note=Extracellular;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 TMHMM Transmembrane 1562 1630 . + . Note=Helical;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 TMHMM Topological domain 1631 1636 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_1-0e35f4c8-7d2d-457e-bf36-a121fcede87d; +NC_001604 feature Chain 7611 7760 . + . Description=Transmembrane protein;ID=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc;Note=Transmembrane protein - N in C in;Target=T7p11; +NC_001604 TMHMM Topological domain 7608 7619 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Transmembrane 7620 7679 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Topological domain 7680 7688 . + . Note=Extracellular;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Transmembrane 7689 7757 . + . Note=Helical;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 TMHMM Topological domain 7758 7760 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_2-f3395d64-94df-47d0-b5cd-b098a8fc57fc; +NC_001604 feature Chain 12991 13323 . + . Description=Transmembrane protein;ID=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9;Note=Transmembrane protein - N in C in;Target=T7p25; +NC_001604 TMHMM Topological domain 12988 13005 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Transmembrane 13006 13074 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Topological domain 13075 13086 . + . Note=Extracellular;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Transmembrane 13087 13155 . + . Note=Helical;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 TMHMM Topological domain 13156 13323 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_3-bc20f884-acf4-4cc9-a845-838c08833ba9; +NC_001604 feature Chain 18397 18504 . + . Description=Transmembrane protein;ID=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8;Note=Transmembrane protein - N out C in;Target=T7p36; +NC_001604 TMHMM Topological domain 18394 18405 . + . Note=Extracellular;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 TMHMM Transmembrane 18406 18465 . + . Note=Helical;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 TMHMM Topological domain 18466 18504 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_4-4bb9c895-cf89-4510-b338-3e8027abfee8; +NC_001604 feature Chain 36347 36544 . + . Description=Transmembrane protein;ID=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb;Note=Transmembrane protein - N out C in;Target=T7p53; +NC_001604 TMHMM Topological domain 36344 36451 . + . Note=Extracellular;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 TMHMM Transmembrane 36452 36508 . + . Note=Helical;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 TMHMM Topological domain 36509 36544 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_5-481eeec9-f604-43c0-a99b-3891bd8881bb; +NC_001604 feature Chain 37035 37280 . + . Description=Transmembrane protein;ID=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e;Note=Transmembrane protein - N in C out;Target=T7p56; +NC_001604 TMHMM Topological domain 37032 37112 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 TMHMM Transmembrane 37113 37181 . + . Note=Helical;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 TMHMM Topological domain 37182 37280 . + . Note=Extracellular;Parent=tmhmm_tmd_6-32be98b3-e5bc-4ed2-b5ff-d031936e4a6e; +NC_001604 feature Chain 39392 39535 . + . Description=Transmembrane protein;ID=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599;Note=Transmembrane protein - N in C out;Target=T7p60; +NC_001604 TMHMM Topological domain 39389 39424 . + . Note=Cytoplasmic;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; +NC_001604 TMHMM Transmembrane 39425 39478 . + . Note=Helical;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; +NC_001604 TMHMM Topological domain 39479 39535 . + . Note=Extracellular;Parent=tmhmm_tmd_7-d18b863e-5930-430b-9a60-666e3a790599; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/child.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/child.gff Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,2 @@ +#gff-version 3 +cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/nonprotein.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/nonprotein.gff Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,2 @@ +##gff-version 3 +PhageBob blastp match_part 300 349 1e-40 + . ID=m00001;Notes=RNAse A Protein; diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/parent.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/parent.gff Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,3 @@ +#gff-version 3 +PhageBob maker cds 300 500 . + . ID=gene42 +PhageBob maker cds 300 500 . + . Parent=gene42;ID=cds42 diff -r 6e7e20cb1fc7 -r 4f4b413056f6 test-data/proteins.gff --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/proteins.gff Mon Jun 05 02:44:12 2023 +0000 @@ -0,0 +1,2 @@ +##gff-version 3 +PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein;