# HG changeset patch
# User cpt
# Date 1655469908 0
# Node ID 66143811fe8aad6d5ae9b6e7f9d8757fde277aa5
Uploaded
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/BIO_FIX_TOPO.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/BIO_FIX_TOPO.py Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,85 @@
+import Bio.GenBank
+
+
+def record_end(self, content):
+ """Clean up when we've finished the record.
+ """
+ #from Bio import Alphabet
+ #from Bio.Alphabet import IUPAC
+ from Bio.Seq import Seq, UnknownSeq
+
+ # Try and append the version number to the accession for the full id
+ if not self.data.id:
+ assert "accessions" not in self.data.annotations, self.data.annotations[
+ "accessions"
+ ]
+ self.data.id = self.data.name # Good fall back?
+ elif self.data.id.count(".") == 0:
+ try:
+ self.data.id += ".%i" % self.data.annotations["sequence_version"]
+ except KeyError:
+ pass
+
+ # add the sequence information
+ # first, determine the alphabet
+ # we default to an generic alphabet if we don't have a
+ # seq type or have strange sequence information.
+
+ #seq_alphabet = Alphabet.generic_alphabet
+
+ # now set the sequence
+ sequence = "".join(self._seq_data)
+
+ if (
+ self._expected_size is not None
+ and len(sequence) != 0
+ and self._expected_size != len(sequence)
+ ):
+ import warnings
+ from Bio import BiopythonParserWarning
+
+ warnings.warn(
+ "Expected sequence length %i, found %i (%s)."
+ % (self._expected_size, len(sequence), self.data.id),
+ BiopythonParserWarning,
+ )
+ """
+ if self._seq_type:
+ # mRNA is really also DNA, since it is actually cDNA
+ if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
+ seq_alphabet = IUPAC.ambiguous_dna
+ # are there ever really RNA sequences in GenBank?
+ elif "RNA" in self._seq_type.upper():
+ # Even for data which was from RNA, the sequence string
+ # is usually given as DNA (T not U). Bug 2408
+ if "T" in sequence and "U" not in sequence:
+ seq_alphabet = IUPAC.ambiguous_dna
+ else:
+ seq_alphabet = IUPAC.ambiguous_rna
+ elif (
+ "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
+ ): # PRT is used in EMBL-bank for patents
+ seq_alphabet = IUPAC.protein # or extended protein?
+ # work around ugly GenBank records which have circular or
+ # linear but no indication of sequence type
+ elif self._seq_type in ["circular", "linear", "unspecified"]:
+ pass
+ # we have a bug if we get here
+ else:
+ raise ValueError(
+ "Could not determine alphabet for seq_type %s" % self._seq_type
+ )
+
+ # Also save the chomosome layout
+ if "circular" in self._seq_type.lower():
+ self.data.annotations["topology"] = "circular"
+ elif "linear" in self._seq_type.lower():
+ self.data.annotations["topology"] = "linear"
+ """
+ if not sequence and self.__expected_size:
+ self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet)
+ else:
+ self.data.seq = Seq(sequence)#, seq_alphabet)
+
+
+Bio.GenBank._FeatureConsumer.record_end = record_end
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/cpt-macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/cpt-macros.xml Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,115 @@
+
+
+
+
+ python
+ biopython
+ requests
+
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Ross},
+ title = {CPT Galaxy Tools},
+ year = {2020-},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {E. Mijalis, H. Rasche},
+ title = {CPT Galaxy Tools},
+ year = {2013-2017},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {A. Criscione},
+ title = {CPT Galaxy Tools},
+ year = {2019-2021},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ 10.1371/journal.pcbi.1008214
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
+
+
+ @unpublished{galaxyTools,
+ author = {C. Maughmer},
+ title = {CPT Galaxy Tools},
+ year = {2017-2020},
+ note = {https://github.com/tamu-cpt/galaxy-tools/}
+ }
+
+
+
+
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/gbk_to_five_col.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/gbk_to_five_col.py Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import BIO_FIX_TOPO # NOQA
+import argparse
+import logging
+from Bio import SeqIO
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+# Read in Genbank file and parse features
+# Output features into Five Column format
+
+"""
+>Feature SeqID
+Line 1
+ Column 1: Start location (first nucleotide) of a feature
+ Column 2: Stop location (last nucleotide) of a feature
+ Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+Line2:
+ Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+ Column 5: Qualifier value
+
+Repeat for each feature in a seq
+Repeat Line 2 for each qualifier in a feature
+"""
+
+
+def gbk_to_5col(genbank):
+ """Converts genbank to BankIt five column format"""
+ for record in SeqIO.parse(genbank, "genbank"):
+ print(">Feature %s" % record.id)
+ for feature in record.features:
+ if feature.type == "source":
+ continue
+ else:
+ for index, part in enumerate(feature.location.parts):
+ if part.strand > 0:
+ start = int(part.start) + 1
+ end = int(part.end)
+ else:
+ start = int(part.end)
+ end = int(part.start) + 1
+ if index == 0:
+ name = feature.type
+ print("%d\t%d\t%s" % (start, end, name))
+ else:
+ print("%d\t%d" % (start, end))
+ for (qualifier, values) in feature.qualifiers.items():
+ for value in values:
+ print("\t\t\t%s\t%s" % (qualifier, value))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Convert a Genbank file into five column format"
+ )
+ parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file")
+
+ args = vars(parser.parse_args())
+ gbk_to_5col(**args)
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/gbk_to_five_col.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/gbk_to_five_col.xml Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,63 @@
+
+
+
+
+ macros.xml
+ cpt-macros.xml
+
+
+ "$output"
+
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Genbank Format to Five Column Format
+====================================
+
+Output format is:
+
+>Feature ID
+Line 1
+- Column 1: Start location (first nucleotide) of a feature
+- Column 2: Stop location (last nucleotide) of a feature
+- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+
+Line2:
+- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+- Column 5: Qualifier value
+
+Example Output::
+
+ >Feature contig00077
+ 0 22956 source
+ mol_type genomic DNA
+ organism AU1189
+ 11652 11326 CDS
+ 11327 11158
+ note tapemeasure frameshift chaperone
+ product P2 E' tapemeasure frameshift chaperone
+ gene gp14
+ translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+ 11900 11599 CDS
+ 11600 11408
+ 11910 11904 RBS
+
+
+
+
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/macros.xml Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,105 @@
+
+
+
+
+ python
+ biopython
+ cpt_gffparser
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/test-data/complex_feature_locs.gbk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/test-data/complex_feature_locs.gbk Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,38 @@
+LOCUS contig00077 300 bp DNA linear 15-MAR-2010
+DEFINITION '[length=22956]' '[numreads=4517 from AU1189;454 Data]'.
+ACCESSION
+VERSION
+KEYWORDS .
+SOURCE AU1189
+ ORGANISM AU1189
+ Unclassified.
+REFERENCE 1 (bases 1 to 22956)
+ AUTHORS Duarte,I.
+ TITLE contig77
+ JOURNAL Unpublished
+REFERENCE 2 (bases 1 to 22956)
+ AUTHORS Duarte,I.
+ TITLE Direct Submission
+ JOURNAL Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU,
+ College Station, TX 77840, USA
+FEATURES Location/Qualifiers
+ source 1..22956
+ /organism="AU1189"
+ /mol_type="genomic DNA"
+ CDS complement(join(11159..11327,11327..11652))
+ /note="tapemeasure frameshift chaperone"
+ /product="P2 E' tapemeasure frameshift chaperone"
+ /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV
+ SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL
+ PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ"
+ /gene="gp14"
+ CDS complement(join(11409..11600,11600..11900))
+ RBS complement(11905..11910)
+BASE COUNT 3240 a 7606 c 8254 g 3856 t
+ORIGIN
+ 1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac
+ 61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat
+ 121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag
+ 181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag
+ 241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc
+//
diff -r 000000000000 -r 66143811fe8a cpt_gbk_to_5col/test-data/gbkto5col.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/test-data/gbkto5col.tsv Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,10 @@
+>Feature contig00077
+11652 11327 CDS
+11327 11159
+ note tapemeasure frameshift chaperone
+ product P2 E' tapemeasure frameshift chaperone
+ translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+ gene gp14
+11900 11600 CDS
+11600 11409
+11910 11905 RBS