changeset 0:66143811fe8a draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:45:08 +0000
parents
children 1bdd481d5c25
files cpt_gbk_to_5col/BIO_FIX_TOPO.py cpt_gbk_to_5col/cpt-macros.xml cpt_gbk_to_5col/gbk_to_five_col.py cpt_gbk_to_5col/gbk_to_five_col.xml cpt_gbk_to_5col/macros.xml cpt_gbk_to_5col/test-data/complex_feature_locs.gbk cpt_gbk_to_5col/test-data/gbkto5col.tsv
diffstat 7 files changed, 477 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/BIO_FIX_TOPO.py	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,85 @@
+import Bio.GenBank
+
+
+def record_end(self, content):
+    """Clean up when we've finished the record.
+    """
+    #from Bio import Alphabet
+    #from Bio.Alphabet import IUPAC
+    from Bio.Seq import Seq, UnknownSeq
+
+    # Try and append the version number to the accession for the full id
+    if not self.data.id:
+        assert "accessions" not in self.data.annotations, self.data.annotations[
+            "accessions"
+        ]
+        self.data.id = self.data.name  # Good fall back?
+    elif self.data.id.count(".") == 0:
+        try:
+            self.data.id += ".%i" % self.data.annotations["sequence_version"]
+        except KeyError:
+            pass
+
+    # add the sequence information
+    # first, determine the alphabet
+    # we default to an generic alphabet if we don't have a
+    # seq type or have strange sequence information.
+    
+    #seq_alphabet = Alphabet.generic_alphabet
+
+    # now set the sequence
+    sequence = "".join(self._seq_data)
+
+    if (
+        self._expected_size is not None
+        and len(sequence) != 0
+        and self._expected_size != len(sequence)
+    ):
+        import warnings
+        from Bio import BiopythonParserWarning
+
+        warnings.warn(
+            "Expected sequence length %i, found %i (%s)."
+            % (self._expected_size, len(sequence), self.data.id),
+            BiopythonParserWarning,
+        )
+    """
+    if self._seq_type:
+        # mRNA is really also DNA, since it is actually cDNA
+        if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
+            seq_alphabet = IUPAC.ambiguous_dna
+        # are there ever really RNA sequences in GenBank?
+        elif "RNA" in self._seq_type.upper():
+            # Even for data which was from RNA, the sequence string
+            # is usually given as DNA (T not U).  Bug 2408
+            if "T" in sequence and "U" not in sequence:
+                seq_alphabet = IUPAC.ambiguous_dna
+            else:
+                seq_alphabet = IUPAC.ambiguous_rna
+        elif (
+            "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
+        ):  # PRT is used in EMBL-bank for patents
+            seq_alphabet = IUPAC.protein  # or extended protein?
+        # work around ugly GenBank records which have circular or
+        # linear but no indication of sequence type
+        elif self._seq_type in ["circular", "linear", "unspecified"]:
+            pass
+        # we have a bug if we get here
+        else:
+            raise ValueError(
+                "Could not determine alphabet for seq_type %s" % self._seq_type
+            )
+
+        # Also save the chomosome layout
+        if "circular" in self._seq_type.lower():
+            self.data.annotations["topology"] = "circular"
+        elif "linear" in self._seq_type.lower():
+            self.data.annotations["topology"] = "linear"
+    """
+    if not sequence and self.__expected_size:
+        self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet)
+    else:
+        self.data.seq = Seq(sequence)#, seq_alphabet)
+
+
+Bio.GenBank._FeatureConsumer.record_end = record_end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/cpt-macros.xml	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation> 
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/gbk_to_five_col.py	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import BIO_FIX_TOPO  # NOQA
+import argparse
+import logging
+from Bio import SeqIO
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger()
+
+
+# Read in Genbank file and parse features
+# Output features into Five Column format
+
+"""
+>Feature SeqID
+Line 1
+    Column 1: Start location (first nucleotide) of a feature
+    Column 2: Stop location (last nucleotide) of a feature
+    Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+Line2:
+    Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+    Column 5: Qualifier value
+
+Repeat for each feature in a seq
+Repeat Line 2 for each qualifier in a feature
+"""
+
+
+def gbk_to_5col(genbank):
+    """Converts genbank to BankIt five column format"""
+    for record in SeqIO.parse(genbank, "genbank"):
+        print(">Feature %s" % record.id)
+        for feature in record.features:
+            if feature.type == "source":
+                continue
+            else:
+                for index, part in enumerate(feature.location.parts):
+                    if part.strand > 0:
+                        start = int(part.start) + 1
+                        end = int(part.end)
+                    else:
+                        start = int(part.end)
+                        end = int(part.start) + 1
+                    if index == 0:
+                        name = feature.type
+                        print("%d\t%d\t%s" % (start, end, name))
+                    else:
+                        print("%d\t%d" % (start, end))
+                for (qualifier, values) in feature.qualifiers.items():
+                    for value in values:
+                        print("\t\t\t%s\t%s" % (qualifier, value))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert a Genbank file into five column format"
+    )
+    parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file")
+
+    args = vars(parser.parse_args())
+    gbk_to_5col(**args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/gbk_to_five_col.xml	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,63 @@
+<?xml version="1.0"?>
+<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0">
+    <description></description>
+  <macros>
+    <import>macros.xml</import>
+		<import>cpt-macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/gbk_to_five_col.py
+  "$file"
+
+> "$output"
+
+]]></command>
+  <inputs>
+    <param label="GenBank file" name="file" type="data" format="genbank" />
+  </inputs>
+  <outputs>
+    <data format="tabular" name="output">
+    </data>
+  </outputs>
+  <tests>
+      <test>
+          <param name="file" value="complex_feature_locs.gbk" />
+          <output name="output" value="gbkto5col.tsv" />
+      </test>
+  </tests>
+  <help>
+Genbank Format to Five Column Format
+====================================
+
+Output format is:
+
+>Feature ID
+Line 1
+- Column 1: Start location (first nucleotide) of a feature
+- Column 2: Stop location (last nucleotide) of a feature
+- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon')
+
+Line2:
+- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note')
+- Column 5: Qualifier value
+
+Example Output::
+
+    >Feature contig00077
+    0	22956	source
+    			mol_type	genomic DNA
+    			organism	AU1189
+    11652	11326	CDS
+    11327	11158
+    			note	tapemeasure frameshift chaperone
+    			product	P2 E' tapemeasure frameshift chaperone
+    			gene	gp14
+    			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+    11900	11599	CDS
+    11600	11408
+    11910	11904	RBS
+
+</help>
+		<expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/macros.xml	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,105 @@
+<?xml version="1.0"?>
+<macros>
+  <xml name="requirements">
+    <requirements>
+	<requirement type="package" version="3.8.13">python</requirement>
+	<requirement type="package" version="1.79">biopython</requirement>
+	<requirement type="package" version="1.2.2">cpt_gffparser</requirement>  
+	<yield/>
+    </requirements>
+  </xml>
+  <xml name="ldap_ref"
+    token_name="dn_ref"
+    token_label="Pick a DN"
+    token_fromfile="ldap_people.loc">
+        <repeat name="repeat_@NAME@" title="@LABEL@">
+          <param name="@NAME@" label="Select a @LABEL@" type="select">
+            <options from_file="@FROMFILE@">
+                <column name="name" index="0"/>
+                <column name="value" index="1"/>
+            </options>
+          </param>
+        </repeat>
+    </xml>
+  <xml name="ldap_ref_single"
+    token_name="dn_ref"
+    token_label="Pick a DN"
+    token_fromfile="ldap_people.loc">
+          <param name="@NAME@" label="Select a @LABEL@" type="select">
+            <options from_file="@FROMFILE@">
+                <column name="name" index="0"/>
+                <column name="value" index="1"/>
+            </options>
+          </param>
+    </xml>
+	<xml name="gbk_feature_type"
+		token_label="Feature type to remove"
+		token_multiple="True"
+		token_optional="False"
+		token_name="positional_2">
+    <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select">
+      <option value="-10_signal">-10_signal</option>
+      <option value="-35_signal">-35_signal</option>
+      <option value="3'UTR">3'UTR</option>
+      <option value="5'UTR">5'UTR</option>
+      <option value="CAAT_signal">CAAT_signal</option>
+      <option selected="true" value="CDS">CDS</option>
+      <option value="C_region">C_region</option>
+      <option value="D-loop">D-loop</option>
+      <option value="D_segment">D_segment</option>
+      <option value="GC_signal">GC_signal</option>
+      <option value="J_segment">J_segment</option>
+      <option value="LTR">LTR</option>
+      <option value="N_region">N_region</option>
+      <option value="RBS">RBS</option>
+      <option value="STS">STS</option>
+      <option value="S_region">S_region</option>
+      <option value="TATA_signal">TATA_signal</option>
+      <option value="V_region">V_region</option>
+      <option value="V_segment">V_segment</option>
+      <option value="all">all</option>
+      <option value="assembly_gap">assembly_gap</option>
+      <option value="attenuator">attenuator</option>
+      <option value="enhancer">enhancer</option>
+      <option value="exon">exon</option>
+      <option value="gap">gap</option>
+      <option value="gene">gene</option>
+      <option value="iDNA">iDNA</option>
+      <option value="intron">intron</option>
+      <option value="mRNA">mRNA</option>
+      <option value="mat_peptide">mat_peptide</option>
+      <option value="misc_RNA">misc_RNA</option>
+      <option value="misc_binding">misc_binding</option>
+      <option value="misc_difference">misc_difference</option>
+      <option value="misc_feature">misc_feature</option>
+      <option value="misc_recomb">misc_recomb</option>
+      <option value="misc_signal">misc_signal</option>
+      <option value="misc_structure">misc_structure</option>
+      <option value="mobile_element">mobile_element</option>
+      <option value="modified_base">modified_base</option>
+      <option value="ncRNA">ncRNA</option>
+      <option value="old_sequence">old_sequence</option>
+      <option value="operon">operon</option>
+      <option value="oriT">oriT</option>
+      <option value="polyA_signal">polyA_signal</option>
+      <option value="polyA_site">polyA_site</option>
+      <option value="precursor_RNA">precursor_RNA</option>
+      <option value="prim_transcript">prim_transcript</option>
+      <option value="primer_bind">primer_bind</option>
+      <option value="promoter">promoter</option>
+      <option value="protein_bind">protein_bind</option>
+      <option value="rRNA">rRNA</option>
+      <option value="rep_origin">rep_origin</option>
+      <option value="repeat_region">repeat_region</option>
+      <option value="sig_peptide">sig_peptide</option>
+      <option value="source">source</option>
+      <option value="stem_loop">stem_loop</option>
+      <option value="tRNA">tRNA</option>
+      <option value="terminator">terminator</option>
+      <option value="tmRNA">tmRNA</option>
+      <option value="transit_peptide">transit_peptide</option>
+      <option value="unsure">unsure</option>
+      <option value="variation">variation</option>
+    </param>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/test-data/complex_feature_locs.gbk	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,38 @@
+LOCUS       contig00077              300 bp    DNA     linear       15-MAR-2010
+DEFINITION  '[length=22956]'   '[numreads=4517 from AU1189;454 Data]'.
+ACCESSION
+VERSION
+KEYWORDS    .
+SOURCE      AU1189
+  ORGANISM  AU1189
+            Unclassified.
+REFERENCE   1  (bases 1 to 22956)
+  AUTHORS   Duarte,I.
+  TITLE     contig77
+  JOURNAL   Unpublished
+REFERENCE   2  (bases 1 to 22956)
+  AUTHORS   Duarte,I.
+  TITLE     Direct Submission
+  JOURNAL   Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU,
+            College Station, TX 77840, USA
+FEATURES             Location/Qualifiers
+     source          1..22956
+                     /organism="AU1189"
+                     /mol_type="genomic DNA"
+     CDS             complement(join(11159..11327,11327..11652))
+                     /note="tapemeasure frameshift chaperone"
+                     /product="P2 E' tapemeasure frameshift chaperone"
+                     /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV
+                     SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL
+                     PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ"
+                     /gene="gp14"
+     CDS             complement(join(11409..11600,11600..11900))
+     RBS             complement(11905..11910)
+BASE COUNT     3240 a   7606 c   8254 g   3856 t
+ORIGIN
+        1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac
+       61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat
+      121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag
+      181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag
+      241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc
+//
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_gbk_to_5col/test-data/gbkto5col.tsv	Fri Jun 17 12:45:08 2022 +0000
@@ -0,0 +1,10 @@
+>Feature contig00077
+11652	11327	CDS
+11327	11159
+			note	tapemeasure frameshift chaperone
+			product	P2 E' tapemeasure frameshift chaperone
+			translation	MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ
+			gene	gp14
+11900	11600	CDS
+11600	11409
+11910	11905	RBS