Mercurial > repos > cpt > cpt_gbk_to_5col
changeset 0:66143811fe8a draft
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 12:45:08 +0000 |
parents | |
children | 1bdd481d5c25 |
files | cpt_gbk_to_5col/BIO_FIX_TOPO.py cpt_gbk_to_5col/cpt-macros.xml cpt_gbk_to_5col/gbk_to_five_col.py cpt_gbk_to_5col/gbk_to_five_col.xml cpt_gbk_to_5col/macros.xml cpt_gbk_to_5col/test-data/complex_feature_locs.gbk cpt_gbk_to_5col/test-data/gbkto5col.tsv |
diffstat | 7 files changed, 477 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/BIO_FIX_TOPO.py Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,85 @@ +import Bio.GenBank + + +def record_end(self, content): + """Clean up when we've finished the record. + """ + #from Bio import Alphabet + #from Bio.Alphabet import IUPAC + from Bio.Seq import Seq, UnknownSeq + + # Try and append the version number to the accession for the full id + if not self.data.id: + assert "accessions" not in self.data.annotations, self.data.annotations[ + "accessions" + ] + self.data.id = self.data.name # Good fall back? + elif self.data.id.count(".") == 0: + try: + self.data.id += ".%i" % self.data.annotations["sequence_version"] + except KeyError: + pass + + # add the sequence information + # first, determine the alphabet + # we default to an generic alphabet if we don't have a + # seq type or have strange sequence information. + + #seq_alphabet = Alphabet.generic_alphabet + + # now set the sequence + sequence = "".join(self._seq_data) + + if ( + self._expected_size is not None + and len(sequence) != 0 + and self._expected_size != len(sequence) + ): + import warnings + from Bio import BiopythonParserWarning + + warnings.warn( + "Expected sequence length %i, found %i (%s)." + % (self._expected_size, len(sequence), self.data.id), + BiopythonParserWarning, + ) + """ + if self._seq_type: + # mRNA is really also DNA, since it is actually cDNA + if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper(): + seq_alphabet = IUPAC.ambiguous_dna + # are there ever really RNA sequences in GenBank? + elif "RNA" in self._seq_type.upper(): + # Even for data which was from RNA, the sequence string + # is usually given as DNA (T not U). Bug 2408 + if "T" in sequence and "U" not in sequence: + seq_alphabet = IUPAC.ambiguous_dna + else: + seq_alphabet = IUPAC.ambiguous_rna + elif ( + "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT" + ): # PRT is used in EMBL-bank for patents + seq_alphabet = IUPAC.protein # or extended protein? + # work around ugly GenBank records which have circular or + # linear but no indication of sequence type + elif self._seq_type in ["circular", "linear", "unspecified"]: + pass + # we have a bug if we get here + else: + raise ValueError( + "Could not determine alphabet for seq_type %s" % self._seq_type + ) + + # Also save the chomosome layout + if "circular" in self._seq_type.lower(): + self.data.annotations["topology"] = "circular" + elif "linear" in self._seq_type.lower(): + self.data.annotations["topology"] = "linear" + """ + if not sequence and self.__expected_size: + self.data.seq = UnknownSeq(self._expected_size)#, seq_alphabet) + else: + self.data.seq = Seq(sequence)#, seq_alphabet) + + +Bio.GenBank._FeatureConsumer.record_end = record_end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/cpt-macros.xml Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/gbk_to_five_col.py Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,61 @@ +#!/usr/bin/env python +import BIO_FIX_TOPO # NOQA +import argparse +import logging +from Bio import SeqIO + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +# Read in Genbank file and parse features +# Output features into Five Column format + +""" +>Feature SeqID +Line 1 + Column 1: Start location (first nucleotide) of a feature + Column 2: Stop location (last nucleotide) of a feature + Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') +Line2: + Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') + Column 5: Qualifier value + +Repeat for each feature in a seq +Repeat Line 2 for each qualifier in a feature +""" + + +def gbk_to_5col(genbank): + """Converts genbank to BankIt five column format""" + for record in SeqIO.parse(genbank, "genbank"): + print(">Feature %s" % record.id) + for feature in record.features: + if feature.type == "source": + continue + else: + for index, part in enumerate(feature.location.parts): + if part.strand > 0: + start = int(part.start) + 1 + end = int(part.end) + else: + start = int(part.end) + end = int(part.start) + 1 + if index == 0: + name = feature.type + print("%d\t%d\t%s" % (start, end, name)) + else: + print("%d\t%d" % (start, end)) + for (qualifier, values) in feature.qualifiers.items(): + for value in values: + print("\t\t\t%s\t%s" % (qualifier, value)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert a Genbank file into five column format" + ) + parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file") + + args = vars(parser.parse_args()) + gbk_to_5col(**args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/gbk_to_five_col.xml Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,63 @@ +<?xml version="1.0"?> +<tool id="edu.tamu.cpt.genbank.GBKtoFiveCol" name="Genbank to Five Column Format" version="1.0"> + <description></description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python $__tool_directory__/gbk_to_five_col.py + "$file" + +> "$output" + +]]></command> + <inputs> + <param label="GenBank file" name="file" type="data" format="genbank" /> + </inputs> + <outputs> + <data format="tabular" name="output"> + </data> + </outputs> + <tests> + <test> + <param name="file" value="complex_feature_locs.gbk" /> + <output name="output" value="gbkto5col.tsv" /> + </test> + </tests> + <help> +Genbank Format to Five Column Format +==================================== + +Output format is: + +>Feature ID +Line 1 +- Column 1: Start location (first nucleotide) of a feature +- Column 2: Stop location (last nucleotide) of a feature +- Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') + +Line2: +- Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') +- Column 5: Qualifier value + +Example Output:: + + >Feature contig00077 + 0 22956 source + mol_type genomic DNA + organism AU1189 + 11652 11326 CDS + 11327 11158 + note tapemeasure frameshift chaperone + product P2 E' tapemeasure frameshift chaperone + gene gp14 + translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ + 11900 11599 CDS + 11600 11408 + 11910 11904 RBS + +</help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/macros.xml Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,105 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.8.13">python</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <xml name="ldap_ref" + token_name="dn_ref" + token_label="Pick a DN" + token_fromfile="ldap_people.loc"> + <repeat name="repeat_@NAME@" title="@LABEL@"> + <param name="@NAME@" label="Select a @LABEL@" type="select"> + <options from_file="@FROMFILE@"> + <column name="name" index="0"/> + <column name="value" index="1"/> + </options> + </param> + </repeat> + </xml> + <xml name="ldap_ref_single" + token_name="dn_ref" + token_label="Pick a DN" + token_fromfile="ldap_people.loc"> + <param name="@NAME@" label="Select a @LABEL@" type="select"> + <options from_file="@FROMFILE@"> + <column name="name" index="0"/> + <column name="value" index="1"/> + </options> + </param> + </xml> + <xml name="gbk_feature_type" + token_label="Feature type to remove" + token_multiple="True" + token_optional="False" + token_name="positional_2"> + <param label="@LABEL@" optional="@TOKEN_OPTIONAL" multiple="@MULTIPLE@" name="feature_type" type="select"> + <option value="-10_signal">-10_signal</option> + <option value="-35_signal">-35_signal</option> + <option value="3'UTR">3'UTR</option> + <option value="5'UTR">5'UTR</option> + <option value="CAAT_signal">CAAT_signal</option> + <option selected="true" value="CDS">CDS</option> + <option value="C_region">C_region</option> + <option value="D-loop">D-loop</option> + <option value="D_segment">D_segment</option> + <option value="GC_signal">GC_signal</option> + <option value="J_segment">J_segment</option> + <option value="LTR">LTR</option> + <option value="N_region">N_region</option> + <option value="RBS">RBS</option> + <option value="STS">STS</option> + <option value="S_region">S_region</option> + <option value="TATA_signal">TATA_signal</option> + <option value="V_region">V_region</option> + <option value="V_segment">V_segment</option> + <option value="all">all</option> + <option value="assembly_gap">assembly_gap</option> + <option value="attenuator">attenuator</option> + <option value="enhancer">enhancer</option> + <option value="exon">exon</option> + <option value="gap">gap</option> + <option value="gene">gene</option> + <option value="iDNA">iDNA</option> + <option value="intron">intron</option> + <option value="mRNA">mRNA</option> + <option value="mat_peptide">mat_peptide</option> + <option value="misc_RNA">misc_RNA</option> + <option value="misc_binding">misc_binding</option> + <option value="misc_difference">misc_difference</option> + <option value="misc_feature">misc_feature</option> + <option value="misc_recomb">misc_recomb</option> + <option value="misc_signal">misc_signal</option> + <option value="misc_structure">misc_structure</option> + <option value="mobile_element">mobile_element</option> + <option value="modified_base">modified_base</option> + <option value="ncRNA">ncRNA</option> + <option value="old_sequence">old_sequence</option> + <option value="operon">operon</option> + <option value="oriT">oriT</option> + <option value="polyA_signal">polyA_signal</option> + <option value="polyA_site">polyA_site</option> + <option value="precursor_RNA">precursor_RNA</option> + <option value="prim_transcript">prim_transcript</option> + <option value="primer_bind">primer_bind</option> + <option value="promoter">promoter</option> + <option value="protein_bind">protein_bind</option> + <option value="rRNA">rRNA</option> + <option value="rep_origin">rep_origin</option> + <option value="repeat_region">repeat_region</option> + <option value="sig_peptide">sig_peptide</option> + <option value="source">source</option> + <option value="stem_loop">stem_loop</option> + <option value="tRNA">tRNA</option> + <option value="terminator">terminator</option> + <option value="tmRNA">tmRNA</option> + <option value="transit_peptide">transit_peptide</option> + <option value="unsure">unsure</option> + <option value="variation">variation</option> + </param> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/test-data/complex_feature_locs.gbk Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,38 @@ +LOCUS contig00077 300 bp DNA linear 15-MAR-2010 +DEFINITION '[length=22956]' '[numreads=4517 from AU1189;454 Data]'. +ACCESSION +VERSION +KEYWORDS . +SOURCE AU1189 + ORGANISM AU1189 + Unclassified. +REFERENCE 1 (bases 1 to 22956) + AUTHORS Duarte,I. + TITLE contig77 + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 22956) + AUTHORS Duarte,I. + TITLE Direct Submission + JOURNAL Submitted (15-MAR-2010) PLPM, Texas A&M University, 2132 TAMU, + College Station, TX 77840, USA +FEATURES Location/Qualifiers + source 1..22956 + /organism="AU1189" + /mol_type="genomic DNA" + CDS complement(join(11159..11327,11327..11652)) + /note="tapemeasure frameshift chaperone" + /product="P2 E' tapemeasure frameshift chaperone" + /translation="MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGV + SLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGL + PDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ" + /gene="gp14" + CDS complement(join(11409..11600,11600..11900)) + RBS complement(11905..11910) +BASE COUNT 3240 a 7606 c 8254 g 3856 t +ORIGIN + 1 agccgggcgc gccaagcctg atcaggctct cagcggtttc ctcccatcgt cgtgcagtac + 61 cgttgcagct aaattgcagc cggaatcggc gcgggctcgg ccgtcagcgg cgcgacccat + 121 tgcgccagat gcgcggccga cagatgcgcg taccgctgca ccatttccat cgtctcccag + 181 ccgcccagct ccttcagcac ctgcagcggc gtgccgcgtt ggacgtgcca gctcgcccag + 241 gtgtggcgca ggtcgtgcca gcggaaatcg tgcaggccgg cgcgccgcag cgccttggcc +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_gbk_to_5col/test-data/gbkto5col.tsv Fri Jun 17 12:45:08 2022 +0000 @@ -0,0 +1,10 @@ +>Feature contig00077 +11652 11327 CDS +11327 11159 + note tapemeasure frameshift chaperone + product P2 E' tapemeasure frameshift chaperone + translation MNPIQSDAAAPDLQADAAAIATPAQDDPATHTLDTPLVRGTQTITSITLRKPKSGELRGVSLSDLVSLDVVALSKVLPRISSPMLTEADVASIDPADLVQLGGIFAGFFDAEGREIPTGLPDRVEDPMADIATVFGWTPPVMDAFSLAELMDWRERARVRAGAQ + gene gp14 +11900 11600 CDS +11600 11409 +11910 11905 RBS