Mercurial > repos > cpt > cpt_disruptin_table
changeset 1:a99be535e99d draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:41:05 +0000 |
parents | f3fc78cc4c43 |
children | 6404df40e420 |
files | Disruptin_hydrophobicity_helicity_table_package.py Disruptin_hydrophobicity_helicity_table_package.xml cpt-macros.xml cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.xml cpt_disruptin_table/cpt-macros.xml cpt_disruptin_table/macros.xml macros.xml |
diffstat | 8 files changed, 343 insertions(+), 293 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Disruptin_hydrophobicity_helicity_table_package.py Mon Jun 05 02:41:05 2023 +0000 @@ -0,0 +1,122 @@ +""" +This program is intended to create the output table for the disruptin finder workflow +""" +from Bio import SeqIO +from Bio.SeqUtils.ProtParam import ProteinAnalysis +from Bio.SeqUtils import ProtParamData +import csv +import argparse +import sys + + +def disruptin_table(garnier_file, fasta_file): + # Iterable variables + position = 1 + net_charge = 0 + charge_res = 0 + record_number = 0 + + # loop structures + names = [] + sec_struct = [] + + # reading the lines from the garnier csv file + # with open(garnier_file,'r') as csvfile: + # garnierreader = csv.reader(csvfile) + for row in garnier_file: + if row[0] == "Sequence: ": + names += [row[1]] + elif row[0] in "HETC": + row = row.split("\t") + sec_struct += ["".join(row)] + + record = [] + p = [] + r = [] + c = [] + h = [] + s = [] + + # Parse the .fasta file and get the sequence + for rec in SeqIO.parse(fasta_file, "fasta"): + sequence = str(rec.seq) + + # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary + # structure + record += [rec.id] + position_vec = [] + residue_vec = [] + charge_sym_vec = [] + sec_struct_vec = [] + + for aa in sequence: + position_vec += [str(position)] + residue_vec += [str(aa)] + sec_struct_vec += [str(sec_struct[record_number][position - 1])] + + # For R and K residues a positive charge is given + if aa in "RK": + symbol = "+" + # For D and E residues a negative charge is given + elif aa in "DE": + symbol = "-" + elif aa in "AVMILPWFG": + symbol = "N" + elif aa in "HSYTCQN": + symbol = "P" + charge_sym_vec += symbol + position += 1 + + # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning + # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their + # approximate position on the sequence. + prot_ana_seq = ProteinAnalysis(sequence) + hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4 + + record_number += 1 + position = 1 + + p += [position_vec] + r += [residue_vec] + c += [charge_sym_vec] + h += [hydro] + s += [sec_struct_vec] + + # returns values for name of the sequence + return record, p, r, c, h, s + + +if __name__ == "__main__": + # Grab all of the filters from our plugin loader + parser = argparse.ArgumentParser(description="Disruptin Table Output") + parser.add_argument( + "garnier_file", type=argparse.FileType("r"), help="csv file from garnier reader" + ) + parser.add_argument( + "fasta_file", + type=argparse.FileType("r"), + help="fasta file of disruptin candidates", + ) + args = parser.parse_args() + + # Set up output location + # f = open(sys.stdout, 'w', newline='') + # writer1 = csv.writer(f) + + iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args)) + + for i in range(len(iden)): + # writer1.writerow(['Protein ID']+[iden[i]]) + # writer1.writerow(['Position'] + [format(x, 's') for x in position[i]]) + # writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]]) + # writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]]) + # writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]]) + # writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]]) + # writer1.writerow(['']) + + print(str(iden[i])) + print("Position \t " + "\t".join(position[i])) + print("Residue \t" + "\t".join(residue[i])) + print("Charge \t" + "\t".join(charge[i])) + print("Hydrophobicity \t" + "\t".join(format(x, ".3f") for x in hydro[i])) + print("Secondary Structure \t" + "\t".join(struct[i]))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Disruptin_hydrophobicity_helicity_table_package.xml Mon Jun 05 02:41:05 2023 +0000 @@ -0,0 +1,32 @@ +<tool id="edu.tamu.cpt2.phage.disruptin_table" name="Disruptin Table Output" version="1.0"> + <description>makes table of disruptin candidates</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python '$__tool_directory__/Disruptin_hydrophobicity_helicity_table_package.py' +'$garnier_file' +'$fasta_file' + + + +>$output]]></command> + <inputs> + <param label="Garnier csv file" name="garnier_file" type="data" format="tabular"/> + <param label="Candidate fasta file" name="fasta_file" type="data" format="fasta"/> + </inputs> + <outputs> + <data format="tabular" name="output"/> + </outputs> + <help><![CDATA[ +**What it does** +This program takes the parsed output from the garnier tool and the fasta file with disruptin candidate sequences +and compiles information on each of the sequences into a table format. The table includes the sequence and the position for each residue +as well as the charge, hydrophobicity (based on the Kyte Doolittle scale), and secondary structure prediction from +the garnier tool. + + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:41:05 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.py Fri Jun 17 12:33:22 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,122 +0,0 @@ -""" -This program is intended to create the output table for the disruptin finder workflow -""" -from Bio import SeqIO -from Bio.SeqUtils.ProtParam import ProteinAnalysis -from Bio.SeqUtils import ProtParamData -import csv -import argparse -import sys - - -def disruptin_table(garnier_file, fasta_file): - # Iterable variables - position = 1 - net_charge = 0 - charge_res = 0 - record_number = 0 - - # loop structures - names = [] - sec_struct = [] - - # reading the lines from the garnier csv file -# with open(garnier_file,'r') as csvfile: -# garnierreader = csv.reader(csvfile) - for row in garnier_file: - if row[0] == 'Sequence: ': - names += [row[1]] - elif row[0] in 'HETC': - row = row.split('\t') - sec_struct += [''.join(row)] - - record = [] - p = [] - r = [] - c = [] - h = [] - s = [] - - # Parse the .fasta file and get the sequence - for rec in SeqIO.parse(fasta_file, "fasta"): - sequence = str(rec.seq) - - # Set up the information vectors: for position #, residue, hydrophobic/charge/polar/nonpolar, and secondary - # structure - record += [rec.id] - position_vec = [] - residue_vec = [] - charge_sym_vec = [] - sec_struct_vec = [] - - for aa in sequence: - position_vec += [str(position)] - residue_vec += [str(aa)] - sec_struct_vec += [str(sec_struct[record_number][position - 1])] - - # For R and K residues a positive charge is given - if aa in "RK": - symbol = "+" - # For D and E residues a negative charge is given - elif aa in "DE": - symbol = "-" - elif aa in "AVMILPWFG": - symbol = "N" - elif aa in "HSYTCQN": - symbol = "P" - charge_sym_vec += symbol - position += 1 - - # Calculating hyrophobicity based on Kyte and Doolittle scale. Using binning value of 9. Since the binning - # is 9, the first 4 residues and last 4 residues as set blank so as to center the values to their - # approximate position on the sequence. - prot_ana_seq = ProteinAnalysis(sequence) - hydro = [0] * 4 + prot_ana_seq.protein_scale(ProtParamData.kd, 9) + [0] * 4 - - record_number += 1 - position = 1 - - p += [position_vec] - r += [residue_vec] - c += [charge_sym_vec] - h += [hydro] - s += [sec_struct_vec] - - # returns values for name of the sequence - return record, p, r, c, h, s - - -if __name__ == "__main__": - # Grab all of the filters from our plugin loader - parser = argparse.ArgumentParser(description="Disruptin Table Output") - parser.add_argument( - "garnier_file", type=argparse.FileType("r"), help="csv file from garnier reader" - ) - parser.add_argument( - "fasta_file", - type=argparse.FileType("r"), - help="fasta file of disruptin candidates", - ) - args = parser.parse_args() - - # Set up output location -# f = open(sys.stdout, 'w', newline='') -# writer1 = csv.writer(f) - - iden, position, residue, charge, hydro, struct = disruptin_table(**vars(args)) - - for i in range(len(iden)): -# writer1.writerow(['Protein ID']+[iden[i]]) -# writer1.writerow(['Position'] + [format(x, 's') for x in position[i]]) -# writer1.writerow(['Residue'] + [format(x, 's') for x in residue[i]]) -# writer1.writerow(['Charge'] + [format(x, 's') for x in charge[i]]) -# writer1.writerow(['Hydrophobicity'] + [format(x, '.3f') for x in hydro[i]]) -# writer1.writerow(['Secondary Structure'] + [format(x, 's') for x in struct[i]]) -# writer1.writerow(['']) - - print(str(iden[i])) - print("Position \t " + "\t".join(position[i])) - print("Residue \t" + "\t".join(residue[i])) - print("Charge \t" + "\t".join(charge[i])) - print("Hydrophobicity \t" + "\t".join(format(x, ".3f") for x in hydro[i])) - print("Secondary Structure \t" + "\t".join(struct[i]))
--- a/cpt_disruptin_table/Disruptin_hydrophobicity_helicity_table_package.xml Fri Jun 17 12:33:22 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -<?xml version="1.0"?> -<tool id="edu.tamu.cpt2.phage.disruptin_table" name="Disruptin Table Output" version="1.0"> - <description>makes table of disruptin candidates</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/Disruptin_hydrophobicity_helicity_table_package.py -$garnier_file -$fasta_file - - - ->$output]]></command> - <inputs> - <param label="Garnier csv file" name="garnier_file" type="data" format="tabular" /> - <param label="Candidate fasta file" name="fasta_file" type="data" format="fasta" /> - </inputs> - <outputs> - <data format="tabular" name="output"/> - </outputs> - <help><![CDATA[ -**What it does** -This program takes the parsed output from the garnier tool and the fasta file with disruptin candidate sequences -and compiles information on each of the sequences into a table format. The table includes the sequence and the position for each residue -as well as the charge, hydrophobicity (based on the Kyte Doolittle scale), and secondary structure prediction from -the garnier tool. - - ]]></help> - <expand macro="citations" /> -</tool>
--- a/cpt_disruptin_table/cpt-macros.xml Fri Jun 17 12:33:22 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_disruptin_table/macros.xml Fri Jun 17 12:33:22 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="genome_selector"> - <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> - </xml> - <xml name="gff3_input"> - <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> - </xml> - <token name="@GENOME_SELECTOR_PRE@"> - ln -s $genome_fasta genomeref.fa; - </token> - <token name="@GENOME_SELECTOR@"> - genomeref.fa - </token> -</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:41:05 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>