view plotly_tabular_tool/plotlytabular.xml @ 4:e2d2b080bae3 draft default tip

Now refuses to create html for >5k rows. 10k rows will always crash firefox so seems wise to be restrictive. Advises to redo job and select png output...
author fubar
date Mon, 07 Aug 2023 06:06:35 +0000
parents 51a0c2e0fbdf
children
line wrap: on
line source

<tool name="plotlytabular" id="plotlytabular" version="3.0">
  <!--Source in git at: https://github.com/fubar2/galaxy_tf_overlay-->
  <!--Created by toolfactory@galaxy.org at 07/08/2023 15:43:09 using the Galaxy Tool Factory.-->
  <description>Plotly plot generator for Galaxy tabular data.</description>
  <requirements>
    <requirement version="1.5.3" type="package">pandas</requirement>
    <requirement version="5.9.0" type="package">plotly</requirement>
    <requirement version="0.2.1" type="package">python-kaleido</requirement>
  </requirements>
  <stdio>
    <exit_code range="1:" level="fatal"/>
  </stdio>
  <version_command><![CDATA[echo "3.0"]]></version_command>
  <command><![CDATA[python
$runme
--input_tab
$input_tab
--htmlout
$htmlout
--xcol
"$xcol"
--ycol
"$ycol"
--colourcol
"$colourcol"
--hovercol
"$hovercol"
--title
"$title"
--header
"$header"
--image_type
"$outputimagetype"]]></command>
  <configfiles>
    <configfile name="runme"><![CDATA[#raw

import argparse
import shutil
import sys
import math
import plotly.express as px
import pandas as pd
# Ross Lazarus July 2023
# based on various plotly tutorials
MAXHTMLROWS = 5000 # empirically, browsers die at 10k so stop here for safety and give png
parser = argparse.ArgumentParser()
a = parser.add_argument
a('--input_tab',default='')
a('--header',default='')
a('--htmlout',default="test_run.html")
a('--xcol',default='')
a('--ycol',default='')
a('--colourcol',default='')
a('--hovercol',default='')
a('--title',default='Default plot title')
a('--image_type',default='short_html')
args = parser.parse_args()
isColour = False
isHover = False
if len(args.colourcol.strip()) > 0:
    isColour = True
if len(args.hovercol.strip()) > 0:
    isHover = True
df = pd.read_csv(args.input_tab, sep='\t')
MAXLEN=35
NCOLS = df.columns.size
NROWS = len(df.index)
defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
testcols = df.columns
if args.image_type in ['short_html', 'long_html']: # refuse to create browser crashing gob stopper html
    if NROWS > MAXHTMLROWS:
        sys.stderr.write('#### CRITICAL: As advised on the tool form, 5k+ rows (you supplied %d) in html breaks browsers, so redo the job and change to png format output.' % NROWS)
        sys.exit(6)
if len(args.header.strip()) > 0:
    newcols = args.header.split(',')
    if len(newcols) == NCOLS:
        if (args.xcol in newcols) and (args.ycol in newcols):
            df.columns = newcols
        else:
            sys.stderr.write('#### xcol and ycol not found in supplied header parameter' % (args.xcol, args.ycol, args.header))
            sys.exit(4)
    else:
        sys.stderr.write('#### Supplied header %s has %d comma delimited header names - does not match the input tabular file %d columns' % (args.header, len(newcols), NCOLS))
        sys.exit(5)
else: # no header supplied - check for a real one that matches the x and y axis column names
    colsok = (args.xcol in testcols) and (args.ycol in testcols) # if they match, probably ok...should use more code and logic..
    if colsok:
        df.columns = testcols # use actual header
    else:
        colsok = (args.xcol in defaultcols) and (args.ycol in defaultcols)
        if colsok:
            sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols))
            df.columns = defaultcols
        else:
            sys.stderr.write('#### xcol %s and ycol %s do not match anything in the file header, supplied header parameter or automatic default column names' % (args.xcol, args.ycol))
if isHover and isColour:
    fig = px.scatter(df, x=args.xcol, y=args.ycol, color=args.colourcol, hover_name=args.hovercol)
elif isHover:
    fig = px.scatter(df, x=args.xcol, y=args.ycol, hover_name=args.hovercol)
elif isColour:
    fig = px.scatter(df, x=args.xcol, y=args.ycol, color=args.colourcol)
else:
    fig = px.scatter(df, x=args.xcol, y=args.ycol)
if args.title:
    ftitle=dict(text=args.title, font=dict(size=50))
    fig.update_layout(title=ftitle)
for scatter in fig.data:
    scatter['x'] = [str(x)[:MAXLEN] + '..' if len(str(x)) > MAXLEN else x for x in scatter['x']]
    scatter['y'] = [str(x)[:MAXLEN] + '..' if len(str(x)) > MAXLEN else x for x in scatter['y']]
    if len(args.colourcol.strip()) == 0:
        sl = str(scatter['legendgroup'])
        if len(sl) > MAXLEN:
            scatter['legendgroup'] = sl[:MAXLEN]
if args.image_type == "short_html":
    fig.write_html(args.htmlout, full_html=False, include_plotlyjs='cdn')
elif args.image_type == "long_html":
    fig.write_html(args.htmlout)
elif args.image_type == "small_png":
    ht = 768
    wdth = 1024
    fig.write_image('plotly.png', height=ht, width=wdth)
    shutil.copyfile('plotly.png', args.htmlout)
else:
    ht = 1200
    wdth = 1920
    fig.write_image('plotly.png', height=ht, width=wdth)
    shutil.copyfile('plotly.png', args.htmlout)



#end raw]]></configfile>
  </configfiles>
  <inputs>
    <param name="input_tab" type="data" optional="false" label="Tabular input file to plot" help="If 5000+ rows, html output will fail, but png will work." format="tabular" multiple="false"/>
    <param name="xcol" type="text" value="sepal_length" label="x axis for plot" help="Use a column name from the header if the file has one, or use one from the list supplied below, or use col1....colN otherwise to select the correct column"/>
    <param name="ycol" type="text" value="sepal_width" label="y axis for plot" help="Use a column name from the header if the file has one, or use one from the list supplied below, or use col1....colN otherwise to select the correct column"/>
    <param name="colourcol" type="text" value="petal_width" label="column containing a groupable variable for colour. Default none." help="Adds a legend so choose wisely "/>
    <param name="hovercol" type="text" value="species" label="columname for hover string" help="Use a column name from the header if the file has one, or use one from the list supplied below, or use col1....colN otherwise to select the correct column"/>
    <param name="title" type="text" value="Iris data" label="Title for the plot" help="Special characters will probably be escaped so do not use them"/>
    <param name="header" type="text" value="" label="Use this comma delimited list of column header names for this tabular file. Default is None when col1...coln will be used" help="The column names supplied for xcol, ycol, hover and colour MUST match either this supplied list, or if none, col1...coln."/>
    <param name="outputimagetype" type="select" label="Select the output format for this plot image. If over 5000 rows of data, HTML breaks browsers, so your job will fail. Use png only if more than 5k rows." help="Small and large png are not interactive but best for many (+10k) points. Stand-alone HTML includes 3MB of javascript. Short form HTML gets it the usual way so can be cut and paste into documents.">
      <option value="short_html">Short HTML interactive format</option>
      <option value="long_html">Long HTML for stand-alone viewing where network access to libraries is not available.</option>
      <option value="large_png">Large (1920x1200) png image - not interactive so hover column ignored</option>
      <option value="small_png">small (1024x768) png image - not interactive so hover column ignored</option>
    </param>
  </inputs>
  <outputs>
    <data name="htmlout" format="html" label="Plotlytabular $title on $input_tab.element_identifier" hidden="false">
      <change_format>
        <when input="outputimagetype" format="png" value="small_png"/>
        <when input="outputimagetype" format="png" value="large_png"/>
      </change_format>
    </data>
  </outputs>
  <tests>
    <test>
      <output name="htmlout" value="htmlout_sample" compare="sim_size" delta="5000"/>
      <param name="input_tab" value="input_tab_sample"/>
      <param name="xcol" value="sepal_length"/>
      <param name="ycol" value="sepal_width"/>
      <param name="colourcol" value="petal_width"/>
      <param name="hovercol" value="species"/>
      <param name="title" value="Iris data"/>
      <param name="header" value=""/>
      <param name="outputimagetype" value="short_html"/>
    </test>
  </tests>
  <help><![CDATA[

This is a generic version of the plotlyblast specific blastn Galaxy search output file plotter.

PNG images are not interactive but best for very large numbers of data points. Hover column will be ignored.

HTML interactive plots are best for a few thousand data points at most because

the hover information becomes uncontrollable with very dense points.

Using the shorter format HTML relies on internet access when viewed, and saves 3MB of javascript being embedded.

The long format is useful if potentially viewed offline.



.. class:: warningmark

Long strings in x and y tickmarks WILL BE TRUNCATED if they are too long - ".." is added to indicate truncation - otherwise some plots are squished.



.. class:: warningmark

Columns with very small scientific notation floats will need to be pre-scaled in a way that doesn't confuse plotly.express with their values.



----



This tool can plot an interactive scatter plot with a hover text column specified, that appears when hovering over each data point, to supply useful additional information. 

It is only useful with a relatively small number of points when they can be distinguished. If many thousands, the density makes them relatively useless so use png output and

forget the hover text.



Column names are auto-generated as col1,...coln *unless* a comma separated list of column names is supplied as the header parameter, *or* pandas can 

find the values supplied as parameters by the user in the first row of data. This sounds more complex than it is.



For example, using a Galaxy blastn output with 25 columns, the following comma delimited string supplied as the "header" parameter will match the names of each column.

   qaccver,saccver,piden,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles



When a header is supplied, the xcol and other column names must match one of those supplied column names.

So for example, xcol = "qaccver" for the blastn header example rather than xcol = "col1" when no header is supplied.



Relies on Plotly python code released under the MIT licence: https://github.com/plotly/plotly.py/blob/master/LICENSE.txt



 

------


Script::

    import argparse
    import shutil
    import sys
    import math
    import plotly.express as px
    import pandas as pd
    # Ross Lazarus July 2023
    # based on various plotly tutorials
    MAXHTMLROWS = 5000 # empirically, browsers die at 10k so stop here for safety and give png
    parser = argparse.ArgumentParser()
    a = parser.add_argument
    a('--input_tab',default='')
    a('--header',default='')
    a('--htmlout',default="test_run.html")
    a('--xcol',default='')
    a('--ycol',default='')
    a('--colourcol',default='')
    a('--hovercol',default='')
    a('--title',default='Default plot title')
    a('--image_type',default='short_html')
    args = parser.parse_args()
    isColour = False
    isHover = False
    if len(args.colourcol.strip()) > 0:
        isColour = True
    if len(args.hovercol.strip()) > 0:
        isHover = True
    df = pd.read_csv(args.input_tab, sep='\t')
    MAXLEN=35
    NCOLS = df.columns.size
    NROWS = len(df.index)
    defaultcols = ['col%d' % (x+1) for x in range(NCOLS)]
    testcols = df.columns
    if args.image_type in ['short_html', 'long_html']: # refuse to create browser crashing gob stopper html
        if NROWS > MAXHTMLROWS:
            sys.stderr.write('#### CRITICAL: As advised on the tool form, 5k+ rows (you supplied %d) in html breaks browsers, so redo the job and change to png format output.' % NROWS)
            sys.exit(6)
    if len(args.header.strip()) > 0:
        newcols = args.header.split(',')
        if len(newcols) == NCOLS:
            if (args.xcol in newcols) and (args.ycol in newcols):
                df.columns = newcols
            else:
                sys.stderr.write('#### xcol and ycol not found in supplied header parameter' % (args.xcol, args.ycol, args.header))
                sys.exit(4)
        else:
            sys.stderr.write('#### Supplied header %s has %d comma delimited header names - does not match the input tabular file %d columns' % (args.header, len(newcols), NCOLS))
            sys.exit(5)
    else: # no header supplied - check for a real one that matches the x and y axis column names
        colsok = (args.xcol in testcols) and (args.ycol in testcols) # if they match, probably ok...should use more code and logic..
        if colsok:
            df.columns = testcols # use actual header
        else:
            colsok = (args.xcol in defaultcols) and (args.ycol in defaultcols)
            if colsok:
                sys.stderr.write('replacing first row of data derived header %s with %s' % (testcols, defaultcols))
                df.columns = defaultcols
            else:
                sys.stderr.write('#### xcol %s and ycol %s do not match anything in the file header, supplied header parameter or automatic default column names' % (args.xcol, args.ycol))
    if isHover and isColour:
        fig = px.scatter(df, x=args.xcol, y=args.ycol, color=args.colourcol, hover_name=args.hovercol)
    elif isHover:
        fig = px.scatter(df, x=args.xcol, y=args.ycol, hover_name=args.hovercol)
    elif isColour:
        fig = px.scatter(df, x=args.xcol, y=args.ycol, color=args.colourcol)
    else:
        fig = px.scatter(df, x=args.xcol, y=args.ycol)
    if args.title:
        ftitle=dict(text=args.title, font=dict(size=50))
        fig.update_layout(title=ftitle)
    for scatter in fig.data:
        scatter['x'] = [str(x)[:MAXLEN] + '..' if len(str(x)) > MAXLEN else x for x in scatter['x']]
        scatter['y'] = [str(x)[:MAXLEN] + '..' if len(str(x)) > MAXLEN else x for x in scatter['y']]
        if len(args.colourcol.strip()) == 0:
            sl = str(scatter['legendgroup'])
            if len(sl) > MAXLEN:
                scatter['legendgroup'] = sl[:MAXLEN]
    if args.image_type == "short_html":
        fig.write_html(args.htmlout, full_html=False, include_plotlyjs='cdn')
    elif args.image_type == "long_html":
        fig.write_html(args.htmlout)
    elif args.image_type == "small_png":
        ht = 768
        wdth = 1024
        fig.write_image('plotly.png', height=ht, width=wdth)
        shutil.copyfile('plotly.png', args.htmlout)
    else:
        ht = 1200
        wdth = 1920
        fig.write_image('plotly.png', height=ht, width=wdth)
        shutil.copyfile('plotly.png', args.htmlout)

]]></help>
  <citations>
    <citation type="doi">10.1093/bioinformatics/bts573</citation>
  </citations>
</tool>