changeset 0:8d2c4c11fd8f draft default tip

Uploaded repo.tar.gz
author boris
date Mon, 03 Feb 2014 13:15:10 -0500
parents
children
files hetbox.py hetbox.xml tool_dependencies.xml
diffstat 3 files changed, 238 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hetbox.py	Mon Feb 03 13:15:10 2014 -0500
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+# Code by Boris Rebolledo-Jaramillo
+# (boris-at-bx.psu.edu)
+# Edited by Nick Stoler
+# (nick-at-bx.psu.edu)
+# New in this version:
+# - Add in proper header line if not present
+
+
+import os
+import sys
+import array
+import numpy
+from rpy2.robjects import Formula
+from rpy2.robjects.packages import importr
+from rpy2 import robjects
+
+def fail(message):
+  sys.stderr.write(message+'\n')
+  sys.exit(1)
+
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG',
+                 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+
+COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR',  'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T',
+                         'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.']
+
+args = sys.argv[1:]
+if len(args) >= 1:
+  infile = args[0]
+else:
+  fail('Error: No input filename provided (as argument 1).')
+if len(args) >= 2:
+  outfile = args[1]
+else:
+  fail('Error: No output filename provided (as argument 2).')
+if len(args) >= 3:
+  report = args[2]
+else:
+  report = ''
+
+# Check input file
+add_header = False
+if not os.path.exists(infile):
+  fail('Error: Input file '+infile+' could not be found.')
+with open(infile, 'r') as lines:
+  line = lines.readline()
+  if not line:
+    fail('Error: Input file seems to be empty')
+  line = line.strip().lstrip('#') # rm whitespace, comment chars
+  labels = line.split("\t")
+  if 'SAMPLE' not in labels:
+    sys.stderr.write("Error: Input file does not seem to have a proper header "
+      +"line.\nAdding an artificial header..")
+    add_header = True
+
+r = robjects.r
+base = importr('base')
+utils = importr('utils')
+stats = importr('stats')
+rprint = robjects.globalenv.get("print")
+graphics = importr('graphics')
+grdevices = importr('grDevices')
+grdevices.png(file=outfile, width=1024, height=768, type="cairo")
+
+# Read file into a data frame
+if add_header:
+    # add header line manually if not present
+    DATA = utils.read_delim(infile, header=False)
+    labels = robjects.r.names(DATA)
+    for i in range(len(labels)):
+        try:
+            labels[i] = COLUMN_LABELS[i]
+        except IndexError, e:
+            try:
+                labels[i] = COLUMN_LABELS_EXTENDED[i]
+            except:
+                fail("Error in input file: Too many columns (does not match hardcoded "
+                +"column labels).")
+else:
+  DATA = utils.read_delim(infile)
+  # Remove comment from header, if present
+  labels = robjects.r.names(DATA)
+  if labels[0][0:2] == 'X.':
+    labels[0] = labels[0][2:]
+
+# Multiply minor allele frequencies by 100 to get percentage
+#  .rx2() looks up a column by its label and returns it as a vector
+#  .ro turns the returned object into one that can be operated on per-element
+minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100
+samples    = DATA.rx2('SAMPLE')
+
+# Formula() creates a Python object representing the R object returned by x ~ y
+formula = Formula('minor_freq ~ samples')
+# The "environment" in .getenvironment() is the entire R workspace in which the
+# Formula object exists. The R workspace meaning all the defined variables.
+# Here, the .getenvironment() method is being used to set some variables in the
+# R workspace
+
+formula.getenvironment()['minor_freq'] = minor_freq
+formula.getenvironment()['samples']    = samples
+
+
+r.par(oma=array.array('i', [0,0,0,0]))
+r.par(mar=array.array('i', [10,4,4,2]))
+ylimit = array.array('i',[-5,50])
+
+# create boxplot - fill kwargs1 with the options for the boxplot function
+kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n",
+           'outpch':"*",'main':"Distribution of minor allele frequencies",
+           'cex.lab':"1.5"}
+p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1)
+
+table = base.table(DATA.rx2('SAMPLE'))
+graphics.text(0, -1, 'N:', font=2)
+for i in range(1, base.length(table)[0]+1, 1):
+    graphics.text(i, -1, table[i-1], font=2)
+
+graphlabels = base.names(table)
+kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"}
+graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3)
+graphics.axis(2,at=(range(0,60,10)),pos=0,font=2)
+grdevices.dev_off()
+
+if not report:
+    sys.exit(0)
+
+
+SAMPLES=[]
+for i in range(len(table)):
+    SAMPLES.append(base.names(table)[i])
+
+def boxstats(data,sample):
+    VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample]
+    NoHET  = len(VALUES)
+    MEDIAN = numpy.median(VALUES)
+    MAD    = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic)
+    return [NoHET,MEDIAN, MAD]
+
+boxreport = open(report, "w+")
+boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n")
+
+for sample in SAMPLES:
+    ENTRY = [sample] + boxstats(infile,sample)
+    boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]]))
+boxreport.close()
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hetbox.xml	Mon Feb 03 13:15:10 2014 -0500
@@ -0,0 +1,75 @@
+<tool id="hetbox" version="1.0" name="MAF boxplot">
+  <description>Minor Allele Frequency Boxplot</description>
+  <requirements>
+    <requirement type="package" version="2.15.0">R</requirement>
+    <requirement type="package" version="2.2.6">rpy2</requirement>
+    <requirement type="package" version="1.7.1">numpy</requirement>
+  </requirements>
+  <command interpreter="python">hetbox.py $input $outplot $outreport
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Input allele counts table"/>
+    <param name="report" type="boolean" truevalue="yes" falsevalue="no" checked="True" label="Also generate a report on data spread" />
+  </inputs>
+  <outputs>
+    <data name="outplot" format="png" label="${tool.name} on ${on_string}: boxplot"/>
+    <data name="outreport" format="tabular" label="${tool.name} on ${on_string}: report">
+      <filter>report is True</filter>
+    </data>
+  </outputs>
+  <stdio>
+    <exit_code range="1:" err_level="fatal"/>
+    <exit_code range=":-1" err_level="fatal"/>
+  </stdio>
+
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+The MAF Boxplot tool takes a table listing heteroplasmic sites per sample and their corresponding minor allele frequency (MAF).
+It generates a boxplot of the minor allele frequencies per sample by default. The number of heteroplasmic sites is displayed under each box.
+Optionally, it can generate  a report that includes the total number of heteroplasmic sites, the median and the median absolute deviation (MAD) of the minor allele frequencies per sample.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Please, follow the format described below for the input file:
+
+-----
+
+.. class:: infomark
+
+**Formats**
+
+**Variant Annotator tool output format**
+
+Columns::
+
+    1.  sample id
+    2.  chromosome
+    3.  position
+    4   counts for A's
+    5.  counts for C's
+    6.  counts for G's
+    7.  counts for T's
+    8.  Coverage
+    9.  Number of alleles passing frequency threshold
+    10. Major allele
+    11. Minor allele
+    12. Minor allele frequency in position
+
+-----
+
+**Citation**
+
+If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al. *In preparation.*
+(boris-at-bx.psu.edu)
+
+
+  </help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Feb 03 13:15:10 2014 -0500
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="R" version="2.15.0">
+        <repository changeset_revision="7fb1a5b1b6ba" name="package_r_2_15_0" owner="boris" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="rpy2" version="2.2.6">
+        <repository changeset_revision="31bb8a2b748e" name="package_rpy2_2_2" owner="boris" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="numpy" version="1.7.1">
+        <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>