Mercurial > repos > boris > hetbox
changeset 0:8d2c4c11fd8f draft default tip
Uploaded repo.tar.gz
author | boris |
---|---|
date | Mon, 03 Feb 2014 13:15:10 -0500 |
parents | |
children | |
files | hetbox.py hetbox.xml tool_dependencies.xml |
diffstat | 3 files changed, 238 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hetbox.py Mon Feb 03 13:15:10 2014 -0500 @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +# Code by Boris Rebolledo-Jaramillo +# (boris-at-bx.psu.edu) +# Edited by Nick Stoler +# (nick-at-bx.psu.edu) +# New in this version: +# - Add in proper header line if not present + + +import os +import sys +import array +import numpy +from rpy2.robjects import Formula +from rpy2.robjects.packages import importr +from rpy2 import robjects + +def fail(message): + sys.stderr.write(message+'\n') + sys.exit(1) + +COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', + 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] + +COLUMN_LABELS_STRANDED= ['SAMPLE', 'CHR', 'POS', '+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T', + 'CVRG','ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] + +args = sys.argv[1:] +if len(args) >= 1: + infile = args[0] +else: + fail('Error: No input filename provided (as argument 1).') +if len(args) >= 2: + outfile = args[1] +else: + fail('Error: No output filename provided (as argument 2).') +if len(args) >= 3: + report = args[2] +else: + report = '' + +# Check input file +add_header = False +if not os.path.exists(infile): + fail('Error: Input file '+infile+' could not be found.') +with open(infile, 'r') as lines: + line = lines.readline() + if not line: + fail('Error: Input file seems to be empty') + line = line.strip().lstrip('#') # rm whitespace, comment chars + labels = line.split("\t") + if 'SAMPLE' not in labels: + sys.stderr.write("Error: Input file does not seem to have a proper header " + +"line.\nAdding an artificial header..") + add_header = True + +r = robjects.r +base = importr('base') +utils = importr('utils') +stats = importr('stats') +rprint = robjects.globalenv.get("print") +graphics = importr('graphics') +grdevices = importr('grDevices') +grdevices.png(file=outfile, width=1024, height=768, type="cairo") + +# Read file into a data frame +if add_header: + # add header line manually if not present + DATA = utils.read_delim(infile, header=False) + labels = robjects.r.names(DATA) + for i in range(len(labels)): + try: + labels[i] = COLUMN_LABELS[i] + except IndexError, e: + try: + labels[i] = COLUMN_LABELS_EXTENDED[i] + except: + fail("Error in input file: Too many columns (does not match hardcoded " + +"column labels).") +else: + DATA = utils.read_delim(infile) + # Remove comment from header, if present + labels = robjects.r.names(DATA) + if labels[0][0:2] == 'X.': + labels[0] = labels[0][2:] + +# Multiply minor allele frequencies by 100 to get percentage +# .rx2() looks up a column by its label and returns it as a vector +# .ro turns the returned object into one that can be operated on per-element +minor_freq = DATA.rx2('MINOR.FREQ.PERC.').ro * 100 +samples = DATA.rx2('SAMPLE') + +# Formula() creates a Python object representing the R object returned by x ~ y +formula = Formula('minor_freq ~ samples') +# The "environment" in .getenvironment() is the entire R workspace in which the +# Formula object exists. The R workspace meaning all the defined variables. +# Here, the .getenvironment() method is being used to set some variables in the +# R workspace + +formula.getenvironment()['minor_freq'] = minor_freq +formula.getenvironment()['samples'] = samples + + +r.par(oma=array.array('i', [0,0,0,0])) +r.par(mar=array.array('i', [10,4,4,2])) +ylimit = array.array('i',[-5,50]) + +# create boxplot - fill kwargs1 with the options for the boxplot function +kwargs1 = {'ylab':"Minor allele frequency (%)", 'col':"gray", 'xaxt':"n", + 'outpch':"*",'main':"Distribution of minor allele frequencies", + 'cex.lab':"1.5"} +p = graphics.boxplot(formula, axes=0,ylim=ylimit, lty=1,**kwargs1) + +table = base.table(DATA.rx2('SAMPLE')) +graphics.text(0, -1, 'N:', font=2) +for i in range(1, base.length(table)[0]+1, 1): + graphics.text(i, -1, table[i-1], font=2) + +graphlabels = base.names(table) +kwargs3 = {'pos':"-2", 'las':"2", 'cex.axis':"1"} +graphics.axis(1, at=range(1, len(graphlabels)+1, 1),labels=graphlabels, **kwargs3) +graphics.axis(2,at=(range(0,60,10)),pos=0,font=2) +grdevices.dev_off() + +if not report: + sys.exit(0) + + +SAMPLES=[] +for i in range(len(table)): + SAMPLES.append(base.names(table)[i]) + +def boxstats(data,sample): + VALUES = [100*float(x.strip().split('\t')[11]) for x in list(open(data)) if x.strip().split('\t')[0]==sample] + NoHET = len(VALUES) + MEDIAN = numpy.median(VALUES) + MAD = numpy.median([abs(i - MEDIAN) for i in VALUES]) # Median absolute distance (robust spread statistic) + return [NoHET,MEDIAN, MAD] + +boxreport = open(report, "w+") +boxreport.write("#sample\tNo.sites\tmedian.freq\tMAD.freq\n") + +for sample in SAMPLES: + ENTRY = [sample] + boxstats(infile,sample) + boxreport.write ('%s\t%d\t%.1f\t%.1f\n' % tuple([ENTRY[i] for i in [0,1,2,3]])) +boxreport.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hetbox.xml Mon Feb 03 13:15:10 2014 -0500 @@ -0,0 +1,75 @@ +<tool id="hetbox" version="1.0" name="MAF boxplot"> + <description>Minor Allele Frequency Boxplot</description> + <requirements> + <requirement type="package" version="2.15.0">R</requirement> + <requirement type="package" version="2.2.6">rpy2</requirement> + <requirement type="package" version="1.7.1">numpy</requirement> + </requirements> + <command interpreter="python">hetbox.py $input $outplot $outreport + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Input allele counts table"/> + <param name="report" type="boolean" truevalue="yes" falsevalue="no" checked="True" label="Also generate a report on data spread" /> + </inputs> + <outputs> + <data name="outplot" format="png" label="${tool.name} on ${on_string}: boxplot"/> + <data name="outreport" format="tabular" label="${tool.name} on ${on_string}: report"> + <filter>report is True</filter> + </data> + </outputs> + <stdio> + <exit_code range="1:" err_level="fatal"/> + <exit_code range=":-1" err_level="fatal"/> + </stdio> + + <help> + +.. class:: infomark + +**What it does** + +The MAF Boxplot tool takes a table listing heteroplasmic sites per sample and their corresponding minor allele frequency (MAF). +It generates a boxplot of the minor allele frequencies per sample by default. The number of heteroplasmic sites is displayed under each box. +Optionally, it can generate a report that includes the total number of heteroplasmic sites, the median and the median absolute deviation (MAD) of the minor allele frequencies per sample. + +----- + +.. class:: warningmark + +**Note** + +Please, follow the format described below for the input file: + +----- + +.. class:: infomark + +**Formats** + +**Variant Annotator tool output format** + +Columns:: + + 1. sample id + 2. chromosome + 3. position + 4 counts for A's + 5. counts for C's + 6. counts for G's + 7. counts for T's + 8. Coverage + 9. Number of alleles passing frequency threshold + 10. Major allele + 11. Minor allele + 12. Minor allele frequency in position + +----- + +**Citation** + +If you use this tool, please cite Dickins B, Rebolledo-Jaramillo B, et al. *In preparation.* +(boris-at-bx.psu.edu) + + + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Feb 03 13:15:10 2014 -0500 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="R" version="2.15.0"> + <repository changeset_revision="7fb1a5b1b6ba" name="package_r_2_15_0" owner="boris" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="rpy2" version="2.2.6"> + <repository changeset_revision="31bb8a2b748e" name="package_rpy2_2_2" owner="boris" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="numpy" version="1.7.1"> + <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>