Mercurial > repos > alenail > pieplot_macs
changeset 0:84b62ddfec66 draft
Uploaded
author | alenail |
---|---|
date | Thu, 14 Apr 2016 11:36:26 -0400 |
parents | |
children | c35147e7f9ef |
files | pieplot_macs/pieplots_macs.py pieplot_macs/pieplots_macs.xml pieplot_macs/tool_dependencies.xml |
diffstat | 3 files changed, 158 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pieplot_macs/pieplots_macs.py Thu Apr 14 11:36:26 2016 -0400 @@ -0,0 +1,127 @@ +''' + NAME + + pieplots_macs.py + + SYNOPSIS + +python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf + + + DESCRIPTION + +Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela) + +''' + +__author__='Renan Escalante' +__email__='renanec@mit.edu' + +import pandas as pd +import matplotlib +matplotlib.use('pdf') +from matplotlib import pyplot as plt +matplotlib.rcParams['pdf.fonttype']=42 +matplotlib.rcParams['font.size']=14 +import sys +from argparse import ArgumentParser + +def map_peaks(gene,peak,outfile,macsFlag): + genefile = open(gene, 'r') + peakfile = open(peak, 'r') + + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0} + + #read mapped gene file, store closest map for each peak + peaks={} #{chrom:{peakStart:[dist, type]}} + for line in genefile: + words = line.strip().split('\t') + #ignore first line + if words[0] == 'knownGeneID': continue + chrom = words[2] + + + if not macsFlag: + try: + start = int(words[3]) + dist = abs(int(words[15])) + maptype = words[16] + if maptype == 'gene': + maptype = words[17] + except: + pass + + else: + start = int(words[3])-1 + dist = abs(int(words[12])) + maptype = words[14] + if maptype == 'gene': + maptype = words[15] + + + if chrom not in peaks: + #new chrom + peaks[chrom] = {start:[dist,maptype]} + else: + if start in peaks[chrom].keys(): + #account for duplicate entries - choose closest gene and store type + if dist < peaks[chrom][start][0]: + #closer gene + peaks[chrom][start] = [dist, maptype] + else: peaks[chrom][start] = [dist, maptype] + + #count types - 1 per peak in peak file + types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0} + totalpks = 0 + #Read peak file in bed format + for line in peakfile: + words = line.strip().split('\t') + chrom = words[0] + start = int(words[1]) + if chrom in peaks: + if start in peaks[chrom]: + types[peaks[chrom][start][1]] += 1 + else: + types['inter'] += 1 + else: + types['inter'] += 1 + totalpks += 1 + + + #-------------------------------------------- + # make a square figure and axes + #-------------------------------------------- + + fig = plt.figure(figsize=(6,6)) + pie_ax = fig.add_axes((0.3,0.2,0.4,0.4)) + + # The slices will be ordered and plotted counter-clockwise. + labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']] + fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']] + + plt.pie(fracs, labels=labels) #, autopct='%1.1f%%') + + #Export data frame with all the counts + indexDataFrame = ['exon','intron','promoter','intergenic','after'] + df = pd.DataFrame(data=fracs, index=indexDataFrame) + dfFileName = outfile.replace("pdf","csv") + df.to_csv(dfFileName, sep=',') + #plt.title('MACS peaks in %s'%(name)) + plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center') + fig.savefig(outfile) + +def main(): + usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf" + parser = ArgumentParser(usage) + parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt") + parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed") + parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart") + parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks') + + args=parser.parse_args() + + map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS) + + +if __name__=='__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pieplot_macs/pieplots_macs.xml Thu Apr 14 11:36:26 2016 -0400 @@ -0,0 +1,22 @@ +<tool id="pieplots_macs" name="Pieplots MACS" version="0.1"> + <description> + Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. + </description> + <parallelism method="basic"></parallelism> + <requirements> + <requirement type="package" version="0.14.1">pandas</requirement> + <requirement type="package" version="1.4">matplotlib</requirement> + </requirements> + <command interpreter="python"> + pieplots_macs.py --genefile $genefile --peakfile $peakfile $MACS --outfile $out + </command> + <inputs> + <param name="genefile" type="data" label="Gene file (map_peaks_to_genes)" help="" optional="false" /> + <param name="peakfile" type="data" label="Peaks file (narrowpeaks)" help="" optional="false" /> + <param name="MACS" checked="false" label="Set this value if you have MACS peaks" type="boolean" truevalue="--MACS" falsevalue="" help="" /> + </inputs> + <outputs> + <data name="out" format="pdf" hidden="false" /> + </outputs> + <help></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pieplot_macs/tool_dependencies.xml Thu Apr 14 11:36:26 2016 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="pandas" version="0.14.1"> + <repository changeset_revision="ac9f317487a9" name="package_pandas_0_14" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu/" /> + </package> + <package name="matplotlib" version="1.4"> + <repository changeset_revision="f7424e1cf115" name="package_python_2_7_matplotlib_1_4" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu/" /> + </package> +</tool_dependency>