annotate pieplot_macs/pieplots_macs.py @ 8:37b6e6c43ef8 draft

Uploaded
author alenail
date Thu, 14 Apr 2016 19:05:32 -0400
parents b9b2be216c05
children 4bbea0c6efd5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
1 '''
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
2 NAME
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
3
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
4 pieplots_macs.py
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
5
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
6 SYNOPSIS
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
7
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
8 python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
9
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
10
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
11 DESCRIPTION
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
12
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
13 Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela)
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
14
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
15 '''
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
16
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
17 __author__='Renan Escalante'
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
18 __email__='renanec@mit.edu'
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
19
2
039eab1177fc Uploaded
alenail
parents: 0
diff changeset
20 import sys
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
21 import matplotlib
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
22 matplotlib.use('pdf')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
23 from matplotlib import pyplot as plt
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
24 matplotlib.rcParams['pdf.fonttype']=42
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
25 matplotlib.rcParams['font.size']=14
5
01748da366c3 Uploaded
alenail
parents: 2
diff changeset
26 # import pandas as pd
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
27 from argparse import ArgumentParser
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
28
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
29 def map_peaks(gene,peak,outfile,macsFlag):
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
30 genefile = open(gene, 'r')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
31 peakfile = open(peak, 'r')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
32
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
33 types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0}
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
34
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
35 #read mapped gene file, store closest map for each peak
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
36 peaks={} #{chrom:{peakStart:[dist, type]}}
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
37 for line in genefile:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
38 words = line.strip().split('\t')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
39 #ignore first line
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
40 if words[0] == 'knownGeneID': continue
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
41 chrom = words[2]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
42
6
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
43 if macsFlag:
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
44 print "macsflag on"
6
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
45 start = int(words[3])-1
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
46 dist = abs(int(words[12]))
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
47 print dist
6
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
48 maptype = words[14]
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
49 if maptype == 'gene':
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
50 maptype = words[15]
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
51
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
52 else:
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
53 print "macsflag off"
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
54 try:
8
37b6e6c43ef8 Uploaded
alenail
parents: 7
diff changeset
55 print words
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
56 start = int(words[3])
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
57 dist = abs(int(words[15]))
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
58 print dist
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
59 maptype = words[16]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
60 if maptype == 'gene':
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
61 maptype = words[17]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
62 except:
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
63 print 'trigger except'
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
64 pass
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
65
6
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
66 print "dist:"
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
67 print dist
7
b9b2be216c05 Uploaded
alenail
parents: 6
diff changeset
68 print "that was dist"
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
69
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
70 if chrom not in peaks:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
71 #new chrom
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
72 peaks[chrom] = {start:[dist,maptype]}
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
73 else:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
74 if start in peaks[chrom].keys():
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
75 #account for duplicate entries - choose closest gene and store type
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
76 if dist < peaks[chrom][start][0]:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
77 #closer gene
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
78 peaks[chrom][start] = [dist, maptype]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
79 else: peaks[chrom][start] = [dist, maptype]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
80
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
81 #count types - 1 per peak in peak file
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
82 types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0}
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
83 totalpks = 0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
84 #Read peak file in bed format
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
85 for line in peakfile:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
86 words = line.strip().split('\t')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
87 chrom = words[0]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
88 start = int(words[1])
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
89 if chrom in peaks:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
90 if start in peaks[chrom]:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
91 types[peaks[chrom][start][1]] += 1
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
92 else:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
93 types['inter'] += 1
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
94 else:
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
95 types['inter'] += 1
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
96 totalpks += 1
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
97
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
98
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
99 #--------------------------------------------
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
100 # make a square figure and axes
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
101 #--------------------------------------------
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
102
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
103 fig = plt.figure(figsize=(6,6))
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
104 pie_ax = fig.add_axes((0.3,0.2,0.4,0.4))
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
105
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
106 # The slices will be ordered and plotted counter-clockwise.
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
107 labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
108 fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']]
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
109
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
110 plt.pie(fracs, labels=labels) #, autopct='%1.1f%%')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
111
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
112 #Export data frame with all the counts
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
113 indexDataFrame = ['exon','intron','promoter','intergenic','after']
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
114 df = pd.DataFrame(data=fracs, index=indexDataFrame)
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
115 dfFileName = outfile.replace("pdf","csv")
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
116 df.to_csv(dfFileName, sep=',')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
117 #plt.title('MACS peaks in %s'%(name))
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
118 plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
119 fig.savefig(outfile)
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
120
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
121 def main():
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
122 usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf"
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
123 parser = ArgumentParser(usage)
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
124 parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt")
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
125 parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed")
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
126 parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart")
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
127 parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks')
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
128
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
129 args=parser.parse_args()
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
130
6
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
131 print args.MACS
d7baa700a930 Uploaded
alenail
parents: 5
diff changeset
132
0
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
133 map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS)
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
134
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
135
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
136 if __name__=='__main__':
84b62ddfec66 Uploaded
alenail
parents:
diff changeset
137 main()