0
|
1 '''
|
|
2 NAME
|
|
3
|
|
4 pieplots_macs.py
|
|
5
|
|
6 SYNOPSIS
|
|
7
|
|
8 python pieplots_macs.py --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf
|
|
9
|
|
10
|
|
11 DESCRIPTION
|
|
12
|
|
13 Peaks are assigned to the closest gene and then categorized according to their location at different genomic regions (promoter, intron, exon, or after the gene). Sites >10kb away from any gene are considered intergenic. (from Pamela)
|
|
14
|
|
15 '''
|
|
16
|
|
17 __author__='Renan Escalante'
|
|
18 __email__='renanec@mit.edu'
|
|
19
|
2
|
20 import sys
|
0
|
21 import matplotlib
|
|
22 matplotlib.use('pdf')
|
|
23 from matplotlib import pyplot as plt
|
|
24 matplotlib.rcParams['pdf.fonttype']=42
|
|
25 matplotlib.rcParams['font.size']=14
|
5
|
26 # import pandas as pd
|
0
|
27 from argparse import ArgumentParser
|
|
28
|
|
29 def map_peaks(gene,peak,outfile,macsFlag):
|
|
30 genefile = open(gene, 'r')
|
|
31 peakfile = open(peak, 'r')
|
|
32
|
|
33 types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0}
|
|
34
|
|
35 #read mapped gene file, store closest map for each peak
|
|
36 peaks={} #{chrom:{peakStart:[dist, type]}}
|
|
37 for line in genefile:
|
|
38 words = line.strip().split('\t')
|
|
39 #ignore first line
|
|
40 if words[0] == 'knownGeneID': continue
|
|
41 chrom = words[2]
|
|
42
|
6
|
43 if macsFlag:
|
7
|
44 print "macsflag on"
|
6
|
45 start = int(words[3])-1
|
|
46 dist = abs(int(words[12]))
|
7
|
47 print dist
|
6
|
48 maptype = words[14]
|
|
49 if maptype == 'gene':
|
|
50 maptype = words[15]
|
|
51
|
|
52 else:
|
7
|
53 print "macsflag off"
|
0
|
54 try:
|
|
55 start = int(words[3])
|
|
56 dist = abs(int(words[15]))
|
7
|
57 print dist
|
0
|
58 maptype = words[16]
|
|
59 if maptype == 'gene':
|
|
60 maptype = words[17]
|
|
61 except:
|
7
|
62 print 'trigger except'
|
0
|
63 pass
|
|
64
|
6
|
65 print "dist:"
|
|
66 print dist
|
7
|
67 print "that was dist"
|
0
|
68
|
|
69 if chrom not in peaks:
|
|
70 #new chrom
|
|
71 peaks[chrom] = {start:[dist,maptype]}
|
|
72 else:
|
|
73 if start in peaks[chrom].keys():
|
|
74 #account for duplicate entries - choose closest gene and store type
|
|
75 if dist < peaks[chrom][start][0]:
|
|
76 #closer gene
|
|
77 peaks[chrom][start] = [dist, maptype]
|
|
78 else: peaks[chrom][start] = [dist, maptype]
|
|
79
|
|
80 #count types - 1 per peak in peak file
|
|
81 types = {'promoter':0, 'after':0, 'intron':0, 'exon': 0, 'inter': 0}
|
|
82 totalpks = 0
|
|
83 #Read peak file in bed format
|
|
84 for line in peakfile:
|
|
85 words = line.strip().split('\t')
|
|
86 chrom = words[0]
|
|
87 start = int(words[1])
|
|
88 if chrom in peaks:
|
|
89 if start in peaks[chrom]:
|
|
90 types[peaks[chrom][start][1]] += 1
|
|
91 else:
|
|
92 types['inter'] += 1
|
|
93 else:
|
|
94 types['inter'] += 1
|
|
95 totalpks += 1
|
|
96
|
|
97
|
|
98 #--------------------------------------------
|
|
99 # make a square figure and axes
|
|
100 #--------------------------------------------
|
|
101
|
|
102 fig = plt.figure(figsize=(6,6))
|
|
103 pie_ax = fig.add_axes((0.3,0.2,0.4,0.4))
|
|
104
|
|
105 # The slices will be ordered and plotted counter-clockwise.
|
|
106 labels = ['exon: %i'%types['exon'],'intron: %i'%types['intron'],'promoter: %i'%types['promoter'],'intergenic: %i'%types['inter'], 'after: %i'%types['after']]
|
|
107 fracs = [types['exon'], types['intron'], types['promoter'], types['inter'], types['after']]
|
|
108
|
|
109 plt.pie(fracs, labels=labels) #, autopct='%1.1f%%')
|
|
110
|
|
111 #Export data frame with all the counts
|
|
112 indexDataFrame = ['exon','intron','promoter','intergenic','after']
|
|
113 df = pd.DataFrame(data=fracs, index=indexDataFrame)
|
|
114 dfFileName = outfile.replace("pdf","csv")
|
|
115 df.to_csv(dfFileName, sep=',')
|
|
116 #plt.title('MACS peaks in %s'%(name))
|
|
117 plt.figtext(.5, .1, 'Total: %i'%totalpks, ha='center')
|
|
118 fig.savefig(outfile)
|
|
119
|
|
120 def main():
|
|
121 usage = "usage: %prog --genefile MACSoutfile_genes.txt --peakfile MACSoutfile_peaks.bed --outfile MACSdirectory/piechart.pdf"
|
|
122 parser = ArgumentParser(usage)
|
|
123 parser.add_argument("--genefile", dest="genefile", help="Path to file MACS_mfold10,30_pval1e-5_genes.txt")
|
|
124 parser.add_argument("--peakfile", dest="peakfile", help="Path to file MACS_mfold10,30_pval1e-5_peaks.bed")
|
|
125 parser.add_argument("--outfile", dest="outfile", default="MACS_piechart.pdf", help="Path to pdf file where you want to store the piechart")
|
|
126 parser.add_argument('--MACS',action='store_true',default=False,help='Set this value if you have MACS peaks')
|
|
127
|
|
128 args=parser.parse_args()
|
|
129
|
6
|
130 print args.MACS
|
|
131
|
0
|
132 map_peaks(args.genefile, args.peakfile, args.outfile, args.MACS)
|
|
133
|
|
134
|
|
135 if __name__=='__main__':
|
|
136 main()
|