annotate mytools/alignr.py @ 9:87eb5c5ddfe9

Uploaded
author xuebing
date Fri, 09 Mar 2012 20:01:43 -0500
parents f0dc65e7f6c0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
1 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
2 the scripts takes two files as input, and compute the coverage of
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
3 features in input 1 across features in input 2. Features in input 2 are
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
4 divided into bins and coverage is computed for each bin.
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
5
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
6 please check the help information by typing:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
7
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
8 python align.py -h
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
9
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
10
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
11 requirement:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
12 please install the following tools first:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
13 bedtools: for read/region overlapping, http://code.google.com/p/bedtools/
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
14
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
15 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
16
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
17 import os,sys,os.path
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
18 from optparse import OptionParser
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
19
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
20 def lineCount(filename):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
21 with open(filename) as f:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
22 for i, l in enumerate(f):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
23 pass
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
24 return i + 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
25
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
26 def combineFilename(f1,f2):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
27 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
28 fuse two file names into one
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
29 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
30 return f1.split('/')[-1]+'-'+f2.split('/')[-1]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
31
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
32 def checkFormat(filename1,filename2,input1format):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
33 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
34 check the format of input files
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
35 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
36
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
37 # file1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
38 # read the first line, see how many filds
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
39 ncol1 = 6
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
40 if input1format == "BED":
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
41 f = open(filename1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
42 line = f.readline().strip().split('\t')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
43 ncol1 = len(line)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
44 if ncol1 < 3:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
45 print "ERROR: "+filename1+" has only "+str(ncol1)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
46 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
47 f.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
48
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
49 # file2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
50 f = open(filename2)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
51 line = f.readline().strip().split('\t')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
52 ncol2 = len(line)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
53 if ncol2 < 3:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
54 print "ERROR: "+filename2+" has only "+str(ncol2)+" columns (>=3 required). Make sure it has NO header line and is TAB-delimited."
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
55 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
56
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
57 return ncol1,ncol2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
58
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
59
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
60 def makeBed(filename,ncol):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
61 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
62 add up to 6 column
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
63 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
64 f = open(filename)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
65 outfile = filename+'.tmp.bed'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
66 outf = open(outfile,'w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
67 if ncol == 3:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
68 for line in f:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
69 outf.write(line.strip()+'\t.\t0\t+\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
70 elif ncol == 4:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
71 for line in f:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
72 outf.write(line.strip()+'\t0\t+\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
73 if ncol == 5:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
74 for line in f:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
75 outf.write(line.strip()+'\t+\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
76 f.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
77 outf.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
78 return outfile
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
79
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
80 def makeWindow(filename,window):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
81
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
82 outfile = filename+'-window='+str(window)+'.tmp.bed'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
83 if not os.path.exists(outfile):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
84 f=open(filename)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
85 out = open(outfile,'w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
86 lines = f.readlines()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
87 if 'track' in lines[0]:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
88 del lines[0]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
89 for line in lines:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
90 flds = line.strip().split('\t')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
91
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
92 #new position
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
93 center = (int(flds[1]) + int(flds[2]))/2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
94 start = center - window
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
95 end = center + window
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
96 if start >= 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
97 flds[1] = str(start)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
98 flds[2] = str(end)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
99 out.write('\t'.join(flds)+'\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
100 f.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
101 out.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
102 return outfile
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
103
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
104 def groupReadsMapped2aRegion(filename,ncol):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
105 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
106 read output from intersectBED
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
107 find all reads mapped to each region
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
108 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
109 try:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
110 f=open(filename)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
111 #If filename cannot be opened, print an error message and exit
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
112 except IOError:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
113 print "could not open",filename,"Are you sure this file exists?"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
114 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
115 lines = f.readlines()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
116
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
117 allReadsStart = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
118 allReadsEnd = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
119 regionStrand = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
120 regionStart = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
121 regionEnd = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
122
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
123 for line in lines:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
124 flds = line.strip().split('\t')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
125 key = '_'.join(flds[ncol:(ncol+4)])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
126 if not allReadsStart.has_key(key):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
127 allReadsStart[key] = list()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
128 allReadsEnd[key] = list()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
129 #print flds[ncol+0],flds[ncol+1],flds[ncol+2]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
130 allReadsStart[key].append(int(flds[1]))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
131 allReadsEnd[key].append(int(flds[2]))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
132 regionStrand[key] = flds[ncol+5]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
133 regionStart[key] = int(flds[ncol+1])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
134 regionEnd[key] = int(flds[ncol+2])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
135 return (allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
136
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
137
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
138 def createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,nbins):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
139 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
140 each region is divided into nbins
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
141 compute the number of reads covering each bin for each region
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
142 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
143 RegionProfile = {}
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
144 nRead = {} # num of all reads in the region
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
145 for region in allReadsStart.keys():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
146 RegionProfile[region] = [0]*nbins
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
147 nRead[region] = len(allReadsStart[region])
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
148 #print region,nRead[region],allReadsStart[region]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
149 for i in range(nRead[region]):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
150 RegionProfile[region] = updateRegionCount(RegionProfile[region],allReadsStart[region][i],allReadsEnd[region][i],regionStart[region],regionEnd[region],regionStrand[region],nbins)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
151 return RegionProfile,nRead
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
152
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
153 def updateRegionCount(RegionCount,readStart,readEnd,regionStart,regionEnd,strand,nbins):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
154 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
155 each region is divided into nbins,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
156 add 1 to each bin covered by the read
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
157 '''
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
158 L = regionEnd-regionStart
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
159 start = int(nbins*(readStart-regionStart)/L)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
160 end = int(nbins*(readEnd-regionStart)/L)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
161 if start < 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
162 start = 0
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
163 if end > nbins:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
164 end = nbins
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
165 if strand == '-':
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
166 for i in range(start,end):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
167 RegionCount[nbins-1-i] = RegionCount[nbins-1-i] + 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
168 else: # if the 6th column of the input is not strand, will treat as + strand by default
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
169 for i in range(start,end):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
170 RegionCount[i] = RegionCount[i] + 1
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
171 return RegionCount
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
172
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
173 def saveProfile(filename,Profile,nRegion):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
174 out = open(filename,'w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
175 for regionType in Profile.keys():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
176 #print Profile[regionType]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
177 out.write(regionType+'\t'+str(nRegion[regionType])+'\t'+'\t'.join(map(str,Profile[regionType]))+'\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
178
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
179 def saveSummary(filename,Profile,nbin):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
180 out = open(filename+'.summary','w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
181
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
182 nfeat = len(Profile)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
183 summaryprofile = [0]*nbin
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
184 for regionType in Profile.keys():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
185 for i in range(nbin):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
186 summaryprofile[i] += Profile[regionType][i]
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
187 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,summaryprofile))+'\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
188 out.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
189 # calculate standard error
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
190 out = open(filename+'.standarderror','w')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
191 sd = [0.0]*nbin
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
192 u = [0.0]*nbin
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
193 for i in range(nbin):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
194 u[i] = float(summaryprofile[i])/nfeat
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
195 for regionType in Profile.keys():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
196 sd[i] = sd[i] + (Profile[regionType][i] - u[i])**2
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
197 sd[i] = sd[i]**0.5 / nfeat
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
198 out.write(filename+'\t'+str(nfeat)+'\t'+'\t'.join(map(str,sd))+'\n')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
199 out.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
200
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
201 def main():
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
202 usage = "usage: %prog [options] -a inputA -b inputB"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
203 parser = OptionParser(usage)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
204 parser.add_option("-a", dest="inputA",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
205 help="(required) input file A, interval (first 3 columns are chrN, start and end) or BAM format. The script computes the depth of coverage of features in file A across the features in file B" )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
206 parser.add_option("-b",dest="inputB",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
207 help="(required) input file B, interval file" )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
208 parser.add_option("-f",dest="aformat",default="BED",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
209 help="Format of input file A. Can be BED (default) or BAM")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
210 parser.add_option("-w",type='int',dest="window",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
211 help="Generate new inputB by making a window of 2 x WINDOW bp (in total) flanking the center of each input feature" )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
212 parser.add_option("-n", type="int", dest="nbins",default=100,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
213 help="number of bins. Features in B are binned, and the coverage is computed for each bin. Default is 100")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
214 parser.add_option("-s",action="store_true", dest="strandness",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
215 help="enforce strandness: require overlapping on the same strand. Default is off")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
216 parser.add_option("-p",action="store_true", dest="plot",default=False,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
217 help="load existed intersectBed outputfile")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
218 parser.add_option("-q", action="store_true", dest="quiet",default=False,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
219 help="suppress output on screen")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
220 parser.add_option("-o", dest="output_data",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
221 help="(optional) output coverage file (txt) name." )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
222 parser.add_option("-v", dest="output_plot",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
223 help="(optional) output plot (pdf) file name." )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
224 parser.add_option("-l", dest="plot_title", default="",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
225 help="(optional) output title of the plot." )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
226 parser.add_option("--ylim", dest="ylim", default="min,max",
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
227 help="(optional) ylim of the plot" )
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
228 parser.add_option("--summary-only", action="store_true", dest="summary_only",default=False,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
229 help="save profile summary only (no data for individual features)")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
230 parser.add_option("--compute-se", action="store_true", dest="compute_se",default=False,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
231 help="compute and plot standard deviation for each bin. used when --summary-only is on")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
232 parser.add_option("--profile-only", action="store_true", dest="profile_only",default=False,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
233 help="save profile only (no plot)")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
234 parser.add_option("--span", type="float", dest="span",default=0.1,
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
235 help="loess span smooth parameter, 0.1 ~ 1")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
236
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
237 (options, args) = parser.parse_args()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
238
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
239 if options.inputA == None or options.inputB == None:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
240 parser.error("Please specify two input files!!")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
241
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
242 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
243 print "checking input file format..."
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
244
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
245 ncol,ncol2 = checkFormat(options.inputA ,options.inputB,options.aformat)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
246
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
247 if ncol2 < 6:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
248 options.inputB = makeBed(options.inputB,ncol2)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
249 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
250 print "fill up 6 columns"
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
251
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
252 if options.window > 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
253 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
254 print "making windows from "+options.inputB+"..."
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
255 options.inputB = makeWindow(options.inputB,options.window)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
256
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
257 output = combineFilename(str(options.inputA),str(options.inputB))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
258
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
259 if not options.plot:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
260 if options.aformat == "BAM":
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
261 cmd = "intersectBed -abam "+str(options.inputA)+" -b "+str(options.inputB) + ' -bed -split '
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
262 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
263 cmd = "intersectBed -a "+str(options.inputA)+" -b "+str(options.inputB)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
264 if options.strandness:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
265 cmd = cmd + ' -s'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
266 cmd = cmd +" -wo > "+ output+'-intersect.tmp.bed'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
267 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
268 print "search for overlappings: "+cmd
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
269 status = os.system(cmd)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
270 if status != 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
271 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
272
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
273
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
274 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
275 print 'group reads mapped to the same region...'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
276
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
277 allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd = groupReadsMapped2aRegion(output+'-intersect.tmp.bed',ncol)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
278
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
279 if len(allReadsStart) == 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
280 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
281 print 'no overlap found!!'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
282 os.system('rm *tmp.*')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
283 sys.exit(1)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
284
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
285 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
286 print 'count number of reads mapped to each bin...'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
287
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
288 RegionProfile,nRead = createRegionProfile(allReadsStart,allReadsEnd,regionStrand,regionStart,regionEnd,options.nbins)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
289
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
290 if options.output_data == None:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
291 options.output_data = output+'.txt'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
292
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
293 if options.summary_only:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
294 saveSummary(options.output_data,RegionProfile,options.nbins)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
295
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
296 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
297 saveProfile(options.output_data,RegionProfile,nRead)
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
298
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
299 if not options.quiet:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
300 print 'results saved to: '+ options.output_data
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
301
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
302 if not (options.summary_only or options.profile_only ):
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
303 # visualize
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
304
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
305 if options.window < 1:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
306 xlab = 'relative position (bins)'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
307 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
308 xlab = 'relative position (bp)'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
309
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
310 if options.output_plot == None:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
311 options.output_plot = output+'.pdf'
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
312
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
313 title = options.plot_title+'\n n = '+str(len(RegionProfile))
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
314
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
315 rscript = open("tmp.r","w")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
316 rscript.write("x <- read.table('"+options.output_data+"')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
317 rscript.write("pdf('"+options.output_plot+"')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
318 rscript.write("avg <- colSums(x[,3:ncol(x)])/nrow(x)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
319 rscript.write("err <- sd(x[,3:ncol(x)])/sqrt(nrow(x))\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
320
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
321 if options.window == 0:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
322 rscript.write("xticks <- seq("+str(options.nbins)+")\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
323 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
324 rscript.write("xticks <- seq("+str(-options.window)+","+str(options.window)+",length.out="+str(options.nbins)+")\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
325
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
326 if options.ylim != 'min,max':
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
327 rscript.write("ylim=c("+options.ylim+")\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
328 else:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
329 rscript.write("ylim=c(min(avg-err),max(avg+err))\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
330 rscript.write("par(cex=1.5)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
331 #smooth
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
332 if options.span >= 0.1:
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
333 rscript.write("avg = loess(avg~xticks,span="+str(options.span)+")$fitted\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
334 rscript.write("err = loess(err~xticks,span="+str(options.span)+")$fitted\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
335 rscript.write("plot(xticks,avg,ylab='average coverage',main='"+title+"',xlab='"+xlab+"',type='l',lwd=0,ylim=ylim)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
336 rscript.write("polygon(c(xticks,rev(xticks)),c(avg+err,rev(avg-err)),col='slateblue1',border=NA)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
337 rscript.write("lines(xticks,avg,type='l',lwd=1)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
338 #rscript.write("xticks <- barplot(avg,names.arg=seq("+str(options.nbins)+"),ylab='average coverage',main='"+title+"',xlab='"+xlab+"',,ylim=c(min(avg-err),max(avg+err)))\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
339 #rscript.write("arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0,col='green')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
340 #rscript.write("lines(xticks,avg,lwd=2)\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
341 #rscript.write("lines(xticks,avg-err,col='green')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
342 #rscript.write("lines(xticks,avg+err,col='green')\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
343 rscript.write("dev.off()\n")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
344 rscript.close()
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
345
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
346 os.system("R --vanilla < tmp.r")
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
347
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
348 # remove intermediate output
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
349 os.system('rm *tmp.*')
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
350
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
351
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
352 if __name__ == "__main__":
f0dc65e7f6c0 Uploaded
xuebing
parents:
diff changeset
353 main()