Mercurial > repos > xuef > vdm_plot
comparison my_VDM_tool.r @ 0:67769f496e2c draft
Uploaded
author | xuef |
---|---|
date | Thu, 05 Nov 2020 13:48:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:67769f496e2c |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # rm(list=ls()) | |
4 | |
5 library("getopt") | |
6 | |
7 #input from trailing line arguments | |
8 args <- commandArgs(trailingOnly = TRUE) | |
9 #read the options from input commandArgs | |
10 option_specification = matrix(c( | |
11 'inf','i01',1,'character', | |
12 'itype','i02',2,'character', | |
13 'thrup','i03',2,'double', | |
14 'thrlow','i04',2,'double', | |
15 'allr','i05',2,'character', | |
16 'complex','i06',2,'logical', | |
17 'lsp','i07',2,'double', | |
18 'pcol','i08',2,'character', | |
19 'lcol','i09',2,'character', | |
20 | |
21 'xstand','i10',2,'logical', | |
22 'bsize','i11',2,'integer', | |
23 'bnorm','i12',2,'logical', | |
24 'exclthr','i13',2,'double', | |
25 'exclcol','i14',2,'character', | |
26 | |
27 'parn','o1',2,'character', | |
28 'outn','o2',2,'character', | |
29 'pdfn','o3',2,'character' | |
30 ), byrow=TRUE, ncol=4) | |
31 | |
32 #parse options | |
33 options = getopt(option_specification) | |
34 | |
35 # #FOR DEBUGGING | |
36 # options<-NULL | |
37 # ###INPUT FILE | |
38 # setwd("D:/Dropbox/_galaxy/") | |
39 # options$inf<-"nor22.vcf" | |
40 # ###BASE OPTIONS | |
41 # #interval type | |
42 # options$itype<-"C.elegans" | |
43 # #user interval type | |
44 # options$userif<-NULL | |
45 # #quality filter | |
46 # options$qual<-200 | |
47 # #for scaling 0-1 = upper threshold for what is considered homozygous | |
48 # options$thrup<-1 | |
49 # #for scaling 0-1 = lower threshold for what is considered homozygous | |
50 # options$thrlow<-0 | |
51 # #type of allelic ratio (AB/ratio)+--------------------------------------------------------- | |
52 # options$allr<-"AB" | |
53 # #include complex variants | |
54 # options$complex<-FALSE | |
55 # ###ADDITIONAL VARIANT EXCLUSION OPTIONS | |
56 # #files with variants to exclude | |
57 # options$exclf<-NULL | |
58 # options$exclf<-c("nor22chunk1.txt","nor22chunk5.txt") | |
59 # #for variants to exclude, bottom threshold for which to be used, i.e. 0=ALL, 1=HOM only, 0.8=near HOM) | |
60 # options$exclthr<-0 | |
61 # #additional colour option for pre-subtraction line | |
62 # options$exclcol<-"green" | |
63 # ###PLOT OPTIONS | |
64 # #loess span | |
65 # options$lsp<-0.4 | |
66 # #point colour | |
67 # options$pcol<-"black" | |
68 # #loess plot colour | |
69 # options$lcol<-"red" | |
70 # #standardize x-axis interval (e.g. 1Mb interval) | |
71 # options$xstand<-TRUE | |
72 # #bin size for barplot | |
73 # options$bsize<-1000000 | |
74 # #normalization for barplot | |
75 # options$bnorm<-FALSE | |
76 # ###OUTPUT OPTIONS | |
77 # #custom files names (may not work for Galaxy) | |
78 # options$outn<-paste(gsub("vcf","",options$inf),"_output_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") | |
79 # options$parn<-paste(gsub("vcf","",options$inf),"_parsed_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") | |
80 # options$pdfn<-paste(gsub("vcf","",options$inf),"_plot_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".pdf",sep="") | |
81 # #fixed file names (will work in Galaxy) | |
82 # # options$outn<-"vcf_output.txt" | |
83 # # options$parn<-"vcf_parsedinput.txt" | |
84 # # options$pdfn<-"vdm_mapping_plot.pdf" | |
85 | |
86 | |
87 'inf','i01',1,'character', | |
88 'itype','i02',2,'character', | |
89 'qual','i03',2,'double', | |
90 'thrup','i04',2,'double', | |
91 'thrlow','i05',2,'double', | |
92 'allr','i06',2,'character', | |
93 'complex','i07',2,'logical', | |
94 'lsp','i08',2,'double', | |
95 'pcol','i09',2,'character', | |
96 'lcol','i10',2,'character', | |
97 | |
98 'xstand','i11',2,'logical', | |
99 'bsize','i12',2,'integer', | |
100 'bnorm','i13',2,'logical', | |
101 'exclthr','i14',2,'double', | |
102 'exclcol','i15',2,'character', | |
103 | |
104 'parn','o1',2,'character', | |
105 'outn','o2',2,'character', | |
106 'pdfn','o3',2,'character' | |
107 | |
108 | |
109 myfunction<-function(inf,itype,thrup,thrlow,allr,complex,lsp,pcol,lcol,xstand,bsize,bnorm,exclthr,exclcol,parn,outn,pdfn){ | |
110 | |
111 #PARAMETERS | |
112 # filename<-options$inf | |
113 # interval_type<-options$itype | |
114 # user_interval_file<-options$userif | |
115 # read_qual<-options$qual | |
116 # threshold_upper<-options$thrup | |
117 # threshold_lower<-options$thrlow | |
118 # allele_ratio<-options$allr | |
119 # incl_complex<-options$complex | |
120 # loess_span<-options$lsp | |
121 # plot_color<-options$pcol | |
122 # loess_color<-options$lcol | |
123 # #transparency for selected colour (to see plot points underneath) | |
124 # xaxis_standard<-options$xstand | |
125 # bin_size<-options$bsize | |
126 # bfreq_norm<-options$bnorm | |
127 # exclusion_list<-options$exclf | |
128 # excl_threshold<-options$exclthr | |
129 # excl_loess_color<-options$exclcol | |
130 # vcfoutput_filename<-options$outn | |
131 # vcfparsed_filename<-options$parn | |
132 # pdf_filename<-options$pdfn | |
133 | |
134 # plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot)[3]),max=255,alpha=150) | |
135 # loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) | |
136 # excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) | |
137 | |
138 inf,itype,thrup,thrlow,allr,complex,lsp,pcol,lcol,xstand,bsize,bnorm,exclthr,exclcol,parn,outn,pdfn | |
139 | |
140 filename<-inf | |
141 interval_type<-itype | |
142 # user_interval_file<-userif | |
143 read_qual<-as.numeric(qual) | |
144 threshold_upper<-as.numeric(thrup) | |
145 threshold_lower<-as.numeric(thrlow) | |
146 allele_ratio<-allr | |
147 incl_complex<-complex | |
148 loess_span<-as.numeric(lsp) | |
149 plot_color<-pcol | |
150 loess_color<-lcol | |
151 xaxis_standard<-xstand | |
152 bin_size<-as.numeric(bsize) | |
153 bfreq_norm<-bnorm | |
154 # exclusion_list<-exclf | |
155 excl_threshold<-as.numeric(exclthr) | |
156 excl_loess_color<-exclcol | |
157 vcfoutput_filename<-outn | |
158 vcfparsed_filename<-parn | |
159 pdf_filename<-pdfn | |
160 | |
161 #transparency for selected colour (to see plot points underneath) | |
162 plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot)[3]),max=255,alpha=150) | |
163 loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) | |
164 excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) | |
165 | |
166 ###FIXED PARAMETERS | |
167 #linkage scatter plot yaxis max value=1 | |
168 sp_yaxis<-1 | |
169 #chromosome intervals in Mb rather than custom | |
170 interval_unit<-1000000 | |
171 | |
172 | |
173 ###################### | |
174 ###READ IN VCF FILE | |
175 #extract column names | |
176 vcf_readin<-readLines(filename) | |
177 #find header line, i.e. last line to begin with # | |
178 for(l in 1:length(vcf_readin)){ | |
179 vcf_readinl<-vcf_readin[l] | |
180 if(substr(vcf_readinl,1,1)=="#"){next} | |
181 else if(substr(vcf_readinl,1,1)!="#"){rowline<-l-1;break} | |
182 } | |
183 vcf_header<-vcf_readin[rowline] | |
184 #e.g. CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\trgSM" | |
185 vcf_header<-gsub("#","",vcf_header) | |
186 vcf_colnames<-unlist(strsplit(vcf_header,"\t")) | |
187 | |
188 #extract data (hashed vcf header skipped with read.table) | |
189 vcf_rtable<-read.table(filename,sep="\t",stringsAsFactors=FALSE) | |
190 names(vcf_rtable)<-vcf_colnames | |
191 | |
192 ###################### | |
193 ###PREPARE DATA | |
194 | |
195 vcfinfo_dat<-NULL | |
196 vcfinfo_pdat<-NULL | |
197 multiallele_counter<-0 | |
198 diviserror_counter<-0 | |
199 for(i in c(1:nrow(vcf_rtable))){ | |
200 vcf_line<-vcf_rtable[i,] | |
201 #to speed up runtime- quality filter here | |
202 if(vcf_line$QUAL>=read_qual){ | |
203 #remove chrom or chr prefix from chromosome value | |
204 if(grepl("chrom",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | |
205 vcf_line$CHROM<-gsub("chrom","",vcf_line$CHROM,ignore.case=TRUE) | |
206 }else if(grepl("chr",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | |
207 vcf_line$CHROM<-gsub("chr","",vcf_line$CHROM,ignore.case=TRUE) | |
208 } | |
209 #PARSE INFO | |
210 vcfinfo_split<-strsplit(vcf_line$INFO,split=";") | |
211 vcfinfo_coln<-gsub("=.*","",unlist(vcfinfo_split)) | |
212 vcfinfo_cold<-gsub(".*=","",unlist(vcfinfo_split)) | |
213 vcfinfo_ldat<-data.frame(t(vcfinfo_cold),stringsAsFactors=FALSE) | |
214 names(vcfinfo_ldat)<-vcfinfo_coln | |
215 | |
216 #skip if commas in values to avoid returning errors | |
217 if(grepl(",",vcfinfo_ldat$AO)==TRUE){ | |
218 multiallele_counter<-multiallele_counter+1 | |
219 next | |
220 } | |
221 #skip divide by zero errors (under "ratio" setting for ratio calculation) | |
222 if(as.numeric(vcfinfo_ldat$AO)+as.numeric(vcfinfo_ldat$RO)=="0"){ | |
223 diviserror_counter<-diviserror_counter+1 | |
224 next | |
225 } | |
226 | |
227 #specific accounting for nonstandard categories | |
228 #LOF columns only present for loss-of-function variants + assign NA values to all other variants | |
229 if(("LOF" %in% names(vcfinfo_ldat))==TRUE){ | |
230 LOF<-vcfinfo_ldat$LOF | |
231 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "LOF"] | |
232 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | |
233 }else{ | |
234 LOF<-"NA" | |
235 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | |
236 } | |
237 #NMD columns only present for nonsense-mediated-decay variants + assign NA values to all other variants | |
238 if(("NMD" %in% names(vcfinfo_ldat))==TRUE){ | |
239 NMD<-vcfinfo_ldat$NMD | |
240 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "NMD"] | |
241 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | |
242 }else{ | |
243 NMD<-"NA" | |
244 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | |
245 } | |
246 | |
247 #general accounting for nonstandard categories | |
248 | |
249 | |
250 #PARSE ANNOTATION | |
251 ann_rparsed<-unlist(strsplit(vcfinfo_ldat$ANN[1],split="\\|"))[1:20] | |
252 ann_rparsed[ann_rparsed==""]<-"novalue" | |
253 ann_parsed<-data.frame(t(ann_rparsed),stringsAsFactors=FALSE) | |
254 names(ann_parsed)<-paste("ANN",c(1:dim(ann_parsed)[2]),sep="") | |
255 #remove duplicate redundant INFO column (fully parsed) | |
256 vcf_line<-vcf_line[,names(vcf_line)!="INFO"] | |
257 | |
258 #dataset keeping unparsed annotation (full) | |
259 vcfinfo_pldat<-cbind(vcf_line,vcfinfo_ldat) | |
260 vcfinfo_pdat<-rbind(vcfinfo_pdat,vcfinfo_pldat) | |
261 | |
262 #dataset keeping parsed annotation (partial parsed-for relevant) | |
263 vcfinfo_ldat<-vcfinfo_ldat[,names(vcfinfo_ldat)!="ANN"] | |
264 #append copy of original data to parsed data | |
265 vcfinfo_lldat<-cbind(vcf_line,vcfinfo_ldat,ann_parsed) | |
266 vcfinfo_dat<-rbind(vcfinfo_dat,vcfinfo_lldat) | |
267 } | |
268 } | |
269 print(paste("rows with multiple alleles skipped: ",multiallele_counter,sep="")) | |
270 # print(paste("rows with AO+RO=0 (not multiple alleles) skipped: ",diviserror_counter,sep="")) | |
271 | |
272 #ENSURE CORRECT DATATYPES | |
273 #convert dataframe columns of factor type to character type | |
274 vcfinfo_dat<-data.frame(lapply(vcfinfo_dat,as.character),stringsAsFactors=FALSE) | |
275 #convert to numeric if column is numeric and not string | |
276 for(n in c(1:dim(vcfinfo_dat)[2])){ | |
277 #suppress warnings when columns with strings encountered (not converted) | |
278 suppressWarnings( | |
279 colnum_index<-!is.na(as.numeric(vcfinfo_dat[,n])) | |
280 ) | |
281 if(all(colnum_index)==TRUE){ | |
282 vcfinfo_dat[,n]<-as.numeric(vcfinfo_dat[,n]) | |
283 } | |
284 } | |
285 | |
286 ###################### | |
287 #RATIO CALCULATION | |
288 #ratio calculation from AO and RO | |
289 RATIO<-c(vcfinfo_dat$AO/(vcfinfo_dat$AO+vcfinfo_dat$RO)) | |
290 #add adj_AB for AB=0->AO=1 conversion | |
291 adj_AB<-replace(vcfinfo_dat$AB,vcfinfo_dat$AB==0,1) | |
292 vcfinfo_dat<-cbind(vcfinfo_dat,RATIO,adj_AB) | |
293 vcfinfo_dat<-vcfinfo_dat[with(vcfinfo_dat,order(CHROM,POS)),] | |
294 | |
295 #EXCLUDE COMPLEX VARIANTS | |
296 if(incl_complex==FALSE){ | |
297 vcfinfo_dat<-subset(vcfinfo_dat,CIGAR=="1X") | |
298 } | |
299 | |
300 ###################### | |
301 #SUBTRACT VARIANTS FROM EXCLUSION LIST | |
302 if(length(exclusion_list)>0){ | |
303 #keep copy of pre-subtraction data for later plotting | |
304 vcfinfo_origdat<-vcfinfo_dat | |
305 | |
306 #identifiers for exclusion list based on CHROM/POS/REF/ALT | |
307 index1<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,sep="_") | |
308 index2<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,sep="_") | |
309 index3<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,vcfinfo_dat$ALT,sep="_") | |
310 vcfinfo_dat<-cbind(vcfinfo_dat,index1,index2,index3) | |
311 | |
312 print(paste("before: ",nrow(vcfinfo_dat),sep="")) | |
313 #loop and subtract through exclusion lists (if multiple files) | |
314 for(exclusion_ind in exclusion_list){ | |
315 exclin<-read.table(exclusion_ind,header=TRUE) | |
316 | |
317 #THRESHOLD FILTER ON EXCLUSION LIST VARIANTS | |
318 if(allele_ratio=="AB"){ | |
319 exclin<-subset(exclin,adj_AB>=excl_threshold) | |
320 } | |
321 if(allele_ratio=="ratio"){ | |
322 exclin<-subset(exclin,ratio>=excl_threshold) | |
323 } | |
324 | |
325 #identifiers for vcf data based on CHROM/POS/REF/ALT | |
326 index1<-paste(exclin$CHROM,exclin$POS,sep="_") | |
327 index2<-paste(exclin$CHROM,exclin$POS,exclin$REF,sep="_") | |
328 index3<-paste(exclin$CHROM,exclin$POS,exclin$REF,exclin$ALT,sep="_") | |
329 exclin<-cbind(exclin,index1,index2,index3) | |
330 #exclude based on CHROM+POS+REF+ALT | |
331 vcfinfo_dat<-subset(vcfinfo_dat,!(index3 %in% exclin$index3)) | |
332 } | |
333 print(paste("after: ",nrow(vcfinfo_dat),sep="")) | |
334 } | |
335 | |
336 ###################### | |
337 #WRITE TO OUTPUT | |
338 #select relevant columns 2 variant type; 4 gene; !5 wormbase ID; 8 type change; 10 nucleotide change; 11 amino acid change; 16 warning message | |
339 vcfinfo_simp<-subset(vcfinfo_dat,select=c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","ANN2","ANN4","ANN8","ANN10","ANN11","ANN16")) | |
340 names(vcfinfo_simp)<-c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","VARTYPE","GENE","TYPE","NTCHANGE","PRCHANGE","WARNINGS") | |
341 vcfsimp_dat<-vcfinfo_simp[with(vcfinfo_simp,order(CHROM,POS)),] | |
342 | |
343 #write table with quality filtered variants for VDM plotting and relevant columns | |
344 try( | |
345 write.table(vcfsimp_dat,vcfoutput_filename,sep="\t",quote=FALSE,row.names=FALSE) | |
346 ,silent=TRUE) | |
347 #write table with all unfiltered variants and all columns including parsed INFO | |
348 try( | |
349 write.table(vcfinfo_pdat,vcfparsed_filename,sep="\t",quote=FALSE,row.names=FALSE) | |
350 ,silent=TRUE) | |
351 | |
352 | |
353 ###################### | |
354 ###CHROMOSOME (INTERVAL) ARRANGEMENT | |
355 #define chromosome and chromosome size in Mb | |
356 if(interval_type == 'C.elegans'){ | |
357 chrom_n<-c('I','II','III','IV','V','X') | |
358 chrom_mb<-c(16,16,14,18,21,18) | |
359 interval_frame<-data.frame(chrom_n,chrom_mb) | |
360 } else if(interval_type == 'Zebrafish'){ | |
361 chrom_n<-c('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25') | |
362 chrom_mb<-c(61,61,64,63,76,60,78,57,59,47,47,51,55,54,48,59,54,50,51,56,45,43,47,44,39) | |
363 interval_frame<-data.frame(chrom_n,chrom_mb) | |
364 } else if(interval_type == 'Brachypodium'){ | |
365 chrom_n<-c('1','2','3','4','5') | |
366 chrom_mb<-c(75,60,60,50,30) | |
367 interval_frame<-data.frame(chrom_n,chrom_mb) | |
368 } else if(interval_type == 'Arabidopsis'){ | |
369 chrom_n<-c('1','2','3','4','5') | |
370 chrom_mb<-c(31, 20,24,19,27 ) | |
371 interval_frame<-data.frame(chrom_n,chrom_mb) | |
372 } else{ | |
373 #user interval file- no headers, with chromosome in column 1 (format CHR# or CHROM#) and size in Mb (rounded up) in column 2 | |
374 user_interval_type<-read.table(user_interval_file) | |
375 if(grepl("chrom",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | |
376 user_interval_type[,1]<-gsub("chrom","",user_interval_type[,1],ignore.case=TRUE) | |
377 }else if(grep("chr",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | |
378 user_interval_type[,1]<-gsub("chr","",user_interval_type[,1],ignore.case=TRUE) | |
379 } | |
380 chrom_n<-user_interval_type[,1] | |
381 chrom_mb<-user_interval_type[,2] | |
382 interval_frame<-data.frame(chrom_n,chrom_mb) | |
383 } | |
384 names(interval_frame)<-c("CHROM","INTERVAL") | |
385 | |
386 | |
387 ###################### | |
388 ###PLOTTING | |
389 #VDM SCATTER PLOT | |
390 #save to pdf | |
391 pdf(file=pdf_filename,width=9,height=8) | |
392 #par(mfrow=c(2,3)) | |
393 for(chromind in interval_frame$CHROM){ | |
394 #subset by data by chromosome for plotting | |
395 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | |
396 chr_dat<-subset(vcfsimp_dat,CHROM==chromind,silent=TRUE) | |
397 | |
398 #same subsetting by chromosome for pre-subtraction data | |
399 if(length(exclusion_list)>0){ | |
400 chr_origdat<-subset(vcfinfo_origdat,CHROM==chromind,silent=TRUE) | |
401 } | |
402 | |
403 #define x-axis upper limit | |
404 if(xaxis_standard==TRUE){ | |
405 #for standardized x-axis (max x-axis chromosome length) | |
406 scupper_xaxis<-max(interval_frame$INTERVAL) | |
407 scupper_xval<-scupper_xaxis*interval_unit | |
408 } else if(xaxis_standard==FALSE){ | |
409 scupper_xaxis<-intervalind | |
410 scupper_xval<-intervalind*interval_unit | |
411 } | |
412 | |
413 if(allele_ratio=="AB"){ | |
414 plot(chr_dat$POS,chr_dat$adj_AB,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [AB]',pch=10, col=plot_color,xaxt='n') | |
415 try(lines(loess.smooth(chr_dat$POS,chr_dat$adj_AB,span=loess_span),lwd=5,col=loess_color)) | |
416 #plot loess curve for data without subtraction of exclusion variants | |
417 if(length(exclusion_list)>0){ | |
418 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lwd=5,col=excl_loess_color)) | |
419 } | |
420 | |
421 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | |
422 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | |
423 } else if(allele_ratio=="ratio"){ | |
424 plot(chr_dat$POS,chr_dat$RATIO,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [ratio]',pch=10, col=plot,xaxt='n') | |
425 try(lines(loess.smooth(chr_dat$POS,chr_dat$RATIO,span=loess_span),lwd=5,col=loess_color)) | |
426 #plot loess curve for data without subtraction of exclusion variants | |
427 if(length(exclusion_list)>0){ | |
428 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lwd=5,col=excl_loess_color)) | |
429 } | |
430 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | |
431 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | |
432 } | |
433 } | |
434 | |
435 ###################### | |
436 #graph barplots | |
437 location_index<-NULL | |
438 meanSNP_dat<-NULL | |
439 # prepare table of counts and calculations | |
440 for(chromind in interval_frame$CHROM){ | |
441 #for standardized x-axis | |
442 if(xaxis_standard==TRUE){ | |
443 intervalind<-max(interval_frame$INTERVAL)*1000000/bin_size | |
444 } else if(xaxis_standard==FALSE){ | |
445 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size | |
446 } | |
447 #start intervals with **1 and end with **0 | |
448 interval_begin<-c(((0:(intervalind-1))*bin_size)+1) | |
449 interval_end<-c((1:intervalind)*bin_size) | |
450 | |
451 #define x-axis upper limit | |
452 if(xaxis_standard==TRUE){ | |
453 upper_xaxis<-max(interval_frame$INTERVAL) | |
454 } else if(xaxis_standard==FALSE){ | |
455 upper_xaxis<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | |
456 } | |
457 #prepare columns | |
458 snp_counter<-0 | |
459 purealt_counter<-0 | |
460 pureref_counter<-0 | |
461 het_counter<-0 | |
462 chr_mean<-0 | |
463 normed_freq<-0 | |
464 ratio<-0 | |
465 | |
466 interval_index<-data.frame(chromind,interval_begin,interval_end,snp_counter,purealt_counter,pureref_counter,het_counter,chr_mean,normed_freq) | |
467 chr_dat<-subset(vcfinfo_dat,CHROM==chromind) | |
468 #ratio calculation | |
469 ratio<-chr_dat$AO/(chr_dat$AO+chr_dat$RO) | |
470 | |
471 #if counter based on adj_AB or ratio | |
472 if(allele_ratio=="ratio"){ | |
473 chr_purealtdat<-subset(chr_dat,ratio>=threshold_upper)#;chr_purealtdat | |
474 chr_purerefdat<-subset(chr_dat,ratio<=threshold_lower)#;chr_purerefdat | |
475 chr_hetdat<-subset(chr_dat,ratio>threshold_lower & ratio<threshold_upper)#;chr_hetdat | |
476 } else if(allele_ratio=="AB"){ | |
477 chr_purealtdat<-subset(chr_dat,adj_AB>=threshold_upper)#;chr_purealtdat | |
478 chr_purerefdat<-subset(chr_dat,adj_AB<=threshold_lower)#;chr_purerefdat | |
479 chr_hetdat<-subset(chr_dat,adj_AB>threshold_lower & adj_AB<threshold_upper)#;chr_hetdat | |
480 } | |
481 #if chromosome with data, count number of snps within each bin (positions rounded up to nearest bin), else skip to next chromosome | |
482 if(dim(chr_dat)[1]>0){ | |
483 for(i in 1:dim(chr_dat)[1]){ | |
484 chr_datind<-chr_dat[i,] | |
485 #round up to nearest bin-size interval | |
486 chr_datind_upper<-ceiling(chr_datind$POS/bin_size)*bin_size | |
487 interval_coln<-NULL;interval_rown<-NULL | |
488 #identify row and and counter column to increment | |
489 interval_coln<-which(names(interval_index)=="snp_counter") | |
490 interval_rown<-match(chr_datind_upper,interval_index$interval_end) | |
491 interval_index[interval_rown,interval_coln]<-c(interval_index$snp_counter[interval_rown]+1) | |
492 } | |
493 }else{ | |
494 next | |
495 } | |
496 ##if chromosome with pure AO, count number of snps with each bin (positions rounded up to nearest bin) | |
497 if(dim(chr_purealtdat)[1]>0){ | |
498 for(i in 1:dim(chr_purealtdat)[1]){ | |
499 chr_purealtind<-chr_purealtdat[i,] | |
500 chr_purealtind_upper<-ceiling(chr_purealtind$POS/bin_size)*bin_size | |
501 interval_coln<-NULL;interval_rown<-NULL | |
502 interval_coln<-which(names(interval_index)=="purealt_counter") | |
503 interval_rown<-match(chr_purealtind_upper,interval_index$interval_end) | |
504 interval_index[interval_rown,interval_coln]<-c(interval_index$purealt_counter[interval_rown]+1) | |
505 } | |
506 } | |
507 #if chromosome with pure RO, count number of snps with each bin (positions rounded up to nearest bin) | |
508 if(dim(chr_purerefdat)[1]>0){ | |
509 for(i in 1:dim(chr_purerefdat)[1]){ | |
510 chr_purerefind<-chr_purerefdat[i,] | |
511 chr_purerefind_upper<-ceiling(chr_purerefind$POS/bin_size)*bin_size | |
512 interval_coln<-NULL;interval_rown<-NULL | |
513 interval_coln<-which(names(interval_index)=="pureref_counter") | |
514 interval_rown<-match(chr_purerefind_upper,interval_index$interval_end) | |
515 interval_index[interval_rown,interval_coln]<-c(interval_index$pureref_counter[interval_rown]+1) | |
516 } | |
517 } | |
518 #if chromosome with hets, count number of snps with each bin (positions rounded up to nearest bin) | |
519 if(dim(chr_hetdat)[1]>0){ | |
520 for(i in 1:dim(chr_hetdat)[1]){ | |
521 chr_hetind<-chr_hetdat[i,] | |
522 chr_hetind_upper<-ceiling(chr_hetind$POS/bin_size)*bin_size | |
523 interval_coln<-NULL;interval_rown<-NULL | |
524 interval_coln<-which(names(interval_index)=="het_counter") | |
525 interval_rown<-match(chr_hetind_upper,interval_index$interval_end) | |
526 interval_index[interval_rown,interval_coln]<-c(interval_index$het_counter[interval_rown]+1) | |
527 } | |
528 } | |
529 #irrespective of standardized x-axis, mean should be calculated from actual interval range of chromosome | |
530 chr_mean<-sum(interval_index$purealt_counter)/(interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size) | |
531 interval_index$chr_mean<-chr_mean | |
532 meanSNP_dat<-rbind(meanSNP_dat,data.frame(chromind,chr_mean)) | |
533 #normalization treatment for if SNPs are AO=0, AO=total SNPs, or AO and RO in bin | |
534 for(i in 1:dim(interval_index)[1]){ | |
535 chr_intind<-interval_index[i,] | |
536 #hom definition based on specified upper and lower thresholds | |
537 if(chr_intind$purealt_counter<=threshold_lower){ | |
538 interval_coln<-NULL | |
539 interval_coln<-which(names(interval_index)=="normed_freq") | |
540 interval_index[i,interval_coln]=0 | |
541 } else if (chr_intind$purealt_counter==chr_intind$snp_counter){ | |
542 interval_coln<-NULL | |
543 interval_coln<-which(names(interval_index)=="normed_freq") | |
544 interval_index[i,interval_coln]=(chr_intind$purealt_counter)^2/chr_mean | |
545 } else { | |
546 interval_coln<-NULL | |
547 interval_coln<-which(names(interval_index)=="normed_freq") | |
548 interval_index[i,interval_coln]=chr_mean*(chr_intind$purealt_counter)^2/(chr_intind$snp_counter-chr_intind$purealt_counter) | |
549 } | |
550 } | |
551 location_index<-rbind(location_index,interval_index) | |
552 } | |
553 | |
554 for(chromind in interval_frame$CHROM){ | |
555 interval_index<-location_index[location_index$chromind==chromind,] | |
556 #assign 0 values to avoid empty datatable error | |
557 if(dim(interval_index)[1]==0){ | |
558 interval_index[1,]<-rep(0,dim(interval_index)[2]) | |
559 } | |
560 } | |
561 #set up x_axis | |
562 if(xaxis_standard==TRUE){ | |
563 #for standardized x-axis (max x-axis chromosome length) | |
564 bpupper_xaxis<-max(interval_frame$INTERVAL) | |
565 bpupper_xval<-bpupper_xaxis*interval_unit | |
566 } else if(xaxis_standard==FALSE){ | |
567 bpupper_xaxis<-intervalind | |
568 bpupper_xval<-intervalind*interval_unit | |
569 } | |
570 #set up y_axis range for barplots | |
571 if(bfreq_norm==TRUE){ | |
572 bp_yaxis<-5*ceiling(max(location_index$normed_freq)/5) | |
573 #assign non-0 value to yaxis to avoid error | |
574 if(bp_yaxis==0){ | |
575 bp_yaxis<-10 | |
576 } | |
577 # } | |
578 if(xaxis_standard==TRUE){ | |
579 bplot<-barplot(interval_index$normed_freq,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | |
580 }else if(xaxis_standard==FALSE){ | |
581 bplot<-barplot(interval_index$normed_freq,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | |
582 } | |
583 }else if(bfreq_norm==FALSE){ | |
584 bp_yaxis<-5*ceiling(max(location_index$purealt_counter)/5) | |
585 #assign non-0 value to yaxis to avoid error | |
586 if(bp_yaxis==0){ | |
587 bp_yaxis<-10 | |
588 } | |
589 # } | |
590 if(xaxis_standard==TRUE){ | |
591 bplot<-barplot(interval_index$purealt_counter,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | |
592 }else if(xaxis_standard==FALSE){ | |
593 bplot<-barplot(interval_index$purealt_counter,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | |
594 } | |
595 | |
596 bp_xaxis1<-as.numeric(bplot) | |
597 bp_xaxis2<-c(bp_xaxis1,tail(bp_xaxis1,1)+bp_xaxis1[2]-bp_xaxis1[1]) | |
598 bp_xaxis<-bp_xaxis2-bp_xaxis1[1] | |
599 | |
600 axis(1,at=bp_xaxis,labels=seq(0,bpupper_xaxis,by=c(bin_size/1000000))) | |
601 } | |
602 dev.off() | |
603 | |
604 |