Mercurial > repos > xuef > vdm_plot
comparison my_VDM_tool.R @ 3:403a83d4b888 draft
Uploaded
| author | xuef |
|---|---|
| date | Fri, 06 Nov 2020 16:39:21 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:dd74836c77ad | 3:403a83d4b888 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 # rm(list=ls()) | |
| 3 # myfunction(inf="nor22.vcf",itype="C.elegans",qual=400,thrup=1.0,thrlow=0.0,allr="AB",snp=TRUE,lsp=0.4,pcol="black",lcol="red",xstand=TRUE,bsize=1000000,bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf") | |
| 4 | |
| 5 #Rscript my_VDM_tool.R --inf "nor22.vcf" --itype "C.elegans" --qual 400 --thrup 1.0 --thrlow 0.0 --allr "AB" --snp TRUE --lsp 0.4 --pcol "black" --lcol "red" --xstand TRUE --bsize 1000000 --bnorm FALSE --exclf NULL --exclthr 0.0 --exclcol "green" --parn "parsed.txt" --outn "output.txt" --pdfn "plot.pdf" | |
| 6 | |
| 7 library("getopt") | |
| 8 #input from trailing line arguments | |
| 9 args <- commandArgs(trailingOnly = TRUE) | |
| 10 #read the options from input commandArgs | |
| 11 option_specification = matrix(c( | |
| 12 'inf','i01',1,'character', | |
| 13 'itype','i02',2,'character', | |
| 14 'qual','i03',2,'double', | |
| 15 'thrup','i04',2,'double', | |
| 16 'thrlow','i05',2,'double', | |
| 17 'allr','i06',2,'character', | |
| 18 'snp','i07',2,'logical', | |
| 19 'lsp','i08',2,'double', | |
| 20 'pcol','i09',2,'character', | |
| 21 'lcol','i10',2,'character', | |
| 22 'xstand','i11',2,'logical', | |
| 23 'bsize','i12',2,'integer', | |
| 24 'bnorm','i13',2,'logical', | |
| 25 'exclf','i14',2,'character', | |
| 26 'exclthr','i15',2,'double', | |
| 27 'exclcol','i16',2,'character', | |
| 28 'parn','o1',2,'character', | |
| 29 'outn','o2',2,'character', | |
| 30 'pdfn','o3',2,'character' | |
| 31 ), byrow=TRUE, ncol=4) | |
| 32 | |
| 33 # #parse options # bnorm=FALSE,exclf=NULL,exclthr=0.0,exclcol="green",parn="parsed.txt",outn="output.txt",pdfn="plot.pdf") | |
| 34 options = getopt(option_specification) | |
| 35 # | |
| 36 # #FOR DEBUGGING | |
| 37 # options<-NULL | |
| 38 # ###INPUT FILE | |
| 39 # setwd("D:/Dropbox/_galaxy/") | |
| 40 # options$inf<-"nor22.vcf" | |
| 41 # ###BASE OPTIONS | |
| 42 # #interval type | |
| 43 # options$itype<-"C.elegans" | |
| 44 # #quality filter | |
| 45 # options$qual<-200 | |
| 46 # #for scaling 0-1 = upper threshold for what is considered homozygous | |
| 47 # options$thrup<-1 | |
| 48 # #for scaling 0-1 = lower threshold for what is considered homozygous | |
| 49 # options$thrlow<-0 | |
| 50 # #type of allelic ratio (AB/ratio) | |
| 51 # options$allr<-"AB" | |
| 52 # #include complex variants | |
| 53 # options$snp<-FALSE | |
| 54 # ###ADDITIONAL VARIANT EXCLUSION OPTIONS | |
| 55 # #files with variants to exclude | |
| 56 # options$exclf<-NULL | |
| 57 # options$exclf<-c("nor22chunk1.txt","nor22chunk5.txt") | |
| 58 # #for variants to exclude, bottom threshold for which to be used, i.e. 0=ALL, 1=HOM only, 0.8=near HOM) | |
| 59 # options$exclthr<-0 | |
| 60 # #additional colour option for pre-subtraction line | |
| 61 # options$exclcol<-"green" | |
| 62 # ###PLOT OPTIONS | |
| 63 # #loess span | |
| 64 # options$lsp<-0.4 | |
| 65 # #point colour | |
| 66 # options$pcol<-"black" | |
| 67 # #loess plot colour | |
| 68 # options$lcol<-"red" | |
| 69 # #standardize x-axis interval (e.g. 1Mb interval) | |
| 70 # options$xstand<-TRUE | |
| 71 # #bin size for barplot | |
| 72 # options$bsize<-1000000 | |
| 73 # #normalization for barplot | |
| 74 # options$bnorm<-FALSE | |
| 75 # ###OUTPUT OPTIONS | |
| 76 # #custom files names (may not work for Galaxy) | |
| 77 # options$outn<-paste(gsub("vcf","",options$inf),"_output_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") | |
| 78 # options$parn<-paste(gsub("vcf","",options$inf),"_parsed_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".txt",sep="") | |
| 79 # options$pdfn<-paste(gsub("vcf","",options$inf),"_plot_q",options$qual,"-",paste(options$exclf,sep="",collapse=""),".pdf",sep="") | |
| 80 # #fixed file names (will work in Galaxy) | |
| 81 # options$outn<-"vcf_output.txt" | |
| 82 # options$parn<-"vcf_parsedinput.txt" | |
| 83 # options$pdfn<-"vdm_mapping_plot.pdf" | |
| 84 | |
| 85 | |
| 86 myfunction<-function(inf,itype,qual,thrup,thrlow,allr,snp,lsp,pcol,lcol,xstand,bsize,bnorm,exclf,exclthr,exclcol,parn,outn,pdfn){ | |
| 87 | |
| 88 #PARAMETERS | |
| 89 # filename<-options$inf | |
| 90 # interval_type<-options$itype | |
| 91 # read_qual<-options$qual | |
| 92 # threshold_upper<-options$thrup | |
| 93 # threshold_lower<-options$thrlow | |
| 94 # allele_ratio<-options$allr | |
| 95 # snp_only<-options$snp | |
| 96 # loess_span<-options$lsp | |
| 97 # plot_color<-options$pcol | |
| 98 # loess_color<-options$lcol | |
| 99 # #transparency for selected colour (to see plot points underneath) | |
| 100 # xaxis_standard<-options$xstand | |
| 101 # bin_size<-options$bsize | |
| 102 # bfreq_norm<-options$bnorm | |
| 103 # exclusion_list<-options$exclf | |
| 104 # excl_threshold<-options$exclthr | |
| 105 # excl_loess_color<-options$exclcol | |
| 106 # vcfoutput_filename<-options$outn | |
| 107 # vcfparsed_filename<-options$parn | |
| 108 # pdf_filename<-options$pdfn | |
| 109 | |
| 110 # plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150) | |
| 111 # loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) | |
| 112 # excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) | |
| 113 | |
| 114 filename<-inf | |
| 115 interval_type<-itype | |
| 116 read_qual<-as.numeric(qual) | |
| 117 threshold_upper<-as.numeric(thrup) | |
| 118 threshold_lower<-as.numeric(thrlow) | |
| 119 allele_ratio<-allr | |
| 120 snp_only<-snp | |
| 121 loess_span<-as.numeric(lsp) | |
| 122 plot_color<-pcol | |
| 123 loess_color<-lcol | |
| 124 xaxis_standard<-xstand | |
| 125 bin_size<-as.numeric(bsize) | |
| 126 bfreq_norm<-bnorm | |
| 127 exclusion_list<-exclf | |
| 128 excl_threshold<-as.numeric(exclthr) | |
| 129 excl_loess_color<-exclcol | |
| 130 vcfoutput_filename<-outn | |
| 131 vcfparsed_filename<-parn | |
| 132 pdf_filename<-pdfn | |
| 133 | |
| 134 #transparency for selected colour (to see plot points underneath) | |
| 135 plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150) | |
| 136 loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) | |
| 137 excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) | |
| 138 | |
| 139 ###FIXED PARAMETERS | |
| 140 #linkage scatter plot yaxis max value=1 | |
| 141 sp_yaxis<-1 | |
| 142 #chromosome intervals in Mb rather than custom | |
| 143 interval_unit<-1000000 | |
| 144 | |
| 145 | |
| 146 ###################### | |
| 147 ###READ IN VCF FILE | |
| 148 #extract column names | |
| 149 vcf_readin<-readLines(filename) | |
| 150 #find header line, i.e. last line to begin with # | |
| 151 for(l in 1:length(vcf_readin)){ | |
| 152 vcf_readinl<-vcf_readin[l] | |
| 153 if(substr(vcf_readinl,1,1)=="#"){next} | |
| 154 else if(substr(vcf_readinl,1,1)!="#"){rowline<-l-1;break} | |
| 155 } | |
| 156 vcf_header<-vcf_readin[rowline] | |
| 157 #e.g. CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\trgSM" | |
| 158 vcf_header<-gsub("#","",vcf_header) | |
| 159 vcf_colnames<-unlist(strsplit(vcf_header,"\t")) | |
| 160 | |
| 161 #extract data (hashed vcf header skipped with read.table) | |
| 162 vcf_rtable<-read.table(filename,sep="\t",stringsAsFactors=FALSE) | |
| 163 names(vcf_rtable)<-vcf_colnames | |
| 164 | |
| 165 ###################### | |
| 166 ###PREPARE DATA | |
| 167 | |
| 168 vcfinfo_dat<-NULL | |
| 169 vcfinfo_pdat<-NULL | |
| 170 multiallele_counter<-0 | |
| 171 diviserror_counter<-0 | |
| 172 for(i in c(1:nrow(vcf_rtable))){ | |
| 173 vcf_line<-vcf_rtable[i,] | |
| 174 #to speed up runtime- quality filter here | |
| 175 if(vcf_line$QUAL>=read_qual){ | |
| 176 #remove chrom or chr prefix from chromosome value | |
| 177 if(grepl("chrom",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | |
| 178 vcf_line$CHROM<-gsub("chrom","",vcf_line$CHROM,ignore.case=TRUE) | |
| 179 }else if(grepl("chr",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | |
| 180 vcf_line$CHROM<-gsub("chr","",vcf_line$CHROM,ignore.case=TRUE) | |
| 181 } | |
| 182 #PARSE INFO | |
| 183 vcfinfo_split<-strsplit(vcf_line$INFO,split=";") | |
| 184 vcfinfo_coln<-gsub("=.*","",unlist(vcfinfo_split)) | |
| 185 vcfinfo_cold<-gsub(".*=","",unlist(vcfinfo_split)) | |
| 186 vcfinfo_ldat<-data.frame(t(vcfinfo_cold),stringsAsFactors=FALSE) | |
| 187 names(vcfinfo_ldat)<-vcfinfo_coln | |
| 188 | |
| 189 #skip if commas in values to avoid returning errors | |
| 190 if(grepl(",",vcfinfo_ldat$AO)==TRUE){ | |
| 191 multiallele_counter<-multiallele_counter+1 | |
| 192 next | |
| 193 } | |
| 194 #skip divide by zero errors (under "ratio" setting for ratio calculation) | |
| 195 if(as.numeric(vcfinfo_ldat$AO)+as.numeric(vcfinfo_ldat$RO)=="0"){ | |
| 196 diviserror_counter<-diviserror_counter+1 | |
| 197 next | |
| 198 } | |
| 199 | |
| 200 #specific accounting for nonstandard categories | |
| 201 #LOF columns only present for loss-of-function variants + assign NA values to all other variants | |
| 202 if(("LOF" %in% names(vcfinfo_ldat))==TRUE){ | |
| 203 LOF<-vcfinfo_ldat$LOF | |
| 204 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "LOF"] | |
| 205 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | |
| 206 }else{ | |
| 207 LOF<-"NA" | |
| 208 vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | |
| 209 } | |
| 210 #NMD columns only present for nonsense-mediated-decay variants + assign NA values to all other variants | |
| 211 if(("NMD" %in% names(vcfinfo_ldat))==TRUE){ | |
| 212 NMD<-vcfinfo_ldat$NMD | |
| 213 vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "NMD"] | |
| 214 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | |
| 215 }else{ | |
| 216 NMD<-"NA" | |
| 217 vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | |
| 218 } | |
| 219 | |
| 220 #general accounting for nonstandard categories | |
| 221 | |
| 222 | |
| 223 #PARSE ANNOTATION | |
| 224 ann_rparsed<-unlist(strsplit(vcfinfo_ldat$ANN[1],split="\\|"))[1:20] | |
| 225 ann_rparsed[ann_rparsed==""]<-"novalue" | |
| 226 ann_parsed<-data.frame(t(ann_rparsed),stringsAsFactors=FALSE) | |
| 227 names(ann_parsed)<-paste("ANN",c(1:dim(ann_parsed)[2]),sep="") | |
| 228 #remove duplicate redundant INFO column (fully parsed) | |
| 229 vcf_line<-vcf_line[,names(vcf_line)!="INFO"] | |
| 230 | |
| 231 #dataset keeping unparsed annotation (full) | |
| 232 vcfinfo_pldat<-cbind(vcf_line,vcfinfo_ldat) | |
| 233 vcfinfo_pdat<-rbind(vcfinfo_pdat,vcfinfo_pldat) | |
| 234 | |
| 235 #dataset keeping parsed annotation (partial parsed-for relevant) | |
| 236 vcfinfo_ldat<-vcfinfo_ldat[,names(vcfinfo_ldat)!="ANN"] | |
| 237 #append copy of original data to parsed data | |
| 238 vcfinfo_lldat<-cbind(vcf_line,vcfinfo_ldat,ann_parsed) | |
| 239 vcfinfo_dat<-rbind(vcfinfo_dat,vcfinfo_lldat) | |
| 240 } | |
| 241 } | |
| 242 print(paste("rows with multiple alleles skipped: ",multiallele_counter,sep="")) | |
| 243 # print(paste("rows with AO+RO=0 (not multiple alleles) skipped: ",diviserror_counter,sep="")) | |
| 244 | |
| 245 #ENSURE CORRECT DATATYPES | |
| 246 #convert dataframe columns of factor type to character type | |
| 247 vcfinfo_dat<-data.frame(lapply(vcfinfo_dat,as.character),stringsAsFactors=FALSE) | |
| 248 #convert to numeric if column is numeric and not string | |
| 249 for(n in c(1:dim(vcfinfo_dat)[2])){ | |
| 250 #suppress warnings when columns with strings encountered (not converted) | |
| 251 suppressWarnings( | |
| 252 colnum_index<-!is.na(as.numeric(vcfinfo_dat[,n])) | |
| 253 ) | |
| 254 if(all(colnum_index)==TRUE){ | |
| 255 vcfinfo_dat[,n]<-as.numeric(vcfinfo_dat[,n]) | |
| 256 } | |
| 257 } | |
| 258 | |
| 259 ###################### | |
| 260 #RATIO CALCULATION | |
| 261 #ratio calculation from AO and RO | |
| 262 RATIO<-c(vcfinfo_dat$AO/(vcfinfo_dat$AO+vcfinfo_dat$RO)) | |
| 263 #add adj_AB for AB=0->AO=1 conversion | |
| 264 adj_AB<-replace(vcfinfo_dat$AB,vcfinfo_dat$AB==0,1) | |
| 265 vcfinfo_dat<-cbind(vcfinfo_dat,RATIO,adj_AB) | |
| 266 vcfinfo_dat<-vcfinfo_dat[with(vcfinfo_dat,order(CHROM,POS)),] | |
| 267 | |
| 268 #CONSIDER ONLY SNP VARIANTS | |
| 269 if(snp_only==TRUE){ | |
| 270 vcfinfo_dat<-subset(vcfinfo_dat,CIGAR=="1X") | |
| 271 } | |
| 272 | |
| 273 ###################### | |
| 274 #SUBTRACT VARIANTS FROM EXCLUSION LIST | |
| 275 # if(length(exclusion_list)>0){ | |
| 276 # if(exclusion_list!="FALSE"){ | |
| 277 if(exclusion_list!="FALSE"){ | |
| 278 #keep copy of pre-subtraction data for later plotting | |
| 279 vcfinfo_origdat<-vcfinfo_dat | |
| 280 | |
| 281 #identifiers for exclusion list based on CHROM/POS/REF/ALT | |
| 282 index1<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,sep="_") | |
| 283 index2<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,sep="_") | |
| 284 index3<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,vcfinfo_dat$ALT,sep="_") | |
| 285 vcfinfo_dat<-cbind(vcfinfo_dat,index1,index2,index3) | |
| 286 | |
| 287 print(paste("before subtraction: ",nrow(vcfinfo_dat),sep="")) | |
| 288 #loop and subtract through exclusion lists (if multiple files) | |
| 289 for(exclusion_ind in exclusion_list){ | |
| 290 exclin<-read.table(exclusion_ind,header=TRUE) | |
| 291 | |
| 292 #THRESHOLD FILTER ON EXCLUSION LIST VARIANTS | |
| 293 if(allele_ratio=="AB"){ | |
| 294 exclin<-subset(exclin,adj_AB>=excl_threshold) | |
| 295 } | |
| 296 if(allele_ratio=="ratio"){ | |
| 297 exclin<-subset(exclin,ratio>=excl_threshold) | |
| 298 } | |
| 299 | |
| 300 #identifiers for vcf data based on CHROM/POS/REF/ALT | |
| 301 index1<-paste(exclin$CHROM,exclin$POS,sep="_") | |
| 302 index2<-paste(exclin$CHROM,exclin$POS,exclin$REF,sep="_") | |
| 303 index3<-paste(exclin$CHROM,exclin$POS,exclin$REF,exclin$ALT,sep="_") | |
| 304 exclin<-cbind(exclin,index1,index2,index3) | |
| 305 #exclude based on CHROM+POS+REF+ALT | |
| 306 vcfinfo_dat<-subset(vcfinfo_dat,!(index3 %in% exclin$index3)) | |
| 307 } | |
| 308 print(paste("after subtraction: ",nrow(vcfinfo_dat),sep="")) | |
| 309 } | |
| 310 | |
| 311 ###################### | |
| 312 #WRITE TO OUTPUT | |
| 313 #select relevant columns 2 variant type; 4 gene; !5 wormbase ID; 8 type change; 10 nucleotide change; 11 amino acid change; 16 warning message | |
| 314 vcfinfo_simp<-subset(vcfinfo_dat,select=c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","ANN2","ANN4","ANN8","ANN10","ANN11","ANN16")) | |
| 315 names(vcfinfo_simp)<-c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","VARTYPE","GENE","TYPE","NTCHANGE","PRCHANGE","WARNINGS") | |
| 316 vcfsimp_dat<-vcfinfo_simp[with(vcfinfo_simp,order(CHROM,POS)),] | |
| 317 | |
| 318 #write table with quality filtered variants for VDM plotting and relevant columns | |
| 319 try( | |
| 320 write.table(vcfsimp_dat,vcfoutput_filename,sep="\t",quote=FALSE,row.names=FALSE) | |
| 321 ,silent=TRUE) | |
| 322 #write table with all unfiltered variants and all columns including parsed INFO | |
| 323 try( | |
| 324 write.table(vcfinfo_pdat,vcfparsed_filename,sep="\t",quote=FALSE,row.names=FALSE) | |
| 325 ,silent=TRUE) | |
| 326 | |
| 327 | |
| 328 ###################### | |
| 329 ###CHROMOSOME (INTERVAL) ARRANGEMENT | |
| 330 #define chromosome and chromosome size in Mb | |
| 331 if(interval_type == 'C.elegans'){ | |
| 332 chrom_n<-c('I','II','III','IV','V','X') | |
| 333 chrom_mb<-c(16,16,14,18,21,18) | |
| 334 interval_frame<-data.frame(chrom_n,chrom_mb) | |
| 335 } else if(interval_type == 'Zebrafish'){ | |
| 336 chrom_n<-c('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25') | |
| 337 chrom_mb<-c(61,61,64,63,76,60,78,57,59,47,47,51,55,54,48,59,54,50,51,56,45,43,47,44,39) | |
| 338 interval_frame<-data.frame(chrom_n,chrom_mb) | |
| 339 } else if(interval_type == 'Brachypodium'){ | |
| 340 chrom_n<-c('1','2','3','4','5') | |
| 341 chrom_mb<-c(75,60,60,50,30) | |
| 342 interval_frame<-data.frame(chrom_n,chrom_mb) | |
| 343 } else if(interval_type == 'Arabidopsis'){ | |
| 344 chrom_n<-c('1','2','3','4','5') | |
| 345 chrom_mb<-c(31, 20,24,19,27 ) | |
| 346 interval_frame<-data.frame(chrom_n,chrom_mb) | |
| 347 } else{ | |
| 348 #user interval file- no headers, with chromosome in column 1 (format CHR# or CHROM#) and size in Mb (rounded up) in column 2 | |
| 349 # user_interval_type<-read.table(user_interval_file) | |
| 350 user_interval_type<-read.table(interval_type) | |
| 351 if(grepl("chrom",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | |
| 352 user_interval_type[,1]<-gsub("chrom","",user_interval_type[,1],ignore.case=TRUE) | |
| 353 }else if(grep("chr",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | |
| 354 user_interval_type[,1]<-gsub("chr","",user_interval_type[,1],ignore.case=TRUE) | |
| 355 } | |
| 356 chrom_n<-user_interval_type[,1] | |
| 357 chrom_mb<-user_interval_type[,2] | |
| 358 interval_frame<-data.frame(chrom_n,chrom_mb) | |
| 359 } | |
| 360 names(interval_frame)<-c("CHROM","INTERVAL") | |
| 361 | |
| 362 | |
| 363 ###################### | |
| 364 ###PLOTTING | |
| 365 #VDM SCATTER PLOT | |
| 366 #save to pdf | |
| 367 pdf(file=pdf_filename,width=9,height=8) | |
| 368 #par(mfrow=c(2,3)) | |
| 369 for(chromind in interval_frame$CHROM){ | |
| 370 #subset by data by chromosome for plotting | |
| 371 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | |
| 372 chr_dat<-subset(vcfsimp_dat,CHROM==chromind,silent=TRUE) | |
| 373 | |
| 374 #same subsetting by chromosome for pre-subtraction data | |
| 375 # if(length(exclusion_list)>0){ | |
| 376 if(exclusion_list!="FALSE"){ | |
| 377 chr_origdat<-subset(vcfinfo_origdat,CHROM==chromind,silent=TRUE) | |
| 378 } | |
| 379 | |
| 380 #define x-axis upper limit | |
| 381 if(xaxis_standard==TRUE){ | |
| 382 #for standardized x-axis (max x-axis chromosome length) | |
| 383 scupper_xaxis<-max(interval_frame$INTERVAL) | |
| 384 scupper_xval<-scupper_xaxis*interval_unit | |
| 385 } else if(xaxis_standard==FALSE){ | |
| 386 scupper_xaxis<-intervalind | |
| 387 scupper_xval<-intervalind*interval_unit | |
| 388 } | |
| 389 | |
| 390 if(allele_ratio=="AB"){ | |
| 391 plot(chr_dat$POS,chr_dat$adj_AB,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [AB]',pch=10, col=plot_color,xaxt='n') | |
| 392 try(lines(loess.smooth(chr_dat$POS,chr_dat$adj_AB,span=loess_span),lwd=5,col=loess_color)) | |
| 393 #plot loess curve for data without subtraction of exclusion variants | |
| 394 # if(length(exclusion_list)>0){ | |
| 395 if(exclusion_list!="FALSE"){ | |
| 396 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) | |
| 397 } | |
| 398 | |
| 399 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | |
| 400 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | |
| 401 } else if(allele_ratio=="ratio"){ | |
| 402 plot(chr_dat$POS,chr_dat$RATIO,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [ratio]',pch=10, col=plot_color,xaxt='n') | |
| 403 try(lines(loess.smooth(chr_dat$POS,chr_dat$RATIO,span=loess_span),lwd=5,col=loess_color)) | |
| 404 #plot loess curve for data without subtraction of exclusion variants | |
| 405 # if(length(exclusion_list)>0){ | |
| 406 if(exclusion_list!="FALSE"){ | |
| 407 try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) | |
| 408 } | |
| 409 axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | |
| 410 abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | |
| 411 } | |
| 412 } | |
| 413 | |
| 414 ###################### | |
| 415 #graph barplots | |
| 416 location_index<-NULL | |
| 417 meanSNP_dat<-NULL | |
| 418 # prepare table of counts and calculations | |
| 419 for(chromind in interval_frame$CHROM){ | |
| 420 #for standardized x-axis | |
| 421 if(xaxis_standard==TRUE){ | |
| 422 intervalind<-max(interval_frame$INTERVAL)*1000000/bin_size | |
| 423 } else if(xaxis_standard==FALSE){ | |
| 424 intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size | |
| 425 } | |
| 426 #start intervals with **1 and end with **0 | |
| 427 interval_begin<-c(((0:(intervalind-1))*bin_size)+1) | |
| 428 interval_end<-c((1:intervalind)*bin_size) | |
| 429 | |
| 430 #define x-axis upper limit | |
| 431 if(xaxis_standard==TRUE){ | |
| 432 upper_xaxis<-max(interval_frame$INTERVAL) | |
| 433 } else if(xaxis_standard==FALSE){ | |
| 434 upper_xaxis<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | |
| 435 } | |
| 436 #prepare columns | |
| 437 snp_counter<-0 | |
| 438 purealt_counter<-0 | |
| 439 pureref_counter<-0 | |
| 440 het_counter<-0 | |
| 441 chr_mean<-0 | |
| 442 normed_freq<-0 | |
| 443 ratio<-0 | |
| 444 | |
| 445 interval_index<-data.frame(chromind,interval_begin,interval_end,snp_counter,purealt_counter,pureref_counter,het_counter,chr_mean,normed_freq) | |
| 446 chr_dat<-subset(vcfinfo_dat,CHROM==chromind) | |
| 447 #ratio calculation | |
| 448 ratio<-chr_dat$AO/(chr_dat$AO+chr_dat$RO) | |
| 449 | |
| 450 #if counter based on adj_AB or ratio | |
| 451 if(allele_ratio=="ratio"){ | |
| 452 chr_purealtdat<-subset(chr_dat,ratio>=threshold_upper)#;chr_purealtdat | |
| 453 chr_purerefdat<-subset(chr_dat,ratio<=threshold_lower)#;chr_purerefdat | |
| 454 chr_hetdat<-subset(chr_dat,ratio>threshold_lower & ratio<threshold_upper)#;chr_hetdat | |
| 455 } else if(allele_ratio=="AB"){ | |
| 456 chr_purealtdat<-subset(chr_dat,adj_AB>=threshold_upper)#;chr_purealtdat | |
| 457 chr_purerefdat<-subset(chr_dat,adj_AB<=threshold_lower)#;chr_purerefdat | |
| 458 chr_hetdat<-subset(chr_dat,adj_AB>threshold_lower & adj_AB<threshold_upper)#;chr_hetdat | |
| 459 } | |
| 460 #if chromosome with data, count number of snps within each bin (positions rounded up to nearest bin), else skip to next chromosome | |
| 461 if(dim(chr_dat)[1]>0){ | |
| 462 for(i in 1:dim(chr_dat)[1]){ | |
| 463 chr_datind<-chr_dat[i,] | |
| 464 #round up to nearest bin-size interval | |
| 465 chr_datind_upper<-ceiling(chr_datind$POS/bin_size)*bin_size | |
| 466 interval_coln<-NULL;interval_rown<-NULL | |
| 467 #identify row and and counter column to increment | |
| 468 interval_coln<-which(names(interval_index)=="snp_counter") | |
| 469 interval_rown<-match(chr_datind_upper,interval_index$interval_end) | |
| 470 interval_index[interval_rown,interval_coln]<-c(interval_index$snp_counter[interval_rown]+1) | |
| 471 } | |
| 472 }else{ | |
| 473 next | |
| 474 } | |
| 475 ##if chromosome with pure AO, count number of snps with each bin (positions rounded up to nearest bin) | |
| 476 if(dim(chr_purealtdat)[1]>0){ | |
| 477 for(i in 1:dim(chr_purealtdat)[1]){ | |
| 478 chr_purealtind<-chr_purealtdat[i,] | |
| 479 chr_purealtind_upper<-ceiling(chr_purealtind$POS/bin_size)*bin_size | |
| 480 interval_coln<-NULL;interval_rown<-NULL | |
| 481 interval_coln<-which(names(interval_index)=="purealt_counter") | |
| 482 interval_rown<-match(chr_purealtind_upper,interval_index$interval_end) | |
| 483 interval_index[interval_rown,interval_coln]<-c(interval_index$purealt_counter[interval_rown]+1) | |
| 484 } | |
| 485 } | |
| 486 #if chromosome with pure RO, count number of snps with each bin (positions rounded up to nearest bin) | |
| 487 if(dim(chr_purerefdat)[1]>0){ | |
| 488 for(i in 1:dim(chr_purerefdat)[1]){ | |
| 489 chr_purerefind<-chr_purerefdat[i,] | |
| 490 chr_purerefind_upper<-ceiling(chr_purerefind$POS/bin_size)*bin_size | |
| 491 interval_coln<-NULL;interval_rown<-NULL | |
| 492 interval_coln<-which(names(interval_index)=="pureref_counter") | |
| 493 interval_rown<-match(chr_purerefind_upper,interval_index$interval_end) | |
| 494 interval_index[interval_rown,interval_coln]<-c(interval_index$pureref_counter[interval_rown]+1) | |
| 495 } | |
| 496 } | |
| 497 #if chromosome with hets, count number of snps with each bin (positions rounded up to nearest bin) | |
| 498 if(dim(chr_hetdat)[1]>0){ | |
| 499 for(i in 1:dim(chr_hetdat)[1]){ | |
| 500 chr_hetind<-chr_hetdat[i,] | |
| 501 chr_hetind_upper<-ceiling(chr_hetind$POS/bin_size)*bin_size | |
| 502 interval_coln<-NULL;interval_rown<-NULL | |
| 503 interval_coln<-which(names(interval_index)=="het_counter") | |
| 504 interval_rown<-match(chr_hetind_upper,interval_index$interval_end) | |
| 505 interval_index[interval_rown,interval_coln]<-c(interval_index$het_counter[interval_rown]+1) | |
| 506 } | |
| 507 } | |
| 508 #irrespective of standardized x-axis, mean should be calculated from actual interval range of chromosome | |
| 509 chr_mean<-sum(interval_index$purealt_counter)/(interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size) | |
| 510 interval_index$chr_mean<-chr_mean | |
| 511 meanSNP_dat<-rbind(meanSNP_dat,data.frame(chromind,chr_mean)) | |
| 512 #normalization treatment for if SNPs are AO=0, AO=total SNPs, or AO and RO in bin | |
| 513 for(i in 1:dim(interval_index)[1]){ | |
| 514 chr_intind<-interval_index[i,] | |
| 515 #hom definition based on specified upper and lower thresholds | |
| 516 if(chr_intind$purealt_counter<=threshold_lower){ | |
| 517 interval_coln<-NULL | |
| 518 interval_coln<-which(names(interval_index)=="normed_freq") | |
| 519 interval_index[i,interval_coln]=0 | |
| 520 } else if (chr_intind$purealt_counter==chr_intind$snp_counter){ | |
| 521 interval_coln<-NULL | |
| 522 interval_coln<-which(names(interval_index)=="normed_freq") | |
| 523 interval_index[i,interval_coln]=(chr_intind$purealt_counter)^2/chr_mean | |
| 524 } else { | |
| 525 interval_coln<-NULL | |
| 526 interval_coln<-which(names(interval_index)=="normed_freq") | |
| 527 interval_index[i,interval_coln]=chr_mean*(chr_intind$purealt_counter)^2/(chr_intind$snp_counter-chr_intind$purealt_counter) | |
| 528 } | |
| 529 } | |
| 530 location_index<-rbind(location_index,interval_index) | |
| 531 } | |
| 532 | |
| 533 for(chromind in interval_frame$CHROM){ | |
| 534 interval_index<-location_index[location_index$chromind==chromind,] | |
| 535 #assign 0 values to avoid empty datatable error | |
| 536 if(dim(interval_index)[1]==0){ | |
| 537 interval_index[1,]<-rep(0,dim(interval_index)[2]) | |
| 538 } | |
| 539 # } | |
| 540 #set up x_axis | |
| 541 if(xaxis_standard==TRUE){ | |
| 542 #for standardized x-axis (max x-axis chromosome length) | |
| 543 bpupper_xaxis<-max(interval_frame$INTERVAL) | |
| 544 bpupper_xval<-bpupper_xaxis*interval_unit | |
| 545 }else if(xaxis_standard==FALSE){ | |
| 546 bpupper_xaxis<-intervalind | |
| 547 bpupper_xval<-intervalind*interval_unit | |
| 548 } | |
| 549 #set up y_axis range for barplots | |
| 550 if(bfreq_norm==TRUE){ | |
| 551 bp_yaxis<-5*ceiling(max(location_index$normed_freq)/5) | |
| 552 #assign non-0 value to yaxis to avoid error | |
| 553 if(bp_yaxis==0){ | |
| 554 bp_yaxis<-10 | |
| 555 } | |
| 556 if(xaxis_standard==TRUE){ | |
| 557 bplot<-barplot(interval_index$normed_freq,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | |
| 558 }else if(xaxis_standard==FALSE){ | |
| 559 bplot<-barplot(interval_index$normed_freq,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | |
| 560 } | |
| 561 | |
| 562 }else if(bfreq_norm==FALSE){ | |
| 563 bp_yaxis<-5*ceiling(max(location_index$purealt_counter)/5) | |
| 564 #assign non-0 value to yaxis to avoid error | |
| 565 if(bp_yaxis==0){ | |
| 566 bp_yaxis<-10 | |
| 567 } | |
| 568 if(xaxis_standard==TRUE){ | |
| 569 bplot<-barplot(interval_index$purealt_counter,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | |
| 570 }else if(xaxis_standard==FALSE){ | |
| 571 bplot<-barplot(interval_index$purealt_counter,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | |
| 572 } | |
| 573 } | |
| 574 bp_xaxis1<-as.numeric(bplot) | |
| 575 bp_xaxis2<-c(bp_xaxis1,tail(bp_xaxis1,1)+bp_xaxis1[2]-bp_xaxis1[1]) | |
| 576 bp_xaxis<-bp_xaxis2-bp_xaxis1[1] | |
| 577 | |
| 578 axis(1,at=bp_xaxis,labels=seq(0,bpupper_xaxis,by=c(bin_size/1000000))) | |
| 579 | |
| 580 } | |
| 581 dev.off() | |
| 582 } | |
| 583 | |
| 584 | |
| 585 # myfunction(options$inf,options$itype,options$qual,options$thrup,options$thrlow,options$allr,options$snp,options$lsp,options$pcol,options$lcol,options$xstand, | |
| 586 # options$bsize,options$bnorm,options$exclf,options$exclthr,options$exclcol,options$parn,options$outn,options$pdfn) | |
| 587 | |
| 588 | |
| 589 myfunction(inf=options$inf,itype=options$itype,qual=options$qual,thrup=options$thrup,thrlow=options$thrlow, | |
| 590 allr=options$allr,snp=options$snp,lsp=options$lsp,pcol=options$pcol,lcol=options$lcol,xstand=options$xstand,bsize=options$bsize, | |
| 591 bnorm=options$bnorm,exclf=options$exclf,exclthr=options$exclthr,exclcol=options$exclcol,parn=options$parn,outn=options$outn,pdfn=options$pdfn) | |
| 592 |
