| 2 | 1 #!/usr/bin/env Rscript | 
|  | 2 # rm(list=ls()) | 
|  | 3 | 
|  | 4 #Rscript my_VDM_tool.R --inf "nor22.vcf" --itype "C.elegans" --qual 200 --allr "AB" --snp TRUE --lsp 0.4 --pcol "black" --lcol "red" --xstand TRUE --bsize 1000000 --freqthr 0.0-1.0 --bnorm FALSE --exclf "FALSE" --exclcol "green" --outn "test-output.txt" --pdfn "test-plot.pdf" | 
|  | 5 | 
|  | 6 library("getopt") | 
|  | 7 #input from trailing line arguments | 
|  | 8 args <- commandArgs(trailingOnly = TRUE) | 
|  | 9 #read the options from input commandArgs | 
|  | 10 option_specification = matrix(c( | 
|  | 11   'inf','in',1,'character', | 
|  | 12   'itype','i',2,'character', | 
|  | 13   'qual','q',2,'double', | 
|  | 14   'allr','a',2,'character', | 
|  | 15   'snp','s',2,'logical', | 
|  | 16   'lsp','l',2,'double', | 
|  | 17   'pcol','pc',2,'character', | 
|  | 18   'lcol','lc',2,'character', | 
|  | 19   'xstand','x',2,'logical', | 
|  | 20   'bsize','b',2,'integer', | 
|  | 21   'freqthr','ft',2,'character', | 
|  | 22   'bnorm','n',2,'logical', | 
|  | 23   'exclf','ef',2,'character', | 
|  | 24   'exclcol','ec',2,'character', | 
|  | 25   'outn','o',2,'character', | 
|  | 26   'pdfn','p',2,'character' | 
|  | 27 ), byrow=TRUE, ncol=4) | 
|  | 28 options = getopt(option_specification) | 
|  | 29 | 
|  | 30 myfunction<-function(inf,itype,qual,allr,snp,lsp,pcol,lcol,xstand,bsize,freqthr,bnorm,exclf,exclcol,outn,pdfn){ | 
|  | 31   #input file | 
|  | 32   filename<-inf | 
|  | 33   #choose species for chrom config | 
|  | 34   interval_type<-itype | 
|  | 35   #quality filter | 
|  | 36   read_qual<-as.numeric(qual) | 
|  | 37   #allele ratio from AB or AO/(AO+RO) | 
|  | 38   allele_ratio<-allr | 
|  | 39   #snp only or include all variant types | 
|  | 40   snp_only<-snp | 
|  | 41   #loess span | 
|  | 42   loess_span<-as.numeric(lsp) | 
|  | 43   #scatterplot point colour | 
|  | 44   plot_color<-pcol | 
|  | 45   #loess curve colour | 
|  | 46   loess_color<-lcol | 
|  | 47   #uniform y-axis scaling | 
|  | 48   xaxis_standard<-xstand | 
|  | 49   #binsize | 
|  | 50   bin_size<-as.numeric(bsize) | 
|  | 51 | 
|  | 52   #allele ratio greater or equal to this value considered hom ALT | 
|  | 53   threshold_upper<-as.numeric(gsub(".*-","",freqthr)) | 
|  | 54   #allele ratio less than or equal to this value considered hom REF | 
|  | 55   threshold_lower<-as.numeric(gsub("-.*","",freqthr)) | 
|  | 56 | 
|  | 57   #normalize frequency barplot | 
|  | 58   bfreq_norm<-bnorm | 
|  | 59   #filenames for variant subtraction | 
|  | 60   exclusion_list<-exclf | 
|  | 61   #pre-subtraction loess curve colour | 
|  | 62   excl_loess_color<-exclcol | 
|  | 63   #filename for ouput table | 
|  | 64   vcfoutput_filename<-outn | 
|  | 65   #filename for pdf | 
|  | 66   pdf_filename<-pdfn | 
|  | 67 | 
|  | 68   #transparency for selected colour (to see plot points underneath) | 
|  | 69   plot_color<-rgb(c(col2rgb(plot_color)[1]),c(col2rgb(plot_color)[2]),c(col2rgb(plot_color)[3]),max=255,alpha=150) | 
|  | 70   loess_color<-rgb(c(col2rgb(loess_color)[1]),c(col2rgb(loess_color)[2]),c(col2rgb(loess_color)[3]),max=255,alpha=150) | 
|  | 71   excl_loess_color<-rgb(c(col2rgb(excl_loess_color)[1]),c(col2rgb(excl_loess_color)[2]),c(col2rgb(excl_loess_color)[3]),max=255,alpha=150) | 
|  | 72 | 
|  | 73   ###FIXED PARAMETERS | 
|  | 74   #chromosome intervals in Mb rather than custom | 
|  | 75   interval_unit<-1000000 | 
|  | 76   #linkage scatter plot yaxis max value=1 | 
|  | 77   sp_yaxis<-1 | 
|  | 78 | 
|  | 79   #only use variants with allele ratio in this range for subtraction (dependent on AB/ratio option) | 
|  | 80   excl_rthreshold_upper<-1 | 
|  | 81   excl_rthreshold_lower<-0 | 
|  | 82   #filename for parsed original vcf | 
|  | 83   vcfparsed_filename<-"parsed.txt" | 
|  | 84 | 
|  | 85   fastmode<-"TRUE" | 
|  | 86 | 
|  | 87 ###################### | 
|  | 88 ###READ IN VCF FILE | 
|  | 89 #extract column names | 
|  | 90 vcf_readin<-readLines(filename) | 
|  | 91 #find header line, i.e. last line to begin with # | 
|  | 92 for(l in 1:length(vcf_readin)){ | 
|  | 93   vcf_readinl<-vcf_readin[l] | 
|  | 94   if(substr(vcf_readinl,1,1)=="#"){next} | 
|  | 95   else if(substr(vcf_readinl,1,1)!="#"){rowline<-l-1;break} | 
|  | 96 } | 
|  | 97 vcf_header<-vcf_readin[rowline] | 
|  | 98 #e.g. CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\trgSM" | 
|  | 99 vcf_header<-gsub("#","",vcf_header) | 
|  | 100 vcf_colnames<-unlist(strsplit(vcf_header,"\t")) | 
|  | 101 | 
|  | 102 #extract data (hashed vcf header skipped with read.table) | 
|  | 103 vcf_rtable<-read.table(filename,sep="\t",stringsAsFactors=FALSE) | 
|  | 104 names(vcf_rtable)<-vcf_colnames | 
|  | 105 | 
|  | 106 if(fastmode=="TRUE"){ | 
|  | 107   #to speed up runtime- quality filter here ny skipping parsing of full vcf (parsed output file incomplete) | 
|  | 108   vcf_rtable<-subset(vcf_rtable,QUAL>=read_qual) | 
|  | 109 } | 
|  | 110 | 
|  | 111 | 
|  | 112 ###################### | 
|  | 113 ###PREPARE DATA | 
|  | 114 vcfinfo_dat<-NULL | 
|  | 115 multiallele_counter<-0 | 
|  | 116 diviserror_counter<-0 | 
|  | 117 for(i in c(1:nrow(vcf_rtable))){ | 
|  | 118   vcf_line<-vcf_rtable[i,] | 
|  | 119   #remove chrom or chr prefix from chromosome value | 
|  | 120   if(grepl("chrom",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | 
|  | 121     vcf_line$CHROM<-gsub("chrom","",vcf_line$CHROM,ignore.case=TRUE) | 
|  | 122   }else if(grepl("chr",vcf_line$CHROM,ignore.case=TRUE)==TRUE){ | 
|  | 123     vcf_line$CHROM<-gsub("chr","",vcf_line$CHROM,ignore.case=TRUE) | 
|  | 124   } | 
|  | 125 #PARSE INFO | 
|  | 126   vcfinfo_split<-strsplit(vcf_line$INFO,split=";") | 
|  | 127   vcfinfo_coln<-gsub("=.*","",unlist(vcfinfo_split)) | 
|  | 128   vcfinfo_cold<-gsub(".*=","",unlist(vcfinfo_split)) | 
|  | 129   vcfinfo_ldat<-data.frame(t(vcfinfo_cold),stringsAsFactors=FALSE) | 
|  | 130   names(vcfinfo_ldat)<-vcfinfo_coln | 
|  | 131 | 
|  | 132   #skip if commas in values to avoid returning errors | 
|  | 133   if(grepl(",",vcfinfo_ldat$AO)==TRUE){ | 
|  | 134     multiallele_counter<-multiallele_counter+1 | 
|  | 135     next | 
|  | 136   } | 
|  | 137   #skip divide by zero errors (under "ratio" setting for ratio calculation) | 
|  | 138   if(as.numeric(vcfinfo_ldat$AO)+as.numeric(vcfinfo_ldat$RO)=="0"){ | 
|  | 139     diviserror_counter<-diviserror_counter+1 | 
|  | 140     next | 
|  | 141   } | 
|  | 142 | 
|  | 143   #specific accounting for nonstandard categories | 
|  | 144   #LOF columns only present for loss-of-function variants + assign NA values to all other variants | 
|  | 145   if(("LOF" %in% names(vcfinfo_ldat))==TRUE){ | 
|  | 146     LOF<-vcfinfo_ldat$LOF | 
|  | 147     vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "LOF"] | 
|  | 148     vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | 
|  | 149   }else{ | 
|  | 150     LOF<-"NA" | 
|  | 151     vcfinfo_ldat<-cbind(vcfinfo_ldat,LOF) | 
|  | 152   } | 
|  | 153   #NMD columns only present for nonsense-mediated-decay variants + assign NA values to all other variants | 
|  | 154   if(("NMD" %in% names(vcfinfo_ldat))==TRUE){ | 
|  | 155     NMD<-vcfinfo_ldat$NMD | 
|  | 156     vcfinfo_ldat<-vcfinfo_ldat[,!names(vcfinfo_ldat) %in% "NMD"] | 
|  | 157     vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | 
|  | 158   }else{ | 
|  | 159     NMD<-"NA" | 
|  | 160     vcfinfo_ldat<-cbind(vcfinfo_ldat,NMD) | 
|  | 161   } | 
|  | 162 #PARSE ANNOTATION | 
|  | 163   ann_rparsed<-unlist(strsplit(vcfinfo_ldat$ANN[1],split="\\|"))[1:20] | 
|  | 164   ann_rparsed[ann_rparsed==""]<-"novalue" | 
|  | 165   ann_parsed<-data.frame(t(ann_rparsed),stringsAsFactors=FALSE) | 
|  | 166   names(ann_parsed)<-paste("ANN",c(1:dim(ann_parsed)[2]),sep="") | 
|  | 167   #remove duplicate redundant INFO column (fully parsed) | 
|  | 168   vcf_line<-vcf_line[,names(vcf_line)!="INFO"] | 
|  | 169 | 
|  | 170   #dataset keeping parsed annotation (partial parsed-for relevant) | 
|  | 171   vcfinfo_ldat<-vcfinfo_ldat[,names(vcfinfo_ldat)!="ANN"] | 
|  | 172   #append copy of original data to parsed data | 
|  | 173   vcfinfo_lldat<-cbind(vcf_line,vcfinfo_ldat,ann_parsed) | 
|  | 174   vcfinfo_dat<-rbind(vcfinfo_dat,vcfinfo_lldat) | 
|  | 175 | 
|  | 176 } | 
|  | 177 | 
|  | 178 print(paste("rows with multiple alleles skipped: ",multiallele_counter,sep="")) | 
|  | 179 # print(paste("rows with AO+RO=0 (not multiple alleles) skipped: ",diviserror_counter,sep="")) | 
|  | 180 | 
|  | 181 #ENSURE CORRECT DATATYPES | 
|  | 182 #convert dataframe columns of factor type to character type | 
|  | 183 vcfinfo_dat<-data.frame(lapply(vcfinfo_dat,as.character),stringsAsFactors=FALSE) | 
|  | 184 #convert to numeric if column is numeric and not string | 
|  | 185 for(n in c(1:dim(vcfinfo_dat)[2])){ | 
|  | 186   #suppress warnings when columns with strings encountered (not converted) | 
|  | 187   suppressWarnings( | 
|  | 188     colnum_index<-!is.na(as.numeric(vcfinfo_dat[,n])) | 
|  | 189   ) | 
|  | 190   if(all(colnum_index)==TRUE){ | 
|  | 191     vcfinfo_dat[,n]<-as.numeric(vcfinfo_dat[,n]) | 
|  | 192   } | 
|  | 193 } | 
|  | 194 | 
|  | 195 | 
|  | 196 ###################### | 
|  | 197 #RATIO CALCULATION | 
|  | 198 #ratio calculation from AO and RO | 
|  | 199 RATIO<-c(vcfinfo_dat$AO/(vcfinfo_dat$AO+vcfinfo_dat$RO)) | 
|  | 200 #add adj_AB for AB=0->AO=1 conversion | 
|  | 201 adj_AB<-replace(vcfinfo_dat$AB,vcfinfo_dat$AB==0,1) | 
|  | 202 vcfinfo_dat<-cbind(vcfinfo_dat,RATIO,adj_AB) | 
|  | 203 vcfinfo_dat<-vcfinfo_dat[with(vcfinfo_dat,order(CHROM,POS)),] | 
|  | 204 | 
|  | 205 vcfinfo_pdat<-vcfinfo_dat | 
|  | 206 vcfinfo_dat<-subset(vcfinfo_dat,QUAL>=read_qual) | 
|  | 207 | 
|  | 208 #CONSIDER ONLY SNP VARIANTS | 
|  | 209 if(snp_only==TRUE){ | 
|  | 210   vcfinfo_dat<-subset(vcfinfo_dat,CIGAR=="1X") | 
|  | 211 } | 
|  | 212 | 
|  | 213 ###################### | 
|  | 214 #SUBTRACT VARIANTS FROM EXCLUSION LIST | 
|  | 215 if(exclusion_list!="FALSE"){ | 
|  | 216   #keep copy of pre-subtraction data for later plotting | 
|  | 217   vcfinfo_origdat<-vcfinfo_dat | 
|  | 218 | 
|  | 219   #identifiers for exclusion list based on CHROM/POS/REF/ALT | 
|  | 220   index1<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,sep="_") | 
|  | 221   index2<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,sep="_") | 
|  | 222   index3<-paste(vcfinfo_dat$CHROM,vcfinfo_dat$POS,vcfinfo_dat$REF,vcfinfo_dat$ALT,sep="_") | 
|  | 223   vcfinfo_dat<-cbind(vcfinfo_dat,index1,index2,index3) | 
|  | 224 | 
|  | 225   print(paste("before subtraction: ",nrow(vcfinfo_dat),sep="")) | 
|  | 226   #loop and subtract through exclusion lists (if multiple files) | 
|  | 227   for(exclusion_ind in exclusion_list){ | 
|  | 228     exclin<-read.table(exclusion_ind,header=TRUE) | 
|  | 229 | 
|  | 230   #THRESHOLD FILTER ON EXCLUSION LIST VARIANTS | 
|  | 231     if(allele_ratio=="AB"){ | 
|  | 232       exclin<-subset(exclin,adj_AB>=excl_rthreshold_lower) | 
|  | 233       exclin<-subset(exclin,adj_AB<=excl_rthreshold_upper) | 
|  | 234     } | 
|  | 235     if(allele_ratio=="ratio"){ | 
|  | 236       exclin<-subset(exclin,ratio>=excl_rthreshold_lower) | 
|  | 237       exclin<-subset(exclin,ratio>=excl_rthreshold_upper) | 
|  | 238     } | 
|  | 239 | 
|  | 240     #identifiers for vcf data based on CHROM/POS/REF/ALT | 
|  | 241     index1<-paste(exclin$CHROM,exclin$POS,sep="_") | 
|  | 242     index2<-paste(exclin$CHROM,exclin$POS,exclin$REF,sep="_") | 
|  | 243     index3<-paste(exclin$CHROM,exclin$POS,exclin$REF,exclin$ALT,sep="_") | 
|  | 244     exclin<-cbind(exclin,index1,index2,index3) | 
|  | 245     #exclude based on CHROM+POS+REF+ALT | 
|  | 246     vcfinfo_dat<-subset(vcfinfo_dat,!(index3 %in% exclin$index3)) | 
|  | 247   } | 
|  | 248   print(paste("after subtraction: ",nrow(vcfinfo_dat),sep="")) | 
|  | 249 } | 
|  | 250 | 
|  | 251 | 
|  | 252 ###################### | 
|  | 253 #WRITE TO OUTPUT | 
|  | 254 #select relevant columns 2 variant type; 4 gene; !5 wormbase ID; 8 type change; 10 nucleotide change; 11 amino acid change; 16 warning message | 
|  | 255 vcfinfo_simp<-subset(vcfinfo_dat,select=c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","ANN2","ANN4","ANN8","ANN10","ANN11","ANN16")) | 
|  | 256 names(vcfinfo_simp)<-c("CHROM","POS","QUAL","DP","REF","ALT","AB","AO","RO","RATIO","adj_AB","VARTYPE","GENE","TYPE","NTCHANGE","PRCHANGE","WARNINGS") | 
|  | 257 vcfsimp_dat<-vcfinfo_simp[with(vcfinfo_simp,order(CHROM,POS)),] | 
|  | 258 | 
|  | 259 #write table with quality filtered variants for VDM plotting and relevant columns | 
|  | 260 try( | 
|  | 261   write.table(vcfsimp_dat,vcfoutput_filename,sep="\t",quote=FALSE,row.names=FALSE) | 
|  | 262   ,silent=TRUE) | 
|  | 263 # #write table with all unfiltered variants and all columns including parsed INFO | 
|  | 264 # if(fastmode!="TRUE"){} | 
|  | 265 # try( | 
|  | 266 #   write.table(vcfinfo_pdat,vcfparsed_filename,sep="\t",quote=FALSE,row.names=FALSE) | 
|  | 267 #   ,silent=TRUE) | 
|  | 268 # } | 
|  | 269 | 
|  | 270 ###################### | 
|  | 271 ###CHROMOSOME (INTERVAL) ARRANGEMENT | 
|  | 272 #define chromosome and chromosome size in Mb | 
|  | 273 if(interval_type == 'C.elegans'){ | 
|  | 274   chrom_n<-c('I','II','III','IV','V','X') | 
|  | 275   chrom_mb<-c(16,16,14,18,21,18) | 
|  | 276   interval_frame<-data.frame(chrom_n,chrom_mb) | 
|  | 277 } else if(interval_type == 'Zebrafish'){ | 
|  | 278   chrom_n<-c('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25') | 
|  | 279   chrom_mb<-c(61,61,64,63,76,60,78,57,59,47,47,51,55,54,48,59,54,50,51,56,45,43,47,44,39) | 
|  | 280   interval_frame<-data.frame(chrom_n,chrom_mb) | 
|  | 281 } else if(interval_type == 'Brachypodium'){ | 
|  | 282   chrom_n<-c('1','2','3','4','5') | 
|  | 283   chrom_mb<-c(75,60,60,50,30) | 
|  | 284   interval_frame<-data.frame(chrom_n,chrom_mb) | 
|  | 285 } else if(interval_type == 'Arabidopsis'){ | 
|  | 286   chrom_n<-c('1','2','3','4','5') | 
|  | 287   chrom_mb<-c(31, 20,24,19,27 ) | 
|  | 288   interval_frame<-data.frame(chrom_n,chrom_mb) | 
|  | 289 } else{ | 
|  | 290   #user interval file- no headers, with chromosome in column 1 (format CHR# or CHROM#) and size in Mb (rounded up) in column 2 | 
|  | 291   user_interval_type<-read.table(interval_type) | 
|  | 292   if(grepl("chrom",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | 
|  | 293     user_interval_type[,1]<-gsub("chrom","",user_interval_type[,1],ignore.case=TRUE) | 
|  | 294   }else if(grep("chr",user_interval_type[1,1],ignore.case=TRUE)==TRUE){ | 
|  | 295     user_interval_type[,1]<-gsub("chr","",user_interval_type[,1],ignore.case=TRUE) | 
|  | 296   } | 
|  | 297   chrom_n<-user_interval_type[,1] | 
|  | 298   chrom_mb<-user_interval_type[,2] | 
|  | 299   interval_frame<-data.frame(chrom_n,chrom_mb) | 
|  | 300 } | 
|  | 301 names(interval_frame)<-c("CHROM","INTERVAL") | 
|  | 302 | 
|  | 303 | 
|  | 304 ###################### | 
|  | 305 ###PLOTTING | 
|  | 306 #VDM SCATTER PLOT | 
|  | 307 #save to pdf | 
|  | 308 pdf(file=pdf_filename,width=9,height=8) | 
|  | 309 #par(mfrow=c(2,3)) | 
|  | 310 for(chromind in interval_frame$CHROM){ | 
|  | 311   #subset by data by chromosome for plotting | 
|  | 312   intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | 
|  | 313   chr_dat<-subset(vcfsimp_dat,CHROM==chromind,silent=TRUE) | 
|  | 314 | 
|  | 315   #same subsetting by chromosome for pre-subtraction data | 
|  | 316   if(exclusion_list!="FALSE"){ | 
|  | 317     chr_origdat<-subset(vcfinfo_origdat,CHROM==chromind,silent=TRUE) | 
|  | 318   } | 
|  | 319 | 
|  | 320   #define x-axis upper limit | 
|  | 321   if(xaxis_standard==TRUE){ | 
|  | 322     #for standardized x-axis (max x-axis chromosome length) | 
|  | 323     scupper_xaxis<-max(interval_frame$INTERVAL) | 
|  | 324     scupper_xval<-scupper_xaxis*interval_unit | 
|  | 325   } else if(xaxis_standard==FALSE){ | 
|  | 326     scupper_xaxis<-intervalind | 
|  | 327     scupper_xval<-intervalind*interval_unit | 
|  | 328   } | 
|  | 329 | 
|  | 330   if(allele_ratio=="AB"){ | 
|  | 331     plot(chr_dat$POS,chr_dat$adj_AB,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [AB]',pch=10, col=plot_color,xaxt='n') | 
|  | 332     try(lines(loess.smooth(chr_dat$POS,chr_dat$adj_AB,span=loess_span),lwd=5,col=loess_color)) | 
|  | 333     #plot loess curve for data without subtraction of exclusion variants | 
|  | 334     if(exclusion_list!="FALSE"){ | 
|  | 335       try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) | 
|  | 336     } | 
|  | 337 | 
|  | 338     axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | 
|  | 339     abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | 
|  | 340   } else if(allele_ratio=="ratio"){ | 
|  | 341     plot(chr_dat$POS,chr_dat$RATIO,cex=0.60,xlim=c(0,scupper_xval),ylim=c(0,sp_yaxis),main=paste("Chr",chromind," Variant Discovery Mapping",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Ratio of Variant Reads/Total Reads [ratio]',pch=10, col=plot_color,xaxt='n') | 
|  | 342     try(lines(loess.smooth(chr_dat$POS,chr_dat$RATIO,span=loess_span),lwd=5,col=loess_color)) | 
|  | 343     #plot loess curve for data without subtraction of exclusion variants | 
|  | 344     if(exclusion_list!="FALSE"){ | 
|  | 345       try(lines(loess.smooth(chr_origdat$POS,chr_origdat$adj_AB,span=loess_span),lty="longdash",lwd=4,col=excl_loess_color)) | 
|  | 346     } | 
|  | 347     axis(1,at=seq(0,scupper_xval,by=interval_unit),labels=c(0:scupper_xaxis)) | 
|  | 348     abline(h=seq(0,sp_yaxis,by=0.1),v=c(1:scupper_xaxis)*interval_unit,col="gray") | 
|  | 349   } | 
|  | 350 } | 
|  | 351 | 
|  | 352 | 
|  | 353 ###################### | 
|  | 354 #graph barplots | 
|  | 355 location_index<-NULL | 
|  | 356 meanSNP_dat<-NULL | 
|  | 357 # prepare table of counts and calculations | 
|  | 358 for(chromind in interval_frame$CHROM){ | 
|  | 359   #for standardized x-axis | 
|  | 360   if(xaxis_standard==TRUE){ | 
|  | 361     intervalind<-max(interval_frame$INTERVAL)*1000000/bin_size | 
|  | 362   } else if(xaxis_standard==FALSE){ | 
|  | 363     intervalind<-interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size | 
|  | 364   } | 
|  | 365   #start intervals with **1 and end with **0 | 
|  | 366   interval_begin<-c(((0:(intervalind-1))*bin_size)+1) | 
|  | 367   interval_end<-c((1:intervalind)*bin_size) | 
|  | 368 | 
|  | 369   #define x-axis upper limit | 
|  | 370   if(xaxis_standard==TRUE){ | 
|  | 371     upper_xaxis<-max(interval_frame$INTERVAL) | 
|  | 372   } else if(xaxis_standard==FALSE){ | 
|  | 373     upper_xaxis<-interval_frame$INTERVAL[interval_frame$CHROM==chromind] | 
|  | 374   } | 
|  | 375   #prepare columns | 
|  | 376   snp_counter<-0 | 
|  | 377   purealt_counter<-0 | 
|  | 378   pureref_counter<-0 | 
|  | 379   het_counter<-0 | 
|  | 380   chr_mean<-0 | 
|  | 381   normed_freq<-0 | 
|  | 382   ratio<-0 | 
|  | 383 | 
|  | 384   interval_index<-data.frame(chromind,interval_begin,interval_end,snp_counter,purealt_counter,pureref_counter,het_counter,chr_mean,normed_freq) | 
|  | 385   chr_dat<-subset(vcfinfo_dat,CHROM==chromind) | 
|  | 386   #ratio calculation | 
|  | 387   ratio<-chr_dat$AO/(chr_dat$AO+chr_dat$RO) | 
|  | 388 | 
|  | 389   #if counter based on adj_AB or ratio | 
|  | 390   if(allele_ratio=="ratio"){ | 
|  | 391     chr_purealtdat<-subset(chr_dat,ratio>=threshold_upper) | 
|  | 392     chr_purerefdat<-subset(chr_dat,ratio<=threshold_lower) | 
|  | 393     chr_hetdat<-subset(chr_dat,ratio>threshold_lower & ratio<threshold_upper) | 
|  | 394   } else if(allele_ratio=="AB"){ | 
|  | 395     chr_purealtdat<-subset(chr_dat,adj_AB>=threshold_upper) | 
|  | 396     chr_purerefdat<-subset(chr_dat,adj_AB<=threshold_lower) | 
|  | 397     chr_hetdat<-subset(chr_dat,adj_AB>threshold_lower & adj_AB<threshold_upper) | 
|  | 398   } | 
|  | 399   #if chromosome with data, count number of snps within each bin (positions rounded up to nearest bin), else skip to next chromosome | 
|  | 400   if(dim(chr_dat)[1]>0){ | 
|  | 401     for(i in 1:dim(chr_dat)[1]){ | 
|  | 402       chr_datind<-chr_dat[i,] | 
|  | 403       #round up to nearest bin-size interval | 
|  | 404       chr_datind_upper<-ceiling(chr_datind$POS/bin_size)*bin_size | 
|  | 405       interval_coln<-NULL;interval_rown<-NULL | 
|  | 406       #identify row and and counter column to increment | 
|  | 407       interval_coln<-which(names(interval_index)=="snp_counter") | 
|  | 408       interval_rown<-match(chr_datind_upper,interval_index$interval_end) | 
|  | 409       interval_index[interval_rown,interval_coln]<-c(interval_index$snp_counter[interval_rown]+1) | 
|  | 410     } | 
|  | 411   }else{ | 
|  | 412     next | 
|  | 413   } | 
|  | 414   #if chromosome with pure AO, count number of snps with each bin (positions rounded up to nearest bin) | 
|  | 415   if(dim(chr_purealtdat)[1]>0){ | 
|  | 416     for(i in 1:dim(chr_purealtdat)[1]){ | 
|  | 417       chr_purealtind<-chr_purealtdat[i,] | 
|  | 418       chr_purealtind_upper<-ceiling(chr_purealtind$POS/bin_size)*bin_size | 
|  | 419       interval_coln<-NULL;interval_rown<-NULL | 
|  | 420       interval_coln<-which(names(interval_index)=="purealt_counter") | 
|  | 421       interval_rown<-match(chr_purealtind_upper,interval_index$interval_end) | 
|  | 422       interval_index[interval_rown,interval_coln]<-c(interval_index$purealt_counter[interval_rown]+1) | 
|  | 423     } | 
|  | 424   } | 
|  | 425   #if chromosome with pure RO, count number of snps with each bin (positions rounded up to nearest bin) | 
|  | 426   if(dim(chr_purerefdat)[1]>0){ | 
|  | 427     for(i in 1:dim(chr_purerefdat)[1]){ | 
|  | 428       chr_purerefind<-chr_purerefdat[i,] | 
|  | 429       chr_purerefind_upper<-ceiling(chr_purerefind$POS/bin_size)*bin_size | 
|  | 430       interval_coln<-NULL;interval_rown<-NULL | 
|  | 431       interval_coln<-which(names(interval_index)=="pureref_counter") | 
|  | 432       interval_rown<-match(chr_purerefind_upper,interval_index$interval_end) | 
|  | 433       interval_index[interval_rown,interval_coln]<-c(interval_index$pureref_counter[interval_rown]+1) | 
|  | 434     } | 
|  | 435   } | 
|  | 436   #if chromosome with hets, count number of snps with each bin (positions rounded up to nearest bin) | 
|  | 437   if(dim(chr_hetdat)[1]>0){ | 
|  | 438     for(i in 1:dim(chr_hetdat)[1]){ | 
|  | 439       chr_hetind<-chr_hetdat[i,] | 
|  | 440       chr_hetind_upper<-ceiling(chr_hetind$POS/bin_size)*bin_size | 
|  | 441       interval_coln<-NULL;interval_rown<-NULL | 
|  | 442       interval_coln<-which(names(interval_index)=="het_counter") | 
|  | 443       interval_rown<-match(chr_hetind_upper,interval_index$interval_end) | 
|  | 444       interval_index[interval_rown,interval_coln]<-c(interval_index$het_counter[interval_rown]+1) | 
|  | 445     } | 
|  | 446   } | 
|  | 447   #irrespective of standardized x-axis, mean should be calculated from actual interval range of chromosome | 
|  | 448   chr_mean<-sum(interval_index$purealt_counter)/(interval_frame$INTERVAL[interval_frame$CHROM==chromind]*1000000/bin_size) | 
|  | 449   interval_index$chr_mean<-chr_mean | 
|  | 450   meanSNP_dat<-rbind(meanSNP_dat,data.frame(chromind,chr_mean)) | 
|  | 451   #normalization treatment for if SNPs are AO=0, AO=total SNPs, or AO and RO in bin | 
|  | 452   for(i in 1:dim(interval_index)[1]){ | 
|  | 453     chr_intind<-interval_index[i,] | 
|  | 454     #hom definition based on specified upper and lower thresholds | 
|  | 455     if(chr_intind$purealt_counter<=threshold_lower){ | 
|  | 456       interval_coln<-NULL | 
|  | 457       interval_coln<-which(names(interval_index)=="normed_freq") | 
|  | 458       interval_index[i,interval_coln]=0 | 
|  | 459     } else if (chr_intind$purealt_counter==chr_intind$snp_counter){ | 
|  | 460       interval_coln<-NULL | 
|  | 461       interval_coln<-which(names(interval_index)=="normed_freq") | 
|  | 462       interval_index[i,interval_coln]=(chr_intind$purealt_counter)^2/chr_mean | 
|  | 463     } else { | 
|  | 464       interval_coln<-NULL | 
|  | 465       interval_coln<-which(names(interval_index)=="normed_freq") | 
|  | 466       interval_index[i,interval_coln]=chr_mean*(chr_intind$purealt_counter)^2/(chr_intind$snp_counter-chr_intind$purealt_counter) | 
|  | 467     } | 
|  | 468   } | 
|  | 469   location_index<-rbind(location_index,interval_index) | 
|  | 470 } | 
|  | 471 | 
|  | 472 for(chromind in interval_frame$CHROM){ | 
|  | 473 	interval_index<-location_index[location_index$chromind==chromind,] | 
|  | 474 	#assign 0 values to avoid empty datatable error | 
|  | 475 	if(dim(interval_index)[1]==0){ | 
|  | 476 		interval_index[1,]<-rep(0,dim(interval_index)[2]) | 
|  | 477 	} | 
|  | 478 	#set up x_axis | 
|  | 479 	if(xaxis_standard==TRUE){ | 
|  | 480 		#for standardized x-axis (max x-axis chromosome length) | 
|  | 481 		bpupper_xaxis<-max(interval_frame$INTERVAL) | 
|  | 482 		bpupper_xval<-bpupper_xaxis*interval_unit | 
|  | 483 	}else if(xaxis_standard==FALSE){ | 
|  | 484 		bpupper_xaxis<-intervalind | 
|  | 485 		bpupper_xval<-intervalind*interval_unit | 
|  | 486 	} | 
|  | 487 	#set up y_axis range for barplots | 
|  | 488 	if(bfreq_norm==TRUE){ | 
|  | 489 		bp_yaxis<-5*ceiling(max(location_index$normed_freq)/5) | 
|  | 490 	#assign non-0 value to yaxis to avoid error | 
|  | 491 		if(bp_yaxis==0){ | 
|  | 492 			bp_yaxis<-10 | 
|  | 493 		} | 
|  | 494 		if(xaxis_standard==TRUE){ | 
|  | 495 			bplot<-barplot(interval_index$normed_freq,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | 
|  | 496 		}else if(xaxis_standard==FALSE){ | 
|  | 497 			bplot<-barplot(interval_index$normed_freq,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Normalised Frequency') | 
|  | 498 		} | 
|  | 499 | 
|  | 500 	}else if(bfreq_norm==FALSE){ | 
|  | 501 		bp_yaxis<-5*ceiling(max(location_index$purealt_counter)/5) | 
|  | 502 		#assign non-0 value to yaxis to avoid error | 
|  | 503 		if(bp_yaxis==0){ | 
|  | 504 		bp_yaxis<-10 | 
|  | 505 		} | 
|  | 506 		if(xaxis_standard==TRUE){ | 
|  | 507 			bplot<-barplot(interval_index$purealt_counter,space=0,ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | 
|  | 508 		}else if(xaxis_standard==FALSE){ | 
|  | 509 			bplot<-barplot(interval_index$purealt_counter,space=0,xlim=c(0,bpupper_xaxis),ylim=c(0,bp_yaxis),main=paste("Chr",chromind," Variant Only",sep=""),xlab="Position along Chromosome (in Mb)",ylab='Frequency') | 
|  | 510 		} | 
|  | 511 	} | 
|  | 512 	bp_xaxis1<-as.numeric(bplot) | 
|  | 513 	bp_xaxis2<-c(bp_xaxis1,tail(bp_xaxis1,1)+bp_xaxis1[2]-bp_xaxis1[1]) | 
|  | 514 	bp_xaxis<-bp_xaxis2-bp_xaxis1[1] | 
|  | 515 | 
|  | 516 	axis(1,at=bp_xaxis,labels=seq(0,bpupper_xaxis,by=c(bin_size/1000000))) | 
|  | 517 	} | 
|  | 518 	dev.off() | 
|  | 519 } | 
|  | 520 | 
|  | 521 myfunction(inf=options$inf,itype=options$itype,qual=options$qual,allr=options$allr,snp=options$snp,lsp=options$lsp,pcol=options$pcol,lcol=options$lcol, | 
|  | 522            xstand=options$xstand,bsize=options$bsize,bnorm=options$bnorm,freqthr=options$freqthr,exclf=options$exclf,exclcol=options$exclcol,outn=options$outn,pdfn=options$pdfn) | 
|  | 523 |