# HG changeset patch # User fcaramia # Date 1354579176 18000 # Node ID 4c5c3994bfcbffc56d3c398636aee3a7c14f2fd2 # Parent c4416c3af2178ca2f67dfe9a8d63ad91dc7f793d Deleted selected files diff -r c4416c3af217 -r 4c5c3994bfcb bismark_bed_files.loc.sample --- a/bismark_bed_files.loc.sample Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -# Bed file made permanent for experimental reasons, user can also input file in galaxy - -#bismark.bed.agilent.human Agilent Human Methyl-Seq /galaxy/bed_files/Agilent_Human_Methyl-Seq/S03770311_Covered.bed - diff -r c4416c3af217 -r 4c5c3994bfcb bismark_indices.loc.sample --- a/bismark_indices.loc.sample Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ - -# Genomes have been coverted using bismark_genome_preparation script. Index points to the directory containing the genome and the methylated genome folder - -#bismark.hg19.GRCh37.62 hg19 hg19 ensembl GRCh37.62 /galaxy/indexes/human/ensembl_GRCh37.62/bowtie2/illumina/ -#bismark.hg19.b37 b37 hg19 1000 genomes b37 /galaxy/indexes/human/g1k_v37/bowtie2/illumina/ -#bismark.mm9.NCBIM37.62 mm9 mm9 ensembl NCBIM37.62 /galaxy/indexes/mouse/ensembl_NCBIM37.62/bowtie2/illumina/ -#bismark.dm3.BDGP5.25.62 dm3 dm3 ensembl BDGP5.25.62 /galaxy/indexes/fruitfly/ensembl_BDGP5.25.62/bowtie2/illumina/ diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/bismark.xml --- a/methylation_analysis/bismark.xml Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,182 +0,0 @@ - - : A bisulfite read mapper and methylation caller - - samtools - bowtie2 - bismark - - - - bismark_wrapper.pl - - - "GENOME::${genome.fields.path}" - - - - #if str($option_input.input_option) == "mates": - "MATES::$option_input.file_mate1::$option_input.file_mate2" - #else - "SINGLES::$option_input.file_single" - #end if - - #if str($format_option) == "fasta": - "FORMAT::--fasta" - #else - "FORMAT::--fastq" - #end if - - #if str($non_directional) == "ON": - "DIRECTIONAL::--non_directional" - #end if - - "OUTPUT::$output" - "SUMMARY::$summary" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| - - -**Reference** - - http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - ------ - -**What it does** - -Bismark takes in FastA or FastQ files and aligns the reads to a specified bisulfite genome. -Sequence reads are transformed into a bisulfite converted forward strand version (C->T conversion) -or into a bisulfite treated reverse strand (G->A conversion of the forward strand). -Each of these reads are then aligned to bisulfite treated forward strand index of a reference genome -(C->T converted) and a bisulfite treated reverse strand index of the genome (G->A conversion of the -forward strand, by doing this alignments will produce the same positions). These 4 instances of -Bowtie (1 or 2) are run in parallel. The sequence file(s) are then read in again sequence by sequence -to pull out the original sequence from the genome and determine if there were any protected C's present or not. - -As of version 0.7.0 Bismark will only run 2 alignment threads for OT and OB in parallel, the 4 strand mode can be -re-enabled by using --non_directional. - -The final output of Bismark is in SAM format by default. But for storage restrictions the output is compressed (BAM). - - ------ - -**Required Parameters** - -:: - - -q/--fastq The query input files (specified as mate1,mate2 or singles are FASTQ - files (usually having extension .fg or .fastq). This is the default. See also - --solexa-quals. - - -f/--fasta The query input files (specified as mate1,mate2 or singles are FASTA - files (usually havin extension .fa, .mfa, .fna or similar). All quality values - are assumed to be 40 on the Phred scale. - - -1 mates1 List of files containing the #1 mates (filename usually includes - "_1"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates2. - Reads may be a mix of different lengths. Bismark will produce one mapping result - and one report file per paired-end input file pair. - - -2 mates2 List of files containing the #2 mates (filename usually includes - "_2"), e.g. flyA_1.fq,flyB_1.fq). Sequences specified with this option must - correspond file-for-file and read-for-read with those specified in mates1. - Reads may be a mix of different lengths. - - singles List of files containing the reads to be aligned (e.g. - lane1.fq,lane2.fq lane3.fq). Reads may be a mix of different lengths. Bismark will - produce one mapping result and one report file per input file. - - --non_directional The sequencing library was constructed in a non strand-specific manner, alignments to all four - bisulfite strands will be reported. Default: ON. - - (The current Illumina protocol for BS-Seq is directional, in which case the strands complementary - to the original strands are merely theoretical and should not exist in reality. Specifying directional - alignments (which is the default) will only run 2 alignment threads to the original top (OT) - or bottom (OB) strands in parallel and report these alignments. This is the recommended option - for sprand-specific libraries). - ------ - -**Default Parameters** - -:: - - --bowtie2 Uses Bowtie 2 instead of Bowtie 1. Bismark limits Bowtie 2 to only perform end-to-end - alignments, i.e. searches for alignments involving all read characters (also called - untrimmed or unclipped alignments). Bismark assumes that raw sequence data is adapter - and/or quality trimmed where appropriate. Default: on. - - - - - -p NTHREADS Launch NTHREADS parallel search threads (default: 4). Threads will run on separate processors/cores - and synchronize when parsing reads and outputting alignments. Searching for alignments is highly - parallel, and speedup is close to linear. Increasing -p increases Bowtie 2's memory footprint. - E.g. when aligning to a human genome index, increasing -p from 1 to 8 increases the memory footprint - by a few hundred megabytes. This option is only available if bowtie is linked with the pthreads - library (i.e. if BOWTIE_PTHREADS=0 is not specified at build time). In addition, this option will - automatically use the option '--reorder', which guarantees that output SAM records are printed in - an order corresponding to the order of the reads in the original input file, even when -p is set - greater than 1 (Bismark requires the Bowtie 2 output to be this way). Specifying --reorder and - setting -p greater than 1 causes Bowtie 2 to run somewhat slower and use somewhat more memory then - if --reorder were not specified. Has no effect if -p is set to 1, since output order will naturally - correspond to input order in that case. - - - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/bismark_wrapper.pl --- a/methylation_analysis/bismark_wrapper.pl Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -use strict; -use warnings; -use File::Basename; -use Cwd; -use File::Path qw(make_path remove_tree); -die qq( -Bad numbr of inputs - -) if(!@ARGV); - -my @bam_list_entries; - -my $player_options = "--bowtie2 -p 4 "; -my $bam_output; -my $summary; -my $genome; -my $singles = ""; -my $mates1 = ""; -my $mates2 = ""; -my $directional=""; -my $format=""; -my $log_file=""; - - -foreach my $input (@ARGV) -{ - my @tmp = split "::", $input; - if($tmp[0] eq "GENOME") - { - $genome=$tmp[1]; - } - elsif($tmp[0] eq "MATES") - { - $mates1 = $tmp[1]; - $mates2 = $tmp[2]; - } - elsif($tmp[0] eq "SINGLES") - { - $singles = $tmp[1]; - } - elsif($tmp[0] eq "FORMAT") - { - $format = $tmp[1]; - } - elsif($tmp[0] eq "DIRECTIONAL") - { - $directional = $tmp[1]; - } - elsif($tmp[0] eq "SUMMARY") - { - $summary = $tmp[1]; - } - elsif($tmp[0] eq "OUTPUT") - { - $bam_output = $tmp[1]; - } - - else - { - die("Unknown Input: $input\n"); - } -} - - -my $working_dir = cwd()."/BISMARK_OUTPUT"; -make_path($working_dir); - - -#run bismark - -if ($singles eq "") -{ - system ("bismark $player_options $directional $format $genome -o $working_dir -1 $mates1 -2 $mates2 > $summary 2>&1 "); - -} -else -{ - system ("bismark $player_options $directional $format $genome -o $working_dir $singles > $summary 2>&1"); - -} - - -move_files($working_dir); - - -sub move_files -{ - my $name; - my $suffix; - my $path; - my $local_dir = $_[0]; - opendir(DIR, $local_dir); - #print ("Openning: $local_dir\n"); - my @FILES= readdir(DIR); - closedir(DIR); - foreach my $file (@FILES) - { - if ($file eq "." || $file eq "..") - { - #print ("./ or ../ skipped\n"); - } - elsif (-d "$local_dir/$file") - { - #print ("moving to: $local_dir/$file\n"); - move_files("$local_dir/$file"); - } - elsif (-f "$local_dir/$file") - { - ($name,$path,$suffix) = fileparse($file,qr"\.[^.]*$"); - if ($suffix eq ".sam") - { - #converto to BAM and Delete SAM - system ("MergeSamFiles.sh MAX_RECORDS_IN_RAM=2000000 I=$local_dir/$file O=$bam_output VALIDATION_STRINGENCY=LENIENT SO=coordinate USE_THREADING=true CREATE_INDEX=true > /dev/null 2>&1"); - system ("rm -rf $local_dir/$file"); - - } - elsif($suffix eq ".txt") - { - system ("mv $local_dir/$file $summary"); - } - } - else - { - die("Unrecognized file generated: $file\n"); - } - - } -} - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/differential_methylation.R --- a/methylation_analysis/differential_methylation.R Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,281 +0,0 @@ -#!/usr/bin/Rscript --vanilla -library(getopt) -spec <- matrix(c("help", "h", 0, "logical", "view this help", "segfile1", "s", 1, "character", "seg file 1", "segfile2", "t", 1, "character", "seg file 2", "output", "o", 1, "character", "output file", "fdr", "f", 2, "character", paste("fdr method [", paste(p.adjust.methods, collapse="|"), "]", sep=""), - "reference", "r", 2, "character", "reference to use [b37|hg19|GRCh37|mm9|NCBIM37|mm10|GRCm38|dm3|BDGP5]", "annot", "a", 2, "character", "annotation to add [both|gene|cpg]", "processes", "p", 2, "integer", "number of cluster instances to open [1]"), ncol=5, byrow=T) -opt <- getopt(spec) - -# set default options -if(is.null(opt$fdr)) opt$fdr <- "BH" -if(is.null(opt$reference)) opt$reference <- "hg19" -if(is.null(opt$annot)) opt$annot <- "both" -if(is.null(opt$processes)) opt$processes <- 1 - -# check if any invalid options -if(! opt$annot %in% c("both", "gene", "cpg") || !opt$fdr %in% p.adjust.methods || !opt$reference %in% c("b37", "hg19", "GRCh37", "mm9", "NCBIM37", "mm10", "GRCm38", "dm3", "BDGP5")) { - opt$help <- 1 -} - -# print help file if any incorrect -if(!is.null(opt$help) || is.null(opt$segfile1) || is.null(opt$output) || is.null(opt$output) || is.na(opt$processes)) { - self = commandArgs()[1]; - cat(getopt(spec, usage=T, command="differential_methylation.R")) - q(status=1) -} - -library(snow) -cl <- makeCluster(opt$p, type = "MPI") - -segfile1 <- read.table(opt$segfile1, sep="\t", as.is=T, head=T, quote = "") -segfile2 <- read.table(opt$segfile2, sep="\t", as.is=T, head=T, quote = "") - -rownames(segfile1) <- paste(segfile1[,2], ":", segfile1[,3], "-", segfile1[,4], sep="") -rownames(segfile2) <- paste(segfile2[,2], ":", segfile2[,3], "-", segfile2[,4], sep="") - -common_reg <- intersect(rownames(segfile1), rownames(segfile2)) -prop_test <- function(x, n, fdr) { - library(abind) - ESTIMATE <- x/n - DELTA <- ESTIMATE[,1L] - ESTIMATE[,2L] - YATES <- pmin(abs(DELTA)/rowSums(1/n), 0.5) - p <- rowSums(x)/rowSums(n) - df <- 1 - x <- abind(x, n - x, along=3) - E <- abind(n*p, n*(1-p), along=3) - STATISTIC <- rowSums((abs(x - E) - YATES)^2/E) - STATISTIC[is.na(STATISTIC)] <- 0 - PVAL <- pchisq(STATISTIC, df, lower.tail = FALSE) - return(data.frame(X_Squared = round(STATISTIC, 3), P.value= round(PVAL, 6), P.adjusted = round(p.adjust(PVAL, method=fdr), 6))) -} -results <- prop_test(cbind(segfile1[common_reg, "Methylated"], segfile2[common_reg, "Methylated"]), cbind(segfile1[common_reg, "Total"], segfile2[common_reg, "Total"]), opt$fdr) - -sample1 <- segfile1[1, 1] -sample2 <- segfile2[1, 1] -tab_out <- data.frame(ID = paste(sample1, "vs", sample2, sep="."), segfile1[common_reg, c(2:4)], segfile1[common_reg, c("Methylated", "Total", "FractionMethylated")], - segfile2[common_reg, c("Methylated", "Total", "FractionMethylated")], results, DiffProp = round(segfile1[common_reg, "FractionMethylated"] - segfile2[common_reg, "FractionMethylated"], 6), stringsAsFactors=F) -colnames(tab_out)[c(5:10)] <- c(paste(sample1, "Methylated", sep="."), paste(sample1, "Total", sep="."), paste(sample1, "Proportion", sep="."), paste(sample2, "Methylated", sep="."), - paste(sample2, "Total", sep="."), paste(sample2, "Proportion", sep=".")) - -rm(segfile1) -rm(segfile2) -gc() - -add_annot <- function(tab, annotation, genome) { - # find closest feature - if(genome == "hg19" || genome == "GRCh37" || genome == "b37") { - genome <- "hg19" - dataset <- "hsapiens_gene_ensembl" - biomart <- "ensembl" - host <- "www.biomart.org" - attributes <- c("ensembl_gene_id", "hgnc_symbol", "refseq_mrna", "start_position", "end_position") - } else if(genome == "mm9" || genome == "NCBIM37") { - genome <- "mm9" - dataset <- "mmusculus_gene_ensembl" - biomart <- "ENSEMBL_MART_ENSEMBL" - host <- "may2012.archive.ensembl.org" - attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position") - } else if(genome == "mm10" || genome == "GRCm38") { - genome <- "mm10" - dataset <- "mmusculus_gene_ensembl" - biomart <- "ensembl" - host <- "www.biomart.org" - attributes <- c("ensembl_gene_id", "mgi_symbol", "refseq_mrna", "start_position", "end_position") - } else if(genome == "dm3" || genome == "BDGP5") { - genome <- "dm3" - dataset <- "dmelanogaster_gene_ensembl" - biomart <- "ensembl" - host <- "www.biomart.org" - attributes <- c("ensembl_gene_id", "flybasecgid_gene", "refseq_mrna", "start_position", "end_position") - } - - e_to_U = c("GL000191.1" = "1_gl000191_random", "GL000192.1" = "1_gl000192_random", "GL000193.1" = "4_gl000193_random", "GL000194.1" = "4_gl000194_random", - "GL000195.1" = "7_gl000195_random", "GL000196.1" = "8_gl000196_random", "GL000197.1" = "8_gl000197_random", "GL000198.1" = "9_gl000198_random", - "GL000199.1" = "9_gl000199_random", "GL000200.1" = "9_gl000200_random", "GL000201.1" = "9_gl000201_random", "GL000202.1" = "11_gl000202_random", - "GL000203.1" = "17_gl000203_random", "GL000204.1" = "17_gl000204_random", "GL000205.1" = "17_gl000205_random", "GL000206.1" = "17_gl000206_random", - "GL000207.1" = "18_gl000207_random", "GL000208.1" = "19_gl000208_random", "GL000209.1" = "19_gl000209_random", "GL000210.1" = "21_gl000210_random", - "GL000211.1" = "Un_gl000211", "GL000212.1" = "Un_gl000212", "GL000213.1" = "Un_gl000213", "GL000214.1" = "Un_gl000214", - "GL000215.1" = "Un_gl000215", "GL000216.1" = "Un_gl000216", "GL000217.1" = "Un_gl000217", "GL000218.1" = "Un_gl000218", - "GL000219.1" = "Un_gl000219", "GL000220.1" = "Un_gl000220", "GL000221.1" = "Un_gl000221", "GL000222.1" = "Un_gl000222", - "GL000223.1" = "Un_gl000223", "GL000224.1" = "Un_gl000224", "GL000225.1" = "Un_gl000225", "GL000226.1" = "Un_gl000226", - "GL000227.1" = "Un_gl000227", "GL000228.1" = "Un_gl000228", "GL000229.1" = "Un_gl000229", "GL000230.1" = "Un_gl000230", - "GL000231.1" = "Un_gl000231", "GL000232.1" = "Un_gl000232", "GL000233.1" = "Un_gl000233", "GL000234.1" = "Un_gl000234", - "GL000235.1" = "Un_gl000235", "GL000236.1" = "Un_gl000236", "GL000237.1" = "Un_gl000237", "GL000238.1" = "Un_gl000238", - "GL000239.1" = "Un_gl000239", "GL000240.1" = "Un_gl000240", "GL000241.1" = "Un_gl000241", "GL000242.1" = "Un_gl000242", - "GL000243.1" = "Un_gl000243", "GL000244.1" = "Un_gl000244", "GL000245.1" = "Un_gl000245", "GL000246.1" = "Un_gl000246", - "GL000247.1" = "Un_gl000247", "GL000248.1" = "Un_gl000248", "GL000249.1" = "Un_gl000249") - - # function to conver ucsc chroms to ensembl - ensembl <- function(chr, genome) { - # ensembl does not us chr and M is MT - chr <- sub("^chr", "", chr) - if(genome == "dm3" || genome == "BDGP5") { - chr <- sub("^M$", "dmel_mitochondrion_genome", chr) - } else { - chr <- sub("^M$", "MT", chr) - } - chr[which(chr %in% e_to_U)] <- names(e_to_U)[match(chr[which(chr %in% e_to_U)], e_to_U)] - return(chr) - } - - # function to conver ensembl chroms to ucsc - ucsc <- function(chr) { - chr <- sub("^chr", "", chr) - tmp <- which(chr %in% names(c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M"))) - chr[tmp] <- c(e_to_U, dmel_mitochondrion_genome = "M", MT = "M")[chr[tmp]] - paste("chr", chr, sep="") - } - - # function to get the gene annotation by chromosome - get_genes <- function(chr) { - library(biomaRt) - mart <- useMart(biomart=biomart, dataset=dataset, host=host) - tab <- getBM(attributes = attributes, filters = "chromosome_name", values = chr, mart = mart) - if(any(is.na(tab))) for(i in 1:ncol(tab)) tab[is.na(tab[,i]), i] <- "" - mult_ids <- names(which(table(tab$ensembl_gene_id) > 1)) - rem <- NULL - for(i in mult_ids) { - index <- which(tab$ensembl_gene_id == i) - refseq <- which(tab$ensembl_gene_id == i & tab$refseq_mrna != "") - if(length(refseq) > 0) { - rem <- c(rem, setdiff(index, refseq[1])) - } else { - rem <- c(rem, index[-1]) - } - } - if(length(rem) > 0) tab <- tab[-rem,] - colnames(tab)[4:5] <- c("feature_start", "feature_end") - return(tab) - } - - get_cpg <- function(chr) { - library(rtracklayer) - options(stringsAsFactors=F) - session <- browserSession() - genome(session) <- genome - query <- ucscTableQuery(session, "CpG Islands", GRangesForUCSCGenome(genome, chr)) - cpg <- getTable(query) - cpg <- cpg[, c("name", "perCpg", "perGc", "chromStart", "chromEnd")] - colnames(cpg) <- c("cpg", "cpg_perCpg", "cpg_perGc", "cpg_start", "cpg_end") - return(cpg) - } - chr <- tab[1, 2] - - # get the gene info for this chrom - if(annotation == "gene") { - tab <- cbind(tab, ensembl_gene_id = "", id = "", refseq_mrna = "", feature_start = "", feature_end = "", Distance_To_Feature = "", stringsAsFactors=F) - annotData <- get_genes(ensembl(chr, genome)) - colnames(tab)[16] <- attributes[2] - } else { - tab <- cbind(tab, cpg = "", cpg_perCpg = "", cpg_perGc = "", cpg_start = "", cpg_end = "", Distance_To_cpg = "", stringsAsFactors=F) - annotData <- get_cpg(ucsc(chr)) - } - if(nrow(annotData) == 0) return(tab) - - starts <- t(matrix(cbind(-1, as.numeric(annotData[, 4])), ncol=2, byrow=F)) - ends <- t(matrix(cbind(-1, as.numeric(annotData[, 5])), ncol=2, byrow=F)) - - # calculate the distances from the features to the regions - dist_start_start <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% starts - dist_start_end <- matrix(cbind(tab$loc.start, 1), ncol=2, byrow=F) %*% ends - dist_end_start <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% starts - dist_end_end <- matrix(cbind(tab$loc.end, 1), ncol=2, byrow=F) %*% ends - - # determine which regions overlap at least 1 feature - sum_signs <- abs(sign(dist_start_start) + sign(dist_start_end) + sign(dist_end_start) + sign(dist_end_end)) - regions <- which(sum_signs != 4, arr.ind=TRUE) - if(length(regions) > 0) { - overlap <- sort(unique(regions[,1])) - non_overlap <- c(1:nrow(tab))[-overlap] - } else { - overlap <- NULL - non_overlap <- c(1:nrow(tab)) - } - - # reduce to regions with no overlaping feqature - if(length(overlap) > 0) { - dist_start_start <- matrix(dist_start_start[non_overlap,], ncol=ncol(dist_start_start)) - dist_start_end <- matrix(dist_start_end[non_overlap,], ncol=ncol(dist_start_end)) - dist_end_start <- matrix(dist_end_start[non_overlap,], ncol=ncol(dist_end_start)) - dist_end_end <- matrix(dist_end_end[non_overlap,], ncol=ncol(dist_end_end)) - } - - rm(sum_signs) - gc() - - # extract the annot for the regions with overlaping features - if(length(overlap) > 0) { - annot <- sapply(overlap, function(x, y) { - x <- regions[which(regions[,1] == x), 2] - sub("^//$", "", c(paste(annotData[x,1], collapse="//"), paste(annotData[x,2], collapse="//"), paste(annotData[x,3], collapse="//"), paste(annotData[x,4], collapse="//"), paste(annotData[x,5], collapse="//"))) - }, regions) - annot <- as.data.frame(t(annot), stringsAsFactors=F) - annot <- cbind(annot, "Overlap", stringsAsFactors=F) - colnames(annot) <- c(colnames(annotData), tail(colnames(tab),1)) - } - rm(regions) - gc() - - # for non the non-overlaps the distance of the closest features to each region - if(length(non_overlap) > 0) { - clst_pts <- matrix(0, ncol=4, nrow=length(non_overlap)) - clst_pts[,1] <- max.col(-abs(dist_start_start), "last") - clst_pts[,2] <- max.col(-abs(dist_start_end), "last") - clst_pts[,3] <- max.col(-abs(dist_end_start), "last") - clst_pts[,4] <- max.col(-abs(dist_end_end), "last") - dist <- matrix(0, ncol=4, nrow=length(non_overlap)) - if(length(clst_pts[,1]) == 1) { - dist[,1] <- dist_start_start[1, clst_pts[,1]] - dist[,2] <- dist_start_end[1, clst_pts[,2]] - dist[,3] <- dist_end_start[1, clst_pts[,3]] - dist[,4] <- dist_end_end[1, clst_pts[,4]] - - # extract the annot for the regions with non-overlaping features - clst_all <- max.col(-abs(dist)) - dist_to_feat <- dist[1, clst_all] - clst <- clst_pts[1, clst_all] - } else { - dist[,1] <- dist_start_start[cbind(seq(clst_pts[,1]), clst_pts[,1])] - dist[,2] <- dist_start_end[cbind(seq(clst_pts[,2]), clst_pts[,2])] - dist[,3] <- dist_end_start[cbind(seq(clst_pts[,3]), clst_pts[,3])] - dist[,4] <- dist_end_end[cbind(seq(clst_pts[,4]), clst_pts[,4])] - - # extract the annot for the regions with non-overlaping features - clst_all <- max.col(-abs(dist)) - dist_to_feat <- dist[cbind(seq(clst_all), clst_all)] - clst <- clst_pts[cbind(seq(clst_all), clst_all)] - } - annot_non_overlap <- cbind(annotData[clst, ], dist_to_feat) - colnames(annot_non_overlap) <- c(colnames(annotData), tail(colnames(tab),1)) - } - - rm(dist_start_start) - rm(dist_start_end) - rm(dist_end_start) - rm(dist_end_end) - gc() - - if(length(overlap) > 0) tab[overlap, colnames(annot)] <- annot - if(length(non_overlap) > 0) tab[non_overlap, colnames(annot_non_overlap)] <- annot_non_overlap - return(tab) -} - -tab_list <- list() -for(i in unique(tab_out[,2])) tab_list[[i]] <- tab_out[which(tab_out$chrom == i),] -rm(tab_out) -gc() -if(opt$annot == "both" || opt$annot == "gene") { - annotation <- "gene" - tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference) -} - -if(opt$annot == "both" || opt$annot == "cpg") { - annotation <- "cpg" - tab_list <- clusterApplyLB(cl, tab_list, add_annot, annotation, opt$reference) -} - -tab_out <- NULL -for(i in 1:length(tab_list)) tab_out <- rbind(tab_out, tab_list[[i]]) - -cat("'", file=opt$output) -write.table(tab_out[,c(1:13, 15:26, 14)], opt$output, row.names=F, col.names=T, quote=F, sep="\t", append=T) -stopCluster(cl) -q(status=0) diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/differential_methylation.xml --- a/methylation_analysis/differential_methylation.xml Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ - - methylation status of two samples by region - - samtools - bowtie2 - getopt - snow - abind - rtracklayer - biomaRt - - - - - differential_methylation.R -s $seg1 -t $seg2 -r $genome.fields.dbkey -p 1 -f $fdr -a $annot -o $output 2>1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| - -**What it does** - -Compares the methylation status of two samples by region. It calculates a p.value for differential methylation using a test of proportions, that is, it tests the null hypothesis that the 2 methylation proportions are equal. The script will also calculate the difference in methylation and annotate the regions with the closest ensembl gene and cpg island. This script takes as input two seg files created by the tool "methylation_by_region_converter" and returns a new seg file containing: - -:: - - Comparison the name of the comparison - Chrom the chromosome of the region - loc.start the start of the region - loc.end the end of the region - Sample1.Methylated the number of methylated reads in the region for sample 1 - Sample1.Total the total number of reads in the region for sample 1 - Sample1.Proportion the proportion of methylated reads for sample 1 - Sample2.Methylated the number of methylated reads in the region for sample 2 - Sample2.Total the total number of reads in the region for sample 2 - Sample2.Proportion the proportion of methylated reads for sample 2 - X_Squared the test statistic for the test of proportions - P.value the raw p-value - P.adjusted the fdr adjusted p-value - ensembl_gene_id the closest or overlapping Ensembl gene id - hgnc_symbol the HUGO Gene Nomenclature Committee id - refseq_mrna the refseq id - feature_start the start of the gene - feature_end the end of the gene - Distance_To_Feature the distance to the feature - Cpg the closest or overlapping CpG island - cpg_perCpg the number of CpGs in island - cpg_perGc the percentage of island that is CpG - cpg_start the start location of the CpG island - cpg_end the end location of the CpG island - Distance_To_cpg the distance to the CpG island - DiffProp the difference in the methylation proportions - - Usage: differential_methylation.R [[option][value]] - Options: - -h|--help view this help - -s|--segfile1 seg file 1 - -t|--segfile2 seg file 2 - -o|--output output file - -f|--fdr fdr method [holm|hochberg|hommel|bonferroni|BH|BY|fdr|none] - -r|--reference reference to use [hg19|GRCh37|mm9|NCBIM37|mm10|GRCm38|dm3|BDGP5] - -a|--annot annotation to add [both|gene|cpg] - -p|--processes number of cluster instances to open [1] - - - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/methylation_by_region_converter.pl --- a/methylation_analysis/methylation_by_region_converter.pl Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,161 +0,0 @@ -#!/usr/bin/perl - -# script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region - -# Created by Jason Ellul, Oct 2012 - - -use strict; -use warnings; -use Getopt::Std; -use File::Basename; -use Data::Dumper; -$| = 1; - -# Grab and set all options -my %OPTIONS = (s => "MethylationData"); -getopts('l:L:o:s:', \%OPTIONS); - -die qq( -Usage: methylation_by_region_converter.pl [OPTIONS] [ ...] - -OPTIONS: -o STR the name of the output file - -l STR filename of the log file [null] - -L STR append to an existing log file [null] - -s STR Sample Name [$OPTIONS{s}] -) if(@ARGV < 2); - -my $version = 0.1; -my $bed = shift @ARGV; -my @graphs = @ARGV; -my $Script = "methylation_by_region_converter.pl"; -my $now; - -# if log file specified -if(defined $OPTIONS{l}) { - open (FH, ">$OPTIONS{l}") or die "Couldn't create log file $OPTIONS{l}!\n"; - close (FH); - # Open the log file and redirect output to it - open (STDERR, ">>$OPTIONS{l}") or die "Couldn't write to log file $OPTIONS{l}!\n"; - my $now = localtime time; - print "Log File Created $now\n"; -} elsif(defined $OPTIONS{L}) { - #append to a log file - # Open the log file and redirect output to it - open (STDERR, ">>$OPTIONS{L}") or die "Couldn't append to log file $OPTIONS{L}!\n"; - my $now = localtime time; - print "Appending To Log File $now\n\n"; -} - -# print version of this script. -print STDERR "Using $Script version $version\n\n"; -print STDERR "Using region file: $bed\n"; -print STDERR "And bedgraphs: @graphs\n\n"; -my %Regions; -my @chr_order; - -# read in regions file -print STDERR "Reading Regions Bed File $bed ... "; -open(BED, "$bed") || die "$bed: $!"; -while(my $line = ) { - chomp $line; - my @line_sp = split("\t", $line); - $line_sp[2]--; - - push @chr_order, $line_sp[0] unless defined $Regions{$line_sp[0]}; - push @{ $Regions{$line_sp[0]}{Start} }, $line_sp[1]; - push @{ $Regions{$line_sp[0]}{End} }, $line_sp[2]; - push @{ $Regions{$line_sp[0]}{Methylated} }, 0; - push @{ $Regions{$line_sp[0]}{Unmethylated} }, 0; -} -close(BED); -print STDERR "Done.\n"; - -# read in bedgraphs -foreach my $bedGraph (@graphs) { - print STDERR "Loading bedGraph File $bedGraph ... "; - open(GRAPH, $bedGraph) || die "$bedGraph: $!"; - my @lines = ; - close(GRAPH); - print STDERR "Done.\n\n"; - - print STDERR "Processing bedGraph File ... "; - foreach my $line (@lines) { - chomp $line; - my @line_sp = split("\t", $line); - $line_sp[1]++; - - # if the chromosome is in the regions file - if(defined $Regions{$line_sp[0]}) { - my $pos = binsearchregion($line_sp[1], \&cmpFunc, \@{ $Regions{$line_sp[0]}{Start} }, \@{ $Regions{$line_sp[0]}{End} }); - - # if the position is within a region - if($pos >= 0) { - ${ $Regions{$line_sp[0]}{Methylated} }[$pos] += $line_sp[4]; - ${ $Regions{$line_sp[0]}{Unmethylated} }[$pos] += $line_sp[5]; - } - } - } - print STDERR "Done.\n\n"; -} - -if(defined $OPTIONS{o}) { - open (STDOUT, ">$OPTIONS{o}") || die "$OPTIONS{o}: $!"; -} - -# calculate percent methylated and print -print STDERR "Printing output ... "; -print "#type=DNA_METHYLATION\n"; -print "'ID\tchrom\tloc.start\tloc.end\tMethylated\tUnmethylated\tTotal\tFractionMethylated\n"; -foreach my $chr (@chr_order) { - for(my $region = 0; $region < @{ $Regions{$chr}{Start} }; $region++) { - my $total = ${ $Regions{$chr}{Methylated} }[$region] + ${ $Regions{$chr}{Unmethylated} }[$region]; - my $fract = sprintf("%.3f", ${ $Regions{$chr}{Methylated} }[$region] / $total) if $total; - print "$OPTIONS{s}\t$chr\t${ $Regions{$chr}{Start} }[$region]\t${ $Regions{$chr}{End} }[$region]\t" if $total; - print "${ $Regions{$chr}{Methylated} }[$region]\t${ $Regions{$chr}{Unmethylated} }[$region]\t$total\t$fract\n" if $total; - } -} -close(STDERR) if defined $OPTIONS{o}; -print STDERR "Done.\n"; - -sub binsearchregion { - my ($target, $cmp, $start, $end) = @_; - - my $posmin = 0; - my $posmax = $#{ $start }; - - return -1 if &$cmp (0, $start, $target) > 0; - return -1 if &$cmp ($#{ $end }, $end, $target) < 0; - - while (1) { - my $mid = int (($posmin + $posmax) / 2); - my $result = &$cmp ($mid, $start, $target); - - if ($result < 0) { - $posmin = $posmax, next if $mid == $posmin && $posmax != $posmin; - if($mid == $posmin) { - return -1 if &$cmp ($mid, $end, $target) < 0; - return $mid; - } - $posmin = $mid; - } elsif ($result > 0) { - $posmax = $posmin, next if $mid == $posmax && $posmax != $posmin; - if($mid == $posmax) { - $mid--; - return -1 if &$cmp ($mid, $end, $target) < 0; - return $mid; - } - $posmax = $mid; - } else { - return $mid; - } - } -} - -sub cmpFunc { - my ($index, $arrayRef, $target) = @_; - my $item = $$arrayRef[$index]; - - return $item <=> $target; -} - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/methylation_by_region_converter.xml --- a/methylation_analysis/methylation_by_region_converter.xml Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - : create a bedgrapgh with methylation percentages aggregated by region - - samtools - bowtie2 - - - - methylation_by_region_converter.pl - - -o $output - -l $summary - ## Handle reference file. - #if $refBedSource.bedSource == "history": - $refBedSource.ownFile - #else: - "${refBedSource.index.fields.path}" - #end if - $bedgraph - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| - - -**What it does** - - -Script to take a bed file of target regions and a series of bedgraphs from bismark and create a bedgrapgh with methylation percentages aggregated by region - - ------ - - -:: - - Usage: methylation_by_region_converter.pl [OPTIONS] bed_file bedGraph - - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/methylation_extractor.xml --- a/methylation_analysis/methylation_extractor.xml Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,165 +0,0 @@ - - : extracts the methylation information for individual cytosine - - samtools - bowtie2 - bismark - - - - methylation_extractor_wrapper.pl - - - "GENOME::${genome.fields.path}" - - - #if str($no_overlap) == "ON": - "OPTION::--no_overlap" - #end if - - #if str($ending) == "single": - "ENDING::-s" - #else - "ENDING::-p" - #end if - - #if str($report) == "ON": - "OPTION::--report" - #end if - - "OPTION::--bedGraph" - - "OPTION::--counts" - - - "OUTPUT::$output" - "SUMMARY::$summary" - - "BAMFILE::$bamfile" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -| - -**Reference** - - http://www.bioinformatics.babraham.ac.uk/projects/bismark/ - ------ - -**What it does** - - - -The script reads in a bisulfite read alignment results file -produced by the Bismark bisulfite mapper (BAM file) and extracts the methylation -informationfor individual cytosines. This information is found in the methylation -call field which can contain the following characters: - -:: - - ~~~ X for methylated C in CHG context (was protected) ~~~ - - ~~~ x for not methylated C CHG (was converted) ~~~ - - ~~~ H for methylated C in CHH context (was protected) ~~~ - - ~~~ h for not methylated C in CHH context (was converted) ~~~ - - ~~~ Z for methylated C in CpG context (was protected) ~~~ - - ~~~ z for not methylated C in CpG context (was converted) ~~~ - - ~~~ . for any bases not involving cytosines ~~~ - - - ------ - -**Required Parameters** - -:: - - -s/--single-end Input file(s) are Bismark result file(s) generated from single-end - read data. Specifying either --single-end or --paired-end is - mandatory. - - -p/--paired-end Input file(s) are Bismark result file(s) generated from paired-end - read data. Specifying either --paired-end or --single-end is - mandatory. - - --no_overlap For paired-end reads it is theoretically possible that read_1 and - read_2 overlap. This option avoids scoring overlapping methylation - calls twice (only methylation calls of read 1 are used for in the process - since read 1 has historically higher quality basecalls than read 2). - Whilst this option removes a bias towards more methylation calls - in the center of sequenced fragments it may de facto remove a sizable - proportion of the data. This option is highly recommended for paired-end - data. - - --report Prints out a short methylation summary as well as the paramaters used to run - this script. - - ------ - -**Default Parameters** - -:: - - --bedGraph After finishing the methylation extraction, the methylation output is written into a - sorted bedGraph file that reports the position of a given cytosine and its methylation - state (in %, seem details below). The methylation extractor output is temporarily split up into - temporary files, one per chromosome (written into the current directory or folder - specified with -o/--output); these temp files are then used for sorting and deleted - afterwards. By default, only cytosines in CpG context will be sorted. The option - '--CX_context' may be used to report all cyosines irrespective of sequence context - (this will take MUCH longer!). - - - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb methylation_analysis/methylation_extractor_wrapper.pl --- a/methylation_analysis/methylation_extractor_wrapper.pl Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ -use strict; -use warnings; -use File::Basename; -use Cwd; -use File::Path qw(make_path remove_tree); -die qq( -Bad numbr of inputs - -) if(!@ARGV); - -my @bam_list_entries; - -my $player_options = ""; -my $bam_file; -my $ending; -my $genome; -my $summary ; -my $output; - -foreach my $input (@ARGV) -{ - my @tmp = split "::", $input; - if($tmp[0] eq "GENOME") - { - $genome=$tmp[1]; - } - elsif($tmp[0] eq "OPTION") - { - $player_options = "$player_options ${tmp[1]}"; - } - elsif($tmp[0] eq "ENDING") - { - $ending = $tmp[1]; - } - elsif($tmp[0] eq "BAMFILE") - { - $bam_file = $tmp[1]; - } - elsif($tmp[0] eq "SUMMARY") - { - $summary = $tmp[1]; - } - elsif($tmp[0] eq "OUTPUT") - { - $output = $tmp[1]; - } - else - { - die("Unknown Input: $input\n"); - } -} - - -my $working_dir = cwd(); - -#convert bam to sam -my $sam_file = "$working_dir/coverted.sam"; -system("samtools view -h $bam_file > $sam_file"); - -#run bismark - -system ("bismark_methylation_extractor $ending $player_options --genome_folder $genome $sam_file > $summary 2>&1"); - -move_files($working_dir); - -sub move_files -{ - my $name; - my $suffix; - my $path; - my $local_dir = $_[0]; - opendir(DIR, $local_dir); - #print ("Openning: $local_dir\n"); - my @FILES= readdir(DIR); - closedir(DIR); - foreach my $file (@FILES) - { - if ($file eq "." || $file eq "..") - { - #print ("./ or ../ skipped\n"); - } - elsif (-d "$local_dir/$file") - { - #print ("moving to: $local_dir/$file\n"); - move_files("$local_dir/$file"); - } - elsif (-f "$local_dir/$file") - { - ($name,$path,$suffix) = fileparse($file,qr"\.[^.]*$"); - if ($suffix eq ".bedGraph") - { - system ("mv $local_dir/$file $output"); - } - } - else - { - die("Unrecognized file generated: $file\n"); - } - - } -} - - - - - diff -r c4416c3af217 -r 4c5c3994bfcb tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Mon Dec 03 18:27:31 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ - - - value, dbkey, name, path - -

- - - value, name, path - -