# HG changeset patch # User rouan # Date 1388140126 18000 # Node ID 4ca4e9200808d754e51865e0338e186fa1e12351 # Parent a1f08471480046b6fd4cb4472ae9152d5176f00b Deleted selected files diff -r a1f084714800 -r 4ca4e9200808 edgeR.pl --- a/edgeR.pl Thu Dec 26 05:36:07 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,391 +0,0 @@ -#/bin/perl - -use strict; -use warnings; -use Getopt::Std; -use File::Basename; -use File::Path qw(make_path remove_tree); -$| = 1; - -# Grab and set all options -my %OPTIONS = (a => "glm", d => "tag", f => "BH", r => 5, u => "movingave"); - -getopts('a:d:e:f:h:lmn:o:r:tu:', \%OPTIONS); - -die qq( -Usage: edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix - -OPTIONS: -a STR Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a}) - -d STR The dispersion estimate to use for GLM analysis [tag, trend, common] (default: $OPTIONS{d}) - -e STR Path to place additional output files - -f STR False discovery rate adjustment method [BH, holm, hochberg, hommel, BY, none] (default: $OPTIONS{f}) - -h STR Name of html file for additional files - -l Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified) - -m Perform all pairwise comparisons - -n STR File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model) - -o STR File name to output csv file with results - -r INT Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r}) - -t Estimate Tagwise Disp when performing 1 factor analysis - -u STR Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave", "tricube", "none"] (default: $OPTIONS{u}) - -) if(!@ARGV); - -my $matrix = pop @ARGV; - -make_path($OPTIONS{e}); -open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n"; -print Rcmd " -zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\") -sink(zz) -sink(zz, type=\"message\") - -library(edgeR) -library(limma) - -# read in matrix and groups -toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T) -groups <- sapply(toc[1, -1], strsplit, \":\") -for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] } -colnames(toc) <- make.names(toc[2,]) -toc[,1] <- gsub(\",\", \".\", toc[,1]) -tagnames <- toc[-(1:2), 1] -rownames(toc) <- toc[,1] -toc <- toc[-(1:2), -1] -for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i]) -norm_factors <- calcNormFactors(as.matrix(toc)) - -pw_tests <- list() -uniq_groups <- unique(names(groups)) -for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j]) -DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups)) -pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14) -for(i in 1:length(pw_tests)) { - j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1]) - par(mfrow = c(1, 2)) - maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]])) - grid(col = \"blue\") - abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4) - maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\")) - grid(col = \"blue\") -} -dev.off() -pdf(file=\"$OPTIONS{e}/MDSplot.pdf\") -plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3)) -dev.off() -tested <- list() -"; - -my $all_cont; -my @add_cont; -my @fact; -my @fact_names; -my @cp; -my @cp_names; -if(@ARGV) { - foreach my $input (@ARGV) { - my @tmp = split "::", $input; - if($tmp[0] eq "factor") { - $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; - push @fact_names, $tmp[1]; - $tmp[2] =~ s/:/\", \"/g; - $tmp[2] = "\"".$tmp[2]."\""; - push @fact, $tmp[2]; - } elsif($tmp[0] eq "cp") { - $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; - push @cp_names, $tmp[1]; - $tmp[2] =~ s/:/, /g; - push @cp, $tmp[2]; - } elsif($tmp[0] eq "cnt") { - push @add_cont, $tmp[1]; - } else { - die("Unknown Input: $input\n"); - } - } -} - -if($OPTIONS{a} eq "pw") { - print Rcmd " -disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r}) -"; - if(defined $OPTIONS{t}) { - print Rcmd " -disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\") -pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") -plotBCV(disp, cex=0.4) -abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3) -dev.off() -"; - } - print Rcmd " -for(i in 1:length(pw_tests)) { - tested[[i]] <- exactTest(disp, pair=pw_tests[[i]]) - names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\") -} -pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") -for(i in 1:length(pw_tests)) { - dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") - if(sum(dt) > 0) { - de_tags <- rownames(disp)[which(dt != 0)] - ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" - } else { - de_tags <- rownames(topTags(tested[[i]], n=100)\$table) - ttl <- \"Top 100 tags\" - } - - if(length(dt) < 5000) { - pointcex = 0.5 - } else { - pointcex = 0.2 - } - plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"Smear Plot\", names(tested)[i]), cex=0.5) - abline(h = c(-1, 1), col = \"blue\") - legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") -} -dev.off() -"; -} elsif($OPTIONS{a} eq "glm") { - for(my $fct = 0; $fct <= $#fact_names; $fct++) { - print Rcmd " -$fact_names[$fct] <- c($fact[$fct]) -"; - } - for(my $fct = 0; $fct <= $#cp_names; $fct++) { - print Rcmd " -$cp_names[$fct] <- c($cp[$fct]) -"; - } - my $all_fact = ""; - if(@fact_names) { - foreach (@fact_names) { - $all_fact .= " + factor($_)"; - } - } - my $all_cp = ""; - if(@cp_names) { - $all_cp = " + ".join(" + ", @cp_names); - } - print Rcmd " -group_fact <- factor(names(groups)) -design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) -colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) -"; - foreach my $fct (@fact_names) { - print Rcmd " -colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) -"; - } - print Rcmd " -disp <- estimateGLMCommonDisp(DGE, design) -"; - if($OPTIONS{d} eq "tag" || $OPTIONS{d} eq "trend") { - print Rcmd " -disp <- estimateGLMTrendedDisp(disp, design) -"; - } - if($OPTIONS{d} eq "tag") { - print Rcmd " -disp <- estimateGLMTagwiseDisp(disp, design) -fit <- glmFit(disp, design) -pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") -plotBCV(disp, cex=0.4) -dev.off() -"; - } - if(@add_cont) { - $all_cont = "\"".join("\", \"", @add_cont)."\""; - print Rcmd " -cont <- c(${all_cont}) -for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) -for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) -"; - } else { - print Rcmd " -cont <- NULL -"; - } - if(defined $OPTIONS{m}) { - print Rcmd " -for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) -"; - } - if(!defined $OPTIONS{m} && !@add_cont){ - die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); - } - print Rcmd " -fit <- glmFit(disp, design) -cont <- makeContrasts(contrasts=cont, levels=design) -for(i in colnames(cont)) tested[[i]] <- glmLRT(fit, contrast=cont[,i]) -pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") -for(i in colnames(cont)) { - dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") - if(sum(dt) > 0) { - de_tags <- rownames(disp)[which(dt != 0)] - ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" - } else { - de_tags <- rownames(topTags(tested[[i]], n=100)\$table) - ttl <- \"Top 100 tags\" - } - - if(length(dt) < 5000) { - pointcex = 0.5 - } else { - pointcex = 0.2 - } - plotSmear(disp, de.tags = de_tags, main = paste(\"Smear Plot\", i), cex=pointcex) - abline(h = c(-1, 1), col = \"blue\") - legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") -} -dev.off() -"; - if(defined $OPTIONS{n}) { - print Rcmd " -tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F) -write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) -"; - } -} elsif($OPTIONS{a} eq "limma") { - for(my $fct = 0; $fct <= $#fact_names; $fct++) { - print Rcmd " -$fact_names[$fct] <- c($fact[$fct]) -"; - } - for(my $fct = 0; $fct <= $#cp_names; $fct++) { - print Rcmd " -$cp_names[$fct] <- c($cp[$fct]) -"; - } - my $all_fact = ""; - if(@fact_names) { - foreach (@fact_names) { - $all_fact .= " + factor($_)"; - } - } - my $all_cp = ""; - if(@cp_names) { - $all_cp = " + ".join(" + ", @cp_names); - } - print Rcmd " -group_fact <- factor(names(groups)) -design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) -colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) -"; - foreach my $fct (@fact_names) { - print Rcmd " -colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) -"; - } - print Rcmd " -isexpr <- rowSums(cpm(toc)>1) >= 2 -toc <- toc[isexpr, ] -pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\") -y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors) -dev.off() - -pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\") -plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\") -dev.off() -fit <- lmFit(y, design) -"; - if(defined $OPTIONS{n}) { - if(defined $OPTIONS{l}) { - print Rcmd " -tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F) -"; - } else { - print Rcmd " -tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F) -"; - } - print Rcmd " -write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) -"; - } - if(@add_cont) { - $all_cont = "\"".join("\", \"", @add_cont)."\""; - print Rcmd " -cont <- c(${all_cont}) -for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) -for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) -"; - } else { - print Rcmd " -cont <- NULL -"; - } - if(defined $OPTIONS{m}) { - print Rcmd " -for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) -"; - } - if(!defined $OPTIONS{m} && !@add_cont){ - die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); - } - print Rcmd " -cont <- makeContrasts(contrasts=cont, levels=design) -fit2 <- contrasts.fit(fit, cont) -fit2 <- eBayes(fit2) -"; -} else { - die("Anaysis type $OPTIONS{a} not found\n"); - -} - -if($OPTIONS{a} ne "limma") { - print Rcmd " -options(digits = 6) -tab <- NULL -for(i in names(tested)) { - tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]] - colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\") - tab_tmp <- tab_tmp[tagnames,] - if(is.null(tab)) { - tab <- tab_tmp - } else tab <- cbind(tab, tab_tmp) -} -tab <- cbind(Feature=rownames(tab), tab) -"; -} else { - print Rcmd " -tab <- NULL -options(digits = 6) -for(i in colnames(fit2)) { - tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\") - colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\") - if(is.null(tab)) { - tab <- tab_tmp - } else tab <- cbind(tab, tab_tmp[,-1]) -} -"; -} -print Rcmd " -write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F) -sink(type=\"message\") -sink() -"; -close(Rcmd); -system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out"); - -open(HTML, ">$OPTIONS{h}"); -print HTML "EdgeR: Empirical analysis of digital gene expression data

EdgeR Additional Files:

\n"; -close(HTML); - diff -r a1f084714800 -r 4ca4e9200808 edgeR.xml --- a/edgeR.xml Thu Dec 26 05:36:07 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,214 +0,0 @@ - - - Estimates differential gene expression for short read sequence count using methods appropriate for count data - - edgeR - limma - - - edgeR.pl -a $analysis_type.analysis -e $html_file.files_path -f $fdr -h $html_file -o $output - ## Pairwise comparisons - #if $analysis_type.analysis == "pw": - -r $analysis_type.rowsumfilter - #if $analysis_type.tagwise_disp.twd == "TRUE": - -u $analysis_type.tagwise_disp.twd_trend - -t - #end if - ## GLM - #else if $analysis_type.analysis == "glm": - #if $analysis_type.exp.export_norm == "true": - -n $norm_exp - #end if - -d $analysis_type.disp - $analysis_type.cont_pw - #for $fct in $analysis_type.factors: - factor::${$fct.fact_name}::${$fct.fact} - #end for - #for $c in $analysis_type.cont_pred: - cp::${c.cp_name}::${c.cp} - #end for - #for $cnt in $analysis_type.contrasts: - "cnt::${cnt.add_cont}" - #end for - ## LIMMA - #else - #if $analysis_type.exp.export_norm == "true": - -n $norm_exp $analysis_type.exp.log - #end if - $analysis_type.cont_pw - #for $fct in $analysis_type.factors: - factor::${$fct.fact_name}::${$fct.fact} - #end for - #for $c in $analysis_type.cont_pred: - cp::${c.cp_name}::${c.cp} - #end for - #for $cnt in $analysis_type.contrasts: - "cnt::${cnt.add_cont}" - #end for - #end if - $matrix - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - analysis_type[ "analysis" ] != "pw" and analysis_type[ "exp" ][ "export_norm" ] == "true" - - - - - -.. class:: infomark - -**What it does** - -Estimates differential gene expression for short read sequence count using methods appropriate for count data. -If you have paired data you may also want to consider Tophat/Cufflinks. -Input must be raw count data for each sequence arranged in a rectangular matrix as a tabular file. -Note - no scaling - please make sure you have untransformed raw counts of reads for each sequence. - -Performs digital differential gene expression analysis between groups (eg a treatment and control). -Biological replicates provide information about experimental variability required for reliable inference. - -**What it does not do** -edgeR_ requires biological replicates. -Without replicates you can't account for known important experimental sources of variability that the approach implemented here requires. - - -**Input** -A count matrix containing sequence names as rows and sample specific counts of reads from this sequence as columns. -The matrix must have 2 header rows, the first indicating the group assignment and the second uniquely identifiying the samples. It must also contain a unique set of (eg Feature) names in the first column. - -Example:: - - # G1:Mut G1:Mut G1:Mut G2:WT G2:WT G2:WT - #Feature Spl1 Spl2 Spl3 Spl4 Spl5 Spl6 - NM_001001130 97 43 61 34 73 26 - NM_001001144 25 8 9 3 5 5 - NM_001001152 72 45 29 20 31 13 - NM_001001160 0 1 1 1 0 0 - NM_001001177 0 1 0 4 3 3 - NM_001001178 0 2 1 0 4 0 - NM_001001179 0 0 0 0 0 2 - NM_001001180 0 0 0 0 0 2 - NM_001001181 415 319 462 185 391 155 - NM_001001182 1293 945 987 297 938 496 - NM_001001183 5 4 11 7 11 2 - NM_001001184 135 198 178 110 205 64 - NM_001001185 186 1 0 1 1 0 - NM_001001186 75 90 91 34 63 54 - NM_001001187 267 236 170 165 202 51 - NM_001001295 5 2 6 1 7 0 - NM_001001309 1 0 0 1 2 1 - ... - - -Please use the "Count reads in features with htseq-count" tool to generate the count matrix. - -**Output** - -A tabular file containing relative expression levels, statistical estimates of differential expression probability, R scripts, log, and some helpful diagnostic plots. - -.. class:: infomark - -**Attribution** -This tool wraps the edgeR_ Bioconductor package so all calculations and plots are controlled by that code. See edgeR_ for all documentation and appropriate attribution. -Recommended reference is Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818 - -.. class:: infomark - -**Attribution** -When applying the LIMMA (Linear models for RNA-Seq) anlysis the tool also makes use of the limma_ Bioconductor package. -Recommended reference is Smyth, G. K. (2005). Limma: linear models for microarray data. In: 'Bioinformatics and Computational Biology Solutions using R and Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, W. Huber (eds), Springer, New York, pages 397--420. - - .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html - .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html - - - - - diff -r a1f084714800 -r 4ca4e9200808 htseq.pl --- a/htseq.pl Thu Dec 26 05:36:07 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use Getopt::Std; -use File::Basename; -$| = 1; - -# Grab and set all options -my %OPTIONS = (a => 0, i => "gene_id", m => "intersection-nonempty", s => "no", t => "exon"); -getopts('a:cg:i:m:o:r:s:t:', \%OPTIONS); - -die qq( -Usage: HTSeq.pl [OPTIONS] Group1=sample1= [Group1=sample2= ... Group2=sampleN= ...] - -OPTIONS: -a STR skip all reads with alignment quality lower than the given minimum value (default: $OPTIONS{a}) - -c reduce the matrix by removing any feature with no counts - -g STR the features file in the GFF/GTF format - -i STR GFF attribute to be used as feature ID (default: $OPTIONS{i}) - -m STR mode to handle reads overlapping more than one feature. Possible values for are union, intersection-strict and intersection-nonempty (default: $OPTIONS{m}) - -o STR output file name for expression matrix - -r STR the name of the output report - -s STR whether the data is from a strand-specific assay (default: $OPTIONS{s}) - -t STR feature type (3rd column in GFF file) to be used, all features of other type are ignored (default, suitable for RNA-Seq and Ensembl GTF files: $OPTIONS{t}) - -) if(@ARGV == 0); - -my $sam_out; -my @counts; -my @features; -my %report; -my @samplenames; -my $current_group; -my @groups; -my @files; -my $groupcount = 0; -my %grouphash; - -foreach my $input (@ARGV) { - my ($group, $sample, $input) = split "::", $input; - if(! defined $grouphash{$group}) { - $groupcount++; - $grouphash{$group} = "G${groupcount}:$group"; - } - push @groups, $grouphash{$group}; - push @samplenames, $sample; - push @files, $input; -} - -for(my $index = 0; $index <= $#files; $index++) { - my $input_file = $files[$index]; - my $sample = $samplenames[$index]; - - # run htseq - my @htseq; - my $COMM; - my $file_type = `file $input_file`; - if(grep /text$/, $file_type ) { - $COMM = "htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} $input_file $OPTIONS{g}"; - @htseq = `$COMM`; - } else { - $COMM = "samtools view $input_file | htseq-count -q -m $OPTIONS{m} -s $OPTIONS{s} -a $OPTIONS{a} -t $OPTIONS{t} -i $OPTIONS{i} - $OPTIONS{g}"; - @htseq = `$COMM`; - } - - my $row = 0; - $report{$sample} = "Command Used: $COMM\n"; - - for(my $row = 0; $row <= $#htseq; $row++) { - # store the report is an hash - if(grep /^no_feature|^ambiguous|^too_low_aQual|^not_aligned|^alignment_not_unique/, $htseq[$row]) { - $report{$sample} .= $htseq[$row]; - } else { - # store the counts in a matrix - chomp $htseq[$row]; - my ($feature, $value) = split "\t", $htseq[$row]; - $counts[$row][$index] = $value; - if($input_file eq $files[0]) { - push @features, $feature; - } - } - } -} - -# print the matrix -open(MATRIX, ">$OPTIONS{o}") || die "Could Not Create Output File $OPTIONS{o}!\n"; -print MATRIX "#\t".join("\t", @groups)."\n"; -print MATRIX "#Feature\t".join("\t", @samplenames)."\n"; -for(my $row = 0; $row <= $#features; $row++) { - if(defined $OPTIONS{c}) { - my $rowsum = 0; - $rowsum += $_ foreach @{ $counts[$row] }; - if(!$rowsum) { - next; - } - } - print MATRIX "$features[$row]\t".join("\t", @{ $counts[$row] })."\n"; -} -close(MATRIX); - -# print the report -open(REPORT, ">$OPTIONS{r}") || die "Could Not Create Output File $OPTIONS{r}!\n"; -print REPORT "$groups[$_]:$samplenames[$_]\n$report{$samplenames[$_]}\n" foreach (0..$#samplenames); -close(REPORT); - - - diff -r a1f084714800 -r 4ca4e9200808 htseq.xml --- a/htseq.xml Thu Dec 26 05:36:07 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ - - - samtools - htseq - - - Create a digital expression matrix by counting reads in features with htseq-count - - htseq.pl -m $MODE -s $STRANDED -a $MINAQUAL -t $FEATURETYPE -i $IDATTR -g $gff_file -o $output -r $report $REDUCE - ## Inputs. - #for $group in $group_analysis - ${group.group}::${group.sample_init}::${group.file_init} - #for $input_files in $group.input_files: - ${group.group}::${input_files.sample}::${input_files.file} - #end for - #end for - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -**What it does** - -Create a digital expression matrix by counting reads in features with htseq-count - -For each given file with aligned sequencing reads this tool counts how many reads map to each feature. It then constructs a matrix where the rows represent the features and the columns represent the files. - -A feature is here an interval (i.e., a range of positions) on a chromosome or a union of such intervals. - -In the case of RNA-Seq, the features are typically genes, where each gene is considered here as the union of all its exons. One may also consider each exon as a feature, e.g., in order to check for alternative splicing. For comparative ChIP-Seq, the features might be binding region from a pre-determined list. - -Special care must be taken to decide how to deal with reads that overlap more than one feature. The htseq-count script allows to choose between three modes. - -The three overlap resolution modes of htseq-count work as follows. For each position i in the read, a set S(i) is defined as the set of all features overlapping position i. Then, consider the set S, which is (with i running through all position within the read) - - * the union of all the sets S(i) for mode union. - * the intersection of all the sets S(i) for mode intersection-strict. - * the intersection of all non-empty sets S(i) for mode intersection-nonempty. - -If S contains precisely one feature, the read is counted for this feature. If it contains more than one feature, the read is counted as ambiguous (and not counted for any features), and if S is empty, the read is counted as no_feature. - -The following figure illustrates the effect of these three modes: - -.. image:: http://www-huber.embl.de/users/anders/HTSeq/doc/_images/count_modes.png - -The strandedness of the assay may also be set. For stranded=no, a read is considered overlapping with a feature regardless of whether it is mapped to the same or the opposite strand as the feature. For stranded=yes and single-end reads, the read has to be mapped to the same strand as the feature. For paired-end reads, the first read has to be on the same strand and the second read on the opposite strand. For stranded=reverse, these rules are reversed. - -.. class:: warningmark - -**Important:** The default for strandedness is no. If yes or reverse is selected and your RNA-Seq data has not been made with a strand-specific protocol, this will cause half of the reads to be lost. Hence, make sure to set the option --stranded=no unless you have strand-specific data! - ------- - -**Output** - -The script outputs a digital expression matrix containing the counts for each feature by each input Sam/Bam file. It will also generate a report containing special counters, which count reads that were not counted for any feature for various reasons, namely: - - * no_feature: reads which could not be assigned to any feature (set S as described above was empty). - * ambiguous: reads which could have been assigned to more than one feature and hence were not counted for any of these (set S had more than one element). - * too_low_aQual: reads which were not counted due to the -a option, see below - * not_aligned: reads in the Sam/Bam file without alignment - * alignment_not_unique: reads with more than one reported alignment. These reads are recognized from the NH optional SAM field tag. (If the aligner does not set this field, multiply aligned reads will be counted multiple times.) - -**Reference** - -http://www-huber.embl.de/users/anders/HTSeq/doc/overview.html - - - - - -