# HG changeset patch # User jjohnson # Date 1360162618 18000 # Node ID c6eade913da324a19792323ce3c42bb9a24d38bb Uploaded diff -r 000000000000 -r c6eade913da3 tophat_stats_pe.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tophat_stats_pe.xml Wed Feb 06 09:56:58 2013 -0500 @@ -0,0 +1,38 @@ + + Calculate mapping statistics from Tophat BAM files. + tophatstatsPE.pl $hits $fastq > $output + + + + + + + + + + +**Tophat Stats PE** + +This tool generates mapping statics from BAM files produced by +Tophat. For paired-end experiments only one of the read files (R1 +or R2) needs to be provided. + +**Example output:** + +:: + + Input files: /galaxy/database/files/001/dataset_1682.dat /galaxy/database/files/001/dataset_1680.dat + 73538 total read pairs in fastq file + 14429 (19.62%) read pairs with only one read in the pair mapped (14262 with unique alignments) + 36461 (49.58%) read pairs mapped with correct orientation and insert size (36366 with unique alignments) + 12169 (16.55%) read pairs mapped with correct orientation but wrong insert size (12063 with unique alignments) + 0.458333333333333 (0.00%) read pairs mapped with wrong orientation but correct insert size (0 with unique alignments) + 10478.5416666667 (14.25%) read pairs with no mapping + + +**Credit** + +John Garbe, University of Minnesota + + + diff -r 000000000000 -r c6eade913da3 tophatstatsPE.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tophatstatsPE.pl Wed Feb 06 09:56:58 2013 -0500 @@ -0,0 +1,125 @@ +#!/usr/bin/perl -w + +############################################ +# tophatstatsPE +# John Garbe +# May 29, 2012 +# +# Calculate statistics about the data going into and out of tophat: +# -Count the number of raw reads, the number of aligned reads, the number +# of properly paired reads, etc +# +########################################### + +die "USAGE: tophatstats.pl accepted_hits.bam fastqfile.fastq\n" unless ($#ARGV == 1); + +$bamfile = $ARGV[0]; +$fastqfile = $ARGV[1]; +die "cannot open $fastqfile" unless (-e $fastqfile); +die "cannot open $bamfile" unless (-e $bamfile); + +my $junk; +$divide = 4; # four lines per read in a fastq file +$insertsum = 0; +$insertcount = 0; +# count total number of raw reads +if (1) { + print "Input files: $bamfile\t $fastqfile\n"; + $results = `wc -l $fastqfile`; + ($results, $junk) = split ' ', $results; + $total = $results / $divide; + print "$total total read pairs in fastq file\n"; +} + +# read in bam file as a pipe from samtools +open IFILE, "samtools view $bamfile |" or die "cannot open $bamfile\n"; + +# initialize array counting multiple alignments +for $i (0..200) { + $count[$i] = 0; + for $j (0..40) { + $unique[$i][$j] = 0; + } +} + +$insertsum = 0; +$insertcount = 0; +# read through bam file, counting alignments +while ($line = ) { + @line = split /\t/, $line, 3; + $count[$line[1]]++; + + # pull out number of alignments for this read + @line = split /\t/, $line, 12; + ($junk, $int) = split /NH:i:/, $line[11]; + ($int, $junk) = split /\W/, $int; + if ($int == 1) { # count unique alignments + $unique[$line[1]][1]++; + } elsif ($int > 1) { # count non-unique alignments + $unique[$line[1]][2]+= 1 / $int; + } + if ($line[1] == 99 || $line[1] == 83) { # if proper insert size use for insert size average + @line = split /\t/, $line, 10; + $insertsum += $line[8]; + $insertcount++; + } +} + +# singleton +$singleton_unique = 0; +$singleton_multiple = 0; +for $i (73, 133, 89, 121, 165, 181, 101, 117, 153, 185, 69, 137) { + $singleton_unique += $unique[$i][1]; + $singleton_multiple += int($unique[$i][2]); +} +$singleton = $singleton_unique + $singleton_multiple; +$singleton_percent = $singleton / $total * 100; +printf "$singleton (%.2f%%) read pairs with only one read in the pair mapped ($singleton_unique with unique alignments)\n", $singleton_percent; + +# correct correct +$correctcorrect_unique = 0; +$correctcorrect_multiple = 0; +for $i (99, 83) { + $correctcorrect_unique += $unique[$i][1]; + $correctcorrect_multiple += int($unique[$i][2]); +} +$correctcorrect = $correctcorrect_unique + $correctcorrect_multiple; +$correctcorrect_percent = $correctcorrect / $total * 100; +printf "$correctcorrect (%.2f%%) read pairs mapped with correct orientation and insert size ($correctcorrect_unique with unique alignments)\n", $correctcorrect_percent; + +# correct wrong insert size +$correctwrong_unique = 0; +$correctwrong_multiple = 0; +for $i (81, 97, 65, 113) { + $correctwrong_unique += $unique[$i][1]; + $correctwrong_multiple += int($unique[$i][2]); +} +$correctwrong = $correctwrong_unique + $correctwrong_multiple; +$correctwrong_percent = $correctwrong / $total * 100; +printf "$correctwrong (%.2f%%) read pairs mapped with correct orientation but wrong insert size ($correctwrong_unique with unique alignments)\n", $correctwrong_percent; + +# correct wrong orientation +$correctswitched_unique = 0; +$correctswitched_multiple = 0; +for $i (67, 115) { + $correctswitched_unique += $unique[$i][1]; + $correctswitched_multiple += $unique[$i][2]; +} +$correctswitched = $correctswitched_unique + $correctswitched_multiple; +$correctswitched_percent = $correctswitched / $total * 100; +printf "$correctswitched (%.2f%%) read pairs mapped with wrong orientation but correct insert size ($correctswitched_unique with unique alignments)\n", $correctswitched_percent; + +# no mapping +$nomapping = $total - ($correctcorrect + $correctswitched + $correctwrong + $singleton); +$nomapping_percent = $nomapping / $total * 100; +printf "$nomapping (%.2f%%) read pairs with no mapping\n", $nomapping_percent; + +# insert size +$insertavg = 0; +$insertavg = $insertsum / $insertcount unless ($insertcount == 0); +#printf "%.2fbp average inner distance between read pairs (of read pairs with correct insert size)\n", $insertavg; # Not ready for primetime yet - needs verification + +exit; + +############################################################ +