Mercurial > repos > jjohnson > tophat_stats_pe
view tophatstatsPE.pl @ 0:c6eade913da3 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Wed, 06 Feb 2013 09:56:58 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w ############################################ # tophatstatsPE # John Garbe # May 29, 2012 # # Calculate statistics about the data going into and out of tophat: # -Count the number of raw reads, the number of aligned reads, the number # of properly paired reads, etc # ########################################### die "USAGE: tophatstats.pl accepted_hits.bam fastqfile.fastq\n" unless ($#ARGV == 1); $bamfile = $ARGV[0]; $fastqfile = $ARGV[1]; die "cannot open $fastqfile" unless (-e $fastqfile); die "cannot open $bamfile" unless (-e $bamfile); my $junk; $divide = 4; # four lines per read in a fastq file $insertsum = 0; $insertcount = 0; # count total number of raw reads if (1) { print "Input files: $bamfile\t $fastqfile\n"; $results = `wc -l $fastqfile`; ($results, $junk) = split ' ', $results; $total = $results / $divide; print "$total total read pairs in fastq file\n"; } # read in bam file as a pipe from samtools open IFILE, "samtools view $bamfile |" or die "cannot open $bamfile\n"; # initialize array counting multiple alignments for $i (0..200) { $count[$i] = 0; for $j (0..40) { $unique[$i][$j] = 0; } } $insertsum = 0; $insertcount = 0; # read through bam file, counting alignments while ($line = <IFILE>) { @line = split /\t/, $line, 3; $count[$line[1]]++; # pull out number of alignments for this read @line = split /\t/, $line, 12; ($junk, $int) = split /NH:i:/, $line[11]; ($int, $junk) = split /\W/, $int; if ($int == 1) { # count unique alignments $unique[$line[1]][1]++; } elsif ($int > 1) { # count non-unique alignments $unique[$line[1]][2]+= 1 / $int; } if ($line[1] == 99 || $line[1] == 83) { # if proper insert size use for insert size average @line = split /\t/, $line, 10; $insertsum += $line[8]; $insertcount++; } } # singleton $singleton_unique = 0; $singleton_multiple = 0; for $i (73, 133, 89, 121, 165, 181, 101, 117, 153, 185, 69, 137) { $singleton_unique += $unique[$i][1]; $singleton_multiple += int($unique[$i][2]); } $singleton = $singleton_unique + $singleton_multiple; $singleton_percent = $singleton / $total * 100; printf "$singleton (%.2f%%) read pairs with only one read in the pair mapped ($singleton_unique with unique alignments)\n", $singleton_percent; # correct correct $correctcorrect_unique = 0; $correctcorrect_multiple = 0; for $i (99, 83) { $correctcorrect_unique += $unique[$i][1]; $correctcorrect_multiple += int($unique[$i][2]); } $correctcorrect = $correctcorrect_unique + $correctcorrect_multiple; $correctcorrect_percent = $correctcorrect / $total * 100; printf "$correctcorrect (%.2f%%) read pairs mapped with correct orientation and insert size ($correctcorrect_unique with unique alignments)\n", $correctcorrect_percent; # correct wrong insert size $correctwrong_unique = 0; $correctwrong_multiple = 0; for $i (81, 97, 65, 113) { $correctwrong_unique += $unique[$i][1]; $correctwrong_multiple += int($unique[$i][2]); } $correctwrong = $correctwrong_unique + $correctwrong_multiple; $correctwrong_percent = $correctwrong / $total * 100; printf "$correctwrong (%.2f%%) read pairs mapped with correct orientation but wrong insert size ($correctwrong_unique with unique alignments)\n", $correctwrong_percent; # correct wrong orientation $correctswitched_unique = 0; $correctswitched_multiple = 0; for $i (67, 115) { $correctswitched_unique += $unique[$i][1]; $correctswitched_multiple += $unique[$i][2]; } $correctswitched = $correctswitched_unique + $correctswitched_multiple; $correctswitched_percent = $correctswitched / $total * 100; printf "$correctswitched (%.2f%%) read pairs mapped with wrong orientation but correct insert size ($correctswitched_unique with unique alignments)\n", $correctswitched_percent; # no mapping $nomapping = $total - ($correctcorrect + $correctswitched + $correctwrong + $singleton); $nomapping_percent = $nomapping / $total * 100; printf "$nomapping (%.2f%%) read pairs with no mapping\n", $nomapping_percent; # insert size $insertavg = 0; $insertavg = $insertsum / $insertcount unless ($insertcount == 0); #printf "%.2fbp average inner distance between read pairs (of read pairs with correct insert size)\n", $insertavg; # Not ready for primetime yet - needs verification exit; ############################################################