annotate tophatstatsPE.pl @ 0:c6eade913da3 draft default tip

Uploaded
author jjohnson
date Wed, 06 Feb 2013 09:56:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
1 #!/usr/bin/perl -w
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
2
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
3 ############################################
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
4 # tophatstatsPE
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
5 # John Garbe
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
6 # May 29, 2012
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
7 #
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
8 # Calculate statistics about the data going into and out of tophat:
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
9 # -Count the number of raw reads, the number of aligned reads, the number
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
10 # of properly paired reads, etc
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
11 #
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
12 ###########################################
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
13
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
14 die "USAGE: tophatstats.pl accepted_hits.bam fastqfile.fastq\n" unless ($#ARGV == 1);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
15
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
16 $bamfile = $ARGV[0];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
17 $fastqfile = $ARGV[1];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
18 die "cannot open $fastqfile" unless (-e $fastqfile);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
19 die "cannot open $bamfile" unless (-e $bamfile);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
20
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
21 my $junk;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
22 $divide = 4; # four lines per read in a fastq file
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
23 $insertsum = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
24 $insertcount = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
25 # count total number of raw reads
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
26 if (1) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
27 print "Input files: $bamfile\t $fastqfile\n";
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
28 $results = `wc -l $fastqfile`;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
29 ($results, $junk) = split ' ', $results;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
30 $total = $results / $divide;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
31 print "$total total read pairs in fastq file\n";
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
32 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
33
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
34 # read in bam file as a pipe from samtools
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
35 open IFILE, "samtools view $bamfile |" or die "cannot open $bamfile\n";
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
36
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
37 # initialize array counting multiple alignments
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
38 for $i (0..200) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
39 $count[$i] = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
40 for $j (0..40) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
41 $unique[$i][$j] = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
42 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
43 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
44
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
45 $insertsum = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
46 $insertcount = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
47 # read through bam file, counting alignments
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
48 while ($line = <IFILE>) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
49 @line = split /\t/, $line, 3;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
50 $count[$line[1]]++;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
51
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
52 # pull out number of alignments for this read
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
53 @line = split /\t/, $line, 12;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
54 ($junk, $int) = split /NH:i:/, $line[11];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
55 ($int, $junk) = split /\W/, $int;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
56 if ($int == 1) { # count unique alignments
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
57 $unique[$line[1]][1]++;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
58 } elsif ($int > 1) { # count non-unique alignments
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
59 $unique[$line[1]][2]+= 1 / $int;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
60 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
61 if ($line[1] == 99 || $line[1] == 83) { # if proper insert size use for insert size average
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
62 @line = split /\t/, $line, 10;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
63 $insertsum += $line[8];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
64 $insertcount++;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
65 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
66 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
67
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
68 # singleton
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
69 $singleton_unique = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
70 $singleton_multiple = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
71 for $i (73, 133, 89, 121, 165, 181, 101, 117, 153, 185, 69, 137) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
72 $singleton_unique += $unique[$i][1];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
73 $singleton_multiple += int($unique[$i][2]);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
74 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
75 $singleton = $singleton_unique + $singleton_multiple;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
76 $singleton_percent = $singleton / $total * 100;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
77 printf "$singleton (%.2f%%) read pairs with only one read in the pair mapped ($singleton_unique with unique alignments)\n", $singleton_percent;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
78
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
79 # correct correct
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
80 $correctcorrect_unique = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
81 $correctcorrect_multiple = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
82 for $i (99, 83) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
83 $correctcorrect_unique += $unique[$i][1];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
84 $correctcorrect_multiple += int($unique[$i][2]);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
85 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
86 $correctcorrect = $correctcorrect_unique + $correctcorrect_multiple;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
87 $correctcorrect_percent = $correctcorrect / $total * 100;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
88 printf "$correctcorrect (%.2f%%) read pairs mapped with correct orientation and insert size ($correctcorrect_unique with unique alignments)\n", $correctcorrect_percent;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
89
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
90 # correct wrong insert size
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
91 $correctwrong_unique = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
92 $correctwrong_multiple = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
93 for $i (81, 97, 65, 113) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
94 $correctwrong_unique += $unique[$i][1];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
95 $correctwrong_multiple += int($unique[$i][2]);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
96 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
97 $correctwrong = $correctwrong_unique + $correctwrong_multiple;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
98 $correctwrong_percent = $correctwrong / $total * 100;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
99 printf "$correctwrong (%.2f%%) read pairs mapped with correct orientation but wrong insert size ($correctwrong_unique with unique alignments)\n", $correctwrong_percent;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
100
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
101 # correct wrong orientation
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
102 $correctswitched_unique = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
103 $correctswitched_multiple = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
104 for $i (67, 115) {
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
105 $correctswitched_unique += $unique[$i][1];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
106 $correctswitched_multiple += $unique[$i][2];
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
107 }
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
108 $correctswitched = $correctswitched_unique + $correctswitched_multiple;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
109 $correctswitched_percent = $correctswitched / $total * 100;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
110 printf "$correctswitched (%.2f%%) read pairs mapped with wrong orientation but correct insert size ($correctswitched_unique with unique alignments)\n", $correctswitched_percent;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
111
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
112 # no mapping
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
113 $nomapping = $total - ($correctcorrect + $correctswitched + $correctwrong + $singleton);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
114 $nomapping_percent = $nomapping / $total * 100;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
115 printf "$nomapping (%.2f%%) read pairs with no mapping\n", $nomapping_percent;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
116
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
117 # insert size
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
118 $insertavg = 0;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
119 $insertavg = $insertsum / $insertcount unless ($insertcount == 0);
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
120 #printf "%.2fbp average inner distance between read pairs (of read pairs with correct insert size)\n", $insertavg; # Not ready for primetime yet - needs verification
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
121
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
122 exit;
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
123
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
124 ############################################################
c6eade913da3 Uploaded
jjohnson
parents:
diff changeset
125