Mercurial > repos > jjohnson > tophat_stats_pe
comparison tophatstatsPE.pl @ 0:c6eade913da3 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Wed, 06 Feb 2013 09:56:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6eade913da3 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 ############################################ | |
4 # tophatstatsPE | |
5 # John Garbe | |
6 # May 29, 2012 | |
7 # | |
8 # Calculate statistics about the data going into and out of tophat: | |
9 # -Count the number of raw reads, the number of aligned reads, the number | |
10 # of properly paired reads, etc | |
11 # | |
12 ########################################### | |
13 | |
14 die "USAGE: tophatstats.pl accepted_hits.bam fastqfile.fastq\n" unless ($#ARGV == 1); | |
15 | |
16 $bamfile = $ARGV[0]; | |
17 $fastqfile = $ARGV[1]; | |
18 die "cannot open $fastqfile" unless (-e $fastqfile); | |
19 die "cannot open $bamfile" unless (-e $bamfile); | |
20 | |
21 my $junk; | |
22 $divide = 4; # four lines per read in a fastq file | |
23 $insertsum = 0; | |
24 $insertcount = 0; | |
25 # count total number of raw reads | |
26 if (1) { | |
27 print "Input files: $bamfile\t $fastqfile\n"; | |
28 $results = `wc -l $fastqfile`; | |
29 ($results, $junk) = split ' ', $results; | |
30 $total = $results / $divide; | |
31 print "$total total read pairs in fastq file\n"; | |
32 } | |
33 | |
34 # read in bam file as a pipe from samtools | |
35 open IFILE, "samtools view $bamfile |" or die "cannot open $bamfile\n"; | |
36 | |
37 # initialize array counting multiple alignments | |
38 for $i (0..200) { | |
39 $count[$i] = 0; | |
40 for $j (0..40) { | |
41 $unique[$i][$j] = 0; | |
42 } | |
43 } | |
44 | |
45 $insertsum = 0; | |
46 $insertcount = 0; | |
47 # read through bam file, counting alignments | |
48 while ($line = <IFILE>) { | |
49 @line = split /\t/, $line, 3; | |
50 $count[$line[1]]++; | |
51 | |
52 # pull out number of alignments for this read | |
53 @line = split /\t/, $line, 12; | |
54 ($junk, $int) = split /NH:i:/, $line[11]; | |
55 ($int, $junk) = split /\W/, $int; | |
56 if ($int == 1) { # count unique alignments | |
57 $unique[$line[1]][1]++; | |
58 } elsif ($int > 1) { # count non-unique alignments | |
59 $unique[$line[1]][2]+= 1 / $int; | |
60 } | |
61 if ($line[1] == 99 || $line[1] == 83) { # if proper insert size use for insert size average | |
62 @line = split /\t/, $line, 10; | |
63 $insertsum += $line[8]; | |
64 $insertcount++; | |
65 } | |
66 } | |
67 | |
68 # singleton | |
69 $singleton_unique = 0; | |
70 $singleton_multiple = 0; | |
71 for $i (73, 133, 89, 121, 165, 181, 101, 117, 153, 185, 69, 137) { | |
72 $singleton_unique += $unique[$i][1]; | |
73 $singleton_multiple += int($unique[$i][2]); | |
74 } | |
75 $singleton = $singleton_unique + $singleton_multiple; | |
76 $singleton_percent = $singleton / $total * 100; | |
77 printf "$singleton (%.2f%%) read pairs with only one read in the pair mapped ($singleton_unique with unique alignments)\n", $singleton_percent; | |
78 | |
79 # correct correct | |
80 $correctcorrect_unique = 0; | |
81 $correctcorrect_multiple = 0; | |
82 for $i (99, 83) { | |
83 $correctcorrect_unique += $unique[$i][1]; | |
84 $correctcorrect_multiple += int($unique[$i][2]); | |
85 } | |
86 $correctcorrect = $correctcorrect_unique + $correctcorrect_multiple; | |
87 $correctcorrect_percent = $correctcorrect / $total * 100; | |
88 printf "$correctcorrect (%.2f%%) read pairs mapped with correct orientation and insert size ($correctcorrect_unique with unique alignments)\n", $correctcorrect_percent; | |
89 | |
90 # correct wrong insert size | |
91 $correctwrong_unique = 0; | |
92 $correctwrong_multiple = 0; | |
93 for $i (81, 97, 65, 113) { | |
94 $correctwrong_unique += $unique[$i][1]; | |
95 $correctwrong_multiple += int($unique[$i][2]); | |
96 } | |
97 $correctwrong = $correctwrong_unique + $correctwrong_multiple; | |
98 $correctwrong_percent = $correctwrong / $total * 100; | |
99 printf "$correctwrong (%.2f%%) read pairs mapped with correct orientation but wrong insert size ($correctwrong_unique with unique alignments)\n", $correctwrong_percent; | |
100 | |
101 # correct wrong orientation | |
102 $correctswitched_unique = 0; | |
103 $correctswitched_multiple = 0; | |
104 for $i (67, 115) { | |
105 $correctswitched_unique += $unique[$i][1]; | |
106 $correctswitched_multiple += $unique[$i][2]; | |
107 } | |
108 $correctswitched = $correctswitched_unique + $correctswitched_multiple; | |
109 $correctswitched_percent = $correctswitched / $total * 100; | |
110 printf "$correctswitched (%.2f%%) read pairs mapped with wrong orientation but correct insert size ($correctswitched_unique with unique alignments)\n", $correctswitched_percent; | |
111 | |
112 # no mapping | |
113 $nomapping = $total - ($correctcorrect + $correctswitched + $correctwrong + $singleton); | |
114 $nomapping_percent = $nomapping / $total * 100; | |
115 printf "$nomapping (%.2f%%) read pairs with no mapping\n", $nomapping_percent; | |
116 | |
117 # insert size | |
118 $insertavg = 0; | |
119 $insertavg = $insertsum / $insertcount unless ($insertcount == 0); | |
120 #printf "%.2fbp average inner distance between read pairs (of read pairs with correct insert size)\n", $insertavg; # Not ready for primetime yet - needs verification | |
121 | |
122 exit; | |
123 | |
124 ############################################################ | |
125 |