comparison NEUMA-1.2.1/bowtieout2mappingstat.2.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded
author chawhwa
date Thu, 08 Aug 2013 00:46:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c44c43d185ef
1 #!/usr/bin/perl
2 use Getopt::Long;
3 # This version includes max_insert_length filtered mapping stat, optionally. (version 2)
4
5
6 my $max_insert_length;
7 my $readlength;
8 my $datatype = 'E'; # E or R.
9 &GetOptions ( 'm|max_insert_length=i' => \$max_insert_length,
10 'l|readlength=i' => \$readlength,
11 'd|datatype=s' => \$datatype
12 );
13
14 if(defined($max_insert_length) && !defined($readlength)) { die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; }
15 if(@ARGV<3){ &help; exit; }
16
17
18 my $bowtie = shift @ARGV;
19 my $sample_name = shift @ARGV;
20 my $PE = shift @ARGV;
21
22 my $prev_head='';
23 my $NM=0; my $NR=0; my $NM_lf = 0; my $NR_lf = 0; # lf : insert-length-filtered version
24 my $countNM=0; my $countNM_lf =0;
25 my $countNR=0; my $countNR_lf = 0;
26 my $countAll=0; my $countAll_lf=0;
27
28
29 open IN, $bowtie or die "Can't open bowtiefile $bowtie\n";
30 while(<IN>){
31 chomp;
32 my ($head,$tr,$pos1)= (split/\t/)[0,2,3];
33 if($datatype eq 'R') { ($tr) = (split/\|/,$tr)[3]; $tr=~s/\.\d+$//; }
34
35 if($PE==1){
36 chomp(my $secondline = <IN>);
37 my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
38 $head = &min($head,$head2);
39
40 }
41
42 if($prev_head eq $head) {
43
44 if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) {
45 if($tr=~/^NM_/) { $NM_lf = 1; }
46 elsif($tr=~/^NR_/) { $NR_lf = 1; }
47 }
48
49 if($tr=~/^NM_/) { $NM=1; }
50 elsif($tr=~/^NR_/) { $NR=1; }
51 }
52 else {
53
54 if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) {
55 if($NM_lf==1) { $countNM_lf++; $NM_lf=0; }
56 if($NR_lf==1) { $countNR_lf++; $NR_lf=0; }
57 $countAll_lf++;
58 }
59 if($NM==1) { $countNM++; $NM=0; } ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts.
60 if($NR==1) { $countNR++; $NR=0; }
61 $countAll++;
62 }
63 $prev_head=$head;
64 }
65 # no need to do additional last line
66 close IN;
67
68
69 if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); }
70
71 $"="\t";
72 if(defined($max_insert_length)){
73 print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n";
74 print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n";
75 }
76 else {
77 print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n";
78 print "$sample_name\t$countNM\t$countNR\t$countAll\n";
79 }
80
81
82
83 sub min {
84 if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
85 }
86
87
88 sub help {
89 print STDERR<<EOF;
90
91 usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat)
92
93 Options:
94 -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored)
95 -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used.
96 -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and the transcript names are in format 'gi|xxx|xxx|name'. (Default E)
97
98 EOF
99 }
100
101
102
103