Mercurial > repos > chawhwa > neuma
diff NEUMA-1.2.1/bowtieout2mappingstat.2.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NEUMA-1.2.1/bowtieout2mappingstat.2.pl Thu Aug 08 00:46:13 2013 -0400 @@ -0,0 +1,103 @@ +#!/usr/bin/perl +use Getopt::Long; +# This version includes max_insert_length filtered mapping stat, optionally. (version 2) + + +my $max_insert_length; +my $readlength; +my $datatype = 'E'; # E or R. +&GetOptions ( 'm|max_insert_length=i' => \$max_insert_length, + 'l|readlength=i' => \$readlength, + 'd|datatype=s' => \$datatype +); + +if(defined($max_insert_length) && !defined($readlength)) { die "readlength (-l|--readlength) must be defined if -m (--max_insert_length) is defined.\n"; } +if(@ARGV<3){ &help; exit; } + + +my $bowtie = shift @ARGV; +my $sample_name = shift @ARGV; +my $PE = shift @ARGV; + +my $prev_head=''; +my $NM=0; my $NR=0; my $NM_lf = 0; my $NR_lf = 0; # lf : insert-length-filtered version +my $countNM=0; my $countNM_lf =0; +my $countNR=0; my $countNR_lf = 0; +my $countAll=0; my $countAll_lf=0; + + +open IN, $bowtie or die "Can't open bowtiefile $bowtie\n"; +while(<IN>){ + chomp; + my ($head,$tr,$pos1)= (split/\t/)[0,2,3]; + if($datatype eq 'R') { ($tr) = (split/\|/,$tr)[3]; $tr=~s/\.\d+$//; } + + if($PE==1){ + chomp(my $secondline = <IN>); + my ($head2,$pos2) = (split/\t/,$secondline)[0,3]; + $head = &min($head,$head2); + + } + + if($prev_head eq $head) { + + if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) { + if($tr=~/^NM_/) { $NM_lf = 1; } + elsif($tr=~/^NR_/) { $NR_lf = 1; } + } + + if($tr=~/^NM_/) { $NM=1; } + elsif($tr=~/^NR_/) { $NR=1; } + } + else { + + if(defined($max_insert_length) && $pos2+$readlength-$pos1 <= $max_insert_length) { + if($NM_lf==1) { $countNM_lf++; $NM_lf=0; } + if($NR_lf==1) { $countNR_lf++; $NR_lf=0; } + $countAll_lf++; + } + if($NM==1) { $countNM++; $NM=0; } ## if a single read is mapped to both NR and NM, it is counted once for each. Therefore total count may not be the same as the sum of NR and NM counts. + if($NR==1) { $countNR++; $NR=0; } + $countAll++; + } + $prev_head=$head; +} +# no need to do additional last line +close IN; + + +if($datatype eq 'E') { ($countNM,$countNR,$countNM_lf,$countNM_lf) = ('NA','NA','NA','NA'); } + +$"="\t"; +if(defined($max_insert_length)){ + print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\tmRNA_mapped(lenfiltered)\tncRNA_mapped(lenfiltered)\ttotalRNA_mapped(lenfiltered)\n"; + print "$sample_name\t$countNM\t$countNR\t$countAll\t$countNM_lf\t$countNR_lf\t$countAll_lf\n"; +} +else { + print "sample_name\tmRNA_mapped\tncRNA_mapped\ttotalRNA_mapped\n"; + print "$sample_name\t$countNM\t$countNR\t$countAll\n"; +} + + + +sub min { + if($_[0] lt $_[1]) { $_[0] } else { $_[1] }; +} + + +sub help { + print STDERR<<EOF; + +usage: $0 <options> bowtieoutfile samplename PE(1/0) (> mappingstat) + +Options: + -m|--max_insert_length <max_insert_length> : the output includes insert-length-filtered mapping stats as well. -l(--readlength) option must be accompanied. This option is valid only for paired-end data. (For single-end, it is ignored) + -l|--readlength <readlength> : read length. This is required only when -m(--max_insert_length) option is used. + -d|--datatype <datatype> : Either E (Ensembl) or R (Refseq). Use R if the data is Refseq and the transcript names are in format 'gi|xxx|xxx|name'. (Default E) + +EOF +} + + + +