Mercurial > repos > chawhwa > neuma
view NEUMA-1.2.1/filter.best.from.bowtieout.3.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use Getopt::Long; # This version has an option of filtering only exclusively-CDS-mapped reads (useful for ribosome profiling analysis). The option name for delete_original is not cahnged. (version 3) # This version fixed the problem of not being able to merge when the order of the two mates were flipped. (version 2) my $delete_original; my $newbowtiefile; ## output bowtiefile my $mmcol=4; my $datatype='E'; &GetOptions ( 'rm|delete_original' => \$delete_original, 'o=s' => \$newbowtiefile, 'mmcol=i' => \$mmcol, ## column for mismatch info (0-based) 'd|datatype=s' => \$datatype # refseq or ensembl ('refseq' if names must be parsed) ); if(@ARGV<2) { &help; exit; } my $bowtiefile = shift @ARGV; my $PE = shift @ARGV; # 1 for paired-end, 0 for single-end if($datatype !~/^[RE]$/ ) { die "ERROR: -d (--datatype) must be specified to be either R (Refseq) or E (Ensembl) (default)\n"; } if(!defined($newbowtiefile)){ ($newbowtiefile) = $bowtiefile=~/(.+)\.bowtieout/; $newbowtiefile .= ".best.bowtieout"; } my $prev_head1=''; my $prev_head2=''; my $totmm=100000; # just some huge number my @mapping=(); open $IN, $bowtiefile or die "Can't open bowtiefile $bowtiefile\n"; open $OUT, ">$newbowtiefile" or die "Can't write to output bowtiefile $newbowtiefile\n"; while(<$IN>){ chomp; my ($head1,$strand1,$tr,$pos1,$mmstr1) = (split/\t/)[0,1,2,3,$mmcol]; my $mm1 = &parse_mmstr($mmstr1); if($PE==1) { chomp(my $secondline = <$IN>); ($head2,$strand2,$pos2,$mmstr2) = (split/\t/,$secondline)[0,1,3,$mmcol]; my $mm2 = &parse_mmstr($mmstr2); } if($PE==1){ if($prev_head1 eq $head1 || $prev_head2 eq $head1) { if($totmm>$mm1+$mm2) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; } elsif($totma==$mm1+$mm2) { push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; } } else { &print_mapping_PE(\@mapping,$OUT, $prev_head1,$prev_head2); @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1,$strand2,$pos2,$mmstr2]; $totmm=$mm1+$mm2; } } else { if($prev_head1 eq $head1) { if($totmm>$mm1) { @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; } elsif($totmm==$mm1) { push @mapping,[$tr,$strand1,$pos1,$mmstr1]; } } else { &print_mapping_SE(\@mapping,$OUT, $prev_head1); @mapping=(); push @mapping,[$tr,$strand1,$pos1,$mmstr1]; $totmm=$mm1; } } $prev_head1= $head1; if($PE==1) { $prev_head2 = $head2; } } ## last line if($PE==1){ &print_mapping_PE(\@mapping,$OUT, $prev_head1, $prev_head2); } else { &print_mapping_SE(\@mapping,$OUT, $prev_head1); } close $OUT; close $IN; ## delete original bowtieout file if(defined $delete_original) { `rm -f $bowtiefile`; } ## subroutines @@ sub parse_mmstr { if(!defined $_[0]){ return 0; } else { return(scalar(split/,/,$_[0])); } } sub print_mapping_PE { my $pMapping = shift @_; my $OUT = shift @_; my $prev_head1 = shift @_; my $prev_head2 = shift @_; if($prev_head1 ne ''){ for my $i (0..$#$pMapping){ my ($tr,$strand1,$pos1,$mm1,$strand2,$pos2,$mm2) = @{$pMapping->[$i]}; print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; print $OUT "$prev_head2\t$strand2\t$tr\t$pos2\t$mm2\n"; } } } sub print_mapping_SE { my $pMapping = shift @_; my $OUT = shift @_; my $prev_head1 = shift @_; if($prev_head1 ne ''){ for my $i (0..$#$pMapping){ my ($tr,$strand1,$pos1,$mm1) = @{$pMapping->[$i]}; print $OUT "$prev_head1\t$strand1\t$tr\t$pos1\t$mm1\n"; } } } sub help { print STDERR<<EOF usage: $0 <options> bowtieout_file PE(1/0) Options --rm|--delete_original : delete the original bowtieout file -o <outfilename> : output bowtieout file name (default : originalfiletitle.best.bowtieout / originalfiletitle.best.cdsonly.bowtieout (if --cdsonly is used) ) --mmcol : the column number (0-based) for mismatch information. (default : 4) -d | datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if the transcript names are in Refseq format (gi|xxx|xxx|name). This option affects only when --cdsonly is used. (default : E) EOF }