Mercurial > repos > chawhwa > neuma
diff NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NEUMA-1.2.1/bowtie2genecount.11.pl Thu Aug 08 00:46:13 2013 -0400 @@ -0,0 +1,283 @@ +#!/usr/bin/perl + +# This version has an option of counting only the pairs with a single mate distance. (version 11). +# This version has fixed the problem in filtering max_insert_length. (version 11). +# This version does not print out the additional tab in the iNIR file. (version 11). +# This version has data type option for parsing refseq headers. (version 10) +# This version fixed the max_insert_lendis option name from max_insert_length. (version 10) +# This version can take multiple bowtieout files separated by comma. (version 10) +# This version can take option of max insert lendis (version 9) +# This version can take sample name (column title) as option, if -f is selected. (version 8) +# This version fixed the problem of double-counting cases that are strand-flipped. (version 7) +# This version has an option of counting all reads, gNIR or iNIR. (version 7) +# This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6) +# This version has an option of using a single bowtie file name instead of bowtie directory. (version 6) +# This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5) +# This version fixed the problem of going into infinite loop when run with one sample. (Version 3) +# This version is verbose. (version 2) + +use Getopt::Long; +my $verbose=0; +my $LINEBIN=1000000; + +my $bowtie = 'd'; # default is directory. +my $count_opt = 'gReadcount'; +my $samplename; +my $max_insert_length; +my $readlength; +my $datatype='E'; +my $uniq_d; # reads with single mate distance only (applicable only for PE) +&GetOptions ( 'v' => sub { $verbose=1 }, + 'vv' => sub { $verbose=2 }, + 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory + 'gNIR' => sub { $count_opt = 'gNIR' }, + 'iNIR' => sub { $count_opt = 'iNIR' }, + 's|sample=s' => \$samplename, + 'm|max_insert_length=i' => \$max_insert_length, + 'l|readlength=i' => \$readlength, + 'd|datatype=s' => \$datatype, + 'u|uniq_d' => \$uniq_d +); + +if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; } +if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; } + + +if(@ARGV<3) { &help; exit; } + + +my $gene2NM = shift @ARGV; +my $bowtieoutdir = shift @ARGV; +my $PE = shift @ARGV; + +if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; } + + +if($bowtie eq 'd'){ + @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`; + if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); } +} +else { + push @bowtiefiles, (split/,/,$bowtieoutdir); +} + + +open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n"; +while(<GENE2NM>){ + chomp; + my ($g,$t) = split/\t/; + $t2g{$t}=$g; + push @{$g2t{$g}}, $t; +} +close GENE2NM; +if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); } + + + + + +for my $i (0..$#bowtiefiles) { + + my $bowtieout = $bowtiefiles[$i]; + my %mapped_genes =(); + my %mapped_tr =(); + my %mapped_insert_lengths =(); + + if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; } + + + my $prev_head=''; my $prev_tr=''; + open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n"; + while(<$BOWTIE>){ + chomp; + my ($head,$tr,$pos1) = (split/\t/)[0,2,3]; + if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }} + + + my $insert_length=0; ## default, for SE. + if($PE==1){ + chomp(my $secondline = <$BOWTIE>); + my ($head2,$pos2) = (split/\t/,$secondline)[0,3]; + $head = &min($head,$head2); + $insert_length = $pos2 - $pos1 + $readlength; + if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } } + } + + + if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line + + + if($count_opt eq 'gNIR'){ + if($head eq $prev_head) { + $mapped_tr { $tr } = 1; + $mapped_tr { $prev_tr } = 1; + $mapped_genes { $t2g{$tr} } = 1; + $mapped_genes { $t2g{$prev_tr} } = 1; + } + else { + if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} + ## count reads that are unique to the gene and common to all of its isoforms + %mapped_genes =(); + %mapped_tr = (); + $mapped_tr { $tr } = 1; + $mapped_genes { $t2g{$tr} } = 1; + } + + } + elsif($count_opt eq 'iNIR'){ + if($head eq $prev_head) { + $mapped_tr { $tr } = 1; + $mapped_tr { $prev_tr } = 1; + } + else { + if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript + %mapped_tr = (); + $mapped_tr { $tr } = 1; + } + } + else { + if($head eq $prev_head) { + $mapped_genes { $t2g{$tr} } = 1; + $mapped_genes { $t2g{$prev_tr} } = 1; + $mapped_insert_lengths { "$insert_length" } =1; + $mapped_insert_lengths { "$prev_insert_length" } =1; + } + else { + if(scalar(keys(%mapped_genes)) == 1) { + my ($gene) = keys %mapped_genes; + if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } + else { $count{$gene}[$i]++; } + } ## count reads that are unique to the gene + %mapped_genes =(); + %mapped_insert_lengths =(); + $mapped_genes { $t2g{$tr} } = 1; + $mapped_insert_lengths { "$insert_length" } =1; + } + } + + + $prev_head = $head; + $prev_tr = $tr; + $prev_insert_length = $insert_length; + + if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } } + + } + close $BOWTIE; + + + # last line + if($count_opt eq 'gNIR'){ + if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} + ## count reads that are unique to the gene and common to all of its isoforms + } + elsif($count_opt eq 'iNIR'){ + if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript + } + else{ + if(scalar(keys(%mapped_genes)) == 1) { + my ($gene) = keys %mapped_genes; + if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } + else { $count{$gene}[$i]++; } + } ## count reads that are unique to the gene + } + +} + + +$"="\t"; + + +if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; } +if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); } + + +$minlen=1000000; +for my $file (@bowtiefiles){ + $minlen = length($file) if length($file)<$minlen; +} + + +# find longest common suffix for bowtiefiles +my %suffs; +my $k=0; +do { + %suffs=(); + $k++; + for my $file (@bowtiefiles){ + $suff = substr($file,length($file)-$k); + $suffs{$suff}=1; + } +} while ( scalar(keys(%suffs))==1 && $k<$minlen); + +my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1)); +if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; } + +# title is between "/" and common suffix. +my @title=(); +for my $file (@bowtiefiles){ + if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; } + else { push @title, $file; } +} +if(defined($samplename)) { @title=(); push @title, $samplename; } + + +if($verbose) { print STDERR "Printing to file...\n"; } + +if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){ + print "gene\t@title\n"; + for my $gene (sort keys %count){ + print "$gene"; + for my $i (0..$#bowtiefiles){ + if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; } + else { print "\t0"; } + } + print "\n"; + } +} +else { ## iNIR + print "gene\ttranscript\t@title\n"; + for my $tr (sort keys %count){ + print "$t2g{$tr}\t$tr"; + for my $i (0..$#bowtiefiles){ + if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; } + else { print "\t0"; } + } + print "\n"; + } +} + +if($verbose) { print STDERR "done.\n"; } + + + +## subroutines ## + + + +sub help { + print STDERR<<EOF + +usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount + +Options: + -v : verbose + -vv : more verbose + -f : use bowtie file name instead of bowtie directory name. + ex) $0 -f gene2NM bowtieoutfile PE > output.readcount + ex2) $0 gene2NM bowtieoutdir PE > output.readcount + --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR) + --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR) + -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used. + -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored) + -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used. + -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E). + -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount) +EOF +} + +sub min { + if($_[0] lt $_[1]) { $_[0] } else { $_[1] }; +} + +