Mercurial > repos > chawhwa > neuma
view NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip
NEUMA-1.2.1 Uploaded
author | chawhwa |
---|---|
date | Thu, 08 Aug 2013 00:46:13 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl # This version has an option of counting only the pairs with a single mate distance. (version 11). # This version has fixed the problem in filtering max_insert_length. (version 11). # This version does not print out the additional tab in the iNIR file. (version 11). # This version has data type option for parsing refseq headers. (version 10) # This version fixed the max_insert_lendis option name from max_insert_length. (version 10) # This version can take multiple bowtieout files separated by comma. (version 10) # This version can take option of max insert lendis (version 9) # This version can take sample name (column title) as option, if -f is selected. (version 8) # This version fixed the problem of double-counting cases that are strand-flipped. (version 7) # This version has an option of counting all reads, gNIR or iNIR. (version 7) # This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6) # This version has an option of using a single bowtie file name instead of bowtie directory. (version 6) # This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5) # This version fixed the problem of going into infinite loop when run with one sample. (Version 3) # This version is verbose. (version 2) use Getopt::Long; my $verbose=0; my $LINEBIN=1000000; my $bowtie = 'd'; # default is directory. my $count_opt = 'gReadcount'; my $samplename; my $max_insert_length; my $readlength; my $datatype='E'; my $uniq_d; # reads with single mate distance only (applicable only for PE) &GetOptions ( 'v' => sub { $verbose=1 }, 'vv' => sub { $verbose=2 }, 'f' => sub { $bowtie='f'}, # read bowtie file instead of bowtiu directory 'gNIR' => sub { $count_opt = 'gNIR' }, 'iNIR' => sub { $count_opt = 'iNIR' }, 's|sample=s' => \$samplename, 'm|max_insert_length=i' => \$max_insert_length, 'l|readlength=i' => \$readlength, 'd|datatype=s' => \$datatype, 'u|uniq_d' => \$uniq_d ); if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning: -s|--sample option is ignored, when -f is not selected.\n"; } if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; } if(@ARGV<3) { &help; exit; } my $gene2NM = shift @ARGV; my $bowtieoutdir = shift @ARGV; my $PE = shift @ARGV; if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; } if($bowtie eq 'd'){ @bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`; if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); } } else { push @bowtiefiles, (split/,/,$bowtieoutdir); } open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n"; while(<GENE2NM>){ chomp; my ($g,$t) = split/\t/; $t2g{$t}=$g; push @{$g2t{$g}}, $t; } close GENE2NM; if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); } for my $i (0..$#bowtiefiles) { my $bowtieout = $bowtiefiles[$i]; my %mapped_genes =(); my %mapped_tr =(); my %mapped_insert_lengths =(); if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; } my $prev_head=''; my $prev_tr=''; open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n"; while(<$BOWTIE>){ chomp; my ($head,$tr,$pos1) = (split/\t/)[0,2,3]; if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }} my $insert_length=0; ## default, for SE. if($PE==1){ chomp(my $secondline = <$BOWTIE>); my ($head2,$pos2) = (split/\t/,$secondline)[0,3]; $head = &min($head,$head2); $insert_length = $pos2 - $pos1 + $readlength; if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } } } if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; } } # print only at first line if($count_opt eq 'gNIR'){ if($head eq $prev_head) { $mapped_tr { $tr } = 1; $mapped_tr { $prev_tr } = 1; $mapped_genes { $t2g{$tr} } = 1; $mapped_genes { $t2g{$prev_tr} } = 1; } else { if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} ## count reads that are unique to the gene and common to all of its isoforms %mapped_genes =(); %mapped_tr = (); $mapped_tr { $tr } = 1; $mapped_genes { $t2g{$tr} } = 1; } } elsif($count_opt eq 'iNIR'){ if($head eq $prev_head) { $mapped_tr { $tr } = 1; $mapped_tr { $prev_tr } = 1; } else { if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript %mapped_tr = (); $mapped_tr { $tr } = 1; } } else { if($head eq $prev_head) { $mapped_genes { $t2g{$tr} } = 1; $mapped_genes { $t2g{$prev_tr} } = 1; $mapped_insert_lengths { "$insert_length" } =1; $mapped_insert_lengths { "$prev_insert_length" } =1; } else { if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } else { $count{$gene}[$i]++; } } ## count reads that are unique to the gene %mapped_genes =(); %mapped_insert_lengths =(); $mapped_genes { $t2g{$tr} } = 1; $mapped_insert_lengths { "$insert_length" } =1; } } $prev_head = $head; $prev_tr = $tr; $prev_insert_length = $insert_length; if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } } } close $BOWTIE; # last line if($count_opt eq 'gNIR'){ if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }} ## count reads that are unique to the gene and common to all of its isoforms } elsif($count_opt eq 'iNIR'){ if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; } ## count reads that are unique to the transcript } else{ if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } } else { $count{$gene}[$i]++; } } ## count reads that are unique to the gene } } $"="\t"; if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; } if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); } $minlen=1000000; for my $file (@bowtiefiles){ $minlen = length($file) if length($file)<$minlen; } # find longest common suffix for bowtiefiles my %suffs; my $k=0; do { %suffs=(); $k++; for my $file (@bowtiefiles){ $suff = substr($file,length($file)-$k); $suffs{$suff}=1; } } while ( scalar(keys(%suffs))==1 && $k<$minlen); my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1)); if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; } # title is between "/" and common suffix. my @title=(); for my $file (@bowtiefiles){ if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; } else { push @title, $file; } } if(defined($samplename)) { @title=(); push @title, $samplename; } if($verbose) { print STDERR "Printing to file...\n"; } if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){ print "gene\t@title\n"; for my $gene (sort keys %count){ print "$gene"; for my $i (0..$#bowtiefiles){ if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; } else { print "\t0"; } } print "\n"; } } else { ## iNIR print "gene\ttranscript\t@title\n"; for my $tr (sort keys %count){ print "$t2g{$tr}\t$tr"; for my $i (0..$#bowtiefiles){ if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; } else { print "\t0"; } } print "\n"; } } if($verbose) { print STDERR "done.\n"; } ## subroutines ## sub help { print STDERR<<EOF usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount Options: -v : verbose -vv : more verbose -f : use bowtie file name instead of bowtie directory name. ex) $0 -f gene2NM bowtieoutfile PE > output.readcount ex2) $0 gene2NM bowtieoutdir PE > output.readcount --gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR) --iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR) -s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used. -m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored) -l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used. -d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E). -u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount) EOF } sub min { if($_[0] lt $_[1]) { $_[0] } else { $_[1] }; }