neuma: NEUMA-1.2.1/bowtie2genecount.11.pl comparison

comparison NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip

NEUMA-1.2.1 Uploaded

author	chawhwa
date	Thu, 08 Aug 2013 00:46:13 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:c44c43d185ef
+#!/usr/bin/perl
+# This version has an option of counting only the pairs with a single mate distance. (version 11).
+# This version has fixed the problem in filtering max_insert_length. (version 11).
+# This version does not print out the additional tab in the iNIR file. (version 11).
+# This version has data type option for parsing refseq headers. (version 10)
+# This version fixed the max_insert_lendis option name from max_insert_length. (version 10)
+# This version can take multiple bowtieout files separated by comma. (version 10)
+# This version can take option of max insert lendis (version 9)
+# This version can take sample name (column title) as option, if -f is selected. (version 8)
+# This version fixed the problem of double-counting cases that are strand-flipped. (version 7)
+# This version has an option of counting all reads, gNIR or iNIR. (version 7)
+# This version has two levels of verbosity and uses whole file name if run with a single bowtie file. (version 6)
+# This version has an option of using a single bowtie file name instead of bowtie directory. (version 6)
+# This version fixed the problem of not being able to find common suffix and assumes .bowtieout suffix for bowtieout files, to prevent reading other files in the directory. (version 5)
+# This version fixed the problem of going into infinite loop when run with one sample. (Version 3)
+# This version is verbose. (version 2)
+use Getopt::Long;
+my $verbose=0;
+my $LINEBIN=1000000;
+my $bowtie = 'd';  # default is directory.
+my $count_opt = 'gReadcount';
+my $samplename;
+my $max_insert_length;
+my $readlength;
+my $datatype='E';
+my $uniq_d;  # reads with single mate distance only (applicable only for PE)
+&GetOptions ( 'v' => sub { $verbose=1 },
+'vv' => sub { $verbose=2 },
+'f' => sub { $bowtie='f'},  # read bowtie file instead of bowtiu directory
+'gNIR' => sub { $count_opt = 'gNIR' },
+'iNIR' => sub { $count_opt = 'iNIR' },
+'s|sample=s' => \$samplename,
+'m|max_insert_length=i' => \$max_insert_length,
+'l|readlength=i' => \$readlength,
+'d|datatype=s' => \$datatype,
+'u|uniq_d' => \$uniq_d
+);
+if($bowtie eq 'd' && defined($samplename)) { print STDERR "Warning:  -s|--sample option is ignored, when -f is not selected.\n"; }
+if(defined($max_insert_length) && !defined($readlength)) { die "readlength must be defined if max_insert_length is defined.\n"; }
+if(@ARGV<3) { &help;  exit; }
+my $gene2NM = shift @ARGV;
+my $bowtieoutdir = shift @ARGV;
+my $PE = shift @ARGV;
+if($PE==0 && defined($max_insert_length)) { print STDERR "Warning: -m|--max_insert_length option is ignored for single end data.\n"; }
+if($bowtie eq 'd'){
+@bowtiefiles = split/\n/,`ls -1 $bowtieoutdir/*`;
+if($verbose) { printf STDERR "Finished reading bowtieout dir $bowtieoutdir. Total %d files\n", scalar(@bowtiefiles); }
+}
+else {
+push @bowtiefiles, (split/,/,$bowtieoutdir);
+}
+open GENE2NM, $gene2NM or die "Can't open gene2NM file $gene2NM\n";
+while(<GENE2NM>){
+chomp;
+my ($g,$t) = split/\t/;
+$t2g{$t}=$g;
+push @{$g2t{$g}}, $t;
+}
+close GENE2NM;
+if($verbose) { printf STDERR "Finished reading gene2NM file $gene2NM. Total %d transcripts and %d genes\n", scalar(keys(%t2g)), scalar(keys(%g2t)); }
+for my $i (0..$#bowtiefiles) {
+my $bowtieout = $bowtiefiles[$i];
+my %mapped_genes =();
+my %mapped_tr =();
+my %mapped_insert_lengths =();
+if($verbose) { print STDERR "Processing bowtieout $bowtieout (file $i)\n"; }
+my $prev_head=''; my $prev_tr='';
+open $BOWTIE, $bowtieout or die "Can't open bowtiefile $bowtieout\n";
+while(<$BOWTIE>){
+chomp;
+my ($head,$tr,$pos1) = (split/\t/)[0,2,3];
+if($datatype eq 'R') { my ($tmp_tr) = (split/\|/,$tr)[3]; if(defined($tmp_tr)) {$tr = $tmp_tr; $tr=~s/\.\d+$//; }}
+my $insert_length=0;   ## default, for SE.
+if($PE==1){
+chomp(my $secondline = <$BOWTIE>);
+my ($head2,$pos2) = (split/\t/,$secondline)[0,3];
+$head = &min($head,$head2);
+$insert_length = $pos2 - $pos1 + $readlength;
+if(defined $max_insert_length) { if($insert_length > $max_insert_length) { next; } }
+}
+if($verbose==2) { if($prev_head eq '') { print STDERR "head= $head, tr= $tr, PE = $PE\n"; }  }   # print only at first line
+if($count_opt eq 'gNIR'){
+if($head eq $prev_head) {
+$mapped_tr { $tr } = 1;
+$mapped_tr { $prev_tr } = 1;
+$mapped_genes { $t2g{$tr} } = 1;
+$mapped_genes { $t2g{$prev_tr} } = 1;
+}
+else {
+if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
+## count reads that are unique to the gene and common to all of its isoforms
+%mapped_genes =();
+%mapped_tr = ();
+$mapped_tr { $tr } = 1;
+$mapped_genes { $t2g{$tr} } = 1;
+}
+}
+elsif($count_opt eq 'iNIR'){
+if($head eq $prev_head) {
+$mapped_tr { $tr } = 1;
+$mapped_tr { $prev_tr } = 1;
+}
+else {
+if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
+%mapped_tr = ();
+$mapped_tr { $tr } = 1;
+}
+}
+else {
+if($head eq $prev_head) {
+$mapped_genes { $t2g{$tr} } = 1;
+$mapped_genes { $t2g{$prev_tr} } = 1;
+$mapped_insert_lengths { "$insert_length" } =1;
+$mapped_insert_lengths { "$prev_insert_length" } =1;
+}
+else {
+if(scalar(keys(%mapped_genes)) == 1) {
+my ($gene) = keys %mapped_genes;
+if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
+else { $count{$gene}[$i]++;  }
+}  ## count reads that are unique to the gene
+%mapped_genes =();
+%mapped_insert_lengths =();
+$mapped_genes { $t2g{$tr} } = 1;
+$mapped_insert_lengths { "$insert_length" } =1;
+}
+}
+$prev_head = $head;
+$prev_tr = $tr;
+$prev_insert_length = $insert_length;
+if($verbose==2) { if($. % $LINEBIN==0) { print STDERR ":line $.\n"; } }
+}
+close $BOWTIE;
+# last line
+if($count_opt eq 'gNIR'){
+if(scalar(keys(%mapped_genes)) == 1) { my ($gene) = keys %mapped_genes; if(scalar(keys(%mapped_tr)) == scalar(@{$g2t{$gene}})) { $count{$gene}[$i]++; }}
+## count reads that are unique to the gene and common to all of its isoforms
+}
+elsif($count_opt eq 'iNIR'){
+if(scalar(keys(%mapped_tr)) == 1) { my ($transcript) = keys %mapped_tr; $count{$transcript}[$i]++; }  ## count reads that are unique to the transcript
+}
+else{
+if(scalar(keys(%mapped_genes)) == 1) {
+my ($gene) = keys %mapped_genes;
+if(defined($uniq_d)) { if(scalar(keys(%mapped_insert_lengths))==1) { $count{$gene}[$i]++; } }
+else { $count{$gene}[$i]++;  }
+}  ## count reads that are unique to the gene
+}
+}
+$"="\t";
+if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount') { $count_target='genes'; } else { $count_target = 'transcripts'; }
+if($verbose) { printf STDERR "Finished reading bowtieout files. Total %d $count_target counted\n",scalar(keys(%count)); }
+$minlen=1000000;
+for my $file (@bowtiefiles){
+$minlen = length($file) if length($file)<$minlen;
+}
+# find longest common suffix for bowtiefiles
+my %suffs;
+my $k=0;
+do {
+%suffs=();
+$k++;
+for my $file (@bowtiefiles){
+$suff = substr($file,length($file)-$k);
+$suffs{$suff}=1;
+}
+} while ( scalar(keys(%suffs))==1 &&  $k<$minlen);
+my ($commonsuffix) = substr($bowtiefiles[0],length($file)-($k-1));
+if(scalar(@bowtiefiles) == 1) { $commonsuffix = ''; }
+# title is between "/" and common suffix.
+my @title=();
+for my $file (@bowtiefiles){
+if($file=~ /([^\/]+)$commonsuffix$/) { push @title, $1; }
+else { push @title, $file; }
+}
+if(defined($samplename)) { @title=(); push @title, $samplename; }
+if($verbose) { print STDERR "Printing to file...\n"; }
+if($count_opt eq 'gNIR' || $count_opt eq 'gReadcount'){
+print "gene\t@title\n";
+for my $gene (sort keys %count){
+print "$gene";
+for my $i (0..$#bowtiefiles){
+if(defined $count{$gene}[$i]) { print "\t$count{$gene}[$i]"; }
+else { print "\t0"; }
+}
+print "\n";
+}
+}
+else {  ## iNIR
+print "gene\ttranscript\t@title\n";
+for my $tr (sort keys %count){
+print "$t2g{$tr}\t$tr";
+for my $i (0..$#bowtiefiles){
+if(defined $count{$tr}[$i]) { print "\t$count{$tr}[$i]"; }
+else { print "\t0"; }
+}
+print "\n";
+}
+}
+if($verbose) { print STDERR "done.\n"; }
+## subroutines ##
+sub help {
+print STDERR<<EOF
+usage: $0 <options> gene2NM bowtieoutdir/file PE[1/0] > output.readcount
+Options:
+-v : verbose
+-vv : more verbose
+-f : use bowtie file name instead of bowtie directory name.
+ex) $0 -f gene2NM bowtieoutfile PE > output.readcount
+ex2) $0 gene2NM bowtieoutdir PE > output.readcount
+--gNIR : count gNIR instead of gReadcount (recommended outfile suffix is .gNIR)
+--iNIR : count iNIR instead of gReadcount (recommended outfile suffix is .iNIR)
+-s|--sample <samplename> : sample name to be used as a column title. This options is available only when -f option is used.
+-m|--max_insert_length <max_insert_length> : maximum insert length. If this option is used, length filtering is done. This option must accompany option -l (--readlength). This option doesn't do anything for single-end data (ignored)
+-l|--readlength <readlength> : read length. This is required only when -m (--max_insert_length) option is used.
+-d|--datatype <datatype> : Either 'R' (Refseq) or 'E' (Ensembl). Use R if transcript names are in Refseq format (gi|xxx|xxx|name). (default E).
+-u|--uniq_d : filter reads with a single mate distance only. (applicable for only PE gReadcount)
+EOF
+}
+sub min {
+if($_[0] lt $_[1]) { $_[0] } else { $_[1] };
+}

Mercurial > repos > chawhwa > neuma

comparison NEUMA-1.2.1/bowtie2genecount.11.pl @ 0:c44c43d185ef draft default tip