Mercurial > repos > big-tiandm > mirplant2
view quantify.pl @ 43:4c0b1a94b882 draft
Uploaded
author | big-tiandm |
---|---|
date | Tue, 28 Oct 2014 01:35:32 -0400 |
parents | a79212816cbc |
children | 0c4e11018934 |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2013/7/19 #Modified: #Description: my $version=1.00; use File::Path; use strict; use File::Basename; #use Getopt::Std; use Getopt::Long; use RNA; my %opts; GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h"); if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments &usage; } my $read=$opts{'r'}; my $pre=$opts{'p'}; my $mature=$opts{'m'}; my $dir=$opts{'o'}; unless ($dir=~/\/$/) {$dir .="/";} if (not -d $dir) { mkdir $dir; } my $threads=defined $opts{'t'} ? $opts{'t'} : 1; my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; my $upstream = 2; my $downstream = 5; $upstream = $opts{'e'} if(defined $opts{'e'}); $downstream = $opts{'f'} if(defined $opts{'f'}); my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; my $time=Time(); if (defined $opts{'time'}) { $time=$opts{'time'};} my $tmpdir="${dir}/miRNA_Express_${time}"; if(not -d $tmpdir){ mkdir($tmpdir); } chdir $tmpdir; `cp $pre ./`; my $pre_file_name=basename($pre); &mapping(); # matures align to precursors && reads align to precursors; my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; &maturePosOnPre(); # acknowledge mature positions on precursor my %pre_read; &readPosOnPre(); # acknowledge reads positions on precursors if(!(defined $opts{'tag'})){ foreach my $key (keys %pre_read) { $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; my @ss=split/_/,$1; for (my $i=1;$i<=@ss;$i++) { $marks .="Smp$i;"; } last; } } my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." &attachPre(); my $preno=scalar (keys %pre); print "Total Precursor Number is $preno !!!!\n"; my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; &structure(); ##### analysis and print out && moRs my $aln=$dir."known_microRNA_express.aln"; my $list=$dir."known_microRNA_express.txt"; my $moRs=$dir."known_microRNA_express.moRs"; system("ln $mature $dir/known_microRNA_mature.fa "); system("ln $pre $dir/known_microRNA_precursor.fa "); open ALN,">$aln"; open LIST,">$list"; open MORS,">$moRs"; $"="\t"; ##### @array print in \t my @marks=split/\;/,$marks; #print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; print LIST "#matueID\tpreID\tpos1\tpos2"; for (my $i=0;$i<@marks;$i++) { print LIST "\t",$marks[$i],"_matureExp"; } for (my $i=0;$i<@marks;$i++) { print LIST "\t",$marks[$i],"_starExp"; } for (my $i=0;$i<@marks;$i++) { print LIST "\t",$marks[$i],"_totalExp"; } print LIST "\n"; print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; my %moRs; foreach my $key (keys %pre) { print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; next if(! (exists $pre_read{$key})); my @array=@{$pre_read{$key}}; @array=sort{$a->[3]<=> $b->[3]} @array; my $length=length($pre{$key}); my $maxline=-1;my $max=0; ### storage the maxinum express read line my $totalReadsNo=0; my @not_over=(); ### new read format better for moRs analysis ####print out Aln file start for (my $i=0;$i<@array;$i++) { my $maps=$array[$i][3]+1; my $mape=$array[$i][3]+length($array[$i][4]); my $str=""; $str .= "." x ($maps-1); $str .=$array[$i][4]; $str .="." x ($length-$mape); $str .=" "; $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; my @sample=split /\_/,$1; my $total=$2; print ALN $str,"@sample","\t",$total,"\n"; if($total>$max){$max=$total; $maxline=$i;} $totalReadsNo+=$total; push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; } ####print out Aln file end #### express list start my ($ms,$me,$ss,$se); if (!(exists($pre_mature{$key}))) { $ms=$array[$maxline][3]+1; $me=$array[$maxline][3]+length($array[$maxline][4]); ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; } else{ foreach my $maID (keys %{$pre_mature{$key}}) { $ms=$pre_mature{$key}{$maID}{"mature"}[0]; $me=$pre_mature{$key}{$maID}{"mature"}[1]; $ss=$pre_mature{$key}{$maID}{"star"}[0]; $se=$pre_mature{$key}{$maID}{"star"}[1]; my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; } } #### express list end #### analysis moRs start my @result; my @m_texp;my $m_texp=0; ### moRs informations while (@not_over>0) { my @over=@not_over; @not_over=(); #丰度最高tag my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; for (my $i=0;$i<@over;$i++) { my @m_array=@{$over[$i]}; if ($m_max<$m_array[4]) { $m_max=$m_array[4]; $m_maxline=$i; } } $m_start=$over[$m_maxline][1]; $m_end=$over[$m_maxline][2]; $m_exp=$m_max; $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; my @m_nums=split/_/,$1; for (my $j=0;$j<@m_nums;$j++) { $m_exp[$j]=$m_nums[$j]; } #统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt for (my $i=0;$i<@over;$i++) { next if($i==$m_maxline); my @m_array=@{$over[$i]}; if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { $m_exp+=$m_array[4]; $m_no++; $m_array[3]=~/:([\d|_]+)_x(\d+)$/; my @m_nums=split/_/,$1; for (my $j=0;$j<@m_nums;$j++) { $m_exp[$j] +=$m_nums[$j]; } } elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads } if($m_exp>5){### 5个reads $m_texp+=$m_exp; for (my $j=0;$j<@m_exp;$j++) { $m_texp[$j]+=$m_exp[$j]; } my $string=&subseq($pre{$key},$m_start,$m_end,"+"); push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; } } my $str=scalar @result; my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; @{$moRs{$str}}=@result; #### analysis moRs end } ##### moRs print out start foreach my $key (keys %moRs) { my @tmp=split/\t/,$key; next if ($tmp[4]<=2); next if($tmp[3]<0.95); my @over; for (my $i=0;$i<@{$moRs{$key}};$i++) { my @arrayi=split/\t/,$moRs{$key}[$i]; for (my $j=0;$j<@{$moRs{$key}};$j++) { next if($i==$j); my @arrayj=split/\t/,$moRs{$key}[$j]; if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { push @over,$moRs{$key}[$i]; } } } if (@over>0) { print MORS "$key\n"; foreach (@{$moRs{$key}}) { print MORS "$_\n"; } } } ###### moRs print out end close ALN; close LIST; close MORS; $"=" ";##### reset ################### Sub programs ################# sub express{ my ($ms,$me,$ss,$se,$read)=@_; my (@mexp,@sexp,@texp); $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; my @numsample=split/_/,$1; for (my $i=0;$i<@numsample;$i++) { $mexp[$i]=0; $sexp[$i]=0; $texp[$i]=0; } for (my $i=0;$i<@{$read};$i++) { my $start=$$read[$i][3]+1; my $end=$$read[$i][3]+length($$read[$i][4]); $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; my $expresses=$1; my @nums=split/_/,$expresses; for (my $j=0;$j<@nums;$j++) { $texp[$j]+=$nums[$j]; } if ($start>=$ms && $end<=$me) { for (my $j=0;$j<@nums;$j++) { $mexp[$j]+=$nums[$j]; } } if ($start>=$ss && $end<=$se) { for (my $j=0;$j<@nums;$j++) { $sexp[$j]+=$nums[$j]; } } } return(\@mexp,\@sexp,\@texp); } sub structure{ foreach my $key (keys %pre_mature) { if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} my ($str,$mfe)=RNA::fold($pre{$key}); $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); foreach my $id (keys %{$pre_mature{$key}}) { ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); } =cut ##### Nucleotide complementary my @tmp=split//,$str; my %a2b; my @bps; for (my $i=0;$i<@tmp;$i++) { if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} if ($tmp[$i] eq ")") { my $up=pop @bps; $a2b{$i+1}=$up; $a2b{$up}=$i+1; } } ##### search star position foreach my $id (keys %{$pre_mature{$key}}) { my $n=0; for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { if (defined $a2b{$i}) { my $a=$i; my $b=$a2b{$i}; if($a>$b){ $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); } if($a<$b{ $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); } last; } $n++; } } =cut } } sub other_pair{ my ($start,$end,$structure)=@_; ##### Nucleotide complementary my @tmp=split//,$structure; my %a2b; my @bps; for (my $i=0;$i<@tmp;$i++) { if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} if ($tmp[$i] eq ")") { my $up=pop @bps; $a2b{$i+1}=$up; $a2b{$up}=$i+1; } } ##### search star position my $n=0;my $startpos; my $endpos; for (my $i=$start;$i<=$end ; $i++) { if (defined $a2b{$i}) { my $a=$i; my $b=$a2b{$i}; # if($a>$b){ # $startpos=$b-$n+2; # $endpos=$b-$n+2+($end-$start); # } # if($a<$b){ $endpos=$b+$n+2; if($endpos>length($structure)){$endpos=length($structure);} $startpos=$b+$n+2-($end-$start); if($startpos<1){$startpos=1;} # } last; } $n++; } return ($startpos,$endpos); } sub attachPre{ open IN, "<$pre_file_name"; my $name; while (my $aline=<IN>) { chomp $aline; if ($aline=~/^>(\S+)/) { $name=$1; next; } $pre{$name} .=$aline; } close IN; } sub readPosOnPre{ open IN,"<read_mapped.bwt"; while (my $aline=<IN>) { chomp $aline; my @tmp=split/\t/,$aline; my $id=lc($tmp[2]); push @{$pre_read{$tmp[2]}},[@tmp]; } close IN; } sub maturePosOnPre{ open IN,"<mature_mapped.bwt"; while (my $aline=<IN>) { chomp $aline; my @tmp=split/\t/,$aline; my $mm=$tmp[0]; # $mm=~s/\-3P|\-5P//i; $mm=lc($mm); my $pm=$tmp[2]; $pm=lc($pm); # next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a next if($mm!~/$pm/); # print "$tmp[2]\t$tmp[0]\n"; # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); } close IN; } sub mapping{ my $err; ## build bowtie index print STDERR "building bowtie index\n"; $err = `bowtie-build $pre_file_name miRNA_precursor`; ## map mature sequences against precursors print STDERR "mapping mature sequences against index\n"; $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature mature_mapped.bwt`; ## map reads against precursors print STDERR "mapping read sequences against index\n"; $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa read_mapped.bwt `; } sub subseq{ my $seq=shift; my $beg=shift; my $end=shift; my $strand=shift; my $subseq=substr($seq,$beg-1,$end-$beg+1); if ($strand eq "-") { $subseq=revcom($subseq); } return uc $subseq; } sub revcom{ my $seq=shift; $seq=~tr/ATCGatcg/TAGCtagc/; $seq=reverse $seq; return uc $seq; } sub Time{ my $time=time(); my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; $month++; $year+=1900; if (length($sec) == 1) {$sec = "0"."$sec";} if (length($min) == 1) {$min = "0"."$min";} if (length($hour) == 1) {$hour = "0"."$hour";} if (length($day) == 1) {$day = "0"."$day";} if (length($month) == 1) {$month = "0"."$month";} #print "$year-$month-$day $hour:$min:$sec\n"; return("$year-$month-$day-$hour-$min-$sec"); } sub usage{ print <<"USAGE"; Version $version Usage: $0 -r -p -m -mis -t -e -f -tag -o -time mandatory parameters: -p precursor.fa miRNA precursor sequences from miRBase # must be absolute path -m mature.fa miRNA sequences from miRBase # must be absolute path -r reads.fa your read sequences #must be absolute path -o output directory options: -mis [int] number of allowed mismatches when mapping reads to precursors, default 0 -t [int] threads number,default 1 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5 -tag [string] sample marks# eg. sampleA;sampleB;sampleC -time sting #make directory time,default is the local time -h help USAGE exit(1); }