Mercurial > repos > big-tiandm > mirplant2
comparison quantify.pl @ 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
49:f008ab2cadc6 | 50:7b5a48b972e9 |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Tian Dongmei | |
4 #Email: tiandm@big.ac.cn | |
5 #Date: 2013/7/19 | |
6 #Modified: | |
7 #Description: | |
8 my $version=1.00; | |
9 | |
10 use File::Path; | |
11 use strict; | |
12 use File::Basename; | |
13 #use Getopt::Std; | |
14 use Getopt::Long; | |
15 #use RNA; | |
16 | |
17 my %opts; | |
18 GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","h"); | |
19 if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments | |
20 &usage; | |
21 } | |
22 | |
23 my $read=$opts{'r'}; | |
24 my $pre=$opts{'p'}; | |
25 my $mature=$opts{'m'}; | |
26 | |
27 my $dir=$opts{'o'}; | |
28 unless ($dir=~/\/$/) {$dir .="/";} | |
29 if (not -d $dir) { | |
30 mkdir $dir; | |
31 } | |
32 | |
33 my $threads=defined $opts{'t'} ? $opts{'t'} : 1; | |
34 my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0; | |
35 | |
36 my $upstream = 2; | |
37 my $downstream = 5; | |
38 | |
39 $upstream = $opts{'e'} if(defined $opts{'e'}); | |
40 $downstream = $opts{'f'} if(defined $opts{'f'}); | |
41 | |
42 my $marks=defined $opts{'tag'} ? $opts{'tag'} : ""; | |
43 | |
44 my $time=Time(); | |
45 | |
46 my $tmpdir="${dir}/known_miRNA_Express"; | |
47 if(not -d $tmpdir){ | |
48 mkdir($tmpdir); | |
49 } | |
50 chdir $tmpdir; | |
51 | |
52 `cp $pre ./`; | |
53 my $pre_file_name=basename($pre); | |
54 | |
55 &mapping(); # matures align to precursors && reads align to precursors; | |
56 | |
57 my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end; | |
58 &maturePosOnPre(); # acknowledge mature positions on precursor | |
59 | |
60 my %pre_read; | |
61 &readPosOnPre(); # acknowledge reads positions on precursors | |
62 | |
63 if(!(defined $opts{'tag'})){ | |
64 foreach my $key (keys %pre_read) { | |
65 $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/; | |
66 my @ss=split/_/,$1; | |
67 for (my $i=1;$i<=@ss;$i++) { | |
68 $marks .="Smp$i;"; | |
69 } | |
70 last; | |
71 } | |
72 } | |
73 | |
74 my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...." | |
75 &attachPre(); | |
76 | |
77 my $preno=scalar (keys %pre); | |
78 print "Total Precursor Number is $preno !!!!\n"; | |
79 | |
80 my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe; | |
81 &structure(); | |
82 | |
83 | |
84 ##### analysis and print out && moRs | |
85 my $aln=$dir."known_microRNA_express.aln"; | |
86 my $list=$dir."known_microRNA_express.txt"; | |
87 my $moRs=$dir."known_microRNA_express.moRs"; | |
88 | |
89 system("ln -s $mature $dir/known_microRNA_mature.fa "); | |
90 system("ln -s $pre $dir/known_microRNA_precursor.fa "); | |
91 | |
92 open ALN,">$aln"; | |
93 open LIST,">$list"; | |
94 open MORS,">$moRs"; | |
95 | |
96 $"="\t"; ##### @array print in \t | |
97 | |
98 my @marks=split/\;/,$marks; | |
99 #print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n"; | |
100 print LIST "#matueID\tpreID\tpos1\tpos2"; | |
101 for (my $i=0;$i<@marks;$i++) { | |
102 print LIST "\t",$marks[$i],"_matureExp"; | |
103 } | |
104 for (my $i=0;$i<@marks;$i++) { | |
105 print LIST "\t",$marks[$i],"_starExp"; | |
106 } | |
107 for (my $i=0;$i<@marks;$i++) { | |
108 print LIST "\t",$marks[$i],"_totalExp"; | |
109 } | |
110 print LIST "\n"; | |
111 print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n"; | |
112 print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n"; | |
113 my %moRs; | |
114 | |
115 foreach my $key (keys %pre) { | |
116 print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n"; | |
117 next if(! (exists $pre_read{$key})); | |
118 my @array=@{$pre_read{$key}}; | |
119 @array=sort{$a->[3]<=> $b->[3]} @array; | |
120 | |
121 my $length=length($pre{$key}); | |
122 | |
123 my $maxline=-1;my $max=0; ### storage the maxinum express read line | |
124 my $totalReadsNo=0; | |
125 my @not_over=(); ### new read format better for moRs analysis | |
126 | |
127 ####print out Aln file start | |
128 for (my $i=0;$i<@array;$i++) { | |
129 my $maps=$array[$i][3]+1; | |
130 my $mape=$array[$i][3]+length($array[$i][4]); | |
131 my $str=""; | |
132 $str .= "." x ($maps-1); | |
133 $str .=$array[$i][4]; | |
134 $str .="." x ($length-$mape); | |
135 $str .=" "; | |
136 | |
137 $array[$i][0]=~/:([\d|_]+)_x(\d+)$/; | |
138 my @sample=split /\_/,$1; | |
139 my $total=$2; | |
140 print ALN $str,"@sample","\t",$total,"\n"; | |
141 | |
142 if($total>$max){$max=$total; $maxline=$i;} | |
143 $totalReadsNo+=$total; | |
144 | |
145 push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"]; | |
146 } | |
147 ####print out Aln file end | |
148 | |
149 #### express list start | |
150 my ($ms,$me,$ss,$se); | |
151 if (!(exists($pre_mature{$key}))) { | |
152 $ms=$array[$maxline][3]+1; | |
153 $me=$array[$maxline][3]+length($array[$maxline][4]); | |
154 ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'}); | |
155 | |
156 my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); | |
157 print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; | |
158 } | |
159 else{ | |
160 foreach my $maID (keys %{$pre_mature{$key}}) { | |
161 $ms=$pre_mature{$key}{$maID}{"mature"}[0]; | |
162 $me=$pre_mature{$key}{$maID}{"mature"}[1]; | |
163 $ss=$pre_mature{$key}{$maID}{"star"}[0]; | |
164 $se=$pre_mature{$key}{$maID}{"star"}[1]; | |
165 my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array); | |
166 print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n"; | |
167 } | |
168 } | |
169 #### express list end | |
170 | |
171 #### analysis moRs start | |
172 my @result; my @m_texp;my $m_texp=0; ### moRs informations | |
173 | |
174 while (@not_over>0) { | |
175 my @over=@not_over; | |
176 @not_over=(); | |
177 | |
178 #丰度最高tag | |
179 my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1; | |
180 for (my $i=0;$i<@over;$i++) { | |
181 my @m_array=@{$over[$i]}; | |
182 if ($m_max<$m_array[4]) { | |
183 $m_max=$m_array[4]; | |
184 $m_maxline=$i; | |
185 } | |
186 } | |
187 $m_start=$over[$m_maxline][1]; | |
188 $m_end=$over[$m_maxline][2]; | |
189 $m_exp=$m_max; | |
190 $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/; | |
191 my @m_nums=split/_/,$1; | |
192 for (my $j=0;$j<@m_nums;$j++) { | |
193 $m_exp[$j]=$m_nums[$j]; | |
194 } | |
195 | |
196 #统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt | |
197 for (my $i=0;$i<@over;$i++) { | |
198 next if($i==$m_maxline); | |
199 my @m_array=@{$over[$i]}; | |
200 if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) { | |
201 $m_exp+=$m_array[4]; | |
202 $m_no++; | |
203 $m_array[3]=~/:([\d|_]+)_x(\d+)$/; | |
204 my @m_nums=split/_/,$1; | |
205 for (my $j=0;$j<@m_nums;$j++) { | |
206 $m_exp[$j] +=$m_nums[$j]; | |
207 } | |
208 } | |
209 elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads | |
210 } | |
211 if($m_exp>5){### 5个reads | |
212 $m_texp+=$m_exp; | |
213 for (my $j=0;$j<@m_exp;$j++) { | |
214 $m_texp[$j]+=$m_exp[$j]; | |
215 } | |
216 my $string=&subseq($pre{$key},$m_start,$m_end,"+"); | |
217 push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ; | |
218 } | |
219 } | |
220 | |
221 my $str=scalar @result; | |
222 my $percent=sprintf("%.2f",$m_texp/$totalReadsNo); | |
223 $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}"; | |
224 @{$moRs{$str}}=@result; | |
225 | |
226 #### analysis moRs end | |
227 } | |
228 | |
229 ##### moRs print out start | |
230 foreach my $key (keys %moRs) { | |
231 my @tmp=split/\t/,$key; | |
232 next if ($tmp[4]<=2); | |
233 next if($tmp[3]<0.95); | |
234 my @over; | |
235 for (my $i=0;$i<@{$moRs{$key}};$i++) { | |
236 my @arrayi=split/\t/,$moRs{$key}[$i]; | |
237 for (my $j=0;$j<@{$moRs{$key}};$j++) { | |
238 next if($i==$j); | |
239 my @arrayj=split/\t/,$moRs{$key}[$j]; | |
240 if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) { | |
241 push @over,$moRs{$key}[$i]; | |
242 } | |
243 } | |
244 } | |
245 if (@over>0) { | |
246 print MORS "$key\n"; | |
247 foreach (@{$moRs{$key}}) { | |
248 print MORS "$_\n"; | |
249 } | |
250 } | |
251 } | |
252 ###### moRs print out end | |
253 close ALN; | |
254 close LIST; | |
255 close MORS; | |
256 | |
257 $"=" ";##### reset | |
258 | |
259 | |
260 ################### Sub programs ################# | |
261 sub express{ | |
262 my ($ms,$me,$ss,$se,$read)=@_; | |
263 my (@mexp,@sexp,@texp); | |
264 $$read[0][0]=~/:([_|\d]+)_x(\d+)$/; | |
265 my @numsample=split/_/,$1; | |
266 for (my $i=0;$i<@numsample;$i++) { | |
267 $mexp[$i]=0; | |
268 $sexp[$i]=0; | |
269 $texp[$i]=0; | |
270 } | |
271 | |
272 for (my $i=0;$i<@{$read};$i++) { | |
273 my $start=$$read[$i][3]+1; | |
274 my $end=$$read[$i][3]+length($$read[$i][4]); | |
275 $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/; | |
276 my $expresses=$1; | |
277 my @nums=split/_/,$expresses; | |
278 | |
279 for (my $j=0;$j<@nums;$j++) { | |
280 $texp[$j]+=$nums[$j]; | |
281 } | |
282 if ($start>=$ms && $end<=$me) { | |
283 for (my $j=0;$j<@nums;$j++) { | |
284 $mexp[$j]+=$nums[$j]; | |
285 } | |
286 } | |
287 if ($start>=$ss && $end<=$se) { | |
288 for (my $j=0;$j<@nums;$j++) { | |
289 $sexp[$j]+=$nums[$j]; | |
290 } | |
291 } | |
292 } | |
293 return(\@mexp,\@sexp,\@texp); | |
294 } | |
295 | |
296 sub structure{ | |
297 foreach my $key (keys %pre_mature) { | |
298 if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";} | |
299 #my ($str,$mfe)=RNA::fold($pre{$key}); | |
300 my $rnafold=`perl -e 'print "$pre{$key}"' | RNAfold --noPS`; | |
301 my @rnafolds=split/\s+/,$rnafold; | |
302 my $str=$rnafolds[1]; | |
303 my $mfe=$rnafolds[-1]; | |
304 $mfe=~s/\(//; | |
305 $mfe=~s/\)//; | |
306 | |
307 $struc{$key}{"struc"}=$str; | |
308 #$struc{$key}{"mfe"}=sprintf ("%.2f",$mfe); | |
309 $struc{$key}{"mfe"}=$mfe; | |
310 | |
311 foreach my $id (keys %{$pre_mature{$key}}) { | |
312 ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str); | |
313 } | |
314 =cut | |
315 ##### Nucleotide complementary | |
316 my @tmp=split//,$str; | |
317 my %a2b; | |
318 my @bps; | |
319 for (my $i=0;$i<@tmp;$i++) { | |
320 if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} | |
321 if ($tmp[$i] eq ")") { | |
322 my $up=pop @bps; | |
323 $a2b{$i+1}=$up; | |
324 $a2b{$up}=$i+1; | |
325 } | |
326 } | |
327 | |
328 ##### search star position | |
329 foreach my $id (keys %{$pre_mature{$key}}) { | |
330 my $n=0; | |
331 for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) { | |
332 if (defined $a2b{$i}) { | |
333 my $a=$i; my $b=$a2b{$i}; | |
334 if($a>$b){ | |
335 $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2; | |
336 $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); | |
337 } | |
338 if($a<$b{ | |
339 $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2; | |
340 $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]); | |
341 } | |
342 last; | |
343 } | |
344 $n++; | |
345 } | |
346 } | |
347 =cut | |
348 } | |
349 } | |
350 sub other_pair{ | |
351 my ($start,$end,$structure)=@_; | |
352 ##### Nucleotide complementary | |
353 my @tmp=split//,$structure; | |
354 my %a2b; my @bps; | |
355 for (my $i=0;$i<@tmp;$i++) { | |
356 if ($tmp[$i] eq "("){push @bps,$i+1 ; next;} | |
357 if ($tmp[$i] eq ")") { | |
358 my $up=pop @bps; | |
359 $a2b{$i+1}=$up; | |
360 $a2b{$up}=$i+1; | |
361 } | |
362 } | |
363 ##### search star position | |
364 my $n=0;my $startpos; my $endpos; | |
365 for (my $i=$start;$i<=$end ; $i++) { | |
366 if (defined $a2b{$i}) { | |
367 my $a=$i; my $b=$a2b{$i}; | |
368 # if($a>$b){ | |
369 # $startpos=$b-$n+2; | |
370 # $endpos=$b-$n+2+($end-$start); | |
371 # } | |
372 # if($a<$b){ | |
373 $endpos=$b+$n+2; | |
374 if($endpos>length($structure)){$endpos=length($structure);} | |
375 $startpos=$b+$n+2-($end-$start); | |
376 if($startpos<1){$startpos=1;} | |
377 # } | |
378 last; | |
379 } | |
380 $n++; | |
381 } | |
382 return ($startpos,$endpos); | |
383 } | |
384 sub attachPre{ | |
385 open IN, "<$pre_file_name"; | |
386 my $name; | |
387 while (my $aline=<IN>) { | |
388 chomp $aline; | |
389 if ($aline=~/^>(\S+)/) { | |
390 $name=$1; | |
391 next; | |
392 } | |
393 $pre{$name} .=$aline; | |
394 } | |
395 close IN; | |
396 } | |
397 sub readPosOnPre{ | |
398 open IN,"<read_mapped.bwt"; | |
399 while (my $aline=<IN>) { | |
400 chomp $aline; | |
401 my @tmp=split/\t/,$aline; | |
402 my $id=lc($tmp[2]); | |
403 push @{$pre_read{$tmp[2]}},[@tmp]; | |
404 } | |
405 close IN; | |
406 } | |
407 sub maturePosOnPre{ | |
408 open IN,"<mature_mapped.bwt"; | |
409 while (my $aline=<IN>) { | |
410 chomp $aline; | |
411 my @tmp=split/\t/,$aline; | |
412 my $mm=$tmp[0]; | |
413 # $mm=~s/\-3P|\-5P//i; | |
414 $mm=lc($mm); | |
415 my $pm=$tmp[2]; | |
416 $pm=lc($pm); | |
417 | |
418 # next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a | |
419 next if($mm!~/$pm/); | |
420 # print "$tmp[2]\t$tmp[0]\n"; | |
421 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream; | |
422 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0); | |
423 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream; | |
424 $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1; | |
425 $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]); | |
426 } | |
427 close IN; | |
428 } | |
429 sub mapping{ | |
430 my $err; | |
431 ## build bowtie index | |
432 #print STDERR "building bowtie index\n"; | |
433 $err = `bowtie-build $pre_file_name miRNA_precursor`; | |
434 | |
435 ## map mature sequences against precursors | |
436 #print STDERR "mapping mature sequences against index\n"; | |
437 $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature > mature_mapped.bwt 2> run.log`; | |
438 | |
439 ## map reads against precursors | |
440 #print STDERR "mapping read sequences against index\n"; | |
441 $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa > read_mapped.bwt 2> run.log`; | |
442 | |
443 } | |
444 | |
445 sub subseq{ | |
446 my $seq=shift; | |
447 my $beg=shift; | |
448 my $end=shift; | |
449 my $strand=shift; | |
450 | |
451 my $subseq=substr($seq,$beg-1,$end-$beg+1); | |
452 if ($strand eq "-") { | |
453 $subseq=revcom($subseq); | |
454 } | |
455 return uc $subseq; | |
456 } | |
457 | |
458 sub revcom{ | |
459 my $seq=shift; | |
460 $seq=~tr/ATCGatcg/TAGCtagc/; | |
461 $seq=reverse $seq; | |
462 return uc $seq; | |
463 } | |
464 | |
465 sub Time{ | |
466 my $time=time(); | |
467 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; | |
468 $month++; | |
469 $year+=1900; | |
470 if (length($sec) == 1) {$sec = "0"."$sec";} | |
471 if (length($min) == 1) {$min = "0"."$min";} | |
472 if (length($hour) == 1) {$hour = "0"."$hour";} | |
473 if (length($day) == 1) {$day = "0"."$day";} | |
474 if (length($month) == 1) {$month = "0"."$month";} | |
475 #print "$year-$month-$day $hour:$min:$sec\n"; | |
476 return("$year-$month-$day-$hour-$min-$sec"); | |
477 } | |
478 | |
479 sub usage{ | |
480 print <<"USAGE"; | |
481 Version $version | |
482 Usage: | |
483 $0 -r -p -m -mis -t -e -f -tag -o -time | |
484 mandatory parameters: | |
485 -p precursor.fa miRNA precursor sequences from miRBase # must be absolute path | |
486 -m mature.fa miRNA sequences from miRBase # must be absolute path | |
487 -r reads.fa your read sequences #must be absolute path | |
488 | |
489 -o output directory | |
490 | |
491 options: | |
492 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0 | |
493 -t [int] threads number,default 1 | |
494 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2 | |
495 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5 | |
496 -tag [string] sample marks# eg. sampleA;sampleB;sampleC | |
497 -time sting #make directory time,default is the local time | |
498 -h help | |
499 USAGE | |
500 exit(1); | |
501 } | |
502 |