comparison quantify.pl @ 18:a79212816cbc draft

Uploaded
author big-tiandm
date Fri, 25 Jul 2014 05:22:21 -0400
parents
children 0c4e11018934
comparison
equal deleted inserted replaced
17:1131b4008650 18:a79212816cbc
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2013/7/19
6 #Modified:
7 #Description:
8 my $version=1.00;
9
10 use File::Path;
11 use strict;
12 use File::Basename;
13 #use Getopt::Std;
14 use Getopt::Long;
15 use RNA;
16
17 my %opts;
18 GetOptions(\%opts,"r=s","p=s","m=s","mis:i","t:i","e:i","f:i","tag:s","o=s","time:s","h");
19 if (!(defined $opts{r} and defined $opts{p} and defined $opts{m} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
20 &usage;
21 }
22
23 my $read=$opts{'r'};
24 my $pre=$opts{'p'};
25 my $mature=$opts{'m'};
26
27 my $dir=$opts{'o'};
28 unless ($dir=~/\/$/) {$dir .="/";}
29 if (not -d $dir) {
30 mkdir $dir;
31 }
32
33 my $threads=defined $opts{'t'} ? $opts{'t'} : 1;
34 my $mismatch=defined $opts{'mis'} ? $opts{'mis'} : 0;
35
36 my $upstream = 2;
37 my $downstream = 5;
38
39 $upstream = $opts{'e'} if(defined $opts{'e'});
40 $downstream = $opts{'f'} if(defined $opts{'f'});
41
42 my $marks=defined $opts{'tag'} ? $opts{'tag'} : "";
43
44 my $time=Time();
45 if (defined $opts{'time'}) { $time=$opts{'time'};}
46
47 my $tmpdir="${dir}/miRNA_Express_${time}";
48 if(not -d $tmpdir){
49 mkdir($tmpdir);
50 }
51 chdir $tmpdir;
52
53 `cp $pre ./`;
54 my $pre_file_name=basename($pre);
55
56 &mapping(); # matures align to precursors && reads align to precursors;
57
58 my %pre_mature; # $pre_mature{pre_id}{matre_ID}{"mature"}[0]->start; $pre_mature{pre_id}{matre_ID}{"mature"}[1]->end;
59 &maturePosOnPre(); # acknowledge mature positions on precursor
60
61 my %pre_read;
62 &readPosOnPre(); # acknowledge reads positions on precursors
63
64 if(!(defined $opts{'tag'})){
65 foreach my $key (keys %pre_read) {
66 $pre_read{$key}[0][0]=~/:([\d|_]+)_x(\d+)$/;
67 my @ss=split/_/,$1;
68 for (my $i=1;$i<=@ss;$i++) {
69 $marks .="Smp$i;";
70 }
71 last;
72 }
73 }
74
75 my %pre;## read in precursor sequences #$pre{pre_id}="CGTA...."
76 &attachPre();
77
78 my $preno=scalar (keys %pre);
79 print "Total Precursor Number is $preno !!!!\n";
80
81 my %struc; #mature star loop; $struc{$key}{"struc"}=$str; $struc{$key}{"mfe"}=$mfe;
82 &structure();
83
84
85 ##### analysis and print out && moRs
86 my $aln=$dir."known_microRNA_express.aln";
87 my $list=$dir."known_microRNA_express.txt";
88 my $moRs=$dir."known_microRNA_express.moRs";
89
90 system("ln $mature $dir/known_microRNA_mature.fa ");
91 system("ln $pre $dir/known_microRNA_precursor.fa ");
92
93 open ALN,">$aln";
94 open LIST,">$list";
95 open MORS,">$moRs";
96
97 $"="\t"; ##### @array print in \t
98
99 my @marks=split/\;/,$marks;
100 #print LIST "#matueID\tpreID\tpos1\tpos2\tmatureExp\tstarExp\ttotalExp\n";
101 print LIST "#matueID\tpreID\tpos1\tpos2";
102 for (my $i=0;$i<@marks;$i++) {
103 print LIST "\t",$marks[$i],"_matureExp";
104 }
105 for (my $i=0;$i<@marks;$i++) {
106 print LIST "\t",$marks[$i],"_starExp";
107 }
108 for (my $i=0;$i<@marks;$i++) {
109 print LIST "\t",$marks[$i],"_totalExp";
110 }
111 print LIST "\n";
112 print ALN "#>precursor ID \n#precursor sequence\n#precursor structure (mfe)\n#RNA_seq\t@marks\ttotal\n";
113 print MORS "#>precursor ID\tstrand\texpress_reads\texpress_reads\/total_reads\tblock_number\tprecursor_sequence\n#\tblock_start\tblock_end\t@marks\ttotal\ttag_number\tsequence\n";
114 my %moRs;
115
116 foreach my $key (keys %pre) {
117 print ALN ">$key\n$pre{$key}\n$struc{$key}{struc} ($struc{$key}{mfe})\n";
118 next if(! (exists $pre_read{$key}));
119 my @array=@{$pre_read{$key}};
120 @array=sort{$a->[3]<=> $b->[3]} @array;
121
122 my $length=length($pre{$key});
123
124 my $maxline=-1;my $max=0; ### storage the maxinum express read line
125 my $totalReadsNo=0;
126 my @not_over=(); ### new read format better for moRs analysis
127
128 ####print out Aln file start
129 for (my $i=0;$i<@array;$i++) {
130 my $maps=$array[$i][3]+1;
131 my $mape=$array[$i][3]+length($array[$i][4]);
132 my $str="";
133 $str .= "." x ($maps-1);
134 $str .=$array[$i][4];
135 $str .="." x ($length-$mape);
136 $str .=" ";
137
138 $array[$i][0]=~/:([\d|_]+)_x(\d+)$/;
139 my @sample=split /\_/,$1;
140 my $total=$2;
141 print ALN $str,"@sample","\t",$total,"\n";
142
143 if($total>$max){$max=$total; $maxline=$i;}
144 $totalReadsNo+=$total;
145
146 push @not_over,[$key,$maps,$mape,$array[$i][0],$total,"+"];
147 }
148 ####print out Aln file end
149
150 #### express list start
151 my ($ms,$me,$ss,$se);
152 if (!(exists($pre_mature{$key}))) {
153 $ms=$array[$maxline][3]+1;
154 $me=$array[$maxline][3]+length($array[$maxline][4]);
155 ($ss,$se)=&other_pair($ms,$me,$struc{$key}{'struc'});
156
157 my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
158 print LIST "$key\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
159 }
160 else{
161 foreach my $maID (keys %{$pre_mature{$key}}) {
162 $ms=$pre_mature{$key}{$maID}{"mature"}[0];
163 $me=$pre_mature{$key}{$maID}{"mature"}[1];
164 $ss=$pre_mature{$key}{$maID}{"star"}[0];
165 $se=$pre_mature{$key}{$maID}{"star"}[1];
166 my ($mexp,$sexp,$texp)=&express($ms-$upstream,$me+$downstream,$ss-$upstream,$se+$downstream,\@array);
167 print LIST "$maID\t$key\tmature:$ms..$me\tstar:$ss..$se\t@$mexp\t@$sexp\t@$texp\n";
168 }
169 }
170 #### express list end
171
172 #### analysis moRs start
173 my @result; my @m_texp;my $m_texp=0; ### moRs informations
174
175 while (@not_over>0) {
176 my @over=@not_over;
177 @not_over=();
178
179 #丰度最高tag
180 my $m_max=0;my $m_maxline=-1;my $m_start=0;my $m_end=0;my $m_exp=0;my @m_exp;my $m_no=1;
181 for (my $i=0;$i<@over;$i++) {
182 my @m_array=@{$over[$i]};
183 if ($m_max<$m_array[4]) {
184 $m_max=$m_array[4];
185 $m_maxline=$i;
186 }
187 }
188 $m_start=$over[$m_maxline][1];
189 $m_end=$over[$m_maxline][2];
190 $m_exp=$m_max;
191 $over[$m_maxline][3]=~/:([\d|_]+)_x(\d+)$/;
192 my @m_nums=split/_/,$1;
193 for (my $j=0;$j<@m_nums;$j++) {
194 $m_exp[$j]=$m_nums[$j];
195 }
196
197 #统计以丰度最高tag为坐标的reads, 两端位置差异不超过3nt
198 for (my $i=0;$i<@over;$i++) {
199 next if($i==$m_maxline);
200 my @m_array=@{$over[$i]};
201 if (abs($m_array[1]-$m_start)<=3 && abs($m_array[2]-$m_end)<=3) {
202 $m_exp+=$m_array[4];
203 $m_no++;
204 $m_array[3]=~/:([\d|_]+)_x(\d+)$/;
205 my @m_nums=split/_/,$1;
206 for (my $j=0;$j<@m_nums;$j++) {
207 $m_exp[$j] +=$m_nums[$j];
208 }
209 }
210 elsif($m_array[1]>=$m_end || $m_array[2]<=$m_start){push @not_over,[@{$over[$i]}];} #去除跨越block的reads
211 }
212 if($m_exp>5){### 5个reads
213 $m_texp+=$m_exp;
214 for (my $j=0;$j<@m_exp;$j++) {
215 $m_texp[$j]+=$m_exp[$j];
216 }
217 my $string=&subseq($pre{$key},$m_start,$m_end,"+");
218 push @result,"\t$m_start\t$m_end\t@m_exp\t$m_exp\t$m_no\t$string" ;
219 }
220 }
221
222 my $str=scalar @result;
223 my $percent=sprintf("%.2f",$m_texp/$totalReadsNo);
224 $str=">$key\t+\t$m_texp\t$percent\t".$str."\t$pre{$key}";
225 @{$moRs{$str}}=@result;
226
227 #### analysis moRs end
228 }
229
230 ##### moRs print out start
231 foreach my $key (keys %moRs) {
232 my @tmp=split/\t/,$key;
233 next if ($tmp[4]<=2);
234 next if($tmp[3]<0.95);
235 my @over;
236 for (my $i=0;$i<@{$moRs{$key}};$i++) {
237 my @arrayi=split/\t/,$moRs{$key}[$i];
238 for (my $j=0;$j<@{$moRs{$key}};$j++) {
239 next if($i==$j);
240 my @arrayj=split/\t/,$moRs{$key}[$j];
241 if ((($arrayj[1]-$arrayi[2]>=0 && $arrayj[1]-$arrayi[2] <=3) || ($arrayj[1]-$arrayi[2]>=18 && $arrayj[1]-$arrayi[2] <=25) )||(($arrayi[1]-$arrayj[2]>=0 && $arrayi[1]-$arrayj[2] <=3)||($arrayi[1]-$arrayj[2]>=18 && $arrayi[1]-$arrayj[2] <=25))) {
242 push @over,$moRs{$key}[$i];
243 }
244 }
245 }
246 if (@over>0) {
247 print MORS "$key\n";
248 foreach (@{$moRs{$key}}) {
249 print MORS "$_\n";
250 }
251 }
252 }
253 ###### moRs print out end
254 close ALN;
255 close LIST;
256 close MORS;
257
258 $"=" ";##### reset
259
260
261 ################### Sub programs #################
262 sub express{
263 my ($ms,$me,$ss,$se,$read)=@_;
264 my (@mexp,@sexp,@texp);
265 $$read[0][0]=~/:([_|\d]+)_x(\d+)$/;
266 my @numsample=split/_/,$1;
267 for (my $i=0;$i<@numsample;$i++) {
268 $mexp[$i]=0;
269 $sexp[$i]=0;
270 $texp[$i]=0;
271 }
272
273 for (my $i=0;$i<@{$read};$i++) {
274 my $start=$$read[$i][3]+1;
275 my $end=$$read[$i][3]+length($$read[$i][4]);
276 $$read[$i][0]=~/:([_|\d]+)_x(\d+)$/;
277 my $expresses=$1;
278 my @nums=split/_/,$expresses;
279
280 for (my $j=0;$j<@nums;$j++) {
281 $texp[$j]+=$nums[$j];
282 }
283 if ($start>=$ms && $end<=$me) {
284 for (my $j=0;$j<@nums;$j++) {
285 $mexp[$j]+=$nums[$j];
286 }
287 }
288 if ($start>=$ss && $end<=$se) {
289 for (my $j=0;$j<@nums;$j++) {
290 $sexp[$j]+=$nums[$j];
291 }
292 }
293 }
294 return(\@mexp,\@sexp,\@texp);
295 }
296
297 sub structure{
298 foreach my $key (keys %pre_mature) {
299 if (!(defined $pre{$key})){die "!!!!! No precursor sequence $key, please check it!\n";}
300 my ($str,$mfe)=RNA::fold($pre{$key});
301 $struc{$key}{"struc"}=$str;
302 $struc{$key}{"mfe"}=sprintf ("%.2f",$mfe);
303
304 foreach my $id (keys %{$pre_mature{$key}}) {
305 ($pre_mature{$key}{$id}{"star"}[0],$pre_mature{$key}{$id}{"star"}[1])=&other_pair($pre_mature{$key}{$id}{"mature"}[0],$pre_mature{$key}{$id}{"mature"}[1],$str);
306 }
307 =cut
308 ##### Nucleotide complementary
309 my @tmp=split//,$str;
310 my %a2b;
311 my @bps;
312 for (my $i=0;$i<@tmp;$i++) {
313 if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
314 if ($tmp[$i] eq ")") {
315 my $up=pop @bps;
316 $a2b{$i+1}=$up;
317 $a2b{$up}=$i+1;
318 }
319 }
320
321 ##### search star position
322 foreach my $id (keys %{$pre_mature{$key}}) {
323 my $n=0;
324 for (my $i=$pre_mature{$key}{$id}{"mature"}[0];$i<=$pre_mature{$key}{$id}{"mature"}[1] ; $i++) {
325 if (defined $a2b{$i}) {
326 my $a=$i; my $b=$a2b{$i};
327 if($a>$b){
328 $pre_mature{$key}{$id}{"star"}[0]=$b-$n+2;
329 $pre_mature{$key}{$id}{"star"}[1]=$b-$n+2+($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
330 }
331 if($a<$b{
332 $pre_mature{$key}{$id}{"star"}[1]=$b+$n+2;
333 $pre_mature{$key}{$id}{"star"}[0]=$b+$n+2-($pre_mature{$key}{$id}{"mature"}[1]-$pre_mature{$key}{$id}{"mature"}[0]);
334 }
335 last;
336 }
337 $n++;
338 }
339 }
340 =cut
341 }
342 }
343 sub other_pair{
344 my ($start,$end,$structure)=@_;
345 ##### Nucleotide complementary
346 my @tmp=split//,$structure;
347 my %a2b; my @bps;
348 for (my $i=0;$i<@tmp;$i++) {
349 if ($tmp[$i] eq "("){push @bps,$i+1 ; next;}
350 if ($tmp[$i] eq ")") {
351 my $up=pop @bps;
352 $a2b{$i+1}=$up;
353 $a2b{$up}=$i+1;
354 }
355 }
356 ##### search star position
357 my $n=0;my $startpos; my $endpos;
358 for (my $i=$start;$i<=$end ; $i++) {
359 if (defined $a2b{$i}) {
360 my $a=$i; my $b=$a2b{$i};
361 # if($a>$b){
362 # $startpos=$b-$n+2;
363 # $endpos=$b-$n+2+($end-$start);
364 # }
365 # if($a<$b){
366 $endpos=$b+$n+2;
367 if($endpos>length($structure)){$endpos=length($structure);}
368 $startpos=$b+$n+2-($end-$start);
369 if($startpos<1){$startpos=1;}
370 # }
371 last;
372 }
373 $n++;
374 }
375 return ($startpos,$endpos);
376 }
377 sub attachPre{
378 open IN, "<$pre_file_name";
379 my $name;
380 while (my $aline=<IN>) {
381 chomp $aline;
382 if ($aline=~/^>(\S+)/) {
383 $name=$1;
384 next;
385 }
386 $pre{$name} .=$aline;
387 }
388 close IN;
389 }
390 sub readPosOnPre{
391 open IN,"<read_mapped.bwt";
392 while (my $aline=<IN>) {
393 chomp $aline;
394 my @tmp=split/\t/,$aline;
395 my $id=lc($tmp[2]);
396 push @{$pre_read{$tmp[2]}},[@tmp];
397 }
398 close IN;
399 }
400 sub maturePosOnPre{
401 open IN,"<mature_mapped.bwt";
402 while (my $aline=<IN>) {
403 chomp $aline;
404 my @tmp=split/\t/,$aline;
405 my $mm=$tmp[0];
406 # $mm=~s/\-3P|\-5P//i;
407 $mm=lc($mm);
408 my $pm=$tmp[2];
409 $pm=lc($pm);
410
411 # next if ($mm ne $pm);### stringent mapping let7a only allowed to map pre-let7a
412 next if($mm!~/$pm/);
413 # print "$tmp[2]\t$tmp[0]\n";
414 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]-$upstream;
415 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=0 if($pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]<0);
416 # $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4])-1+$downstream;
417 $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[0]=$tmp[3]+1;
418 $pre_mature{$tmp[2]}{$tmp[0]}{"mature"}[1]=$tmp[3]+length($tmp[4]);
419 }
420 close IN;
421 }
422 sub mapping{
423 my $err;
424 ## build bowtie index
425 print STDERR "building bowtie index\n";
426 $err = `bowtie-build $pre_file_name miRNA_precursor`;
427
428 ## map mature sequences against precursors
429 print STDERR "mapping mature sequences against index\n";
430 $err = `bowtie -p $threads -f -v 0 -a --best --strata --norc miRNA_precursor $mature mature_mapped.bwt`;
431
432 ## map reads against precursors
433 print STDERR "mapping read sequences against index\n";
434 $err=`bowtie -p $threads -f -v $mismatch -a --best --strata --norc miRNA_precursor $read --al mirbase_mapped.fa --un mirbase_not_mapped.fa read_mapped.bwt `;
435
436 }
437
438 sub subseq{
439 my $seq=shift;
440 my $beg=shift;
441 my $end=shift;
442 my $strand=shift;
443
444 my $subseq=substr($seq,$beg-1,$end-$beg+1);
445 if ($strand eq "-") {
446 $subseq=revcom($subseq);
447 }
448 return uc $subseq;
449 }
450
451 sub revcom{
452 my $seq=shift;
453 $seq=~tr/ATCGatcg/TAGCtagc/;
454 $seq=reverse $seq;
455 return uc $seq;
456 }
457
458 sub Time{
459 my $time=time();
460 my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
461 $month++;
462 $year+=1900;
463 if (length($sec) == 1) {$sec = "0"."$sec";}
464 if (length($min) == 1) {$min = "0"."$min";}
465 if (length($hour) == 1) {$hour = "0"."$hour";}
466 if (length($day) == 1) {$day = "0"."$day";}
467 if (length($month) == 1) {$month = "0"."$month";}
468 #print "$year-$month-$day $hour:$min:$sec\n";
469 return("$year-$month-$day-$hour-$min-$sec");
470 }
471
472 sub usage{
473 print <<"USAGE";
474 Version $version
475 Usage:
476 $0 -r -p -m -mis -t -e -f -tag -o -time
477 mandatory parameters:
478 -p precursor.fa miRNA precursor sequences from miRBase # must be absolute path
479 -m mature.fa miRNA sequences from miRBase # must be absolute path
480 -r reads.fa your read sequences #must be absolute path
481
482 -o output directory
483
484 options:
485 -mis [int] number of allowed mismatches when mapping reads to precursors, default 0
486 -t [int] threads number,default 1
487 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2
488 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5
489 -tag [string] sample marks# eg. sampleA;sampleB;sampleC
490 -time sting #make directory time,default is the local time
491 -h help
492 USAGE
493 exit(1);
494 }
495