Mercurial > repos > big-tiandm > mirplant2
comparison miRDeep_plant.pl @ 44:0c4e11018934 draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 30 Oct 2014 21:29:19 -0400 |
parents | dc5a29826c7d |
children | ca05d68aca13 |
comparison
equal
deleted
inserted
replaced
43:4c0b1a94b882 | 44:0c4e11018934 |
---|---|
1 #!/usr/bin/perl | 1 #!/usr/bin/perl |
2 | 2 |
3 use warnings; | 3 use warnings; |
4 use strict; | 4 use strict; |
5 use Getopt::Std; | 5 use Getopt::Std; |
6 | 6 use RNA; |
7 | 7 |
8 | 8 |
9 ################################# MIRDEEP ################################################# | 9 ################################# MIRDEEP ################################################# |
10 | 10 |
11 ################################## USAGE ################################################## | 11 ################################## USAGE ################################################## |
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed | 123 #if conservation is scored, the fasta file of known miRNA sequences is parsed |
124 if($options{s}){create_hash_nuclei($options{s})}; | 124 if($options{s}){create_hash_nuclei($options{s})}; |
125 | 125 |
126 #parse signature file in blast_parsed format and resolve each potential precursor | 126 #parse signature file in blast_parsed format and resolve each potential precursor |
127 parse_file_blast_parsed($file_blast_parsed); | 127 parse_file_blast_parsed($file_blast_parsed); |
128 `rm -rf $tmpdir`; | 128 #`rm -rf $tmpdir`; |
129 exit; | 129 exit; |
130 | 130 |
131 | 131 |
132 | 132 |
133 | 133 |
358 sub test_randfold{ | 358 sub test_randfold{ |
359 | 359 |
360 #print sequence to temporary file, test randfold value, return 1 or 0 | 360 #print sequence to temporary file, test randfold value, return 1 or 0 |
361 | 361 |
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); | 362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); |
363 my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; | 363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; |
364 open(FILE, ">$tmpfile"); | 364 #open(FILE, ">$tmpfile"); |
365 print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; | 365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; |
366 close FILE; | 366 #close FILE; |
367 | 367 |
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; | 368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; |
369 my $p1=`randfold -s $tmpfile 999 | cut -f 3`; | 369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; |
370 my $p2=`randfold -s $tmpfile 999 | cut -f 3`; | 370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; |
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); | |
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); | |
371 my $p_value=($p1+$p2)/2; | 373 my $p_value=($p1+$p2)/2; |
372 wait; | 374 wait; |
373 # system "rm $tmpfile"; | 375 # system "rm $tmpfile"; |
374 | 376 |
375 if($p_value<=0.05){return 1;} | 377 if($p_value<=0.05){return 1;} |
376 | 378 |
377 return 0; | 379 return 0; |
378 } | 380 } |
379 | 381 |
380 | 382 sub randfold_pvalue{ |
381 #sub print_file{ | 383 my $cpt_sup = 0; |
382 | 384 my $cpt_inf = 0; |
383 #print string to file | 385 my $cpt_ega = 1; |
384 | 386 |
385 # my($file,$string)=@_; | 387 my ($seq,$number_of_randomizations)=@_; |
386 | 388 my $str =$seq; |
387 # open(FILE, ">$file"); | 389 my $mfe = RNA::fold($seq,$str); |
388 # print FILE "$string"; | 390 |
389 # close FILE; | 391 for (my $i=0;$i<$number_of_randomizations;$i++) { |
390 #} | 392 $seq = shuffle_sequence_dinucleotide($seq); |
391 | 393 $str = $seq; |
394 | |
395 my $rand_mfe = RNA::fold($str,$str); | |
396 | |
397 if ($rand_mfe < $mfe) { | |
398 $cpt_inf++; | |
399 } | |
400 if ($rand_mfe == $mfe) { | |
401 $cpt_ega++; | |
402 } | |
403 if ($rand_mfe > $mfe) { | |
404 $cpt_sup++; | |
405 } | |
406 } | |
407 | |
408 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); | |
409 | |
410 #print "$name\t$mfe\t$proba\n"; | |
411 return $proba; | |
412 } | |
413 | |
414 sub shuffle_sequence_dinucleotide { | |
415 | |
416 my ($str) = @_; | |
417 | |
418 # upper case and convert to ATGC | |
419 $str = uc($str); | |
420 $str =~ s/U/T/g; | |
421 | |
422 my @nuc = ('A','T','G','C'); | |
423 my $count_swap = 0; | |
424 # set maximum number of permutations | |
425 my $stop = length($str) * 10; | |
426 | |
427 while($count_swap < $stop) { | |
428 | |
429 my @pos; | |
430 | |
431 # look start and end letters | |
432 my $firstnuc = $nuc[int(rand 4)]; | |
433 my $thirdnuc = $nuc[int(rand 4)]; | |
434 | |
435 # get positions for matching nucleotides | |
436 for (my $i=0;$i<(length($str)-2);$i++) { | |
437 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { | |
438 push (@pos,($i+1)); | |
439 $i++; | |
440 } | |
441 } | |
442 | |
443 # swap at random trinucleotides | |
444 my $max = scalar(@pos); | |
445 for (my $i=0;$i<$max;$i++) { | |
446 my $swap = int(rand($max)); | |
447 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { | |
448 $count_swap++; | |
449 my $w1 = substr($str,$pos[$i],1); | |
450 my $w2 = substr($str,$pos[$swap],1); | |
451 substr($str,$pos[$i],1,$w2); | |
452 substr($str,$pos[$swap],1,$w1); | |
453 } | |
454 } | |
455 } | |
456 return($str); | |
457 } | |
392 | 458 |
393 sub test_nucleus_conservation{ | 459 sub test_nucleus_conservation{ |
394 | 460 |
395 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 | 461 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 |
396 | 462 |
1277 | 1343 |
1278 if($query=~/x(\d+)/i){ | 1344 if($query=~/x(\d+)/i){ |
1279 my $freq=$1; | 1345 my $freq=$1; |
1280 return $freq; | 1346 return $freq; |
1281 }else{ | 1347 }else{ |
1282 print STDERR "Problem with read format\n"; | 1348 #print STDERR "Problem with read format\n"; |
1283 return 0; | 1349 return 0; |
1284 } | 1350 } |
1285 } | 1351 } |
1286 | 1352 |
1287 | 1353 |
1495 my $mfe_adj=max2(1,-$mfe); | 1561 my $mfe_adj=max2(1,-$mfe); |
1496 my $mfe_adj1=$mfe/$mlng; | 1562 my $mfe_adj1=$mfe/$mlng; |
1497 #parameters of known precursors and background hairpins, scale and location | 1563 #parameters of known precursors and background hairpins, scale and location |
1498 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; | 1564 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; |
1499 my $ev=$e**($mfe_adj1*$c); | 1565 my $ev=$e**($mfe_adj1*$c); |
1500 print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; | 1566 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; |
1501 my $log_odds=($a/($b+$ev)); | 1567 my $log_odds=($a/($b+$ev)); |
1502 | 1568 |
1503 | 1569 |
1504 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); | 1570 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); |
1505 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); | 1571 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); |
1506 | 1572 |
1507 my $odds=$prob_test/$prob_background; | 1573 my $odds=$prob_test/$prob_background; |
1508 my $log_odds_2=log($odds); | 1574 my $log_odds_2=log($odds); |
1509 print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; | 1575 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; |
1510 return $log_odds; | 1576 return $log_odds; |
1511 } | 1577 } |
1512 | 1578 |
1513 | 1579 |
1514 | 1580 |