comparison miRDeep_plant.pl @ 44:0c4e11018934 draft

Uploaded
author big-tiandm
date Thu, 30 Oct 2014 21:29:19 -0400
parents dc5a29826c7d
children ca05d68aca13
comparison
equal deleted inserted replaced
43:4c0b1a94b882 44:0c4e11018934
1 #!/usr/bin/perl 1 #!/usr/bin/perl
2 2
3 use warnings; 3 use warnings;
4 use strict; 4 use strict;
5 use Getopt::Std; 5 use Getopt::Std;
6 6 use RNA;
7 7
8 8
9 ################################# MIRDEEP ################################################# 9 ################################# MIRDEEP #################################################
10 10
11 ################################## USAGE ################################################## 11 ################################## USAGE ##################################################
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed 123 #if conservation is scored, the fasta file of known miRNA sequences is parsed
124 if($options{s}){create_hash_nuclei($options{s})}; 124 if($options{s}){create_hash_nuclei($options{s})};
125 125
126 #parse signature file in blast_parsed format and resolve each potential precursor 126 #parse signature file in blast_parsed format and resolve each potential precursor
127 parse_file_blast_parsed($file_blast_parsed); 127 parse_file_blast_parsed($file_blast_parsed);
128 `rm -rf $tmpdir`; 128 #`rm -rf $tmpdir`;
129 exit; 129 exit;
130 130
131 131
132 132
133 133
358 sub test_randfold{ 358 sub test_randfold{
359 359
360 #print sequence to temporary file, test randfold value, return 1 or 0 360 #print sequence to temporary file, test randfold value, return 1 or 0
361 361
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); 362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
363 my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; 363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
364 open(FILE, ">$tmpfile"); 364 #open(FILE, ">$tmpfile");
365 print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; 365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
366 close FILE; 366 #close FILE;
367 367
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; 368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
369 my $p1=`randfold -s $tmpfile 999 | cut -f 3`; 369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
370 my $p2=`randfold -s $tmpfile 999 | cut -f 3`; 370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999);
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999);
371 my $p_value=($p1+$p2)/2; 373 my $p_value=($p1+$p2)/2;
372 wait; 374 wait;
373 # system "rm $tmpfile"; 375 # system "rm $tmpfile";
374 376
375 if($p_value<=0.05){return 1;} 377 if($p_value<=0.05){return 1;}
376 378
377 return 0; 379 return 0;
378 } 380 }
379 381
380 382 sub randfold_pvalue{
381 #sub print_file{ 383 my $cpt_sup = 0;
382 384 my $cpt_inf = 0;
383 #print string to file 385 my $cpt_ega = 1;
384 386
385 # my($file,$string)=@_; 387 my ($seq,$number_of_randomizations)=@_;
386 388 my $str =$seq;
387 # open(FILE, ">$file"); 389 my $mfe = RNA::fold($seq,$str);
388 # print FILE "$string"; 390
389 # close FILE; 391 for (my $i=0;$i<$number_of_randomizations;$i++) {
390 #} 392 $seq = shuffle_sequence_dinucleotide($seq);
391 393 $str = $seq;
394
395 my $rand_mfe = RNA::fold($str,$str);
396
397 if ($rand_mfe < $mfe) {
398 $cpt_inf++;
399 }
400 if ($rand_mfe == $mfe) {
401 $cpt_ega++;
402 }
403 if ($rand_mfe > $mfe) {
404 $cpt_sup++;
405 }
406 }
407
408 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);
409
410 #print "$name\t$mfe\t$proba\n";
411 return $proba;
412 }
413
414 sub shuffle_sequence_dinucleotide {
415
416 my ($str) = @_;
417
418 # upper case and convert to ATGC
419 $str = uc($str);
420 $str =~ s/U/T/g;
421
422 my @nuc = ('A','T','G','C');
423 my $count_swap = 0;
424 # set maximum number of permutations
425 my $stop = length($str) * 10;
426
427 while($count_swap < $stop) {
428
429 my @pos;
430
431 # look start and end letters
432 my $firstnuc = $nuc[int(rand 4)];
433 my $thirdnuc = $nuc[int(rand 4)];
434
435 # get positions for matching nucleotides
436 for (my $i=0;$i<(length($str)-2);$i++) {
437 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
438 push (@pos,($i+1));
439 $i++;
440 }
441 }
442
443 # swap at random trinucleotides
444 my $max = scalar(@pos);
445 for (my $i=0;$i<$max;$i++) {
446 my $swap = int(rand($max));
447 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {
448 $count_swap++;
449 my $w1 = substr($str,$pos[$i],1);
450 my $w2 = substr($str,$pos[$swap],1);
451 substr($str,$pos[$i],1,$w2);
452 substr($str,$pos[$swap],1,$w1);
453 }
454 }
455 }
456 return($str);
457 }
392 458
393 sub test_nucleus_conservation{ 459 sub test_nucleus_conservation{
394 460
395 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 461 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0
396 462
1277 1343
1278 if($query=~/x(\d+)/i){ 1344 if($query=~/x(\d+)/i){
1279 my $freq=$1; 1345 my $freq=$1;
1280 return $freq; 1346 return $freq;
1281 }else{ 1347 }else{
1282 print STDERR "Problem with read format\n"; 1348 #print STDERR "Problem with read format\n";
1283 return 0; 1349 return 0;
1284 } 1350 }
1285 } 1351 }
1286 1352
1287 1353
1495 my $mfe_adj=max2(1,-$mfe); 1561 my $mfe_adj=max2(1,-$mfe);
1496 my $mfe_adj1=$mfe/$mlng; 1562 my $mfe_adj1=$mfe/$mlng;
1497 #parameters of known precursors and background hairpins, scale and location 1563 #parameters of known precursors and background hairpins, scale and location
1498 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; 1564 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
1499 my $ev=$e**($mfe_adj1*$c); 1565 my $ev=$e**($mfe_adj1*$c);
1500 print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; 1566 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
1501 my $log_odds=($a/($b+$ev)); 1567 my $log_odds=($a/($b+$ev));
1502 1568
1503 1569
1504 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); 1570 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
1505 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); 1571 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
1506 1572
1507 my $odds=$prob_test/$prob_background; 1573 my $odds=$prob_test/$prob_background;
1508 my $log_odds_2=log($odds); 1574 my $log_odds_2=log($odds);
1509 print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; 1575 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
1510 return $log_odds; 1576 return $log_odds;
1511 } 1577 }
1512 1578
1513 1579
1514 1580