Mercurial > repos > big-tiandm > mirplant2

#!/usr/bin/perl

use warnings;
use strict;
use Getopt::Std;
#use RNA;


################################# MIRDEEP #################################################

################################## USAGE ##################################################


my $usage=
"$0 file_signature file_structure temp_out_directory

This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with
information on the positions of reads aligned to potential precursor sequences (signature).
It also takes as input an RNAfold output file, giving information on the sequence, structure
and mimimum free energy of the potential precursor sequences.

Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA
sequences that should be considered for conservation purposes. -t prints out the potential
precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that
exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors
that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences
obtained through conventional cloning. -z consider the number of base pairings in the lower
stems (this option is not well tested).

-h print this usage
-s fasta file with known miRNAs
#-o temp directory ,maked befor running the program.
-t print filtered
-u limited output (only ids)
-v cut-off (default 1)
-x sensitive option for Sanger sequences
-y use Randfold
-z consider Drosha processing
";


############################################################################################

################################### INPUT ##################################################


#signature file in blast_parsed format
my $file_blast_parsed=shift or die $usage;

#structure file outputted from RNAfold
my $file_struct=shift or die $usage;

my $tmpdir=shift or die $usage;
#options
my %options=();
getopts("hs:tuv:xyz",\%options);


#############################################################################################

############################# GLOBAL VARIABLES ##############################################


#parameters
my $nucleus_lng=11;

my $score_star=3.9;
my $score_star_not=-1.3;
my $score_nucleus=7.63;
my $score_nucleus_not=-1.17;
my $score_randfold=1.37;
my $score_randfold_not=-3.624;
my $score_intercept=0.3;
my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
my $score_min=1;
if($options{v}){$score_min=$options{v};}
if($options{x}){$score_min=-5;}

my $e=2.718281828;

#hashes
my %hash_desc;
my %hash_seq;
my %hash_struct;
my %hash_mfe;
my %hash_nuclei;
my %hash_mirs;
my %hash_query;
my %hash_comp;
my %hash_bp;

#other variables
my $subject_old;
my $message_filter;
my $message_score;
my $lines;
my $out_of_bound;


##############################################################################################

################################  MAIN  ######################################################


#print help if that option is used
if($options{h}){die $usage;}
unless ($tmpdir=~/\/$/) {$tmpdir .="/";}
if(!(-s $tmpdir)){mkdir $tmpdir;}
$tmpdir .="TMP_DIR/";
mkdir $tmpdir;

#parse structure file outputted from RNAfold
parse_file_struct($file_struct);

#if conservation is scored, the fasta file of known miRNA sequences is parsed
if($options{s}){create_hash_nuclei($options{s})};

#parse signature file in blast_parsed format and resolve each potential precursor
parse_file_blast_parsed($file_blast_parsed);
#`rm -rf $tmpdir`;
exit;


##############################################################################################

############################## SUBROUTINES ###################################################


sub parse_file_blast_parsed{

#    read through the signature blastparsed file, fills up a hash with information on queries
#    (deep sequences) mapping to the current subject (potential precursor) and resolve each
#    potential precursor in turn

    my $file_blast_parsed=shift;

    open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n";
    while (my $line=<FILE_BLAST_PARSED>){
		if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){
			my $query=$1;
			my $query_lng=$2;
			my $query_beg=$3;
			my $query_end=$4;
			my $subject=$5;
			my $subject_lng=$6;
			my $subject_beg=$7;
			my $subject_end=$8;
			my $e_value=$9;
			my $pid=$10;
			my $bitscore=$11;
			my $other=$12;

			#if the new line concerns a new subject (potential precursor) then the old subject must be resolved
			if($subject_old and $subject_old ne $subject){
				resolve_potential_precursor();
			}

			#resolve the strand
			my $strand=find_strand($other);

			#resolve the number of reads that the deep sequence represents
			my $freq=find_freq($query);

			#read information of the query (deep sequence) into hash
			$hash_query{$query}{"subject_beg"}=$subject_beg;
			$hash_query{$query}{"subject_end"}=$subject_end;
			$hash_query{$query}{"strand"}=$strand;
			$hash_query{$query}{"freq"}=$freq;

			#save the signature information
			$lines.=$line;

			$subject_old=$subject;
		}
    }
    resolve_potential_precursor();
}

sub resolve_potential_precursor{

#    dissects the potential precursor in parts by filling hashes, and tests if it passes the
#    initial filter and the scoring filter

#    binary variable whether the potential precursor is still viable
    my $ret=1;
#print STDERR ">$subject_old\n";

    fill_structure();
#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n";
    fill_pri();
#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";

    fill_mature();
#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";

    fill_star();
#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";

    fill_loop();
#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";

    fill_lower_flanks();
#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";

#    do_test_assemble();

#    this is the actual classification
    unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;}

    print_results($ret);

    reset_variables();

    return;

}


sub print_results{

    my $ret=shift;

#    print out if the precursor is accepted and accepted precursors should be printed out
#    or if the potential precursor is discarded and discarded potential precursors should
#    be printed out

    if((!$options{t} and $ret) or ($options{t} and !$ret)){
	#full output
		unless($options{u}){
			if($message_filter){print $message_filter;}
			if($message_score){print $message_score;}
			print_hash_comp();
			print $lines,"\n\n";
			return;
		}
		#limited output (only ids)
		my $id=$hash_comp{"pri_id"};
		print "$id\n";
    }
}


sub pass_threshold_score{

#    this is the scoring

    #minimum free energy of the potential precursor
#    my $score_mfe=score_mfe($hash_comp{"pri_mfe"});
    my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"});

    #count of reads that map in accordance with Dicer processing
    my $score_freq=score_freq($hash_comp{"freq"});
#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n";

    #basic score
    my $score=$score_mfe+$score_freq;

    #scoring of conserved nucleus/seed (optional)
    if($options{s}){

	#if the nucleus is conserved
	if(test_nucleus_conservation()){

	    #nucleus from position 2-8
	    my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);

	    #resolve DNA/RNA ambiguities
	    $nucleus=~tr/[T]/[U]/;

	    #print score contribution
	    score_s("score_nucleus\t$score_nucleus");

	    #print the ids of known miRNAs with same nucleus
	    score_s("$hash_mirs{$nucleus}");
#print STDERR "score_nucleus\t$score_nucleus\n";

	    #add to score
	    $score+=$score_nucleus;

	#if the nucleus is not conserved
	}else{
	    #print (negative) score contribution
	    score_s("score_nucleus\t$score_nucleus_not");

	    #add (negative) score contribution
	    $score+=$score_nucleus_not;
	}
    }

    #if the majority of potential star reads fall as expected from Dicer processing
    if($hash_comp{"star_read"}){
	score_s("score_star\t$score_star");
#print STDERR "score_star\t$score_star\n";
	$score+=$score_star;
    }else{
	score_s("score_star\t$score_star_not");
#print STDERR "score_star_not\t$score_star_not\n";
	$score+=$score_star_not;
    }

    #score lower stems for potential for Drosha recognition (highly optional)
    if($options{z}){
	my $stem_bp=$hash_comp{"stem_bp"};
	my $score_stem=$scores_stem[$stem_bp];
	$score+=$score_stem;
	score_s("score_stem\t$score_stem");
    }

#print STDERR "score_intercept\t$score_intercept\n";

    $score+=$score_intercept;

    #score for randfold (optional)
    if($options{y}){

#	only calculate randfold value if it can make the difference between the potential precursor
#	being accepted or discarded
	if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){

	    #randfold value<0.05
	    if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");}

	    #randfold value>0.05
	    else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");}
	}
    }

    #round off values to one decimal
    my $round_mfe=round($score_mfe*10)/10;
    my $round_freq=round($score_freq*10)/10;
    my $round=round($score*10)/10;

    #print scores
    score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round");

    #return 1 if the potential precursor is accepted, return 0 if discarded
    unless($score>=$score_min){return 0;}
    return 1;
}

sub test_randfold{

    #print sequence to temporary file, test randfold value, return 1 or 0

#    print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
	#my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
	#open(FILE, ">$tmpfile");
	#print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
	#close FILE;

#	my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
	#my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
	#my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
	my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999);
	my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999);
	my $p_value=($p1+$p2)/2;
	wait;
#    system "rm $tmpfile";

    if($p_value<=0.05){return 1;}

    return 0;
}

sub randfold_pvalue{
	my $cpt_sup = 0;
	my $cpt_inf = 0;
	my $cpt_ega = 1;

	my ($seq,$number_of_randomizations)=@_;
	#my $str =$seq;
	#my $mfe = RNA::fold($seq,$str);
		my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
		my @rawfolds=split/\s+/,$rnafold;
		my $str=$rawfolds[1];
		my $mfe=$rawfolds[-1];
		$mfe=~s/\(//;
		$mfe=~s/\)//;

	for (my $i=0;$i<$number_of_randomizations;$i++) {
		$seq = shuffle_sequence_dinucleotide($seq);
		#$str = $seq;

		#my $rand_mfe = RNA::fold($str,$str);
		$rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
		my @rawfolds=split/\s+/,$rnafold;
		my $str=$rawfolds[1];
		my $rand_mfe=$rawfolds[-1];
		$rand_mfe=~s/\(//;
		$rand_mfe=~s/\)//;

		if ($rand_mfe < $mfe) {
			$cpt_inf++;
		}
		if ($rand_mfe == $mfe) {
			$cpt_ega++;
		}
		if ($rand_mfe > $mfe) {
			$cpt_sup++;
		}
	}

	my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);

	#print "$name\t$mfe\t$proba\n";
	return $proba;
}

sub shuffle_sequence_dinucleotide {

	my ($str) = @_;

	# upper case and convert to ATGC
	$str = uc($str);
	$str =~ s/U/T/g;

	my @nuc = ('A','T','G','C');
	my $count_swap = 0;
	# set maximum number of permutations
	my $stop = length($str) * 10;

	while($count_swap < $stop) {

		my @pos;

		# look start and end letters
		my $firstnuc = $nuc[int(rand 4)];
		my $thirdnuc = $nuc[int(rand 4)];

		# get positions for matching nucleotides
		for (my $i=0;$i<(length($str)-2);$i++) {
			if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
				push (@pos,($i+1));
				$i++;
			}
		}

		# swap at random trinucleotides
		my $max = scalar(@pos);
		for (my $i=0;$i<$max;$i++) {
			my $swap = int(rand($max));
			if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {
				$count_swap++;
				my $w1 = substr($str,$pos[$i],1);
				my $w2 = substr($str,$pos[$swap],1);
				substr($str,$pos[$i],1,$w2);
				substr($str,$pos[$swap],1,$w1);
			}
		}
	}
	return($str);
}

sub test_nucleus_conservation{

    #test if nucleus is identical to nucleus from known miRNA, return 1 or 0

    my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
    $nucleus=~tr/[T]/[U]/;
    if($hash_nuclei{$nucleus}){return 1;}

    return 0;
}


sub pass_filtering_initial{

    #test if the structure forms a plausible hairpin
    unless(pass_filtering_structure()){filter_p("structure problem"); return 0;}

    #test if >90% of reads map to the hairpin in consistence with Dicer processing
    unless(pass_filtering_signature()){filter_p("signature problem");return 0;}

    return 1;

}


sub pass_filtering_signature{

    #number of reads that map in consistence with Dicer processing
    my $consistent=0;

    #number of reads that map inconsistent with Dicer processing
    my $inconsistent=0;

#   number of potential star reads map in good consistence with Drosha/Dicer processing
#   (3' overhangs relative to mature product)
    my $star_perfect=0;

#   number of potential star reads that do not map in good consistence with 3' overhang
    my $star_fuzzy=0;


    #sort queries (deep sequences) by their position on the hairpin
    my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query;

    foreach my $query(@queries){

		#number of reads that the deep sequence represents
		unless(defined($hash_query{$query}{"freq"})){next;}
		my $query_freq=$hash_query{$query}{"freq"};

		#test which Dicer product (if any) the deep sequence corresponds to
		my $product=test_query($query);

		#if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
		if($product){$consistent+=$query_freq;}

		#if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
		else{$inconsistent+=$query_freq;}

		#test a potential star sequence has good 3' overhang
		if($product eq "star"){
			if(test_star($query)){$star_perfect+=$query_freq;}
			else{$star_fuzzy+=$query_freq;}
		}
    }

#   if the majority of potential star sequences map in good accordance with 3' overhang
#    score for the presence of star evidence
    if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;}

    #total number of reads mapping to the hairpin
    my $freq=$consistent+$inconsistent;
    $hash_comp{"freq"}=$freq;
    unless($freq>0){filter_s("read frequency too low"); return 0;}

    #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
    my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent);
    unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;}

    #the hairpin is retained
    return 1;
}

sub test_star{

    #test if a deep sequence maps in good consistence with 3' overhang

    my $query=shift;

    #5' begin and 3' end positions
    my $beg=$hash_query{$query}{"subject_beg"};
    my $end=$hash_query{$query}{"subject_end"};

    #the difference between observed and expected begin positions must be 0 or 1
    my $offset=$beg-$hash_comp{"star_beg"};
    if($offset==0 or $offset==1 or $offset==-1){return 1;}

    return 0;
}


sub test_query{

    #test if deep sequence maps in consistence with Dicer processing

    my $query=shift;

    #begin, end, strand and read count
    my $beg=$hash_query{$query}{"subject_beg"};
    my $end=$hash_query{$query}{"subject_end"};
    my $strand=$hash_query{$query}{"strand"};
    my $freq=$hash_query{$query}{"freq"};

    #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs)
    if($strand eq '-'){return 0;}

    #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
    my $fuzz_beg=2;
    #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
    my $fuzz_end=2;

    #if in accordance with Dicer processing, return the type of Dicer product
    if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";}
    if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";}
    if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";}

    #if not in accordance, return 0
    return 0;
}


sub pass_filtering_structure{

    #The potential precursor must form a hairpin with miRNA precursor-like characteristics

    #return value
    my $ret=1;

    #potential mature, star, loop and lower flank parts must be identifiable
    unless(test_components()){return 0;}

    #no bifurcations
    unless(no_bifurcations_precursor()){$ret=0;}

    #minimum 14 base pairings in duplex
    unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");}

    #not more than 6 nt difference between mature and star length
    unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") }

    return $ret;
}


sub test_components{

    #tests whether potential mature, star, loop and lower flank parts are identifiable

    unless($hash_comp{"mature_struct"}){
	filter_s("no mature");
#	print STDERR "no mature\n";
	return 0;
    }

    unless($hash_comp{"star_struct"}){
	filter_s("no star");
#	print STDERR "no star\n";
	return 0;
    }

    unless($hash_comp{"loop_struct"}){
	filter_s("no loop");
#	print STDERR "no loop\n";
   	return 0;
    }

    unless($hash_comp{"flank_first_struct"}){
	filter_s("no flanks");
#print STDERR "no flanks_first_struct\n";
   	return 0;
    }

    unless($hash_comp{"flank_second_struct"}){
	filter_s("no flanks");
#	print STDERR "no flanks_second_struct\n";
    	return 0;
    }
    return 1;
}


sub no_bifurcations_precursor{

    #tests whether there are bifurcations in the hairpin

    #assembles the potential precursor sequence and structure from the expected Dicer products
    #this is the expected biological precursor, in contrast with 'pri_seq' that includes
    #some genomic flanks on both sides

    my $pre_struct;
    my $pre_seq;
    if($hash_comp{"mature_arm"} eq "first"){
	$pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"};
	$pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"};
    }else{
	$pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"};
	$pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"};
    }

    #read into hash
    $hash_comp{"pre_struct"}=$pre_struct;
    $hash_comp{"pre_seq"}=$pre_seq;

    #simple pattern matching checks for bifurcations
    unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){
	filter_s("bifurcation in precursor");
#	print STDERR "bifurcation in precursor\n";
	return 0;
    }

    return 1;
}

sub bp_precursor{

    #total number of bps in the precursor

    my $pre_struct=$hash_comp{"pre_struct"};

    #simple pattern matching
    my $pre_bps=0;
    while($pre_struct=~/\(/g){
	$pre_bps++;
    }
    return $pre_bps;
}


sub bp_duplex{

    #total number of bps in the duplex

    my $duplex_bps=0;
    my $mature_struct=$hash_comp{"mature_struct"};

    #simple pattern matching
    while($mature_struct=~/(\(|\))/g){
	$duplex_bps++;
    }
    return $duplex_bps;
}

sub diff_lng{

    #find difference between mature and star lengths

    my $mature_lng=length $hash_comp{"mature_struct"};
    my $star_lng=length $hash_comp{"star_struct"};
    my $diff_lng=$mature_lng-$star_lng;
    return $diff_lng;
}


sub do_test_assemble{

#    not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks)
#    is identical to 'pri_struct' before disassembly into parts

    my $assemble_struct;

    if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){
	if($hash_comp{"mature_arm"} eq "first"){
	    $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"};
	}else{
	    $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"};
	}
	unless($assemble_struct eq $hash_comp{"pri_struct"}){
	    $hash_comp{"test_assemble"}=$assemble_struct;
	    print_hash_comp();
	}
    }
    return;
 }


sub fill_structure{

    #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired

    my $struct=$hash_struct{$subject_old};
    my $lng=length $struct;

    #local stack for keeping track of basepairings
    my @bps;

    for(my $pos=1;$pos<=$lng;$pos++){
		my $struct_pos=excise_struct($struct,$pos,$pos,"+");

		if($struct_pos eq "("){
			push(@bps,$pos);
		}

		if($struct_pos eq ")"){
			my $pos_prev=pop(@bps);
			$hash_bp{$pos_prev}=$pos;
			$hash_bp{$pos}=$pos_prev;
		}
    }
    return;
}


sub fill_star{

    #fills specifics on the expected star strand into 'comp' hash ('component' hash)

    #if the mature sequence is not plausible, don't look for the star arm
    my $mature_arm=$hash_comp{"mature_arm"};
    unless($mature_arm){$hash_comp{"star_arm"}=0; return;}

    #if the star sequence is not plausible, don't fill into the hash
    my($star_beg,$star_end)=find_star();
    my $star_arm=arm_star($star_beg,$star_end);
    unless($star_arm){return;}

    #excise expected star sequence and structure
    my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+");
    my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+");

    #fill into hash
    $hash_comp{"star_beg"}=$star_beg;
    $hash_comp{"star_end"}=$star_end;
    $hash_comp{"star_seq"}=$star_seq;
    $hash_comp{"star_struct"}=$star_struct;
    $hash_comp{"star_arm"}=$star_arm;

    return;
}


sub find_star{

    #uses the 'bp' hash to find the expected star begin and end positions from the mature positions

    #the -2 is for the overhang
    my $mature_beg=$hash_comp{"mature_beg"};
    my $mature_end=$hash_comp{"mature_end"}-2;
    my $mature_lng=$mature_end-$mature_beg+1;

    #in some cases, the last nucleotide of the mature sequence does not form a base pair,
    #and therefore does not basepair with the first nucleotide of the star sequence.
    #In this case, the algorithm searches for the last nucleotide of the mature sequence
    #to form a base pair. The offset is the number of nucleotides searched through.
    my $offset_star_beg=0;
    my $offset_beg=0;

    #the offset should not be longer than the length of the mature sequence, then it
    #means that the mature sequence does not form any base pairs
    while(!$offset_star_beg and $offset_beg<$mature_lng){
		if($hash_bp{$mature_end-$offset_beg}){
			$offset_star_beg=$hash_bp{$mature_end-$offset_beg};
		}else{
			$offset_beg++;
		}
    }
    #when defining the beginning of the star sequence, compensate for the offset
    my $star_beg=$offset_star_beg-$offset_beg;

    #same as above
    my $offset_star_end=0;
    my $offset_end=0;
    while(!$offset_star_end and $offset_end<$mature_lng){
		if($hash_bp{$mature_beg+$offset_end}){
			$offset_star_end=$hash_bp{$mature_beg+$offset_end};
		}else{
			$offset_end++;
		}
    }
    #the +2 is for the overhang
    my $star_end=$offset_star_end+$offset_end+2;

    return($star_beg,$star_end);
}


sub fill_pri{

    #fills basic specifics on the precursor into the 'comp' hash

    my $seq=$hash_seq{$subject_old};
    my $struct=$hash_struct{$subject_old};
    my $mfe=$hash_mfe{$subject_old};
    my $length=length $seq;

    $hash_comp{"pri_id"}=$subject_old;
    $hash_comp{"pri_seq"}=$seq;
    $hash_comp{"pri_struct"}=$struct;
    $hash_comp{"pri_mfe"}=$mfe;
    $hash_comp{"pri_beg"}=1;
    $hash_comp{"pri_end"}=$length;

    return;
}


sub fill_mature{

    #fills specifics on the mature sequence into the 'comp' hash

    my $mature_query=find_mature_query();
    my($mature_beg,$mature_end)=find_positions_query($mature_query);
    my $mature_strand=find_strand_query($mature_query);
    my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand);
    my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand);
    my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand);

    $hash_comp{"mature_query"}=$mature_query;
    $hash_comp{"mature_beg"}=$mature_beg;
    $hash_comp{"mature_end"}=$mature_end;
    $hash_comp{"mature_strand"}=$mature_strand;
    $hash_comp{"mature_struct"}=$mature_struct;
    $hash_comp{"mature_seq"}=$mature_seq;
    $hash_comp{"mature_arm"}=$mature_arm;

    return;
}


sub fill_loop{

    #fills specifics on the loop sequence into the 'comp' hash

    #unless both mature and star sequences are plausible, do not look for the loop
    unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}

    my $loop_beg;
    my $loop_end;

    #defining the begin and end positions of the loop from the mature and star positions
    #excision depends on whether the mature or star sequence is 5' of the loop ('first')
    if($hash_comp{"mature_arm"} eq "first"){
	$loop_beg=$hash_comp{"mature_end"}+1;
    }else{
	$loop_end=$hash_comp{"mature_beg"}-1;
    }

    if($hash_comp{"star_arm"} eq "first"){
	$loop_beg=$hash_comp{"star_end"}+1;
    }else{
	$loop_end=$hash_comp{"star_beg"}-1;
    }

    #unless the positions are plausible, do not fill into hash
    unless(test_loop($loop_beg,$loop_end)){return;}

    my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+");
    my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+");

    $hash_comp{"loop_beg"}=$loop_beg;
    $hash_comp{"loop_end"}=$loop_end;
    $hash_comp{"loop_seq"}=$loop_seq;
    $hash_comp{"loop_struct"}=$loop_struct;

    return;
}


sub fill_lower_flanks{

    #fills specifics on the lower flanks and unpaired strands into the 'comp' hash

    #unless both mature and star sequences are plausible, do not look for the flanks
    unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}

    my $flank_first_end;
    my $flank_second_beg;

    #defining the begin and end positions of the flanks from the mature and star positions
    #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first')
    if($hash_comp{"mature_arm"} eq "first"){
	$flank_first_end=$hash_comp{"mature_beg"}-1;
    }else{
	$flank_second_beg=$hash_comp{"mature_end"}+1;
    }

    if($hash_comp{"star_arm"} eq "first"){
	$flank_first_end=$hash_comp{"star_beg"}-1;
    }else{
	$flank_second_beg=$hash_comp{"star_end"}+1;
    }

    #unless the positions are plausible, do not fill into hash
    unless(test_flanks($flank_first_end,$flank_second_beg)){return;}

    $hash_comp{"flank_first_end"}=$flank_first_end;
    $hash_comp{"flank_second_beg"}=$flank_second_beg;
    $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
    $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
    $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
    $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");

    if($options{z}){
	fill_stems_drosha();
    }

    return;
}


sub fill_stems_drosha{

    #scores the number of base pairings formed by the first ten nt of the lower stems
    #in general, the more stems, the higher the score contribution
    #warning: this options has not been thoroughly tested

    my $flank_first_struct=$hash_comp{"flank_first_struct"};
    my $flank_second_struct=$hash_comp{"flank_second_struct"};

    my $stem_first=substr($flank_first_struct,-10);
    my $stem_second=substr($flank_second_struct,0,10);

    my $stem_bp_first=0;
    my $stem_bp_second=0;

    #find base pairings by simple pattern matching
    while($stem_first=~/\(/g){
	$stem_bp_first++;
    }

    while($stem_second=~/\)/g){
	$stem_bp_second++;
    }

    my $stem_bp=min2($stem_bp_first,$stem_bp_second);

    $hash_comp{"stem_first"}=$stem_first;
    $hash_comp{"stem_second"}=$stem_second;
    $hash_comp{"stem_bp_first"}=$stem_bp_first;
    $hash_comp{"stem_bp_second"}=$stem_bp_second;
    $hash_comp{"stem_bp"}=$stem_bp;

    return;
}


sub arm_mature{

    #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor

    my ($beg,$end,$strand)=@_;

    #mature and star sequences should alway be on plus strand
    if($strand eq "-"){return 0;}

    #there should be no bifurcations and minimum one base pairing
    my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
    if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){
	return "first";
    }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){
	return "second";
    }
    return 0;
}


sub arm_star{

    #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor

    my ($beg,$end)=@_;

    #unless the begin and end positions are plausible, test negative
    unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}

    #no overlap between the mature and the star sequence
    if($hash_comp{"mature_arm"} eq "first"){
	($hash_comp{"mature_end"}<$beg) or return 0;
    }elsif($hash_comp{"mature_arm"} eq "second"){
	($end<$hash_comp{"mature_beg"}) or return 0;
    }

    #there should be no bifurcations and minimum one base pairing
    my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+");
    if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){
	return "first";
    }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){
	return "second";
    }
    return 0;
}


sub test_loop{

    #tests the loop positions

    my ($beg,$end)=@_;

    #unless the begin and end positions are plausible, test negative
    unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}

    return 1;
}


sub test_flanks{

    #tests the positions of the lower flanks

    my ($beg,$end)=@_;

    #unless the begin and end positions are plausible, test negative
    unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}

    return 1;
}


sub comp{

    #subroutine to retrive from the 'comp' hash

    my $type=shift;
    my $component=$hash_comp{$type};
    return $component;
}


sub find_strand_query{

    #subroutine to find the strand for a given query

    my $query=shift;
    my $strand=$hash_query{$query}{"strand"};
    return $strand;
}


sub find_positions_query{

    #subroutine to find the begin and end positions for a given query

    my $query=shift;
    my $beg=$hash_query{$query}{"subject_beg"};
    my $end=$hash_query{$query}{"subject_end"};
    return ($beg,$end);
}


sub find_mature_query{

    #finds the query with the highest frequency of reads and returns it
    #is used to determine the positions of the potential mature sequence

    my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query;
    my $mature_query=$queries[0];
    return $mature_query;
}


sub reset_variables{

    #resets the hashes for the next potential precursor

#    %hash_query=();
#    %hash_comp=();
#    %hash_bp=();
	foreach my $key (keys %hash_query) {delete($hash_query{$key});}
	foreach my $key (keys %hash_comp) {delete($hash_comp{$key});}
	foreach my $key (keys %hash_bp) {delete($hash_bp{$key});}

#    $message_filter=();
#    $message_score=();
#    $lines=();
	undef($message_filter);
	undef($message_score);
	undef($lines);
    return;
}


sub excise_seq{

    #excise sub sequence from the potential precursor

    my($seq,$beg,$end,$strand)=@_;

    #begin can be equal to end if only one nucleotide is excised
    unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}

    #rarely, permuted combinations of signature and structure cause out of bound excision errors.
    #this happens once appr. every two thousand combinations
    unless($beg<=length($seq)){$out_of_bound++;return 0;}

    #if on the minus strand, the reverse complement should be excised
    if($strand eq "-"){$seq=revcom($seq);}

    #the blast parsed format is 1-indexed, substr is 0-indexed
    my $sub_seq=substr($seq,$beg-1,$end-$beg+1);

    return $sub_seq;

}

sub excise_struct{

    #excise sub structure

    my($struct,$beg,$end,$strand)=@_;
    my $lng=length $struct;

    #begin can be equal to end if only one nucleotide is excised
    unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}

    #rarely, permuted combinations of signature and structure cause out of bound excision errors.
    #this happens once appr. every two thousand combinations
    unless($beg<=length($struct)){return 0;}

    #if excising relative to minus strand, positions are reversed
    if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}

    #the blast parsed format is 1-indexed, substr is 0-indexed
    my $sub_struct=substr($struct,$beg-1,$end-$beg+1);

    return $sub_struct;
}


sub create_hash_nuclei{
    #parses a fasta file with sequences of known miRNAs considered for conservation purposes
    #reads the nuclei into a hash

    my ($file) = @_;
    my ($id, $desc, $sequence, $nucleus) = ();

    open (FASTA, "<$file") or die "can not open $file\n";
    while (<FASTA>)
    {
        chomp;
        if (/^>(\S+)(.*)/)
	{
	    $id       = $1;
	    $desc     = $2;
	    $sequence = "";
	    $nucleus  = "";
	    while (<FASTA>){
                chomp;
                if (/^>(\S+)(.*)/){
		    $nucleus                = substr($sequence,1,$nucleus_lng);
		    $nucleus                =~ tr/[T]/[U]/;
		    $hash_mirs{$nucleus}   .="$id\t";
		    $hash_nuclei{$nucleus} += 1;

		    $id               = $1;
		    $desc             = $2;
		    $sequence         = "";
		    $nucleus          = "";
		    next;
                }
		$sequence .= $_;
            }
        }
    }
    $nucleus                = substr($sequence,1,$nucleus_lng);
    $nucleus                =~ tr/[T]/[U]/;
    $hash_mirs{$nucleus}   .="$id\t";
    $hash_nuclei{$nucleus} += 1;
    close FASTA;
}


sub parse_file_struct{
	#parses the output from RNAfoldand reads it into hashes
	my($file) = @_;
	my($id,$desc,$seq,$struct,$mfe) = ();
	open (FILE_STRUCT, "<$file") or die "can not open $file\n";
	while (<FILE_STRUCT>){
		chomp;
		if (/^>(\S+)\s*(.*)/){
			$id= $1;
			$desc= $2;
			$seq= "";
			$struct= "";
			$mfe= "";
			while (<FILE_STRUCT>){
				chomp;
				if (/^>(\S+)\s*(.*)/){
					$hash_desc{$id}   = $desc;
					$hash_seq{$id}    = $seq;
					$hash_struct{$id} = $struct;
					$hash_mfe{$id}    = $mfe;
					$id          = $1;
					$desc        = $2;
					$seq         = "";
					$struct      = "";
					$mfe         = "";
					next;
				}
				if(/^\w/){
					tr/uU/tT/;
					$seq .= $_;
					next;
				}
				if(/((\.|\(|\))+)/){$struct .=$1;}
				if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;}
			}
        }
    }
    $hash_desc{$id}        = $desc;
    $hash_seq{$id}         = $seq;
    $hash_struct{$id}      = $struct;
    $hash_mfe{$id}         = $mfe;
    close FILE_STRUCT;
    return;
}


sub score_s{

    #this score message is appended to the end of the string of score messages outputted for the potential precursor

    my $message=shift;
    $message_score.=$message."\n";;
    return;
}


sub score_p{

   #this score message is appended to the beginning of the string of score messages outputted for the potential precursor

    my $message=shift;
    $message_score=$message."\n".$message_score;
    return;
}


sub filter_s{

    #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor

    my $message=shift;
    $message_filter.=$message."\n";
    return;
}


sub filter_p{

    #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor

    my $message=shift;
    if(defined $message_filter){$message_filter=$message."\n".$message_filter;}
    else{$message_filter=$message."\n";}
    return;
}


sub find_freq{

    #finds the frequency of a given read query from its id.

    my($query)=@_;

    if($query=~/x(\d+)/i){
	my $freq=$1;
	return $freq;
    }else{
	#print STDERR "Problem with read format\n";
	return 0;
    }
}


sub print_hash_comp{

    #prints the 'comp' hash

    my @keys=sort keys %hash_comp;
    foreach my $key(@keys){
	my $value=$hash_comp{$key};
	print "$key  \t$value\n";
    }
}


sub print_hash_bp{

    #prints the 'bp' hash

    my @keys=sort {$a<=>$b} keys %hash_bp;
    foreach my $key(@keys){
	my $value=$hash_bp{$key};
	print "$key\t$value\n";
    }
    print "\n";
}


sub find_strand{

    #A subroutine to find the strand, parsing different blast formats

    my($other)=@_;

    my $strand="+";

    if($other=~/-/){
	$strand="-";
    }

    if($other=~/minus/i){
	$strand="-";
    }
    return($strand);
}


sub contained{

    #Is the stretch defined by the first positions contained in the stretch defined by the second?

    my($beg1,$end1,$beg2,$end2)=@_;

    testbeginend($beg1,$end1,$beg2,$end2);

    if($beg2<=$beg1 and $end1<=$end2){
	return 1;
    }else{
	return 0;
    }
}


sub testbeginend{

    #Are the beginposition numerically smaller than the endposition for each pair?

    my($begin1,$end1,$begin2,$end2)=@_;

    unless($begin1<=$end1 and $begin2<=$end2){
	print STDERR "beg can not be larger than end for $subject_old\n";
	exit;
    }
}


sub rev_pos{

#   The blast_parsed format always uses positions that are relative to the 5' of the given strand
#   This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with
#   the n't nucleotide on the plus strand

#   This subroutine reverses the begin and end positions of positions of the minus strand so that they
#   are relative to the 5' end of the plus strand

    my($beg,$end,$lng)=@_;

    my $new_end=$lng-$beg+1;
    my $new_beg=$lng-$end+1;

    return($new_beg,$new_end);
}

sub round {

    #rounds to nearest integer

    my($number) = shift;
    return int($number + .5);

}


sub rev{

    #reverses the order of nucleotides in a sequence

    my($sequence)=@_;

    my $rev=reverse $sequence;

    return $rev;
}

sub com{

    #the complementary of a sequence

    my($sequence)=@_;

    $sequence=~tr/acgtuACGTU/TGCAATGCAA/;

    return $sequence;
}

sub revcom{

    #reverse complement

    my($sequence)=@_;

    my $revcom=rev(com($sequence));

    return $revcom;
}


sub max2 {

    #max of two numbers

    my($a, $b) = @_;
    return ($a>$b ? $a : $b);
}

sub min2  {

    #min of two numbers

    my($a, $b) = @_;
    return ($a<$b ? $a : $b);
}


sub score_freq{

#   scores the count of reads that map to the potential precursor
#   Assumes geometric distribution as described in methods section of manuscript

    my $freq=shift;

    #parameters of known precursors and background hairpins
    my $parameter_test=0.999;
    my $parameter_control=0.6;

    #log_odds calculated directly to avoid underflow
    my $intercept=log((1-$parameter_test)/(1-$parameter_control));
    my $slope=log($parameter_test/$parameter_control);
    my $log_odds=$slope*$freq+$intercept;

    #if no strong evidence for 3' overhangs, limit the score contribution to 0
    unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}

    return $log_odds;
}


##sub score_mfe{

#   scores the minimum free energy in kCal/mol of the potential precursor
#   Assumes Gumbel distribution as described in methods section of manuscript

##    my $mfe=shift;

    #numerical value, minimum 1
##    my $mfe_adj=max2(1,-$mfe);

    #parameters of known precursors and background hairpins, scale and location
##    my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
##    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);

##    my $odds=$prob_test/$prob_background;
##    my $log_odds=log($odds);

##    return $log_odds;
##}

sub score_mfe{
#	use bignum;

#   scores the minimum free energy in kCal/mol of the potential precursor
#   Assumes Gumbel distribution as described in methods section of manuscript

    my ($mfe,$mlng)=@_;

    #numerical value, minimum 1
    my $mfe_adj=max2(1,-$mfe);
my $mfe_adj1=$mfe/$mlng;
    #parameters of known precursors and background hairpins, scale and location
	my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
	my $ev=$e**($mfe_adj1*$c);
	#print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
	my $log_odds=($a/($b+$ev));


	my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
    my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);

    my $odds=$prob_test/$prob_background;
    my $log_odds_2=log($odds);
	#print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
   return $log_odds;
}


sub prob_gumbel_discretized{

#   discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
#   uses the subroutine that calculates the cdf to find the probabilities

    my ($var,$scale,$location)=@_;

    my $bound_lower=$var-0.5;
    my $bound_upper=$var+0.5;

    my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
    my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);

    my $prob=$cdf_upper-$cdf_lower;

    return $prob;
}


sub cdf_gumbel{

#   calculates the cumulative distribution function of the Gumbel distribution

    my ($var,$scale,$location)=@_;

    my $cdf=$e**(-($e**(-($var-$location)/$scale)));

    return $cdf;
}
author	big-tiandm
date	Mon, 08 Dec 2014 01:51:16 -0500
parents	7b5a48b972e9
children