view randfold @ 44:0c4e11018934 draft

Uploaded
author big-tiandm
date Thu, 30 Oct 2014 21:29:19 -0400
parents
children
line wrap: on
line source

# 	randfold (c) Eric Bonnet & Jan Wuyts 2003
# 	randomization test for rna secondary structure

#	This program is free software; you can redistribute it and/or
#	modify it under the terms of the GNU General Public License
#	as published by the Free Software Foundation; either version 2
#	of the License, or (at your option) any later version.

#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.

#	You should have received a copy of the GNU General Public License
#	along with this program; if not, write to the Free Software
#	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

# 	This soft needs BioPerl and ViennaRNA packages
#	See http://www.bioperl.org and http://www.tbi.univie.ac.at/~ivo/RNA/
#	See also README

use strict;
use Bio::SeqIO;
use RNA;

if (scalar(@ARGV) != 3) {
	die "Usage $0 fasta_file number_of_randomizations type[m|d]\n";
}

my $in = Bio::SeqIO->new(-file => "$ARGV[0]", -format => "fasta");
my $number_of_randomizations = $ARGV[1];
my $type = $ARGV[2];
if ($type !~ /[m|d]/) {
	die "error: wrong type of randomization.";
}

while (my $o = $in->next_seq) {
	my $seq = uc($o->seq());
	my $name = $o->display_id();
	
	my $cpt_sup = 0;
	my $cpt_inf = 0;
	my $cpt_ega = 1;
	
	my $str = $seq;
	my $mfe = RNA::fold($seq,$str);

	for (my $i=0;$i<$number_of_randomizations;$i++) {
		if ($type eq "d") {
			$seq = shuffle_sequence_dinucleotide($seq);
			$str = $seq;
		}
		elsif ($type eq "m") {
			my @tmp = split //,$seq;
			fisher_yates_shuffle(\@tmp);
			$seq = join '',@tmp;
			$str = $seq;
		}
	
		my $rand_mfe = RNA::fold($str,$str);
	
		if ($rand_mfe < $mfe) {
			$cpt_inf++;
		}
		if ($rand_mfe == $mfe) {
			$cpt_ega++;
		}
		if ($rand_mfe > $mfe) {		
			$cpt_sup++;
		}
	}
	
	my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);

	print "$name\t$mfe\t$proba\n";
}

# fisher_yates_shuffle( \@array ) : 
# generate a random permutation of @array in place
# input	array ref
# return	nothing
sub fisher_yates_shuffle {
	my $array = shift;
	my $i;
	for ($i = @$array; --$i; ) {
		my $j = int rand ($i+1);
		next if $i == $j;
		@$array[$i,$j] = @$array[$j,$i];
	}
}

# shuffle a sequence while preserving dinucleotide distribution
# input 	sequence string
# return 	sequence string shuffled
sub shuffle_sequence_dinucleotide {

	my ($str) = @_;

	# upper case and convert to ATGC
	$str = uc($str);
	$str =~ s/U/T/g;
	
	my @nuc = ('A','T','G','C');
	my $count_swap = 0;
	# set maximum number of permutations
	my $stop = length($str) * 10;
	
	while($count_swap < $stop) {

		my @pos;

		# look start and end letters
		my $firstnuc = $nuc[int(rand 4)];
		my $thirdnuc = $nuc[int(rand 4)];	

		# get positions for matching nucleotides
		for (my $i=0;$i<(length($str)-2);$i++) {
			if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
				push (@pos,($i+1));
				$i++;
			}
		}
	
		# swap at random trinucleotides
		my $max = scalar(@pos);
		for (my $i=0;$i<$max;$i++) {
			my $swap = int(rand($max));
			if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {			
				$count_swap++;
				my $w1 = substr($str,$pos[$i],1);
				my $w2 = substr($str,$pos[$swap],1);			
				substr($str,$pos[$i],1,$w2);
				substr($str,$pos[$swap],1,$w1);		
			}
		}
	}
	return($str);	
}