diff randfold @ 44:0c4e11018934 draft

Uploaded
author big-tiandm
date Thu, 30 Oct 2014 21:29:19 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/randfold	Thu Oct 30 21:29:19 2014 -0400
@@ -0,0 +1,139 @@
+# 	randfold (c) Eric Bonnet & Jan Wuyts 2003
+# 	randomization test for rna secondary structure
+
+#	This program is free software; you can redistribute it and/or
+#	modify it under the terms of the GNU General Public License
+#	as published by the Free Software Foundation; either version 2
+#	of the License, or (at your option) any later version.
+
+#	This program is distributed in the hope that it will be useful,
+#	but WITHOUT ANY WARRANTY; without even the implied warranty of
+#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#	GNU General Public License for more details.
+
+#	You should have received a copy of the GNU General Public License
+#	along with this program; if not, write to the Free Software
+#	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+# 	This soft needs BioPerl and ViennaRNA packages
+#	See http://www.bioperl.org and http://www.tbi.univie.ac.at/~ivo/RNA/
+#	See also README
+
+use strict;
+use Bio::SeqIO;
+use RNA;
+
+if (scalar(@ARGV) != 3) {
+	die "Usage $0 fasta_file number_of_randomizations type[m|d]\n";
+}
+
+my $in = Bio::SeqIO->new(-file => "$ARGV[0]", -format => "fasta");
+my $number_of_randomizations = $ARGV[1];
+my $type = $ARGV[2];
+if ($type !~ /[m|d]/) {
+	die "error: wrong type of randomization.";
+}
+
+while (my $o = $in->next_seq) {
+	my $seq = uc($o->seq());
+	my $name = $o->display_id();
+	
+	my $cpt_sup = 0;
+	my $cpt_inf = 0;
+	my $cpt_ega = 1;
+	
+	my $str = $seq;
+	my $mfe = RNA::fold($seq,$str);
+
+	for (my $i=0;$i<$number_of_randomizations;$i++) {
+		if ($type eq "d") {
+			$seq = shuffle_sequence_dinucleotide($seq);
+			$str = $seq;
+		}
+		elsif ($type eq "m") {
+			my @tmp = split //,$seq;
+			fisher_yates_shuffle(\@tmp);
+			$seq = join '',@tmp;
+			$str = $seq;
+		}
+	
+		my $rand_mfe = RNA::fold($str,$str);
+	
+		if ($rand_mfe < $mfe) {
+			$cpt_inf++;
+		}
+		if ($rand_mfe == $mfe) {
+			$cpt_ega++;
+		}
+		if ($rand_mfe > $mfe) {		
+			$cpt_sup++;
+		}
+	}
+	
+	my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);
+
+	print "$name\t$mfe\t$proba\n";
+}
+
+# fisher_yates_shuffle( \@array ) : 
+# generate a random permutation of @array in place
+# input	array ref
+# return	nothing
+sub fisher_yates_shuffle {
+	my $array = shift;
+	my $i;
+	for ($i = @$array; --$i; ) {
+		my $j = int rand ($i+1);
+		next if $i == $j;
+		@$array[$i,$j] = @$array[$j,$i];
+	}
+}
+
+# shuffle a sequence while preserving dinucleotide distribution
+# input 	sequence string
+# return 	sequence string shuffled
+sub shuffle_sequence_dinucleotide {
+
+	my ($str) = @_;
+
+	# upper case and convert to ATGC
+	$str = uc($str);
+	$str =~ s/U/T/g;
+	
+	my @nuc = ('A','T','G','C');
+	my $count_swap = 0;
+	# set maximum number of permutations
+	my $stop = length($str) * 10;
+	
+	while($count_swap < $stop) {
+
+		my @pos;
+
+		# look start and end letters
+		my $firstnuc = $nuc[int(rand 4)];
+		my $thirdnuc = $nuc[int(rand 4)];	
+
+		# get positions for matching nucleotides
+		for (my $i=0;$i<(length($str)-2);$i++) {
+			if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
+				push (@pos,($i+1));
+				$i++;
+			}
+		}
+	
+		# swap at random trinucleotides
+		my $max = scalar(@pos);
+		for (my $i=0;$i<$max;$i++) {
+			my $swap = int(rand($max));
+			if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {			
+				$count_swap++;
+				my $w1 = substr($str,$pos[$i],1);
+				my $w2 = substr($str,$pos[$swap],1);			
+				substr($str,$pos[$i],1,$w2);
+				substr($str,$pos[$swap],1,$w1);		
+			}
+		}
+	}
+	return($str);	
+}
+