diff microRNA.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microRNA.pl	Fri Dec 05 00:11:02 2014 -0500
@@ -0,0 +1,253 @@
+#!/usr/bin/perl -w
+#Filename:
+#Author: Tian Dongmei
+#Email: tiandm@big.ac.cn
+#Date: 2014-4-22
+#Modified:
+#Description: plant microRNA prediction
+my $version=1.00;
+
+use strict;
+use Getopt::Long;
+use threads;
+#use threads::shared;
+use File::Path;
+use File::Basename;
+#use RNA;
+#use Term::ANSIColor;
+
+my %opts;
+GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");
+if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments
+&usage;
+}
+
+my $time=&Time();
+print "miPlant program start:\n   The time is $time!\n";
+print "Command line:\n   $0 @ARGV\n";
+
+my  $mypath=`pwd`;
+chomp $mypath;
+
+my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/";
+
+
+unless ($dir=~/\/$/) {$dir.="/";}
+if (not -d $dir) {
+	mkdir $dir;
+}
+my $config=$opts{'i'};
+my $data=$opts{'fa'};
+
+my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";
+
+my $t=1; #threads number
+if (defined $opts{'t'}) {$t=$opts{'t'};}
+
+my $mis=0; #mismatch number for microRNA
+if (defined $opts{'mis'}) {$mis=$opts{'mis'};}
+
+my $hit=25; # maximum reads mapping hits in genome
+if (defined $opts{'r'}) {$hit=$opts{'r'};}
+
+my $upstream = 2; # microRNA 5' extension
+$upstream = $opts{'e'} if(defined $opts{'e'});
+
+my $downstream = 5;# microRNA 3' extension
+$downstream = $opts{'f'} if(defined $opts{'f'});
+
+my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200;
+my $flank=defined $opts{'flank'} ? $opts{'flank'} :10;
+my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20;
+
+$time=&Time();
+print "$time, Checking input file!\n";
+
+my (@filein,@mark);
+&read_config();
+
+&checkfa($opts{pre}) if(defined $opts{pre});
+&checkfa($opts{mat}) if(defined $opts{mat});
+&checkfa($opts{gfa});
+
+chdir $dir;
+my $data2=$data;
+my $known_result=$dir."known_miRNA_Express";
+if(defined $opts{pre} and defined $opts{mat}){
+	&quantify(); ### known microRAN quantify
+	$data2=$known_result."/mirbase_not_mapped.fa";
+}
+
+my $genome_map=$dir."genome_match";
+&genome($data2);
+
+#my $genome_map=&search($dir,"genome_match_");
+my $mapfile=$genome_map."/genome_mapped.bwt";
+my $mapfa=$genome_map."/genome_mapped.fa";
+my $unmap=$genome_map."/genome_not_mapped.fa";
+
+#$time=Time();
+#print "$time: Novel microRNA prediction!\n\n";
+
+&predict($mapfa);
+
+$time=Time();
+print "$time: Program end!!\n";
+
+############################## sub programs ###################################
+sub predict{
+	my ($file)=@_;
+	$time=&Time();
+	print "$time: Novel microRNA prediction!\n\n";
+	my $predict=$dir."Novel_miRNA_predict";
+	mkdir $predict;
+	chdir $predict;
+	system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");
+#	print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n";
+	
+	system("bowtie-build -f excised_precursor.fa excised_precursor");
+#	print "\nbowtie-build -f excised_precursor.fa excised_precursor\n";
+	
+	system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");
+#	print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n";
+	
+	system("perl $scipt_path/convert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");
+#	print "\nconvert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n";
+
+	system("sort -k 4  precursor_mapped.bst  > signatures.bst");
+#	print "\nsort +3 -25  precursor_mapped.bst  > ../signatures.bst\n";
+
+	chdir $dir;
+	system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");
+#	print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n";
+	#system("rm novel_tmp_dir -rf");
+	my $tag=join "," ,@mark;
+	system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag");
+
+	system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");
+
+}
+
+sub genome{
+	my ($file)=@_;
+	if(defined $opts{'idx'}){
+		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;
+#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
+	}else{
+		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;
+#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
+	}
+}
+
+sub quantify{
+	my $tag=join "\\;" ,@mark;
+	system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");
+	print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n";
+}
+
+sub read_config{
+	open CON,"<$config";
+	while (my $aline=<CON>) {
+		chomp $aline;
+		my @tmp=split/\t/,$aline;
+		push @filein,$tmp[0];
+		push @mark,$tmp[1];
+		#&check_rawdata($tmp[0]);
+	}
+	close CON;
+	if (@filein != @mark) {
+		#&printErr();
+		die "Maybe config file have some wrong!!!\n";
+	}
+}
+sub checkfa{
+	my ($file_reads)=@_;
+	open N,"<$file_reads";
+	my $line=<N>;
+	chomp $line;
+    if($line !~ /^>\S+/){
+        #printErr();
+        die "The first line of file $file_reads does not start with '>identifier'
+Reads file $file_reads is not a valid fasta file\n\n";
+    }
+    if(<N> !~ /^[ACGTNacgtn]*$/){
+        #printErr();
+        die "File $file_reads contains not allowed characters in sequences
+Allowed characters are ACGTN
+Reads file $file_reads is not a fasta file\n\n";
+    }
+	close N;
+}
+sub search{
+	my ($dir,$str)=@_;
+	opendir I,$dir;
+	my @ret;
+	while (my $file=readdir I) {
+		if ($file=~/$str/) {
+			push @ret, $file;
+		}
+	}
+	closedir I;
+	if (@ret != 1) {
+		#&printErr();
+
+		die "Can not find directory or file which name has string: $str !!!\n";
+	}
+	return $ret[0];
+}
+
+
+sub Time{
+        my $time=time();
+        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+        $month++;
+        $year+=1900;
+        if (length($sec) == 1) {$sec = "0"."$sec";}
+        if (length($min) == 1) {$min = "0"."$min";}
+        if (length($hour) == 1) {$hour = "0"."$hour";}
+        if (length($day) == 1) {$day = "0"."$day";}
+        if (length($month) == 1) {$month = "0"."$month";}
+        #print "$year-$month-$day $hour:$min:$sec\n";
+        return("$year-$month-$day $hour:$min:$sec");
+}
+
+
+sub usage{
+print <<"USAGE";
+Version $version
+Usage:
+
+$0 -i -fa -gfa -idx -pre -mat -mis -e -f  -t  -o  -path
+options:
+-i input files, # config
+
+-fa ,#fasta sequence file
+
+-path scirpt path
+
+-gfa string,  input file # genome fasta. sequence file
+-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
+                string must be the prefix of the bowtie index. For instance, if
+                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+                the prefix is 'h_sapiens_37_asm'.##can be null
+
+-pre string,  input file #species specific microRNA precursor sequences
+-mat string,  input file #species specific microRNA mature sequences
+
+-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0
+-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2
+-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5
+-r int       a read is allowed to map up to this number of positions in the genome,default is 25 
+
+-dis <int>   Maximal space between miRNA and miRNA* (200)
+-flank <int>   Flank sequence length of miRNA precursor (10)
+-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)
+
+-t int,    number of threads [1]
+
+-o output directory# absolute path
+-h help
+USAGE
+exit(1);
+}
+