view microRNA.pl @ 53:f5a2e8308836 draft default tip

Uploaded
author big-tiandm
date Mon, 08 Dec 2014 01:51:16 -0500
parents 7b5a48b972e9
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Tian Dongmei
#Email: tiandm@big.ac.cn
#Date: 2014-4-22
#Modified:
#Description: plant microRNA prediction
my $version=1.00;

use strict;
use Getopt::Long;
use threads;
#use threads::shared;
use File::Path;
use File::Basename;
#use RNA;
#use Term::ANSIColor;

my %opts;
GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h");
if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments
&usage;
}

my $time=&Time();
print "miPlant program start:\n   The time is $time!\n";
print "Command line:\n   $0 @ARGV\n";

my  $mypath=`pwd`;
chomp $mypath;

my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/";


unless ($dir=~/\/$/) {$dir.="/";}
if (not -d $dir) {
	mkdir $dir;
}
my $config=$opts{'i'};
my $data=$opts{'fa'};

my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/";

my $t=1; #threads number
if (defined $opts{'t'}) {$t=$opts{'t'};}

my $mis=0; #mismatch number for microRNA
if (defined $opts{'mis'}) {$mis=$opts{'mis'};}

my $hit=25; # maximum reads mapping hits in genome
if (defined $opts{'r'}) {$hit=$opts{'r'};}

my $upstream = 2; # microRNA 5' extension
$upstream = $opts{'e'} if(defined $opts{'e'});

my $downstream = 5;# microRNA 3' extension
$downstream = $opts{'f'} if(defined $opts{'f'});

my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200;
my $flank=defined $opts{'flank'} ? $opts{'flank'} :10;
my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20;

$time=&Time();
print "$time, Checking input file!\n";

my (@filein,@mark);
&read_config();

&checkfa($opts{pre}) if(defined $opts{pre});
&checkfa($opts{mat}) if(defined $opts{mat});
&checkfa($opts{gfa});

chdir $dir;
my $data2=$data;
my $known_result=$dir."known_miRNA_Express";
if(defined $opts{pre} and defined $opts{mat}){
	&quantify(); ### known microRAN quantify
	$data2=$known_result."/mirbase_not_mapped.fa";
}

my $genome_map=$dir."genome_match";
&genome($data2);

#my $genome_map=&search($dir,"genome_match_");
my $mapfile=$genome_map."/genome_mapped.bwt";
my $mapfa=$genome_map."/genome_mapped.fa";
my $unmap=$genome_map."/genome_not_mapped.fa";

#$time=Time();
#print "$time: Novel microRNA prediction!\n\n";

&predict($mapfa);

$time=Time();
print "$time: Program end!!\n";

############################## sub programs ###################################
sub predict{
	my ($file)=@_;
	$time=&Time();
	print "$time: Novel microRNA prediction!\n\n";
	my $predict=$dir."Novel_miRNA_predict";
	mkdir $predict;
	chdir $predict;
	system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe");
#	print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n";
	
	system("bowtie-build -f excised_precursor.fa excised_precursor");
#	print "\nbowtie-build -f excised_precursor.fa excised_precursor\n";
	
	system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log");
#	print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n";
	
	system("perl $scipt_path/convert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst");
#	print "\nconvert_bowtie_to_blast.pl  precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n";

	system("sort -k 4  precursor_mapped.bst  > signatures.bst");
#	print "\nsort +3 -25  precursor_mapped.bst  > ../signatures.bst\n";

	chdir $dir;
	system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd");
#	print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n";
	#system("rm novel_tmp_dir -rf");
	my $tag=join "," ,@mark;
	system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag");

	system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa");

}

sub genome{
	my ($file)=@_;
	if(defined $opts{'idx'}){
		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ;
#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n";
	}else{
		system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ;
#		print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n";
	}
}

sub quantify{
	my $tag=join "\\;" ,@mark;
	system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag");
	print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n";
}

sub read_config{
	open CON,"<$config";
	while (my $aline=<CON>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		push @filein,$tmp[0];
		push @mark,$tmp[1];
		#&check_rawdata($tmp[0]);
	}
	close CON;
	if (@filein != @mark) {
		#&printErr();
		die "Maybe config file have some wrong!!!\n";
	}
}
sub checkfa{
	my ($file_reads)=@_;
	open N,"<$file_reads";
	my $line=<N>;
	chomp $line;
    if($line !~ /^>\S+/){
        #printErr();
        die "The first line of file $file_reads does not start with '>identifier'
Reads file $file_reads is not a valid fasta file\n\n";
    }
    if(<N> !~ /^[ACGTNacgtn]*$/){
        #printErr();
        die "File $file_reads contains not allowed characters in sequences
Allowed characters are ACGTN
Reads file $file_reads is not a fasta file\n\n";
    }
	close N;
}
sub search{
	my ($dir,$str)=@_;
	opendir I,$dir;
	my @ret;
	while (my $file=readdir I) {
		if ($file=~/$str/) {
			push @ret, $file;
		}
	}
	closedir I;
	if (@ret != 1) {
		#&printErr();

		die "Can not find directory or file which name has string: $str !!!\n";
	}
	return $ret[0];
}


sub Time{
        my $time=time();
        my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
        $month++;
        $year+=1900;
        if (length($sec) == 1) {$sec = "0"."$sec";}
        if (length($min) == 1) {$min = "0"."$min";}
        if (length($hour) == 1) {$hour = "0"."$hour";}
        if (length($day) == 1) {$day = "0"."$day";}
        if (length($month) == 1) {$month = "0"."$month";}
        #print "$year-$month-$day $hour:$min:$sec\n";
        return("$year-$month-$day $hour:$min:$sec");
}


sub usage{
print <<"USAGE";
Version $version
Usage:

$0 -i -fa -gfa -idx -pre -mat -mis -e -f  -t  -o  -path
options:
-i input files, # config

-fa ,#fasta sequence file

-path scirpt path

-gfa string,  input file # genome fasta. sequence file
-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
                string must be the prefix of the bowtie index. For instance, if
                the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
                the prefix is 'h_sapiens_37_asm'.##can be null

-pre string,  input file #species specific microRNA precursor sequences
-mat string,  input file #species specific microRNA mature sequences

-mis [int]     number of allowed mismatches when mapping reads to precursors, default 0
-e [int]     number of nucleotides upstream of the mature sequence to consider, default 2
-f [int]     number of nucleotides downstream of the mature sequence to consider, default 5
-r int       a read is allowed to map up to this number of positions in the genome,default is 25 

-dis <int>   Maximal space between miRNA and miRNA* (200)
-flank <int>   Flank sequence length of miRNA precursor (10)
-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20)

-t int,    number of threads [1]

-o output directory# absolute path
-h help
USAGE
exit(1);
}