Mercurial > repos > big-tiandm > mirplant2
view microRNA.pl @ 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2014-4-22 #Modified: #Description: plant microRNA prediction my $version=1.00; use strict; use Getopt::Long; use threads; #use threads::shared; use File::Path; use File::Basename; #use RNA; #use Term::ANSIColor; my %opts; GetOptions(\%opts,"i=s","fa=s","gfa=s","pre:s","mat:s","dis:i","flank:i","mfe:f","idx:s","mis:i","r:i","e:i","f:i","t:i","o:s","path:s","D","h"); if (!(defined $opts{i} and defined $opts{gfa}) || defined $opts{h}) { #necessary arguments &usage; } my $time=&Time(); print "miPlant program start:\n The time is $time!\n"; print "Command line:\n $0 @ARGV\n"; my $mypath=`pwd`; chomp $mypath; my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRNA_out/"; unless ($dir=~/\/$/) {$dir.="/";} if (not -d $dir) { mkdir $dir; } my $config=$opts{'i'}; my $data=$opts{'fa'}; my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; my $t=1; #threads number if (defined $opts{'t'}) {$t=$opts{'t'};} my $mis=0; #mismatch number for microRNA if (defined $opts{'mis'}) {$mis=$opts{'mis'};} my $hit=25; # maximum reads mapping hits in genome if (defined $opts{'r'}) {$hit=$opts{'r'};} my $upstream = 2; # microRNA 5' extension $upstream = $opts{'e'} if(defined $opts{'e'}); my $downstream = 5;# microRNA 3' extension $downstream = $opts{'f'} if(defined $opts{'f'}); my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; $time=&Time(); print "$time, Checking input file!\n"; my (@filein,@mark); &read_config(); &checkfa($opts{pre}) if(defined $opts{pre}); &checkfa($opts{mat}) if(defined $opts{mat}); &checkfa($opts{gfa}); chdir $dir; my $data2=$data; my $known_result=$dir."known_miRNA_Express"; if(defined $opts{pre} and defined $opts{mat}){ &quantify(); ### known microRAN quantify $data2=$known_result."/mirbase_not_mapped.fa"; } my $genome_map=$dir."genome_match"; &genome($data2); #my $genome_map=&search($dir,"genome_match_"); my $mapfile=$genome_map."/genome_mapped.bwt"; my $mapfa=$genome_map."/genome_mapped.fa"; my $unmap=$genome_map."/genome_not_mapped.fa"; #$time=Time(); #print "$time: Novel microRNA prediction!\n\n"; &predict($mapfa); $time=Time(); print "$time: Program end!!\n"; ############################## sub programs ################################### sub predict{ my ($file)=@_; $time=&Time(); print "$time: Novel microRNA prediction!\n\n"; my $predict=$dir."Novel_miRNA_predict"; mkdir $predict; chdir $predict; system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); # print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; system("bowtie-build -f excised_precursor.fa excised_precursor"); # print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt 2> run.log"); # print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); # print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; system("sort -k 4 precursor_mapped.bst > signatures.bst"); # print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; chdir $dir; system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); # print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; #system("rm novel_tmp_dir -rf"); my $tag=join "," ,@mark; system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); system("perl $scipt_path/non_miRNA_reads.pl -i microRNA_prediction.mrd -fa $file -o non_microRNA_sequence.fa"); } sub genome{ my ($file)=@_; if(defined $opts{'idx'}){ system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} ") ; # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; }else{ system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir ") ; # print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; } } sub quantify{ my $tag=join "\\;" ,@mark; system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; } sub read_config{ open CON,"<$config"; while (my $aline=<CON>) { chomp $aline; my @tmp=split/\t/,$aline; push @filein,$tmp[0]; push @mark,$tmp[1]; #&check_rawdata($tmp[0]); } close CON; if (@filein != @mark) { #&printErr(); die "Maybe config file have some wrong!!!\n"; } } sub checkfa{ my ($file_reads)=@_; open N,"<$file_reads"; my $line=<N>; chomp $line; if($line !~ /^>\S+/){ #printErr(); die "The first line of file $file_reads does not start with '>identifier' Reads file $file_reads is not a valid fasta file\n\n"; } if(<N> !~ /^[ACGTNacgtn]*$/){ #printErr(); die "File $file_reads contains not allowed characters in sequences Allowed characters are ACGTN Reads file $file_reads is not a fasta file\n\n"; } close N; } sub search{ my ($dir,$str)=@_; opendir I,$dir; my @ret; while (my $file=readdir I) { if ($file=~/$str/) { push @ret, $file; } } closedir I; if (@ret != 1) { #&printErr(); die "Can not find directory or file which name has string: $str !!!\n"; } return $ret[0]; } sub Time{ my $time=time(); my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; $month++; $year+=1900; if (length($sec) == 1) {$sec = "0"."$sec";} if (length($min) == 1) {$min = "0"."$min";} if (length($hour) == 1) {$hour = "0"."$hour";} if (length($day) == 1) {$day = "0"."$day";} if (length($month) == 1) {$month = "0"."$month";} #print "$year-$month-$day $hour:$min:$sec\n"; return("$year-$month-$day $hour:$min:$sec"); } sub usage{ print <<"USAGE"; Version $version Usage: $0 -i -fa -gfa -idx -pre -mat -mis -e -f -t -o -path options: -i input files, # config -fa ,#fasta sequence file -path scirpt path -gfa string, input file # genome fasta. sequence file -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -pre string, input file #species specific microRNA precursor sequences -mat string, input file #species specific microRNA mature sequences -mis [int] number of allowed mismatches when mapping reads to precursors, default 0 -e [int] number of nucleotides upstream of the mature sequence to consider, default 2 -f [int] number of nucleotides downstream of the mature sequence to consider, default 5 -r int a read is allowed to map up to this number of positions in the genome,default is 25 -dis <int> Maximal space between miRNA and miRNA* (200) -flank <int> Flank sequence length of miRNA precursor (10) -mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) -t int, number of threads [1] -o output directory# absolute path -h help USAGE exit(1); }