Mercurial > repos > big-tiandm > mirplant2
diff miRPlant.pl @ 34:915b62d31f9d draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 31 Jul 2014 03:08:00 -0400 |
parents | 554fbaf5f451 |
children | 0c4e11018934 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRPlant.pl Thu Jul 31 03:08:00 2014 -0400 @@ -0,0 +1,503 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2014-4-22 +#Modified: +#Description: plant microRNA prediction +my $version=1.00; + +use strict; +use Getopt::Long; +use threads; +use threads::shared; +use File::Path; +use File::Basename; +#use RNA; +use Term::ANSIColor; + +my %opts; +GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); +if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $time=&Time(); +print "miPlant program start:\n The time is $time!\n"; +print "Command line:\n $0 @ARGV\n"; + +my $format=$opts{'format'}; +if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { + &printErr(); + die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +} + +my $phred_qv=64; + + +my @inputfiles=@{$opts{'i'}}; +my @inputtags=@{$opts{'tag'}}; + +my $mypath=`pwd`; +chomp $mypath; + +my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/"; + + +unless ($dir=~/\/$/) {$dir.="/";} +if (not -d $dir) { + mkdir $dir; +} +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@inputfiles;$i++) { + print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; + } +close CONFIG; + +my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; + +my $a="ATCTCGTATG"; #adapter +if (defined $opts{'a'}) {$a=$opts{'a'};} + +my $m=6; #adapter minimum mapped nt +if (defined $opts{'M'}) {$m=$opts{'M'};} + +my $t=1; #threads number +if (defined $opts{'t'}) {$t=$opts{'t'};} + +my $min_nt=19; # minimum reads length +if (defined $opts{'min'}) {$min_nt=$opts{'min'};} + +my $max_nt=28; #maximum reads length +if (defined $opts{'max'}) {$max_nt=$opts{'max'};} + +my $mis=0; #mismatch number for microRNA +if (defined $opts{'mis'}) {$mis=$opts{'mis'};} + +my $mis_rfam=0;# mismatch number for rfam +if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} + +my $hit=25; # maximum reads mapping hits in genome +if (defined $opts{'r'}) {$hit=$opts{'r'};} + +my $upstream = 2; # microRNA 5' extension +$upstream = $opts{'e'} if(defined $opts{'e'}); + +my $downstream = 5;# microRNA 3' extension +$downstream = $opts{'f'} if(defined $opts{'f'}); + +my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; +my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; +my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; + +$time=&Time(); +print "$time, Checking input file!\n"; + +my (@filein,@mark,@clean); +#&read_config(); +@filein=@inputfiles; +@mark=@inputtags; + +&checkfa($opts{pre}); +&checkfa($opts{mat}); +&checkfa($opts{gfa}); + + +##### clip adpter --> clean data start +$time=&Time(); +print "$time, Preprocess:\n trim adapter, reads collapse and filter reads by length.\n"; + +$time=~s/:/-/g; +$time=~s/ /-/g; +my $preprocess=$dir."preProcess_${time}/"; +mkdir $preprocess; +my $can_use_threads = eval 'use threads; 1'; +if ($can_use_threads) { +# Do processing using threads + print "Do processing using threads\n"; + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clean,$preprocess.$out."_clips_adapter.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } +} else { +# Do not processing using threads + print "Do not processing using threads\n"; + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clean,$preprocess.$out."_clips_adapter.fq"; + &clips($in,$out); + } +} + +##### clip adpter --> clean data end + +my $collapsed=$preprocess."collapse_reads.fa"; +my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data +my $data2; ### mirbase not mapped reads +my $data3; ### rfam not mapped reads +&collapse(\@clean,$collapsed); #collapse reads to tags + +&filterbylength(); # filter <$min_nt && >$max_nt + +print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; + +$time=Time(); +print "$time: known microRNA quantify!\n\n"; + +chdir $dir; + +$time=~s/:/-/g; +$time=~s/ /-/g; +my $known_result=$dir."miRNA_Express_${time}/"; +&quantify(); ### known microRAN quantify + + +#my $miR_exp_dir=&search($known_result,"miRNA_Express_"); +$data2=$known_result."/mirbase_not_mapped.fa"; + +my $pathfile="$dir/path.txt"; +open PA,">$pathfile"; +print PA "$config\n"; +print PA "$preprocess\n"; +print PA "$known_result\n"; + +if (defined $opts{'rfam'}) { #rfam mapping and analysis + $time=Time(); + print "$time: RNA annotate!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $rfam_exp_dir=$dir."rfam_match_${time}"; + &rfam(); + #my $rfam_exp_dir=&search($dir,"rfam_match_"); + $data3=$rfam_exp_dir."/rfam_not_mapped.fa"; +print PA "$rfam_exp_dir\n"; + + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); +} + +my $data4=$data; +if (defined $opts{'D'}) { #genome mapping + $data4=$data3; +}else{ + $data4=$data2; +} + +$time=Time(); +print "$time: Genome alignment!\n\n"; +$time=~s/:/-/g; +$time=~s/ /-/g; +my $genome_map=$dir."genome_match_${time}"; +&genome($data4); +print PA "$genome_map\n"; +#my $genome_map=&search($dir,"genome_match_"); +my $mapfile=$genome_map."/genome_mapped.bwt"; +my $mapfa=$genome_map."/genome_mapped.fa"; +my $unmap=$genome_map."/genome_not_mapped.fa"; + +#$time=Time(); +#print "$time: Novel microRNA prediction!\n\n"; + +&predict($mapfa); + +close PA; +system("perl $scipt_path/html.pl -i $pathfile -format $format -o $dir/result.html"); + +$time=Time(); +print "$time: Program end!!\n"; + +############################## sub programs ################################### +sub predict{ + my ($file)=@_; + $time=&Time(); + print "$time: Novel microRNA prediction!\n\n"; + $time=~s/:/-/g; + $time=~s/ /-/g; + my $predict=$dir."miRNA_predict_${time}"; +print PA "$predict\n"; + mkdir $predict; + chdir $predict; + system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); +# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; + + system("bowtie-build -f excised_precursor.fa excised_precursor"); +# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; + + system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt"); +# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; + + system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); +# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; + + system("sort +3 -25 precursor_mapped.bst > signatures.bst"); +# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; + + chdir $dir; + system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); +# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; + system("rm novel_tmp_dir -rf"); + my $tag=join "," ,@mark; + system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); +} + +sub genome{ + my ($file)=@_; + if(defined $opts{'idx'}){ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; + }else{ + system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time") ; +# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; + } +} +sub rfam{ + if (defined $opts{'idx2'}) { + system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time"); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; + }else{ + system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time"); +# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; + } +} +sub quantify{ + my $tag=join "\\;" ,@mark; + system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); +# print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; +} +sub filterbylength{ + my $tmpmark=join ",", @mark; + system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); +# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; + +} +sub collapse{ + my ($ins,$data)=@_; + my $str=""; + for (my $i=0;$i<@{$ins};$i++) { + $str .="-i $$ins[$i] "; + } + system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); +# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; +} + +sub clips{ + my ($in,$out)=@_; + my $adapter=$preprocess.$out."_clips_adapter.fq"; + if($format eq "fq" || $format eq "fastq"){ + system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; + print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; + } + if($format eq "fa" || $format eq "fasta"){ + system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; + # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; + } + #my $clean=$preprocess.$out."_clean.fq"; + #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); + + return; +} + +sub read_config{ + open CON,"<$config"; + while (my $aline=<CON>) { + chomp $aline; + my @tmp=split/\t/,$aline; + push @filein,$tmp[0]; + push @mark,$tmp[1]; + &check_rawdata($tmp[0]); + } + close CON; + if (@filein != @mark) { + &printErr(); + die "Maybe config file have some wrong!!!\n"; + } +} +sub check_rawdata{ + my ($fileforcheck)=@_; + if (!(-s $fileforcheck)) { + &printErr(); + die "Can not find $fileforcheck, or file is empty!!!\n"; + } + if ($format eq "fasta" || $format eq "fa") { + &checkfa($fileforcheck); + } + if ($format eq "fastq" || $format eq "fq") { + &checkfq($fileforcheck); + } +} +sub checkfa{ + my ($file_reads)=@_; + open N,"<$file_reads"; + my $line=<N>; + chomp $line; + if($line !~ /^>\S+/){ + printErr(); + die "The first line of file $file_reads does not start with '>identifier' +Reads file $file_reads is not a valid fasta file\n\n"; + } + if(<N> !~ /^[ACGTNacgtn]*$/){ + printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + close N; +} +sub checkfq{ + my ($file_reads)=@_; + + open N,"<$file_reads"; + for (my $i=0;$i<10;$i++) { + my $a=<N>; + my $b=<N>; + my $c=<N>; + my $d=<N>; + chomp $a; + chomp $b; + chomp $c; + chomp $d; + if($a!~/^\@/){ + &printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if($b!~ /^[ACGTNacgtn]*$/){ + &printErr(); + die "File $file_reads contains not allowed characters in sequences +Allowed characters are ACGTN +Reads file $file_reads is not a fasta file\n\n"; + } + if ($c!~/^\@/ && $c!~/^\+/) { + &printErr(); + die "$file_reads is not a fastq file\n\n"; + } + if ((length $b) != (length $d)) { + &printErr(); + die "$file_reads is not a fastq file\n\n"; + } + my @qv=split //,$d; + for (my $j=0;$j<@qv ;$j++) { + my $q=ord($qv[$j])-64; + if($q<0){$phred_qv=33;} + } + } + close N; +} + +sub search{ + my ($dir,$str)=@_; + opendir I,$dir; + my @ret; + while (my $file=readdir I) { + if ($file=~/$str/) { + push @ret, $file; + } + } + closedir I; + if (@ret != 1) { + &printErr(); + + die "Can not find directory or file which name has string: $str !!!\n"; + } + return $ret[0]; +} + +sub printErr{ + print STDERR color 'bold red'; + print STDERR "Error: "; + print STDERR color 'reset'; +} +=cut +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +=cut +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + #print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day $hour:$min:$sec"); +} + + +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path +options: +-i string, input file#input files information file + /path/filename mark + /path/filename mark + ... + +-format string,#specific input rawdata file format : fastq|fq|fasta|fa + +-gfa string, input file # genome fasta. sequence file +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-pre string, input file #species specific microRNA precursor sequences +-mat string, input file #species specific microRNA mature sequences + +-rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-D If [-D] is specified,will discard rfam mapped reads(nead -rfam). + +-a string, ADAPTER string. default is ATCTCGTATG. +-M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. +-min int, reads min length,default is 19. +-max int, reads max length,default is 28. + +-mis [int] number of allowed mismatches when mapping reads to precursors, default 0 +-e [int] number of nucleotides upstream of the mature sequence to consider, default 2 +-f [int] number of nucleotides downstream of the mature sequence to consider, default 5 +-v <int> report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment +-r int a read is allowed to map up to this number of positions in the genome,default is 25 + +-dis <int> Maximal space between miRNA and miRNA* (200) +-flank <int> Flank sequence length of miRNA precursor (10) +-mfe <folat> Maximal free energy allowed for a miRNA precursor (-20) + +-t int, number of threads [1] + +-o output directory# absolute path +-h help +USAGE +exit(1); +} +