# HG changeset patch # User big-tiandm # Date 1406282240 14400 # Node ID c0aecae5eb03359b9178c6dbd556f2bed09c24ff # Parent b963c4a161d25ef438ccc97bd3f6739dcd8b7a5f Deleted selected files diff -r b963c4a161d2 -r c0aecae5eb03 miRPlant.pl --- a/miRPlant.pl Fri Jul 25 05:57:15 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,503 +0,0 @@ -#!/usr/bin/perl -w -#Filename: -#Author: Tian Dongmei -#Email: tiandm@big.ac.cn -#Date: 2014-4-22 -#Modified: -#Description: plant microRNA prediction -my $version=1.00; - -use strict; -use Getopt::Long; -use threads; -use threads::shared; -use File::Path; -use File::Basename; -#use RNA; -use Term::ANSIColor; - -my %opts; -GetOptions(\%opts,"i:s@","tag:s@","format=s","gfa=s","pre=s","mat=s","rfam:s","dis:i","flank:i","mfe:f","idx:s","idx2:s","mis:i","r:i","v:i","e:i","f:i","a:s","M:i","t:i","min:i","max:i","o:s","path:s","D","h"); -if (!(defined $opts{i} and defined $opts{format} and defined $opts{gfa} and defined $opts{pre} and defined $opts{mat}) || defined $opts{h}) { #necessary arguments -&usage; -} - -my $time=&Time(); -print "miPlant program start:\n The time is $time!\n"; -print "Command line:\n $0 @ARGV\n"; - -my $format=$opts{'format'}; -if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { - &printErr(); - die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; -} - -my $phred_qv=64; - - -my @inputfiles=@{$opts{'i'}}; -my @inputtags=@{$opts{'tag'}}; - -my $mypath=`pwd`; -chomp $mypath; - -my $dir=defined $opts{'o'} ? $opts{'o'} : "$mypath/miRPlant_out/"; - - -unless ($dir=~/\/$/) {$dir.="/";} -if (not -d $dir) { - mkdir $dir; -} -my $config=$dir."/input_config"; -open CONFIG,">$config"; - for (my $i=0;$i<@inputfiles;$i++) { - print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n"; - } -close CONFIG; - -my $scipt_path=defined $opts{'path'} ? $opts{'path'} : "/Users/big/galaxy-dist/tools/myTools/"; - -my $a="ATCTCGTATG"; #adapter -if (defined $opts{'a'}) {$a=$opts{'a'};} - -my $m=6; #adapter minimum mapped nt -if (defined $opts{'M'}) {$m=$opts{'M'};} - -my $t=1; #threads number -if (defined $opts{'t'}) {$t=$opts{'t'};} - -my $min_nt=19; # minimum reads length -if (defined $opts{'min'}) {$min_nt=$opts{'min'};} - -my $max_nt=28; #maximum reads length -if (defined $opts{'max'}) {$max_nt=$opts{'max'};} - -my $mis=0; #mismatch number for microRNA -if (defined $opts{'mis'}) {$mis=$opts{'mis'};} - -my $mis_rfam=0;# mismatch number for rfam -if (defined $opts{'v'}) {$mis_rfam=$opts{'v'};} - -my $hit=25; # maximum reads mapping hits in genome -if (defined $opts{'r'}) {$hit=$opts{'r'};} - -my $upstream = 2; # microRNA 5' extension -$upstream = $opts{'e'} if(defined $opts{'e'}); - -my $downstream = 5;# microRNA 3' extension -$downstream = $opts{'f'} if(defined $opts{'f'}); - -my $maxd=defined $opts{'dis'} ? $opts{'dis'} : 200; -my $flank=defined $opts{'flank'} ? $opts{'flank'} :10; -my $mfe=defined $opts{'mfe'} ? $opts{'mfe'} : -20; - -$time=&Time(); -print "$time, Checking input file!\n"; - -my (@filein,@mark,@clean); -#&read_config(); -@filein=@inputfiles; -@mark=@inputtags; - -&checkfa($opts{pre}); -&checkfa($opts{mat}); -&checkfa($opts{gfa}); - - -##### clip adpter --> clean data start -$time=&Time(); -print "$time, Preprocess:\n trim adapter, reads collapse and filter reads by length.\n"; - -$time=~s/:/-/g; -$time=~s/ /-/g; -my $preprocess=$dir."preProcess_${time}/"; -mkdir $preprocess; -my $can_use_threads = eval 'use threads; 1'; -if ($can_use_threads) { -# Do processing using threads - print "Do processing using threads\n"; - my @filein1=@filein; my @mark1=@mark; - while (@filein1>0) { - my @thrs; my @res; - for (my $i=0;$i<$t ;$i++) { - last if(@filein1==0); - my $in=shift @filein1; - my $out=shift @mark1; - push @clean,$preprocess.$out."_clips_adapter.fq"; - $thrs[$i]=threads->create(\&clips,$in,$out); - } - for (my $i=0;$i<@thrs;$i++) { - $res[$i]=$thrs[$i]->join(); - } - } -} else { -# Do not processing using threads - print "Do not processing using threads\n"; - for (my $i=0;$i<@filein ;$i++) { - my $in=$filein[$i]; - my $out=$mark[$i]; - push @clean,$preprocess.$out."_clips_adapter.fq"; - &clips($in,$out); - } -} - -##### clip adpter --> clean data end - -my $collapsed=$preprocess."collapse_reads.fa"; -my $data=$preprocess."collapse_reads_${min_nt}_${max_nt}.fa"; ## raw clean data -my $data2; ### mirbase not mapped reads -my $data3; ### rfam not mapped reads -&collapse(\@clean,$collapsed); #collapse reads to tags - -&filterbylength(); # filter <$min_nt && >$max_nt - -print "The final clean data file is $data, only contains reads which length is among $min_nt\~$max_nt\n\n"; - -$time=Time(); -print "$time: known microRNA quantify!\n\n"; - -chdir $dir; - -$time=~s/:/-/g; -$time=~s/ /-/g; -my $known_result=$dir."miRNA_Express_${time}/"; -&quantify(); ### known microRAN quantify - - -#my $miR_exp_dir=&search($known_result,"miRNA_Express_"); -$data2=$known_result."/mirbase_not_mapped.fa"; - -my $pathfile="$dir/path.txt"; -open PA,">$pathfile"; -print PA "$config\n"; -print PA "$preprocess\n"; -print PA "$known_result\n"; - -if (defined $opts{'rfam'}) { #rfam mapping and analysis - $time=Time(); - print "$time: RNA annotate!\n\n"; - $time=~s/:/-/g; - $time=~s/ /-/g; - my $rfam_exp_dir=$dir."rfam_match_${time}"; - &rfam(); - #my $rfam_exp_dir=&search($dir,"rfam_match_"); - $data3=$rfam_exp_dir."/rfam_not_mapped.fa"; -print PA "$rfam_exp_dir\n"; - - my $tag=join "\\;" ,@mark; - system("perl $scipt_path/count_rfam_express.pl -i $rfam_exp_dir/rfam_mapped.bwt -tag $tag -o rfam_non-miRNA_annotation.txt"); -} - -my $data4=$data; -if (defined $opts{'D'}) { #genome mapping - $data4=$data3; -}else{ - $data4=$data2; -} - -$time=Time(); -print "$time: Genome alignment!\n\n"; -$time=~s/:/-/g; -$time=~s/ /-/g; -my $genome_map=$dir."genome_match_${time}"; -&genome($data4); -print PA "$genome_map\n"; -#my $genome_map=&search($dir,"genome_match_"); -my $mapfile=$genome_map."/genome_mapped.bwt"; -my $mapfa=$genome_map."/genome_mapped.fa"; -my $unmap=$genome_map."/genome_not_mapped.fa"; - -#$time=Time(); -#print "$time: Novel microRNA prediction!\n\n"; - -&predict($mapfa); - -close PA; -system("perl $scipt_path/html.pl -i $pathfile -format $format -o $dir/result.html"); - -$time=Time(); -print "$time: Program end!!\n"; - -############################## sub programs ################################### -sub predict{ - my ($file)=@_; - $time=&Time(); - print "$time: Novel microRNA prediction!\n\n"; - $time=~s/:/-/g; - $time=~s/ /-/g; - my $predict=$dir."miRNA_predict_${time}"; -print PA "$predict\n"; - mkdir $predict; - chdir $predict; - system("perl $scipt_path/precursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe"); -# print "\nprecursors.pl -map $mapfile -g $opts{gfa} -d $maxd -f $flank -o $predict/excised_precursor.fa -s $predict/excised_precursor_struc.txt -e $mfe\n"; - - system("bowtie-build -f excised_precursor.fa excised_precursor"); -# print "\nbowtie-build -f excised_precursor.fa excised_precursor\n"; - - system("bowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt"); -# print "\nbowtie -v $mis -f -p $t -m $hit -a --best --strata excised_precursor $file > precursor_mapped.bwt\n"; - - system("perl $scipt_path/convert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst"); -# print "\nconvert_bowtie_to_blast.pl precursor_mapped.bwt $file excised_precursor.fa > precursor_mapped.bst\n"; - - system("sort +3 -25 precursor_mapped.bst > signatures.bst"); -# print "\nsort +3 -25 precursor_mapped.bst > ../signatures.bst\n"; - - chdir $dir; - system("perl $scipt_path/miRDeep_plant.pl $predict/signatures.bst $predict/excised_precursor_struc.txt novel_tmp_dir -y > microRNA_prediction.mrd"); -# print "\nmiRDeep_plant.pl $dir/signatures.bst $predict/excised_precursor_struc.txt tmp_dir -y > microRNA_prediction.txt\n"; - system("rm novel_tmp_dir -rf"); - my $tag=join "," ,@mark; - system("perl $scipt_path/miRNA_Express_and_sequence.pl -i microRNA_prediction.mrd -list novel_microRNA_express.txt -fa novel_microRNA_mature.fa -pre novel_microRNA_precursor.fa -tag $tag"); -} - -sub genome{ - my ($file)=@_; - if(defined $opts{'idx'}){ - system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time") ; -# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -index $opts{idx} -time $time\n"; - }else{ - system("perl $scipt_path/matching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time") ; -# print "\nmatching.pl -i $file -g $opts{gfa} -v $mis -p $t -r $hit -o $dir -time $time\n"; - } -} -sub rfam{ - if (defined $opts{'idx2'}) { - system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time"); -# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -index $opts{idx2} -time $time\n"; - }else{ - system("perl $scipt_path/rfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time"); -# print "\nrfam.pl -i $data2 -ref $opts{rfam} -v $mis_rfam -p $t -o $dir -time $time\n"; - } -} -sub quantify{ - my $tag=join "\\;" ,@mark; - system("perl $scipt_path/quantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag"); -# print "\nquantify.pl -p $opts{pre} -m $opts{mat} -r $data -o $dir -time $time -mis $mis -t $t -e $upstream -f $downstream -tag $tag\n"; -} -sub filterbylength{ - my $tmpmark=join ",", @mark; - system("perl $scipt_path/filterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark"); -# print "\nfilterReadsByLength.pl -i $collapsed -o $data -min $min_nt -max $max_nt -mark $tmpmark\n"; - -} -sub collapse{ - my ($ins,$data)=@_; - my $str=""; - for (my $i=0;$i<@{$ins};$i++) { - $str .="-i $$ins[$i] "; - } - system ("perl $scipt_path/collapseReads2Tags.pl $str -mark seq -o $data -format $format"); -# print "\ncollapseReads2Tags.pl $str -mark seq -o $data -format $format\n"; -} - -sub clips{ - my ($in,$out)=@_; - my $adapter=$preprocess.$out."_clips_adapter.fq"; - if($format eq "fq" || $format eq "fastq"){ - system("fastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter") ; - print "\nfastx_clipper -a $a -M $m -Q $phred_qv -i $in -o $adapter\n"; - } - if($format eq "fa" || $format eq "fasta"){ - system("fastx_clipper -a $a -M $m -i $in -o $adapter") ; - # print "\nfastx_clipper -a $a -M $m -i $in -o $adapter\n"; - } - #my $clean=$preprocess.$out."_clean.fq"; - #system("filterReadsByLength.pl -i $adapter -o $clean -min $min_nt -max $max_nt "); - - return; -} - -sub read_config{ - open CON,"<$config"; - while (my $aline=) { - chomp $aline; - my @tmp=split/\t/,$aline; - push @filein,$tmp[0]; - push @mark,$tmp[1]; - &check_rawdata($tmp[0]); - } - close CON; - if (@filein != @mark) { - &printErr(); - die "Maybe config file have some wrong!!!\n"; - } -} -sub check_rawdata{ - my ($fileforcheck)=@_; - if (!(-s $fileforcheck)) { - &printErr(); - die "Can not find $fileforcheck, or file is empty!!!\n"; - } - if ($format eq "fasta" || $format eq "fa") { - &checkfa($fileforcheck); - } - if ($format eq "fastq" || $format eq "fq") { - &checkfq($fileforcheck); - } -} -sub checkfa{ - my ($file_reads)=@_; - open N,"<$file_reads"; - my $line=; - chomp $line; - if($line !~ /^>\S+/){ - printErr(); - die "The first line of file $file_reads does not start with '>identifier' -Reads file $file_reads is not a valid fasta file\n\n"; - } - if( !~ /^[ACGTNacgtn]*$/){ - printErr(); - die "File $file_reads contains not allowed characters in sequences -Allowed characters are ACGTN -Reads file $file_reads is not a fasta file\n\n"; - } - close N; -} -sub checkfq{ - my ($file_reads)=@_; - - open N,"<$file_reads"; - for (my $i=0;$i<10;$i++) { - my $a=; - my $b=; - my $c=; - my $d=; - chomp $a; - chomp $b; - chomp $c; - chomp $d; - if($a!~/^\@/){ - &printErr(); - die "$file_reads is not a fastq file\n\n"; - } - if($b!~ /^[ACGTNacgtn]*$/){ - &printErr(); - die "File $file_reads contains not allowed characters in sequences -Allowed characters are ACGTN -Reads file $file_reads is not a fasta file\n\n"; - } - if ($c!~/^\@/ && $c!~/^\+/) { - &printErr(); - die "$file_reads is not a fastq file\n\n"; - } - if ((length $b) != (length $d)) { - &printErr(); - die "$file_reads is not a fastq file\n\n"; - } - my @qv=split //,$d; - for (my $j=0;$j<@qv ;$j++) { - my $q=ord($qv[$j])-64; - if($q<0){$phred_qv=33;} - } - } - close N; -} - -sub search{ - my ($dir,$str)=@_; - opendir I,$dir; - my @ret; - while (my $file=readdir I) { - if ($file=~/$str/) { - push @ret, $file; - } - } - closedir I; - if (@ret != 1) { - &printErr(); - - die "Can not find directory or file which name has string: $str !!!\n"; - } - return $ret[0]; -} - -sub printErr{ - print STDERR color 'bold red'; - print STDERR "Error: "; - print STDERR color 'reset'; -} -=cut -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - #print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day-$hour-$min-$sec"); -} -=cut -sub Time{ - my $time=time(); - my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; - $month++; - $year+=1900; - if (length($sec) == 1) {$sec = "0"."$sec";} - if (length($min) == 1) {$min = "0"."$min";} - if (length($hour) == 1) {$hour = "0"."$hour";} - if (length($day) == 1) {$day = "0"."$day";} - if (length($month) == 1) {$month = "0"."$month";} - #print "$year-$month-$day $hour:$min:$sec\n"; - return("$year-$month-$day $hour:$min:$sec"); -} - - -sub usage{ -print <<"USAGE"; -Version $version -Usage: -$0 -i -format -gfa -index -pre -mat -rfam -D -a -M -min -max -mis -e -f -v -t -o -path -options: --i string, input file#input files information file - /path/filename mark - /path/filename mark - ... - --format string,#specific input rawdata file format : fastq|fq|fasta|fa - --gfa string, input file # genome fasta. sequence file --idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null - --pre string, input file #species specific microRNA precursor sequences --mat string, input file #species specific microRNA mature sequences - --rfam string, input file# rfam database file, microRNAs must not be contained in this file## if not define, rfam small RNA will not be count. --idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter - string must be the prefix of the bowtie index. For instance, if - the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then - the prefix is 'h_sapiens_37_asm'.##can be null - --D If [-D] is specified,will discard rfam mapped reads(nead -rfam). - --a string, ADAPTER string. default is ATCTCGTATG. --M int, require minimum adapter alignment length of N. If less than N nucleotides aligned with the adapter - don't clip it. --min int, reads min length,default is 19. --max int, reads max length,default is 28. - --mis [int] number of allowed mismatches when mapping reads to precursors, default 0 --e [int] number of nucleotides upstream of the mature sequence to consider, default 2 --f [int] number of nucleotides downstream of the mature sequence to consider, default 5 --v report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment --r int a read is allowed to map up to this number of positions in the genome,default is 25 - --dis Maximal space between miRNA and miRNA* (200) --flank Flank sequence length of miRNA precursor (10) --mfe Maximal free energy allowed for a miRNA precursor (-20) - --t int, number of threads [1] - --o output directory# absolute path --h help -USAGE -exit(1); -} -