Mercurial > repos > big-tiandm > mirplant2
view siRNA.pl @ 53:f5a2e8308836 draft default tip
Uploaded
author | big-tiandm |
---|---|
date | Mon, 08 Dec 2014 01:51:16 -0500 |
parents | 7b5a48b972e9 |
children |
line wrap: on
line source
#!/usr/bin/perl -w my $version=1.00; use strict; use warnings; use Getopt::Long; use Getopt::Std; use threads; use threads::shared; use Parallel::ForkManager; use lib '/leofs/biotrans/chentt/perl_module/'; #perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq print " ##################################### # # # sRNA cluster # # # ##################################### "; ########################################################################################### my $usage="$0 Options: -i input file# fasta -config input file -g genome file -f gff file -o workdir file -path script path -t int, number of threads [1] -format fastq, fq, fasta or fa -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -mis int number of allowed mismatches when mapping reads to genome, default 0 -n int max hits number,default 25; used in genome alignment -d int distance of tag to merged a cluster; default 100 -p cluster method F :conventional default is F T :NIBLES -l int the length of the upstream and downstream,default 1000;used in position annotate -nat natural antisense transcripts file -repeat repeat information file out of Repeatmasker -deg file config of de sample -cen centromere file input -span plot span, default 50000 "; my %options; GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h"); #print help if that option is used if($options{h}){die $usage;} my $filein=$options{'i'}; #my $config=$options{'i'}; my $genome_fa=$options{'g'}; my $gff=$options{'f'}; ########################################################################################## my $predir=`pwd`; chomp $predir; my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; my $path=$options{'path'}; my $t=defined($options{'t'})? $options{'t'}:1; #threads number my $mis=defined $options{'mis'} ? $options{'mis'}:0; my $hit=defined $options{'n'}?$options{'n'}:25; my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; my $format=$options{'format'}; #if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { # die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; #} my $sample_number; my ($dir,$dir_tmp); ################################ MAIN ################################################## print "\ncluster program start:"; my $time=Time(); make_dir_tmp(); my $mark; my $sample_mark; my $config=$opts{'config'}; my (@filein,@mark); &read_config(); $sample_number=@mark; $mark=join "\t",@mark; $sample_mark=join "\#",@mark; my $data3=$filein; ### rfam not mapped reads genome(); my $bed=$dir."cluster\/"."sample\.bed"; my $read=$dir."cluster\/"."sample_reads\.cluster"; my $read_txt=$dir."cluster\/"."cluster\.txt"; my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; my $preprocess; my $cluster_file; my $annotate_dir; my $deg_dir; my $plot_dir; my %id; for (my $i=0;$i<@mark ;$i++) { $id{$mark[$i]}=$i+4; } my @map_read; my $map_tag=0; bwt2bed(); cluster(); quantify(); phase(); if (defined($options{'nat'})&&defined($options{'repeat'})) { class(); } else{ get_genelist(); } annotate(); genome_length(); plot(); my @pairdir; if (defined($options{'deg'})) { dec(); infor_merge(); } else{infor_merge_no_dec()} html(); print "\ncluster program end:"; Time(); ############################sub program################################################### sub make_dir_tmp{ #make temporary directory if(not -d "$workdir\/cluster_runs"){ mkdir("$workdir\/cluster_runs"); mkdir("$workdir\/cluster_runs\/ref\/"); } $dir="$workdir\/cluster_runs\/"; #print STDERR "mkdir $dir\n\n"; return; } sub genome{ if(defined $options{'idx'}){ system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; }else{ system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; } #=================== mapping sta =================================================== my $map_file=$dir."genome_match\/genome_mapped\.fa"; open (MAP,"<$map_file")||die"$!"; print "\n#each sample mapping reads sta:\n\n"; print "#$mark\ttotal\n"; while (my $ID=<MAP>) { chomp $ID; my @tmp=split/\:/,$ID; my @exp=split/\_/,$tmp[1]; $exp[-1] =~ s/^x//; for (my $i=0;$i<@exp ;$i++) { $map_read[$i]+=$exp[$i]; } $map_tag++; my $seq=<MAP>; } my $map_read=join"\t",@map_read; print "$map_read\n\n"; print "#total mapped tags:$map_read\n\n"; close MAP; return 0; } sub bwt2bed{ $cluster_file=$dir."cluster\/"; mkdir ("$cluster_file"); print "sam file changed to bed file\n"; my ($file) = $dir."genome_match\/genome_mapped\.bwt"; my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; return 0; } sub cluster{ print "tags is ready to merged clusters\n\n"; my ($file) =$bed; if ($cluster_mothod eq "F") { my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; } elsif($cluster_mothod eq "T"){ my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`; print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n"; } else{print "\-p is wrong!\n\n";} return 0; } sub quantify{ print "clusters is ready to quantified\n\n"; my @depth=@map_read; pop @depth; my $depth=join ",",@depth; my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n"; return 0; } sub phase{ $annotate_dir=$dir."annotate\/"; mkdir ("$annotate_dir"); print "clusters is to predict phase siRNA\n"; my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; return 0; } sub class{ print "clusters is ready to annotate by sources\n\n"; my $nat=$options{'nat'}; my $repeat=$options{'repeat'}; my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; } sub annotate{ print "clusters is ready to annotate by gff file\n\n"; my $file; if (defined($options{'nat'})&&defined($options{'repeat'})) { $file="$annotate_dir\/sample_class.anno"; } else{ $file=$rpkm; } my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; return 0; } sub get_genelist{ my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; } sub dec{ print "deg reading\n\n"; my $deg_file=$options{'deg'}; open IN,"<$deg_file"; my @deg; my $s=0; while (my $aline=<IN>) { chomp $aline; next if($aline=~/^\#/); $deg[$s]=$aline; my @ea=split/\s+/,$aline; push @pairdir,"$ea[0]_VS_$ea[1]\/"; #print "$deg[$s]\n"; $s++; } close IN; $deg_dir=$dir."deg\/"; mkdir ("$deg_dir"); my $max_process = 10; my $pm = new Parallel::ForkManager( $max_process ); my $number=@deg-1; foreach(0..$number){ $pm->start and next; &dec_pel($deg[$_]); $pm->finish; } $pm->wait_all_children; } sub dec_pel{ print "\n******************\nstart:\n"; Time(); my $sample=shift(@_); my @each=split/\s+/,$sample; print "$each[0]\t$each[1]\n"; my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; mkdir ("$deg_sample_dir"); print "read: $read\n"; print "deg_sample_dir: $deg_sample_dir\n"; print "$id{$each[0]}\t$each[0]\n"; print "$id{$each[1]}\t$each[1]\n"; my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 my $time2=time(); print "end:\n*************************\n"; Time(); sleep 1; } sub infor_merge{ my ($input,$mark); foreach (@pairdir) { print "@pairdir\n"; $mark.=" -mark $_ "; $input.=" -i $dir/deg\/$_\/output_score\.txt "; print "$input\n$mark\n"; } my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; } sub infor_merge_no_dec{ my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; } sub genome_length{ my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" } sub plot{ $plot_dir="$dir\/plot\/"; mkdir ("$plot_dir"); my $span=defined($options{span})?$options{span}:50000; my $cen=""; if (defined $options{cen}) { $cen="-cen $options{cen}"; } my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `; "print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n"; } sub html{ my $pathfile="$dir/path.txt"; open PA,">$pathfile"; print PA "$config\n"; print PA "$preprocess\n"; print PA "$dir"."rfam_match\n"; print PA "$dir"."genome_match\n"; print PA "$cluster_file\n"; print PA "$annotate_dir\n"; print PA "$plot_dir\n"; if (defined($deg_dir)) { print PA "$deg_dir\n"; } close PA; my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; } sub Time{ my $time=time(); my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; $month++; $year+=1900; if (length($sec) == 1) {$sec = "0"."$sec";} if (length($min) == 1) {$min = "0"."$min";} if (length($hour) == 1) {$hour = "0"."$hour";} if (length($day) == 1) {$day = "0"."$day";} if (length($month) == 1) {$month = "0"."$month";} print "$year-$month-$day $hour:$min:$sec\n"; return("$year-$month-$day-$hour-$min-$sec"); } ################################################################################# sub read_config{ open CON,"<$config"; while (my $aline=<CON>) { chomp $aline; my @tmp=split/\t/,$aline; push @filein,$tmp[0]; push @mark,$tmp[1]; #&check_rawdata($tmp[0]); } close CON; if (@filein != @mark) { #&printErr(); die "Maybe config file have some wrong!!!\n"; } }