Mercurial > repos > big-tiandm > sirna_plant
view siRNA.pl @ 13:a8c011dc575b draft
Uploaded
author | big-tiandm |
---|---|
date | Wed, 29 Oct 2014 04:19:42 -0400 |
parents | a0222bdfe2ac |
children | 22d79320085c |
line wrap: on
line source
#!/usr/bin/perl -w my $version=1.00; use strict; use warnings; use Getopt::Long; use Getopt::Std; use threads; use threads::shared; use Parallel::ForkManager; use lib '/leofs/biotrans/chentt/perl_module/'; #perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq print " ##################################### # # # sRNA cluster # # # ##################################### "; ########################################################################################### my $usage="$0 Options: -i input file# raw data file -tag string #raw data sample name -g genome file -f gff file -o workdir file -path script path -t int, number of threads [1] -format fastq, fq, fasta or fa -idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -mis int number of allowed mismatches when mapping reads to genome, default 0 -rfam string, input file# rfam database file. -idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter string must be the prefix of the bowtie index. For instance, if the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then the prefix is 'h_sapiens_37_asm'.##can be null -v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment -a string, ADAPTER string. default is ATCTCGTATG. -n int max hits number,default 25; used in genome alignment -d int distance of tag to merged a cluster; default 100 -p cluster method F :conventional default is F T :NIBLES -l int the length of the upstream and downstream,default 1000;used in position annotate -nat natural antisense transcripts file -repeat repeat information file out of Repeatmasker -deg file config of de sample -cen centromere file input -span plot span, default 50000 "; my %options; GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); #print help if that option is used if($options{h}){die $usage;} my @filein=@{$options{'i'}}; my @mark=@{$options{'tag'}}; #my $config=$options{'i'}; my $genome_fa=$options{'g'}; my $gff=$options{'f'}; ########################################################################################## my $predir=`pwd`; chomp $predir; my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; my $path=$options{'path'}; my $t=defined($options{'t'})? $options{'t'}:1; #threads number my $mis=defined $options{'mis'} ? $options{'mis'}:0; my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; my $hit=defined $options{'n'}?$options{'n'}:25; my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; my $format=$options{'format'}; #if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { # die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; #} my $adpter="ATCTCGTATG"; #adapter if (defined $options{'a'}) {$adpter=$options{'a'};} my $phred_qv=64; my $sample_number; my ($dir,$dir_tmp); ################################ MAIN ################################################## print "\ncluster program start:"; my $time=Time(); make_dir_tmp(); my @clip; my $mark; my $sample_mark; my $config=$dir."/input_config"; open CONFIG,">$config"; for (my $i=0;$i<@filein;$i++) { print CONFIG $filein[$i],"\t",$mark[$i],"\n"; } close CONFIG; if (@filein != @mark) { die "Maybe config file have some wrong!!!\n"; } $sample_number=@mark; $mark=join "\t",@mark; $sample_mark=join "\#",@mark; #read_config(); trim_adapter_and_filter(); my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data my $data2=$filter_out; ### mirbase not mapped reads my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads my $bed=$dir."cluster\/"."sample\.bed"; my $read=$dir."cluster\/"."sample_reads\.cluster"; my $read_txt=$dir."cluster\/"."cluster\.txt"; my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; my $preprocess; my $cluster_file; my $annotate_dir; my $deg_dir; my %id; for (my $i=0;$i<@mark ;$i++) { $id{$mark[$i]}=$i+4; } group_and_filter(); #collapse reads to tags rfam(); my @map_read; my $map_tag=0; genome(); bwt2bed(); cluster(); quantify(); phase(); if (defined($options{'nat'})&&defined($options{'repeat'})) { class(); } else{ get_genelist(); } annotate(); genome_length(); plot(); my @pairdir; if (defined($options{'deg'})) { dec(); infor_merge(); } else{infor_merge_no_dec()} html(); print "\ncluster program end:"; Time(); ############################sub program################################################### sub make_dir_tmp{ #make temporary directory if(not -d "$workdir\/cluster_runs"){ mkdir("$workdir\/cluster_runs"); mkdir("$workdir\/cluster_runs\/ref\/"); } $dir="$workdir\/cluster_runs\/"; #print STDERR "mkdir $dir\n\n"; return; } #sub read_config{ # open IN,"<$config"; # while (my $aline=<IN>) { # chomp $aline; # my @tmp=split/\t/,$aline; # push @filein,$tmp[0]; # push @mark,$tmp[1]; # } # close IN; # if (@filein != @mark) { # die "Maybe config file have some wrong!!!\n"; # } # $sample_number=@mark; # $mark=join "\t",@mark; # $sample_mark=join "\#",@mark; #} sub trim_adapter_and_filter{ my $time=time(); $preprocess=$dir."preProcess/"; mkdir $preprocess; my $can_use_threads = eval 'use threads; 1'; if ($can_use_threads) { # Do processing using threads my @filein1=@filein; my @mark1=@mark; while (@filein1>0) { my @thrs; my @res; for (my $i=0;$i<$t ;$i++) { last if(@filein1==0); my $in=shift @filein1; my $out=shift @mark1; push @clip,$dir."preProcess\/$out\_clip\.fq"; $thrs[$i]=threads->create(\&clips,$in,$out); } for (my $i=0;$i<@thrs;$i++) { $res[$i]=$thrs[$i]->join(); } } } else { # Do not processing using threads for (my $i=0;$i<@filein ;$i++) { my $in=$filein[$i]; my $out=$mark[$i]; push @clip,$dir."preProcess\/$out\_clip\.fq"; &clips($in,$out); } } } sub clips{ my ($filein,$fileout)=@_; my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; if($format eq "fq" || $format eq "fastq"){ my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; } if($format eq "fa" || $format eq "fasta"){ my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; } #my $clean=$dir."preProcess\/$fileout\_clean.fq"; #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; return $fileout; } sub group_and_filter{ #my ($ins,$data)=@_; my @ins=@clip; my $str=""; my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; #print "$$ins[0]\t$$ins[0]\n"; for (my $i=0;$i<@clip;$i++) { $str .="-i $clip[$i] "; #print "$$ins[$i]\n"; } my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`; print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n"; my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; return 0; } sub rfam{ if (defined $options{'idx2'}) { system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); }else{ system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); } my $tag=join "\\;" ,@mark; my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; return 0; } sub genome{ if(defined $options{'idx'}){ system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; }else{ system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; } #=================== mapping sta =================================================== my $map_file=$dir."genome_match\/genome_mapped\.fa"; open (MAP,"<$map_file")||die"$!"; print "\n#each sample mapping reads sta:\n\n"; print "#$mark\ttotal\n"; while (my $ID=<MAP>) { chomp $ID; my @tmp=split/\:/,$ID; my @exp=split/\_/,$tmp[1]; $exp[-1] =~ s/^x//; for (my $i=0;$i<@exp ;$i++) { $map_read[$i]+=$exp[$i]; } $map_tag++; my $seq=<MAP>; } my $map_read=join"\t",@map_read; print "$map_read\n\n"; print "#total mapped tags:$map_read\n\n"; close MAP; return 0; } sub bwt2bed{ $cluster_file=$dir."cluster\/"; mkdir ("$cluster_file"); print "sam file changed to bed file\n"; my ($file) = $dir."genome_match\/genome_mapped\.bwt"; my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; return 0; } sub cluster{ print "tags is ready to merged clusters\n\n"; my ($file) =$bed; if ($cluster_mothod eq "F") { my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; } elsif($cluster_mothod eq "T"){ my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -mark $sample_mark`; print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -mark $sample_mark\n\n"; } else{print "\-p is wrong!\n\n";} return 0; } sub quantify{ print "clusters is ready to quantified\n\n"; my @depth=@map_read; pop @depth; my $depth=join ",",@depth; my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n"; return 0; } sub phase{ $annotate_dir=$dir."annotate\/"; mkdir ("$annotate_dir"); print "clusters is to predict phase siRNA\n"; my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; return 0; } sub class{ print "clusters is ready to annotate by sources\n\n"; my $nat=$options{'nat'}; my $repeat=$options{'repeat'}; my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; } sub annotate{ print "clusters is ready to annotate by gff file\n\n"; my $file; if (defined($options{'nat'})&&defined($options{'repeat'})) { $file="$annotate_dir\/sample_class.anno"; } else{ $file=$rpkm; } my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; return 0; } sub get_genelist{ my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; } sub dec{ print "deg reading\n\n"; my $deg_file=$options{'deg'}; open IN,"<$deg_file"; my @deg; my $s=0; while (my $aline=<IN>) { chomp $aline; next if($aline=~/^\#/); $deg[$s]=$aline; my @ea=split/\s+/,$aline; push @pairdir,"$ea[0]_VS_$ea[1]\/"; #print "$deg[$s]\n"; $s++; } close IN; $deg_dir=$dir."deg\/"; mkdir ("$deg_dir"); my $max_process = 10; my $pm = new Parallel::ForkManager( $max_process ); my $number=@deg-1; foreach(0..$number){ $pm->start and next; &dec_pel($deg[$_]); $pm->finish; } $pm->wait_all_children; } sub dec_pel{ print "start:\n"; Time(); my $sample=shift(@_); my @each=split/\s+/,$sample; print "$each[0]\t$each[1]\n"; my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; mkdir ("$deg_sample_dir"); my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 my $time2=time(); print "end:\n"; Time(); sleep 1; } sub infor_merge{ my ($input,$mark); foreach (@pairdir) { print "@pairdir\n"; $mark.=" -mark $_ "; $input.=" -i $dir/deg\/$_\/output_score\.txt "; print "$input\n$mark\n"; } my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; } sub infor_merge_no_dec{ my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; } sub genome_length{ my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" } sub plot{ my $plot_file="$dir\/plot\/"; mkdir ("$plot_file"); my $genome_plot="$dir\/plot\/genome\/"; mkdir ("$genome_plot"); #genome cluster my $span=defined($options{span})?$options{span}:50000; foreach (1..$sample_number) { my $mark=$mark[$_-1]; my $cen=""; if (defined $options{cen}) { $cen="-cen $options{cen}"; } my $plot=`perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt`; print "perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt\n\n"; } my $chr_plot_dir="$dir\/plot\/chr\/"; mkdir("$chr_plot_dir"); my %chr; open LEN,"<$dir\/ref\/genome\.length"; while (my $aline=<LEN>) { next if($aline=~/^\#/); chomp $aline; my @temp=split/\t/,$aline; $chr{$temp[0]}=$temp[1]; } close LEN; foreach my $chr (sort keys %chr) { my $cen=""; if (defined $options{cen}) { $cen="-cen $options{cen}"; } my $chr_plot=`perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html`; print "perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html\n"; } } sub html{ my $pathfile="$dir/path.txt"; open PA,">$pathfile"; print PA "$config\n"; print PA "$preprocess\n"; print PA "$dir"."rfam_match\n"; print PA "$dir"."genome_match\n"; print PA "$cluster_file\n"; print PA "$annotate_dir\n"; if (defined($deg_dir)) { print PA "$deg_dir\n"; } close PA; my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; } sub Time{ my $time=time(); my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; $month++; $year+=1900; if (length($sec) == 1) {$sec = "0"."$sec";} if (length($min) == 1) {$min = "0"."$min";} if (length($hour) == 1) {$hour = "0"."$hour";} if (length($day) == 1) {$day = "0"."$day";} if (length($month) == 1) {$month = "0"."$month";} print "$year-$month-$day $hour:$min:$sec\n"; return("$year-$month-$day-$hour-$min-$sec"); } #################################################################################