# HG changeset patch # User big-tiandm # Date 1414570724 14400 # Node ID a0222bdfe2ac8a7473d23f36e2a105496476766f # Parent b20345ef399547968ec8bd3c517e0b44f3ae6a42 Uploaded diff -r b20345ef3995 -r a0222bdfe2ac siRNA.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/siRNA.pl Wed Oct 29 04:18:44 2014 -0400 @@ -0,0 +1,529 @@ +#!/usr/bin/perl -w +my $version=1.00; +use strict; +use warnings; +use Getopt::Long; +use Getopt::Std; +use threads; +use threads::shared; +use Parallel::ForkManager; +use lib '/leofs/biotrans/chentt/perl_module/'; +#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq +print " +##################################### +# # +# sRNA cluster # +# # +##################################### +"; +########################################################################################### +my $usage="$0 +Options: +-i input file# raw data file +-tag string #raw data sample name +-g genome file +-f gff file + +-o workdir file +-path script path +-t int, number of threads [1] +-format fastq, fq, fasta or fa +-idx string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null +-mis int number of allowed mismatches when mapping reads to genome, default 0 +-rfam string, input file# rfam database file. +-idx2 string, rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter + string must be the prefix of the bowtie index. For instance, if + the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then + the prefix is 'h_sapiens_37_asm'.##can be null + +-v int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment + +-a string, ADAPTER string. default is ATCTCGTATG. +-n int max hits number,default 25; used in genome alignment +-d int distance of tag to merged a cluster; default 100 +-p cluster method F :conventional default is F + T :NIBLES +-l int the length of the upstream and downstream,default 1000;used in position annotate + +-nat natural antisense transcripts file +-repeat repeat information file out of Repeatmasker +-deg file config of de sample +-cen centromere file input +-span plot span, default 50000 +"; + +my %options; +GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h"); +#print help if that option is used +if($options{h}){die $usage;} + +my @filein=@{$options{'i'}}; +my @mark=@{$options{'tag'}}; + +#my $config=$options{'i'}; +my $genome_fa=$options{'g'}; +my $gff=$options{'f'}; + + +########################################################################################## +my $predir=`pwd`; +chomp $predir; +my $workdir=defined($options{'o'}) ? $options{'o'}:$predir; + +my $path=$options{'path'}; + +my $t=defined($options{'t'})? $options{'t'}:1; #threads number + +my $mis=defined $options{'mis'} ? $options{'mis'}:0; + +my $mis_rfam=defined $options{'v'} ? $options{'v'}:0; + +my $hit=defined $options{'n'}?$options{'n'}:25; + +my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100; + +my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000; + +my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F"; + +my $format=$options{'format'}; +#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { +# die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n"; +#} + +my $adpter="ATCTCGTATG"; #adapter +if (defined $options{'a'}) {$adpter=$options{'a'};} + + +my $phred_qv=64; +my $sample_number; +my ($dir,$dir_tmp); +################################ MAIN ################################################## +print "\ncluster program start:"; +my $time=Time(); +make_dir_tmp(); + +my @clip; +my $mark; +my $sample_mark; + +my $config=$dir."/input_config"; +open CONFIG,">$config"; + for (my $i=0;$i<@filein;$i++) { + print CONFIG $filein[$i],"\t",$mark[$i],"\n"; + } +close CONFIG; +if (@filein != @mark) { + die "Maybe config file have some wrong!!!\n"; +} +$sample_number=@mark; +$mark=join "\t",@mark; +$sample_mark=join "\#",@mark; + + +#read_config(); + +trim_adapter_and_filter(); + +my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data +my $data2=$filter_out; ### mirbase not mapped reads +my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa"; ### rfam not mapped reads +my $bed=$dir."cluster\/"."sample\.bed"; +my $read=$dir."cluster\/"."sample_reads\.cluster"; +my $read_txt=$dir."cluster\/"."cluster\.txt"; +my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster"; +my $preprocess; +my $cluster_file; +my $annotate_dir; +my $deg_dir; +my %id; +for (my $i=0;$i<@mark ;$i++) { + $id{$mark[$i]}=$i+4; +} +group_and_filter(); #collapse reads to tags + +rfam(); + +my @map_read; +my $map_tag=0; +genome(); + +bwt2bed(); + +cluster(); + +quantify(); + +phase(); + +if (defined($options{'nat'})&&defined($options{'repeat'})) { + class(); +} +else{ + get_genelist(); +} + +annotate(); + +genome_length(); + +plot(); + +my @pairdir; +if (defined($options{'deg'})) { + dec(); + infor_merge(); +} +else{infor_merge_no_dec()} +html(); +print "\ncluster program end:"; +Time(); +############################sub program################################################### +sub make_dir_tmp{ + + #make temporary directory + if(not -d "$workdir\/cluster_runs"){ + mkdir("$workdir\/cluster_runs"); + mkdir("$workdir\/cluster_runs\/ref\/"); + } + + $dir="$workdir\/cluster_runs\/"; + #print STDERR "mkdir $dir\n\n"; + return; +} + +#sub read_config{ +# open IN,"<$config"; +# while (my $aline=) { +# chomp $aline; +# my @tmp=split/\t/,$aline; +# push @filein,$tmp[0]; +# push @mark,$tmp[1]; +# } +# close IN; +# if (@filein != @mark) { +# die "Maybe config file have some wrong!!!\n"; +# } +# $sample_number=@mark; +# $mark=join "\t",@mark; +# $sample_mark=join "\#",@mark; +#} + + +sub trim_adapter_and_filter{ + my $time=time(); + $preprocess=$dir."preProcess/"; + mkdir $preprocess; + my $can_use_threads = eval 'use threads; 1'; + if ($can_use_threads) { + # Do processing using threads + my @filein1=@filein; my @mark1=@mark; + while (@filein1>0) { + my @thrs; my @res; + for (my $i=0;$i<$t ;$i++) { + last if(@filein1==0); + my $in=shift @filein1; + my $out=shift @mark1; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + $thrs[$i]=threads->create(\&clips,$in,$out); + } + for (my $i=0;$i<@thrs;$i++) { + $res[$i]=$thrs[$i]->join(); + } + } + } + else { +# Do not processing using threads + for (my $i=0;$i<@filein ;$i++) { + my $in=$filein[$i]; + my $out=$mark[$i]; + push @clip,$dir."preProcess\/$out\_clip\.fq"; + &clips($in,$out); + } + } +} + +sub clips{ + my ($filein,$fileout)=@_; + my $adapter=$dir."preProcess\/$fileout\_clip\.fq"; + if($format eq "fq" || $format eq "fastq"){ + my $clip=`fastx_clipper -a $adpter -M 6 -Q $phred_qv -i $filein -o $adapter`; + } + if($format eq "fa" || $format eq "fasta"){ + my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`; + } + #my $clean=$dir."preProcess\/$fileout\_clean.fq"; + #my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `; + return $fileout; +} + +sub group_and_filter{ + #my ($ins,$data)=@_; + my @ins=@clip; + my $str=""; + my $group_out_file=$dir."preProcess\/"."collapse_reads.fa"; + #print "$$ins[0]\t$$ins[0]\n"; + for (my $i=0;$i<@clip;$i++) { + $str .="-i $clip[$i] "; + #print "$$ins[$i]\n"; + } + my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`; + print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n"; + + my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa"; + my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`; + print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n"; + my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`; + print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n"; + return 0; +} + +sub rfam{ + if (defined $options{'idx2'}) { + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}"); + }else{ + system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir"); + } + my $tag=join "\\;" ,@mark; + my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`; + return 0; +} +sub genome{ + if(defined $options{'idx'}){ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ; + }else{ + system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ; + } + #=================== mapping sta =================================================== + my $map_file=$dir."genome_match\/genome_mapped\.fa"; + open (MAP,"<$map_file")||die"$!"; + print "\n#each sample mapping reads sta:\n\n"; + print "#$mark\ttotal\n"; + while (my $ID=) { + chomp $ID; + my @tmp=split/\:/,$ID; + my @exp=split/\_/,$tmp[1]; + $exp[-1] =~ s/^x//; + for (my $i=0;$i<@exp ;$i++) { + $map_read[$i]+=$exp[$i]; + } + $map_tag++; + my $seq=; + } + my $map_read=join"\t",@map_read; + print "$map_read\n\n"; + print "#total mapped tags:$map_read\n\n"; + close MAP; + return 0; +} + +sub bwt2bed{ + $cluster_file=$dir."cluster\/"; + mkdir ("$cluster_file"); + print "sam file changed to bed file\n"; + my ($file) = $dir."genome_match\/genome_mapped\.bwt"; + + my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `; + print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n"; + return 0; +} + +sub cluster{ + print "tags is ready to merged clusters\n\n"; + my ($file) =$bed; + if ($cluster_mothod eq "F") { + my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`; + print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n"; + } + elsif($cluster_mothod eq "T"){ + my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -mark $sample_mark`; + print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -mark $sample_mark\n\n"; + } + else{print "\-p is wrong!\n\n";} + return 0; +} + + +sub quantify{ + print "clusters is ready to quantified\n\n"; + my @depth=@map_read; + pop @depth; + my $depth=join ",",@depth; + my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`; + print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n"; + return 0; +} + +sub phase{ + $annotate_dir=$dir."annotate\/"; + mkdir ("$annotate_dir"); + print "clusters is to predict phase siRNA\n"; + my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`; + print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n"; + return 0; +} + +sub class{ + print "clusters is ready to annotate by sources\n\n"; + my $nat=$options{'nat'}; + my $repeat=$options{'repeat'}; + my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`; + print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n"; +} + +sub annotate{ + print "clusters is ready to annotate by gff file\n\n"; + my $file; + if (defined($options{'nat'})&&defined($options{'repeat'})) { + $file="$annotate_dir\/sample_class.anno"; + } + else{ + $file=$rpkm; + } + my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`; + print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n"; + return 0; +} +sub get_genelist{ + + my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`; + print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt"; +} + +sub dec{ + print "deg reading\n\n"; + my $deg_file=$options{'deg'}; + open IN,"<$deg_file"; + my @deg; + my $s=0; + while (my $aline=) { + chomp $aline; + next if($aline=~/^\#/); + $deg[$s]=$aline; + my @ea=split/\s+/,$aline; + push @pairdir,"$ea[0]_VS_$ea[1]\/"; + #print "$deg[$s]\n"; + $s++; + } + close IN; + $deg_dir=$dir."deg\/"; + mkdir ("$deg_dir"); + my $max_process = 10; + my $pm = new Parallel::ForkManager( $max_process ); + my $number=@deg-1; + foreach(0..$number){ + $pm->start and next; + &dec_pel($deg[$_]); + $pm->finish; + } + $pm->wait_all_children; +} + +sub dec_pel{ + print "start:\n"; + Time(); + my $sample=shift(@_); + my @each=split/\s+/,$sample; + print "$each[0]\t$each[1]\n"; + my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/"; + mkdir ("$deg_sample_dir"); + my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2 + my $time2=time(); + print "end:\n"; + Time(); + sleep 1; +} + +sub infor_merge{ + my ($input,$mark); + foreach (@pairdir) { + print "@pairdir\n"; + $mark.=" -mark $_ "; + $input.=" -i $dir/deg\/$_\/output_score\.txt "; + print "$input\n$mark\n"; + } + my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `; + print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n"; +} + +sub infor_merge_no_dec{ + my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`; +} + +sub genome_length{ + my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`; + print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n" + +} + +sub plot{ + my $plot_file="$dir\/plot\/"; + mkdir ("$plot_file"); + my $genome_plot="$dir\/plot\/genome\/"; + mkdir ("$genome_plot"); + #genome cluster + my $span=defined($options{span})?$options{span}:50000; + foreach (1..$sample_number) { + my $mark=$mark[$_-1]; + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $plot=`perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt`; + print "perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt\n\n"; + } + + my $chr_plot_dir="$dir\/plot\/chr\/"; + mkdir("$chr_plot_dir"); + my %chr; + open LEN,"<$dir\/ref\/genome\.length"; + while (my $aline=) { + next if($aline=~/^\#/); + chomp $aline; + my @temp=split/\t/,$aline; + $chr{$temp[0]}=$temp[1]; + } + close LEN; + foreach my $chr (sort keys %chr) { + my $cen=""; + if (defined $options{cen}) { + $cen="-cen $options{cen}"; + } + my $chr_plot=`perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html`; + print "perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html\n"; + } +} + +sub html{ + my $pathfile="$dir/path.txt"; + open PA,">$pathfile"; + print PA "$config\n"; + print PA "$preprocess\n"; + print PA "$dir"."rfam_match\n"; + print PA "$dir"."genome_match\n"; + print PA "$cluster_file\n"; + print PA "$annotate_dir\n"; + if (defined($deg_dir)) { + print PA "$deg_dir\n"; + } + close PA; + my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`; +} + +sub Time{ + my $time=time(); + my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6]; + $month++; + $year+=1900; + if (length($sec) == 1) {$sec = "0"."$sec";} + if (length($min) == 1) {$min = "0"."$min";} + if (length($hour) == 1) {$hour = "0"."$hour";} + if (length($day) == 1) {$day = "0"."$day";} + if (length($month) == 1) {$month = "0"."$month";} + print "$year-$month-$day $hour:$min:$sec\n"; + return("$year-$month-$day-$hour-$min-$sec"); +} +#################################################################################