diff siRNA.pl @ 21:9dcffd531c76 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:09:35 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/siRNA.pl	Wed Nov 05 21:09:35 2014 -0500
@@ -0,0 +1,521 @@
+#!/usr/bin/perl -w
+my $version=1.00;
+use strict;
+use warnings;
+use Getopt::Long;
+use Getopt::Std;
+use threads;
+use threads::shared;
+use Parallel::ForkManager;
+use lib '/leofs/biotrans/chentt/perl_module/';
+#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq 
+print "
+#####################################
+#                                   # 
+#          sRNA cluster             # 
+#                                   #
+#####################################
+";
+###########################################################################################
+my $usage="$0
+Options:
+-i input file# raw data file
+-tag string #raw data sample name
+-g  genome file
+-f  gff file
+
+-o  workdir file
+-path  script path
+-t  int,    number of threads [1]
+-format  fastq, fq, fasta or fa
+-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
+		string must be the prefix of the bowtie index. For instance, if
+		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+		the prefix is 'h_sapiens_37_asm'.##can be null
+-mis  int     number of allowed mismatches when mapping reads to genome, default 0
+-rfam  string,  input file# rfam database file.
+-idx2  string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
+		string must be the prefix of the bowtie index. For instance, if
+		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
+		the prefix is 'h_sapiens_37_asm'.##can be null
+
+-v  int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment
+
+-a string,  ADAPTER string. default is ATCTCGTATG.
+-n  int max hits number,default 25; used in genome alignment
+-d  int distance of tag to merged a cluster; default 100
+-p  cluster method F :conventional default is F
+				   T :NIBLES 
+-l  int the length of the upstream and downstream,default 1000;used in position annotate
+
+-nat  natural antisense transcripts file
+-repeat  repeat information file out of Repeatmasker
+-deg  file config of de sample
+-cen  centromere file input
+-span  plot span, default 50000
+";
+
+my %options;
+GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h");
+#print help if that option is used
+if($options{h}){die $usage;}
+
+my @filein=@{$options{'i'}};
+my @mark=@{$options{'tag'}};
+
+#my $config=$options{'i'};
+my $genome_fa=$options{'g'};
+my $gff=$options{'f'};
+
+
+##########################################################################################
+my $predir=`pwd`;
+chomp $predir;
+my $workdir=defined($options{'o'}) ? $options{'o'}:$predir;
+
+my $path=$options{'path'};
+
+my $t=defined($options{'t'})? $options{'t'}:1; #threads number
+
+my $mis=defined $options{'mis'} ? $options{'mis'}:0;
+
+my $mis_rfam=defined $options{'v'} ? $options{'v'}:0;
+
+my $hit=defined $options{'n'}?$options{'n'}:25;
+
+my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100;
+
+my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000;
+
+my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F";
+
+my $format=$options{'format'};
+#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") { 
+#	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
+#}
+
+my $adpter="ATCTCGTATG";  #adapter
+if (defined $options{'a'}) {$adpter=$options{'a'};}
+
+
+my $phred_qv=64;
+my $sample_number;
+my ($dir,$dir_tmp);
+################################  MAIN  ##################################################
+print "\ncluster program start:";
+my $time=Time();
+make_dir_tmp();
+
+my @clip;
+my $mark;
+my $sample_mark;
+
+my $config=$dir."/input_config";
+open CONFIG,">$config";
+	for (my $i=0;$i<@filein;$i++) {
+		print CONFIG $filein[$i],"\t",$mark[$i],"\n";
+	}
+close CONFIG;
+if (@filein != @mark) {
+	die "Maybe config file have some wrong!!!\n";
+}
+$sample_number=@mark;
+$mark=join "\t",@mark;
+$sample_mark=join "\#",@mark;
+
+
+#read_config();
+
+trim_adapter_and_filter();
+
+my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data
+my $data2=$filter_out;   ### mirbase not mapped reads
+my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa";   ### rfam not mapped reads
+my $bed=$dir."cluster\/"."sample\.bed";
+my $read=$dir."cluster\/"."sample_reads\.cluster";
+my $read_txt=$dir."cluster\/"."cluster\.txt";
+my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster";
+my $preprocess;
+my $cluster_file;
+my $annotate_dir;
+my $deg_dir;
+my $plot_dir;
+my %id;
+for (my $i=0;$i<@mark ;$i++) {
+	$id{$mark[$i]}=$i+4;
+}
+
+print "\n######## tiandm test start ###########\n";
+print "\@mark: @mark\n\%id keys number:";
+print scalar keys %id;
+print "\n";
+foreach my $kyess (keys %id){
+	print $kyess," --> $id{$kyess}\n";	
+}
+print "\n######## tiandm test end ############\n";
+group_and_filter();   #collapse reads to tags
+
+rfam();
+
+my @map_read;
+my $map_tag=0;
+genome();
+
+bwt2bed();
+
+cluster();
+
+quantify();
+
+phase();
+
+if (defined($options{'nat'})&&defined($options{'repeat'})) {
+	class();
+}
+else{
+	get_genelist();
+}
+
+annotate();
+
+genome_length();
+
+plot();
+
+my @pairdir;
+if (defined($options{'deg'})) {
+	dec();
+	infor_merge();
+}
+else{infor_merge_no_dec()}
+html();
+print "\ncluster program end:";
+Time();
+############################sub program###################################################
+sub make_dir_tmp{
+
+	#make temporary directory
+	if(not -d "$workdir\/cluster_runs"){
+		mkdir("$workdir\/cluster_runs");
+		mkdir("$workdir\/cluster_runs\/ref\/");
+	}
+
+	$dir="$workdir\/cluster_runs\/";
+	#print STDERR "mkdir $dir\n\n";
+	return;
+}
+
+#sub read_config{
+#	open IN,"<$config";
+#	while (my $aline=<IN>) {
+#		chomp $aline;
+#		my @tmp=split/\t/,$aline;
+#		push @filein,$tmp[0];
+#		push @mark,$tmp[1];
+#	}
+#	close IN;
+#	if (@filein != @mark) {
+#		die "Maybe config file have some wrong!!!\n";
+#	}
+#	$sample_number=@mark;
+#	$mark=join "\t",@mark;
+#	$sample_mark=join "\#",@mark;
+#}
+
+
+sub trim_adapter_and_filter{
+	my $time=time();
+	$preprocess=$dir."preProcess/";
+	mkdir $preprocess;
+	my $can_use_threads = eval 'use threads; 1';
+	if ($can_use_threads) {
+	# Do processing using threads
+		my @filein1=@filein; my @mark1=@mark;
+		while (@filein1>0) {
+			my @thrs; my @res;
+			for (my $i=0;$i<$t ;$i++) {
+				last if(@filein1==0);
+				my $in=shift @filein1;
+				my $out=shift @mark1;
+				push @clip,$dir."preProcess\/$out\_clip\.fq";
+				$thrs[$i]=threads->create(\&clips,$in,$out);
+			}
+			for (my $i=0;$i<@thrs;$i++) {
+				$res[$i]=$thrs[$i]->join();
+			}
+		}
+	} 
+	else {
+# Do not processing using threads
+		for (my $i=0;$i<@filein ;$i++) {
+			my $in=$filein[$i];
+			my $out=$mark[$i];
+			push @clip,$dir."preProcess\/$out\_clip\.fq";
+			&clips($in,$out);
+		}
+	}
+}
+
+sub clips{
+	my ($filein,$fileout)=@_;
+	my $adapter=$dir."preProcess\/$fileout\_clip\.fq";
+	if($format eq "fq" || $format eq "fastq"){
+		my $clip=`fastx_clipper -a $adpter -M 6  -Q $phred_qv -i $filein -o $adapter`;
+	}
+	if($format eq "fa" || $format eq "fasta"){
+		my $clip=`fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`;
+	}
+	#my $clean=$dir."preProcess\/$fileout\_clean.fq";
+	#my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `;
+	return $fileout;
+}	
+
+sub group_and_filter{
+	#my ($ins,$data)=@_;
+	my @ins=@clip;
+	my $str="";
+	my $group_out_file=$dir."preProcess\/"."collapse_reads.fa";
+	#print "$$ins[0]\t$$ins[0]\n";
+	for (my $i=0;$i<@clip;$i++) {
+		$str .="-i $clip[$i] ";
+		#print "$$ins[$i]\n";
+	}
+	my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`;
+	print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n";
+
+	my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa";
+	my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`;
+	print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n";
+	my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`;
+	print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n";
+	my $plot_l_D=`perl $path/Length_Distibution.pl -i $dir/preProcess/reads_length_distribution_after_count_filter.txt -o $dir/preProcess/length.html `;
+	print "perl $path\/Length_Distibution.pl -i $dir\/preProcess\/reads_length_distribution_after_count_filter.txt -o $dir\/preProcess\/length\.html\n\n";
+	return 0;
+}
+
+sub rfam{
+	if (defined $options{'idx2'}) {
+		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}");
+	}else{
+		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir");
+	}
+	my $tag=join "\\;" ,@mark;
+	my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`;
+	return 0;
+}
+sub genome{
+	if(defined $options{'idx'}){
+		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ;
+	}else{
+		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ;
+	}
+	#=================== mapping sta ===================================================
+	my $map_file=$dir."genome_match\/genome_mapped\.fa";
+	open (MAP,"<$map_file")||die"$!";
+	print "\n#each sample mapping reads sta:\n\n";
+	print "#$mark\ttotal\n";
+	while (my $ID=<MAP>) {
+		chomp $ID;
+		my @tmp=split/\:/,$ID;
+		my @exp=split/\_/,$tmp[1];
+		$exp[-1] =~ s/^x//;
+		for (my $i=0;$i<@exp ;$i++) {
+			$map_read[$i]+=$exp[$i];
+		}
+		$map_tag++;
+		my $seq=<MAP>;
+	}
+	my $map_read=join"\t",@map_read;
+	print "$map_read\n\n";
+	print "#total mapped tags:$map_read\n\n";
+	close MAP;
+	return 0;
+}
+
+sub bwt2bed{
+	$cluster_file=$dir."cluster\/";
+	mkdir ("$cluster_file");
+	print "sam file changed to bed file\n";
+	my ($file) = $dir."genome_match\/genome_mapped\.bwt";
+
+	my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `;
+	print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n";
+	return 0;
+}
+
+sub cluster{
+	print "tags is ready to merged clusters\n\n";
+	my ($file) =$bed;
+	if ($cluster_mothod eq "F") {
+		my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`;
+		print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n";
+	}
+	elsif($cluster_mothod eq "T"){
+		my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`;
+		print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n";
+	}
+	else{print "\-p is wrong!\n\n";}
+	return 0;
+}
+
+
+sub quantify{
+	print "clusters is ready to quantified\n\n";
+	my @depth=@map_read;
+	pop @depth;
+	my $depth=join ",",@depth;
+	my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`;
+	print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n";
+	return 0;
+}
+
+sub phase{
+	$annotate_dir=$dir."annotate\/";
+	mkdir ("$annotate_dir");
+	print "clusters is to predict phase siRNA\n";
+	my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`;
+	print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n";
+	return 0;
+}
+
+sub class{
+	print "clusters is ready to annotate by sources\n\n";
+	my $nat=$options{'nat'};
+	my $repeat=$options{'repeat'};
+	my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`;
+	print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n";
+}
+
+sub annotate{
+	print "clusters is ready to annotate by gff file\n\n";
+	my $file;
+	if (defined($options{'nat'})&&defined($options{'repeat'})) {
+		$file="$annotate_dir\/sample_class.anno";
+	}
+	else{
+		$file=$rpkm;
+	}
+	my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`;
+	print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n";
+	return 0;
+}
+sub get_genelist{
+
+	my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`;
+	print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt";
+}
+
+sub dec{
+	print "deg reading\n\n";
+	my $deg_file=$options{'deg'};
+	open IN,"<$deg_file";
+	my @deg;
+	my $s=0;
+		while (my $aline=<IN>) {
+		chomp $aline;
+		next if($aline=~/^\#/);
+		$deg[$s]=$aline;
+		my @ea=split/\s+/,$aline;
+		push @pairdir,"$ea[0]_VS_$ea[1]\/";
+		#print "$deg[$s]\n";
+		$s++;
+	}
+	close IN;
+	$deg_dir=$dir."deg\/";
+	mkdir ("$deg_dir");
+	my $max_process = 10;
+	my $pm = new Parallel::ForkManager( $max_process );
+	my $number=@deg-1;
+	foreach(0..$number){
+		$pm->start and next;
+		&dec_pel($deg[$_]);
+		$pm->finish;
+	}
+	$pm->wait_all_children;
+}
+
+sub dec_pel{
+	print "\n******************\nstart:\n";
+	Time();
+	my $sample=shift(@_);
+	my @each=split/\s+/,$sample;
+	print "$each[0]\t$each[1]\n";
+	my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/";
+	mkdir ("$deg_sample_dir");
+	print "read: $read\n";
+	print "deg_sample_dir: $deg_sample_dir\n";
+	print "$id{$each[0]}\t$each[0]\n";
+	print "$id{$each[1]}\t$each[1]\n";
+	my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2
+	my $time2=time();
+	print "end:\n*************************\n";
+	Time();
+	sleep 1;
+}
+
+sub infor_merge{
+	my ($input,$mark);
+	foreach (@pairdir) {
+		print "@pairdir\n";
+		$mark.=" -mark $_ ";
+		$input.=" -i $dir/deg\/$_\/output_score\.txt ";
+		print "$input\n$mark\n";
+	}
+	my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `;
+	print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n";
+}
+
+sub infor_merge_no_dec{
+	my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`;
+}
+
+sub genome_length{
+	my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`;
+	print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n"
+
+}
+
+sub plot{
+	$plot_dir="$dir\/plot\/";
+	mkdir ("$plot_dir");
+	my $span=defined($options{span})?$options{span}:50000;
+	my $cen="";
+	if (defined $options{cen}) {
+		$cen="-cen $options{cen}";
+	}
+	my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;
+	"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n";
+
+}
+
+sub html{
+	my $pathfile="$dir/path.txt";
+	open PA,">$pathfile";
+	print PA "$config\n";
+	print PA "$preprocess\n";
+	print PA "$dir"."rfam_match\n";
+	print PA "$dir"."genome_match\n";
+	print PA "$cluster_file\n";
+	print PA "$annotate_dir\n";
+	print PA "$plot_dir\n";
+	if (defined($deg_dir)) {
+			print PA "$deg_dir\n";
+	}
+	close PA;
+	my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`;
+}
+
+sub Time{
+	my $time=time();
+	my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
+	$month++;
+	$year+=1900;
+	if (length($sec) == 1) {$sec = "0"."$sec";}
+	if (length($min) == 1) {$min = "0"."$min";}
+	if (length($hour) == 1) {$hour = "0"."$hour";}
+	if (length($day) == 1) {$day = "0"."$day";}
+	if (length($month) == 1) {$month = "0"."$month";}
+	print "$year-$month-$day $hour:$min:$sec\n";
+	return("$year-$month-$day-$hour-$min-$sec");
+}
+#################################################################################