Mercurial > repos > big-tiandm > sirna_plant

#!/usr/bin/perl -w
my $version=1.00;
use strict;
use warnings;
use Getopt::Long;
use Getopt::Std;
use threads;
use threads::shared;
use Parallel::ForkManager;
use lib '/leofs/biotrans/chentt/perl_module/';
#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq
print "
#####################################
#                                   #
#    sRNA cluster                   #
#                                   #
#####################################
";
###########################################################################################
my $usage="$0
Options:
-i input file# raw data file
-tag string #raw data sample name
-g  genome file
-f  gff file

-o  workdir file
-path  script path
-t  int,    number of threads [1]
-format  fastq, fq, fasta or fa
-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
		string must be the prefix of the bowtie index. For instance, if
		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
		the prefix is 'h_sapiens_37_asm'.##can be null
-mis  int     number of allowed mismatches when mapping reads to genome, default 0
-rfam  string,  input file# rfam database file.
-idx2  string,  rfam file index, file-prefix #(must be indexed by bowtie-build) The parameter
		string must be the prefix of the bowtie index. For instance, if
		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
		the prefix is 'h_sapiens_37_asm'.##can be null

-v  int report end-to-end hits w/ <=v mismatches; ignore qualities,default 0; used in rfam alignment

-a string,  ADAPTER string. default is ATCTCGTATG.
-n  int max hits number,default 25; used in genome alignment
-d  int distance of tag to merged a cluster; default 100
-p  cluster method F :conventional default is F
				   T :NIBLES
-l  int the length of the upstream and downstream,default 1000;used in position annotate

-nat  natural antisense transcripts file
-repeat  repeat information file out of Repeatmasker
-deg  file config of de sample
-cen  centromere file input
-span  plot span, default 50000
";

my %options;
GetOptions(\%options,"i:s@","tag:s@","g=s","f=s","o=s","a:s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","rfam:s","t:i","v:i","d:i","l:i","idx:s","idx2:s","cen:s","span:s","h");

my @inputfiles=@{$opts{'i'}};
my @inputtags=@{$opts{'tag'}};

#my $config=$options{'i'};
my $genome_fa=$options{'g'};
my $gff=$options{'f'};
##########################################################################################
my $predir=`pwd`;
chomp $predir;
my $workdir=defined($options{'o'}) ? $options{'o'}:$predir;

my $path=$options{'path'};

my $t=defined($options{'t'})? $options{'t'}:1; #threads number

my $mis=defined $options{'mis'} ? $options{'mis'}:0;

my $mis_rfam=defined $options{'v'} ? $options{'v'}:0;

my $hit=defined $options{'n'}?$options{'n'}:25;

my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100;

my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000;

my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F";

my $format=$options{'format'};
#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
#	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
#}

my $adpter="ATCTCGTATG";  #adapter
if (defined $opts{'a'}) {$a=$opts{'a'};}

#print help if that option is used
if($options{h}){die $usage;}

my $phred_qv=64;
my $sample_number;
my ($dir,$dir_tmp);
################################  MAIN  ##################################################
print "\ncluster program start:";
my $time=Time();
make_dir_tmp();

my (@filein,@mark,@clip);
my $mark;
my $sample_mark;

my $config=$workdir."/input_config";
open CONFIG,">$config";
	for (my $i=0;$i<@inputfiles;$i++) {
		print CONFIG $inputfiles[$i],"\t",$inputtags[$i],"\n";
	}
close CONFIG;

read_config();

trim_adapter_and_filter();

my $filter_out=$dir."preProcess\/"."collapse_reads_out.fa";## raw clean data
my $data2=$filter_out;   ### mirbase not mapped reads
my $data3=$dir."\/rfam_match\/rfam_not_mapped\.fa";   ### rfam not mapped reads
my $bed=$dir."cluster\/"."sample\.bed";
my $read=$dir."cluster\/"."sample_reads\.cluster";
my $read_txt=$dir."cluster\/"."cluster\.txt";
my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster";
my $preprocess;
my $cluster_file;
my $annotate_dir;
my $deg_dir;
my %id;
for (my $i=0;$i<@mark ;$i++) {
	$id{$mark[$i]}=$i+4;
}
group_and_filter();   #collapse reads to tags

rfam();

my @map_read;
my $map_tag=0;
genome();

bwt2bed();

cluster();

quantify();

phase();

class();

annotate();

genome_length();

plot();

my @pairdir;
if (defined($options{'deg'})) {
	dec();
	infor_merge();
}
html();
print "\ncluster program end:";
Time();
############################sub program###################################################
sub make_dir_tmp{

	#make temporary directory
	if(not -d "$workdir\/cluster_runs_$time"){
		mkdir("$workdir\/cluster_runs_$time");
		mkdir("$workdir\/cluster_runs_$time\/ref\/");
	}

	$dir="$workdir\/cluster_runs_$time\/";
	print STDERR "mkdir $dir\n\n";
	return;
}

sub read_config{
	open IN,"<$config";
	while (my $aline=<IN>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		push @filein,$tmp[0];
		push @mark,$tmp[1];
	}
	close IN;
	if (@filein != @mark) {
		die "Maybe config file have some wrong!!!\n";
	}
	$sample_number=@mark;
	$mark=join "\t",@mark;
	$sample_mark=join "\#",@mark;
}


sub trim_adapter_and_filter{
	my $time=time();
	$preprocess=$dir."preProcess/";
	mkdir $preprocess;
	my $can_use_threads = eval 'use threads; 1';
	if ($can_use_threads) {
	# Do processing using threads
		my @filein1=@filein; my @mark1=@mark;
		while (@filein1>0) {
			my @thrs; my @res;
			for (my $i=0;$i<$t ;$i++) {
				last if(@filein1==0);
				my $in=shift @filein1;
				my $out=shift @mark1;
				push @clip,$dir."preProcess\/$out\_clip\.fq";
				$thrs[$i]=threads->create(\&clips,$in,$out);
			}
			for (my $i=0;$i<@thrs;$i++) {
				$res[$i]=$thrs[$i]->join();
			}
		}
	}
	else {
# Do not processing using threads
		for (my $i=0;$i<@filein ;$i++) {
			my $in=$filein[$i];
			my $out=$mark[$i];
			push @clip,$dir."preProcess\/$out\_clip\.fq";
			&clips($in,$out);
		}
	}
}

sub clips{
	my ($filein,$fileout)=@_;
	my $adapter=$dir."preProcess\/$fileout\_clip\.fq";
	if($format eq "fq" || $format eq "fastq"){
		my $clip=`$path\/fastx_clipper -a $adpter -M 6  -Q $phred_qv -i $filein -o $adapter`;
	}
	if($format eq "fa" || $format eq "fasta"){
		my $clip=`$path\/fastx_clipper -a $adpter -M 6 -i $filein -o $adapter`;
	}
	#my $clean=$dir."preProcess\/$fileout\_clean.fq";
	#my $filter=`filterReadsByLength.pl -i $adapter -o $clean -min 18 -max 40 `;
	return $fileout;
}

sub group_and_filter{
	#my ($ins,$data)=@_;
	my @ins=@clip;
	my $str="";
	my $group_out_file=$dir."preProcess\/"."collapse_reads.fa";
	#print "$$ins[0]\t$$ins[0]\n";
	for (my $i=0;$i<@clip;$i++) {
		$str .="-i $clip[$i] ";
		#print "$$ins[$i]\n";
	}
	my $group=`perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format`;
	print "perl $path\/collapseReads2Tags.pl $str -mark seq -o $group_out_file -format $format\n\n";

	my $l_out=$dir."preProcess\/"."collapse_reads_18-40.fa";
	my $length_f=`perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark`;
	print "perl $path\/filterReadsByLength_1.pl -i $group_out_file -o $l_out -min 18 -max 40 -mark $sample_mark\n\n";
	my $cout_f=`perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark`;
	print "perl $path\/filterReadsByCount.pl -i $l_out -o $filter_out -mark $sample_mark\n\n";
	return 0;
}

sub rfam{
	if (defined $options{'idx2'}) {
		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir -index $options{idx2}");
	}else{
		system("perl $path\/rfam.pl -i $data2 -ref $options{rfam} -v $mis_rfam -p $t -o $dir");
	}
	my $tag=join "\\;" ,@mark;
	my $rfam_count=`perl $path\/count_rfam_express.pl -i $dir\/rfam_match\/rfam_mapped.bwt -tag $tag -o $dir\/rfam_match\/rfam_non-miRNA_annotation.txt`;
	return 0;
}
sub genome{
	if(defined $options{'idx'}){
		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ;
	}else{
		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ;
	}
	#=================== mapping sta ===================================================
	my $map_file=$dir."genome_match\/genome_mapped\.fa";
	open (MAP,"<$map_file")||die"$!";
	print "\n#each sample mapping reads sta:\n\n";
	print "#$mark\ttotal\n";
	while (my $ID=<MAP>) {
		chomp $ID;
		my @tmp=split/\:/,$ID;
		my @exp=split/\_/,$tmp[1];
		$exp[-1] =~ s/^x//;
		for (my $i=0;$i<@exp ;$i++) {
			$map_read[$i]+=$exp[$i];
		}
		$map_tag++;
		my $seq=<MAP>;
	}
	my $map_read=join"\t",@map_read;
	print "$map_read\n\n";
	print "#total mapped tags:$map_read\n\n";
	close MAP;
	return 0;
}

sub bwt2bed{
	$cluster_file=$dir."cluster\/";
	mkdir ("$cluster_file");
	print "sam file changed to bed file\n";
	my ($file) = $dir."genome_match\/genome_mapped\.bwt";

	my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `;
	print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n";
	return 0;
}

sub cluster{
	print "tags is ready to merged clusters\n\n";
	my ($file) =$bed;
	if ($cluster_mothod eq "F") {
		my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`;
		print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n";
	}
	elsif($cluster_mothod eq "T"){
		my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -mark $sample_mark`;
		print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -mark $sample_mark\n\n";
	}
	else{print "\-p is wrong!\n\n";}
	return 0;
}


sub quantify{
	print "clusters is ready to quantified\n\n";
	my @depth=@map_read;
	pop @depth;
	my $depth=join ",",@depth;
	my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`;
	print "perl $path\/quantify.pl -i $read -d $depth -o $rpkm\n\n\n";
	return 0;
}

sub phase{
	$annotate_dir=$dir."annotate\/";
	mkdir ("$annotate_dir");
	print "clusters is to predict phase siRNA\n";
	my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`;
	print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n";
	return 0;
}

sub class{
	print "clusters is ready to annotate by source\n\n";
	my $nat=$options{'nat'};
	my $repeat=$options{'repeat'};
	my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`;
	print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n";
}

sub annotate{
	print "clusters is ready to annotate by gff file\n\n";
	my $file="$annotate_dir\/sample_class.anno";
	my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`;
	print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n";
	return 0;
}

sub dec{
	print "deg reading\n\n";
	my $deg_file=$options{'deg'};
	open IN,"<$deg_file";
	my @deg;
	my $s=0;
		while (my $aline=<IN>) {
		chomp $aline;
		next if($aline=~/^\#/);
		$deg[$s]=$aline;
		my @ea=split/\s+/,$aline;
		push @pairdir,"$ea[0]_VS_$ea[1]\/";
		#print "$deg[$s]\n";
		$s++;
	}
	close IN;
	$deg_dir=$dir."deg\/";
	mkdir ("$deg_dir");
	my $max_process = 10;
	my $pm = new Parallel::ForkManager( $max_process );
	my $number=@deg-1;
	foreach(0..$number){
		$pm->start and next;
		&dec_pel($deg[$_]);
		$pm->finish;
	}
	$pm->wait_all_children;
}

sub dec_pel{
	print "start:\n";
	Time();
	my $sample=shift(@_);
	my @each=split/\s+/,$sample;
	print "$each[0]\t$each[1]\n";
	my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/";
	mkdir ("$deg_sample_dir");
	my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2
	my $time2=time();
	print "end:\n";
	Time();
	sleep 1;
}

sub infor_merge{
	my ($input,$mark);
	foreach (@pairdir) {
		print "@pairdir\n";
		$mark.=" -mark $_ ";
		$input.=" -i $dir/deg\/$_\/output_score\.txt ";
		print "$input\n$mark\n";
	}
	my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `;
	print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n";
}

sub genome_length{
	my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`;
	print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n"

}

sub plot{
	my $plot_file="$dir\/plot\/";
	mkdir ("$plot_file");
	my $genome_plot="$dir\/plot\/genome\/";
	mkdir ("$genome_plot");
	#genome cluster
	my $span=defined($options{span})?$options{span}:50000;
	foreach  (1..$sample_number) {
		my $mark=$mark[$_-1];
		my $cen="";
		if (defined $options{cen}) {
			$cen="-cen $options{cen}";
		}
		my $plot=`perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt`;
		print "perl $path\/sRNA_rpkm_distribution_along_genome.pl -c $rpkm -n $_ -mark $mark -span $span -l $dir\/ref\/genome\.length $cen -o $genome_plot\/$mark\.html -out $genome_plot\/$mark\.txt\n\n";
	}

	my $chr_plot_dir="$dir\/plot\/chr\/";
	mkdir("$chr_plot_dir");
	my %chr;
	open LEN,"<$dir\/ref\/genome\.length";
	while (my $aline=<LEN>) {
		next if($aline=~/^\#/);
		chomp $aline;
		my @temp=split/\t/,$aline;
		$chr{$temp[0]}=$temp[1];
	}
	close LEN;
	foreach my $chr (sort keys %chr) {
		my $cen="";
		if (defined $options{cen}) {
			$cen="-cen $options{cen}";
		}
		my $chr_plot=`perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span  -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html`;
		print "perl $path\/chr_plot.pl -l $chr{$chr} -chro $chr -g $dir\/ref\/genelist.txt -span $span  -c $rpkm -mark $sample_mark -o $chr_plot_dir\/$chr\.html\n";
	}
}

sub html{
	my $pathfile="$dir/path.txt";
	open PA,">$pathfile";
	print PA "$config\n";
	print PA "$preprocess\n";
	print PA "$dir"."rfam_match\n";
	print PA "$dir"."genome_match\n";
	print PA "$cluster_file\n";
	print PA "$annotate_dir\n";
	print PA "$deg_dir\n";
	close PA;
	my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`;
}

sub Time{
	my $time=time();
	my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
	$month++;
	$year+=1900;
	if (length($sec) == 1) {$sec = "0"."$sec";}
	if (length($min) == 1) {$min = "0"."$min";}
	if (length($hour) == 1) {$hour = "0"."$hour";}
	if (length($day) == 1) {$day = "0"."$day";}
	if (length($month) == 1) {$month = "0"."$month";}
	print "$year-$month-$day $hour:$min:$sec\n";
	return("$year-$month-$day-$hour-$min-$sec");
}
#################################################################################
author	big-tiandm
date	Thu, 18 Sep 2014 21:40:25 -0400
parents
children	f466394ee1fd