Mercurial > repos > big-tiandm > mirplant2

#!/usr/bin/perl -w
my $version=1.00;
use strict;
use warnings;
use Getopt::Long;
use Getopt::Std;
use threads;
use threads::shared;
use Parallel::ForkManager;
use lib '/leofs/biotrans/chentt/perl_module/';
#perl ../siRNA.pl -i config -g /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome.fa -f /share_bio/hs4/disk3-4/Reference/Plants/Rice_TIGR/Reference/TIGR/version_6.1/all.dir/all.gff3 -path /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/ -o /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test -t 3 -rfam /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/Rfam.fasta -idx /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/genome -idx2 /leofs/biotrans/projects/rice/smallRNA/sRNA_package/bin/test/ref/rfam -deg deg -n 25 -nat class/nat_1 -repeat class/repeat_1 -cen centromere_TIGR.txt -format fastq
print "
#####################################
#                                   #
#          sRNA cluster             #
#                                   #
#####################################
";
###########################################################################################
my $usage="$0
Options:
-i input file# fasta
-config input file
-g  genome file
-f  gff file

-o  workdir file
-path  script path
-t  int,    number of threads [1]
-format  fastq, fq, fasta or fa
-idx  string, genome file index, file-prefix #(must be indexed by bowtie-build) The parameter
		string must be the prefix of the bowtie index. For instance, if
		the first indexed file is called 'h_sapiens_37_asm.1.ebwt' then
		the prefix is 'h_sapiens_37_asm'.##can be null
-mis  int     number of allowed mismatches when mapping reads to genome, default 0

-n  int max hits number,default 25; used in genome alignment
-d  int distance of tag to merged a cluster; default 100
-p  cluster method F :conventional default is F
				   T :NIBLES
-l  int the length of the upstream and downstream,default 1000;used in position annotate

-nat  natural antisense transcripts file
-repeat  repeat information file out of Repeatmasker
-deg  file config of de sample
-cen  centromere file input
-span  plot span, default 50000
";

my %options;
GetOptions(\%options,"i:s","config=s","g=s","f=s","o=s","path:s","p=s","format=s","nat:s","repeat:s","deg:s","n:i","mis:i","t:i","d:i","l:i","idx:s","cen:s","span:s","h");
#print help if that option is used
if($options{h}){die $usage;}

my $filein=$options{'i'};

#my $config=$options{'i'};
my $genome_fa=$options{'g'};
my $gff=$options{'f'};


##########################################################################################
my $predir=`pwd`;
chomp $predir;
my $workdir=defined($options{'o'}) ? $options{'o'}:$predir;

my $path=$options{'path'};

my $t=defined($options{'t'})? $options{'t'}:1; #threads number

my $mis=defined $options{'mis'} ? $options{'mis'}:0;


my $hit=defined $options{'n'}?$options{'n'}:25;

my $distance_of_merged_tag=defined $options{'d'} ? $options{'d'}:100;

my $up_down_dis=defined $options{'l'} ?$options{'l'}:1000;

my $cluster_mothod=defined $options{'p'}?$options{'p'}:"F";

my $format=$options{'format'};
#if ($format ne "fastq" && $format ne "fq" && $format ne "fasta" && $format ne "fa") {
#	die "Parameter \"-format\" is error! Parameter is fastq, fq, fasta or fa\n";
#}


my $sample_number;
my ($dir,$dir_tmp);
################################  MAIN  ##################################################
print "\ncluster program start:";
my $time=Time();
make_dir_tmp();

my $mark;
my $sample_mark;

my $config=$opts{'config'};
my (@filein,@mark);
&read_config();
$sample_number=@mark;
$mark=join "\t",@mark;
$sample_mark=join "\#",@mark;


my $data3=$filein;   ### rfam not mapped reads
genome();

my $bed=$dir."cluster\/"."sample\.bed";
my $read=$dir."cluster\/"."sample_reads\.cluster";
my $read_txt=$dir."cluster\/"."cluster\.txt";
my $rpkm=$dir."cluster\/"."sample_rpkm\.cluster";
my $preprocess;
my $cluster_file;
my $annotate_dir;
my $deg_dir;
my $plot_dir;
my %id;
for (my $i=0;$i<@mark ;$i++) {
	$id{$mark[$i]}=$i+4;
}


my @map_read;
my $map_tag=0;

bwt2bed();

cluster();

quantify();

phase();

if (defined($options{'nat'})&&defined($options{'repeat'})) {
	class();
}
else{
	get_genelist();
}

annotate();

genome_length();

plot();

my @pairdir;
if (defined($options{'deg'})) {
	dec();
	infor_merge();
}
else{infor_merge_no_dec()}
html();
print "\ncluster program end:";
Time();
############################sub program###################################################
sub make_dir_tmp{

	#make temporary directory
	if(not -d "$workdir\/cluster_runs"){
		mkdir("$workdir\/cluster_runs");
		mkdir("$workdir\/cluster_runs\/ref\/");
	}

	$dir="$workdir\/cluster_runs\/";
	#print STDERR "mkdir $dir\n\n";
	return;
}

sub genome{
	if(defined $options{'idx'}){
		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir -index $options{idx}") ;
	}else{
		system("perl $path\/matching.pl -i $data3 -g $genome_fa -v $mis -p $t -r $hit -o $dir ") ;
	}
	#=================== mapping sta ===================================================
	my $map_file=$dir."genome_match\/genome_mapped\.fa";
	open (MAP,"<$map_file")||die"$!";
	print "\n#each sample mapping reads sta:\n\n";
	print "#$mark\ttotal\n";
	while (my $ID=<MAP>) {
		chomp $ID;
		my @tmp=split/\:/,$ID;
		my @exp=split/\_/,$tmp[1];
		$exp[-1] =~ s/^x//;
		for (my $i=0;$i<@exp ;$i++) {
			$map_read[$i]+=$exp[$i];
		}
		$map_tag++;
		my $seq=<MAP>;
	}
	my $map_read=join"\t",@map_read;
	print "$map_read\n\n";
	print "#total mapped tags:$map_read\n\n";
	close MAP;
	return 0;
}

sub bwt2bed{
	$cluster_file=$dir."cluster\/";
	mkdir ("$cluster_file");
	print "sam file changed to bed file\n";
	my ($file) = $dir."genome_match\/genome_mapped\.bwt";

	my $sam2bed=`perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed `;
	print "perl $path\/sam2Bed_bowtie.pl -i $file -mark $sample_mark -o $bed\n\n";
	return 0;
}

sub cluster{
	print "tags is ready to merged clusters\n\n";
	my ($file) =$bed;
	if ($cluster_mothod eq "F") {
		my $cluster=`perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt`;
		print "Using converntional method\n perl $path\/conventional.pl -i $file -d $distance_of_merged_tag -n $sample_number -mark $sample_mark -o $read -t $read_txt\n\n";
	}
	elsif($cluster_mothod eq "T"){
		my $cluster=`perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $read_txt -k $sample_mark`;
		print "Using nibls method\n perl $path\/nibls.pl -f $file -m $distance_of_merged_tag -o $read -t $dir\/cluster.txt -k $sample_mark\n\n";
	}
	else{print "\-p is wrong!\n\n";}
	return 0;
}


sub quantify{
	print "clusters is ready to quantified\n\n";
	my @depth=@map_read;
	pop @depth;
	my $depth=join ",",@depth;
	my $quantify=`perl $path\/quantify.pl -i $read -d $depth -o $rpkm`;
	print "perl $path\/quantify_siRNA.pl -i $read -d $depth -o $rpkm\n\n\n";
	return 0;
}

sub phase{
	$annotate_dir=$dir."annotate\/";
	mkdir ("$annotate_dir");
	print "clusters is to predict phase siRNA\n";
	my $phase=`perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out`;
	print "perl $path\/phased_siRNA.pl -i $read_txt -o $annotate_dir\/phase.out\n\n\n";
	return 0;
}

sub class{
	print "clusters is ready to annotate by sources\n\n";
	my $nat=$options{'nat'};
	my $repeat=$options{'repeat'};
	my $class=`perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt`;
	print "perl $path\/ClassAnnotate.pl -i $rpkm -g $gff  -n $nat -r $repeat -p $annotate_dir\/phase.out -o $annotate_dir\/sample_class.anno -t $annotate_dir\/nat.out -l $dir\/ref\/genelist.txt\n\n";
}

sub annotate{
	print "clusters is ready to annotate by gff file\n\n";
	my $file;
	if (defined($options{'nat'})&&defined($options{'repeat'})) {
		$file="$annotate_dir\/sample_class.anno";
	}
	else{
		$file=$rpkm;
	}
	my $annotate=`perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno`;
	print "perl $path\/Annotate.pl -i $file -g $dir\/ref\/genelist.txt -d $up_down_dis -o $annotate_dir\/sample_c_p.anno\n\n";
	return 0;
}
sub get_genelist{

	my $get_genelist=`perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt`;
	print "perl $path\/get_genelist.pl -i $gff -o $dir\/ref\/genelist.txt";
}

sub dec{
	print "deg reading\n\n";
	my $deg_file=$options{'deg'};
	open IN,"<$deg_file";
	my @deg;
	my $s=0;
		while (my $aline=<IN>) {
		chomp $aline;
		next if($aline=~/^\#/);
		$deg[$s]=$aline;
		my @ea=split/\s+/,$aline;
		push @pairdir,"$ea[0]_VS_$ea[1]\/";
		#print "$deg[$s]\n";
		$s++;
	}
	close IN;
	$deg_dir=$dir."deg\/";
	mkdir ("$deg_dir");
	my $max_process = 10;
	my $pm = new Parallel::ForkManager( $max_process );
	my $number=@deg-1;
	foreach(0..$number){
		$pm->start and next;
		&dec_pel($deg[$_]);
		$pm->finish;
	}
	$pm->wait_all_children;
}

sub dec_pel{
	print "\n******************\nstart:\n";
	Time();
	my $sample=shift(@_);
	my @each=split/\s+/,$sample;
	print "$each[0]\t$each[1]\n";
	my $deg_sample_dir=$deg_dir."$each[0]_VS_$each[1]\/";
	mkdir ("$deg_sample_dir");
	print "read: $read\n";
	print "deg_sample_dir: $deg_sample_dir\n";
	print "$id{$each[0]}\t$each[0]\n";
	print "$id{$each[1]}\t$each[1]\n";
	my $deg=`perl $path\/DEGseq_2.pl -i $read -outdir $deg_sample_dir -column1 $id{$each[0]} -mark1 $each[0] -column2 $id{$each[1]} -mark2 $each[1]`; #-depth1 -depth2
	my $time2=time();
	print "end:\n*************************\n";
	Time();
	sleep 1;
}

sub infor_merge{
	my ($input,$mark);
	foreach (@pairdir) {
		print "@pairdir\n";
		$mark.=" -mark $_ ";
		$input.=" -i $dir/deg\/$_\/output_score\.txt ";
		print "$input\n$mark\n";
	}
	my $infor_merge=`perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result `;
	print "perl $path\/SampleDEGseqMerge.pl $input $mark -f $annotate_dir\/sample_c_p.anno -n $sample_number -o $dir\/total.result\n\n";
}

sub infor_merge_no_dec{
	my $infor_merge_no_dec=`cp $annotate_dir\/sample_c_p.anno $dir\/total.result`;
}

sub genome_length{
	my $length=`perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length`;
	print "perl $path\/count_ref_length.pl -i $genome_fa -o $dir\/ref\/genome\.length\n\n"

}

sub plot{
	$plot_dir="$dir\/plot\/";
	mkdir ("$plot_dir");
	my $span=defined($options{span})?$options{span}:50000;
	my $cen="";
	if (defined $options{cen}) {
		$cen="-cen $options{cen}";
	}
	my $plot=`perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome\.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt `;
	"print perl $path/sRNA_plot.pl -c $rpkm -g $dir/ref/genelist.txt -span 50000 -mark $sample_mark -l $dir/ref/genome.length $cen -o $plot_dir/cluster.html -out $plot_dir/cluster.txt \n";

}

sub html{
	my $pathfile="$dir/path.txt";
	open PA,">$pathfile";
	print PA "$config\n";
	print PA "$preprocess\n";
	print PA "$dir"."rfam_match\n";
	print PA "$dir"."genome_match\n";
	print PA "$cluster_file\n";
	print PA "$annotate_dir\n";
	print PA "$plot_dir\n";
	if (defined($deg_dir)) {
			print PA "$deg_dir\n";
	}
	close PA;
	my $html=`perl $path\/html.pl -i $pathfile -format $format -o $dir/result.html`;
}

sub Time{
	my $time=time();
	my ($sec,$min,$hour,$day,$month,$year) = (localtime($time))[0,1,2,3,4,5,6];
	$month++;
	$year+=1900;
	if (length($sec) == 1) {$sec = "0"."$sec";}
	if (length($min) == 1) {$min = "0"."$min";}
	if (length($hour) == 1) {$hour = "0"."$hour";}
	if (length($day) == 1) {$day = "0"."$day";}
	if (length($month) == 1) {$month = "0"."$month";}
	print "$year-$month-$day $hour:$min:$sec\n";
	return("$year-$month-$day-$hour-$min-$sec");
}
#################################################################################
sub read_config{
	open CON,"<$config";
	while (my $aline=<CON>) {
		chomp $aline;
		my @tmp=split/\t/,$aline;
		push @filein,$tmp[0];
		push @mark,$tmp[1];
		#&check_rawdata($tmp[0]);
	}
	close CON;
	if (@filein != @mark) {
		#&printErr();
		die "Maybe config file have some wrong!!!\n";
	}
}
author	big-tiandm
date	Mon, 08 Dec 2014 01:51:16 -0500
parents	7b5a48b972e9
children