view conventional.pl @ 23:cad6a1dfb174 draft

Uploaded
author big-tiandm
date Wed, 05 Nov 2014 21:11:49 -0500
parents 07745c0958dd
children
line wrap: on
line source

#!/usr/bin/perl -w
#Filename:
#Author: Chentt
#Email: chentt@big.ac.cn
#Date: 2014/04/09
#Modified:
#Description: islands merged of merged samples 
my $version=1.00;

use strict;
use Getopt::Long;

my %opts;
GetOptions(\%opts,"i=s","d=i","o=s","N=i","t=s","mark=s","h");
if (!(defined $opts{i} and defined $opts{d} and defined $opts{N} and defined $opts{mark} and defined $opts{t} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
&usage;
}

my $filein=$opts{'i'};
my $fileout=$opts{'o'};
my $distance=$opts{'d'};
my $tempout=$opts{'t'};
my $mark=$opts{'mark'};
my @sample=split/\#/,$mark;
$mark=join"\"\t\"",@sample;

open IN,"<$filein"; #input file  
open OUT,">$fileout"; #output file  
print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
open TMP,">$tempout";
print TMP "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
my %hash;
while (my $aline=<IN>) {
	chomp $aline;
	if($aline=~/^\#/){
		#print OUT "$aline\n";
		next;
	}
	my @tmp=split/\t/,$aline;
	my $chr=shift @tmp;
	#shift @tmp;
	push @{$hash{$chr}},[@tmp];
}

close IN;

foreach my $key (keys %hash) {
	my @tag=sort{$a->[1] <=> $b->[1]} @{$hash{$key}};
	my @sample;
	my $start=$tag[0][1];
	my $end=$tag[0][2];
	push @sample,[@{$tag[0]}];
	for (my $i=1;$i<@tag-1;$i++) {
		if ($tag[$i][1]-$end<=$distance) {
			if ($tag[$i][2]>$end) {
				$end=$tag[$i][2];
			}
			push @sample,[@{$tag[$i]}];
		}
		else{
			my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
			my $cluster_exp=join"\t",@cluster_exp;
			if ($max_length>30) {
				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
				$max_length="\>30";
			}
			else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
			print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";			
			$start=$tag[$i][1];
			$end=$tag[$i][2];

			@sample=();
			push @sample,[@{$tag[$i]}];
		}
	}
	if ($tag[$#tag][1]-$end<=$distance) {
			if ($tag[$#tag][2]>$end) {
				$end=$tag[$#tag][2];
			}
			push @sample,[@{$tag[$#tag]}];
			my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
			my $cluster_exp=join"\t",@cluster_exp;
			if ($max_length>30) {
				$max_length="\>30";
				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
			}
			else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
			print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";
	}
	else{
		my ($max_length,$max_p,$tag,@cluster_exp)=Max_length(\@sample);
		my $cluster_exp=join"\t",@cluster_exp;
		if ($max_length>30) {
				$max_length="\>30";
				print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
		}
		else{print TMP "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";}
		print OUT "$key\:$start\-$end\t$max_length"."nt\t$max_p\t$cluster_exp\n";

	}
}
close OUT;
close TMP;
sub Max_length{
	my @exp=@{$_[0]};
	my %sample_length;
	my $total_exp;
	my @each;
	my @tag;
	for (my $i=0;$i<=$#exp ;$i++) {
		my $length=$exp[$i][2]-$exp[$i][1]+1;
		#if ($length>30) {
		#	$length=40;
		#}
		my $exp=0;
		foreach  (1..$opts{'N'}) {
			$exp+=$exp[$i][$_+2];
			$each[$_-1]+=$exp[$i][$_+2];
		}	
		$sample_length{$length}+=$exp;
		$total_exp+=$exp;
		push @tag,($exp[$i][1].",".$exp[$i][2].",".$exp[$i][0].",".$exp);
	}
	my $max=0;
	my $max_key;
	foreach my $key (sort keys %sample_length) {
		my $p=$sample_length{$key}/$total_exp;
		if ($p>$max) {
			$max=$p;
			$max_key=$key;
		}
		$sample_length{$key}=sprintf("%.2f",$p);
	}
	my $tag_n=@tag;
	my $tag=join";",@tag;
	$tag=$tag_n."\t".$tag;
	return($max_key,$sample_length{$max_key},$tag,@each);
}

sub usage{
print <<"USAGE";
Version $version
Usage:
$0 -i -o -d -N -t -mark
options:
-i input file
-d distance of two islands
-mark sample name;
-o output file
-N sample number
-t temp output file
-h help
USAGE
exit(1);
}