view nibls.pl @ 18:22d79320085c draft

Uploaded
author big-tiandm
date Thu, 30 Oct 2014 21:31:55 -0400
parents 07745c0958dd
children e0884a4b996b
line wrap: on
line source

#!/usr/bin/perl
#####################################################################################################
#LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. 
#
# Dan MacLean
# dan.maclean@sainsbury-laboratory.ac.uk
#
# This program is free academic software; academic and non-profit
# users may redistribute it freely.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
#
# This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007
# see included file GPL3.txt
#
#


###Dont forget you will need ... 
#####################################################################################################
# Boost::Graph 
#Copyright 2005 by David Burdick
# Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm
#Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
#####################################################################################################



use strict;
use warnings;
use Boost::Graph;
use Getopt::Long;


my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n";

my $gff_file ;
my $min_inc = 5;
my $clus = 0.6;
my $buff = 0;
my $output_file;
my $temp;
my $mark;

GetOptions(

	'c=f'          => \$clus,
	'm=i' => \$min_inc,
	'f|file=s'     => \$gff_file,
	'b=i' => \$buff,
	'o=s' => \$output_file,
	't=s' => \$temp,
	'k=s' => \$mark
) ;


die $usage unless $gff_file;


my $starttime = time;
warn "started $starttime\n";

## load in data
my %molecules; # stores starts and ends of srnas
open GFF, "<$gff_file";

while (my $entry = <GFF>){

	chomp $entry;
	next if($entry=~/^\#/);
	my @data = split(/\t/,$entry);
	my $chr=shift @data;
	my $strand=shift @data;
	my $start=shift @data;
	my $end=shift @data;
#	my $length1=$end-$start+1;
#	if ($length1>30) {
#		$length1=40;
#	}
	my $total;
	foreach (0..$#data) {
		$total+=$_;
	}
	push @data,$total;
#	push @data,$length1;
#	if (defined $molecules{$chr}{$start}{$end}{$strand}) {
#		my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});
#		for (my $i=0;$i<$#old_data ;$i++) {
#			$data[$i]+=$old_data[$i];
#		}
#	}
	my $data=join ";",@data;
	$molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information
	#print "$chr\t$start\t$end\n";
}

close GFF;

warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n";

my @sample=split/\#/,$mark;
$mark=join"\"\t\"",@sample;
open OUT, ">$output_file";
print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
open CLUSTER,">$temp";
print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
foreach my $chromosome (keys %molecules){
	my $g = new Boost::Graph(directed=>0);
	my @starts = keys(%{$molecules{$chromosome}} );
	@starts = sort {$a <=> $b} @starts;  

	while (my $srna_start = shift @starts){   ## work from left most sRNA to right most, add to graph if they close enough


		foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){


			###use new graph if the next srna is too far away from this one.. 
			if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){


				##dump the info from the old graph
				if (scalar(@{$g->get_nodes()}) > 2){

					my $cluster_coeff = get_cc($g);
					if ($cluster_coeff >= $clus){
					 dump_locus($g, $cluster_coeff);
					}
				}


				$g = new Boost::Graph(directed=>0);

			}

			foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){   ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below

				unless ($e eq $srna_end){
					my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end;   ## turn coordinate of sRNA inro a node name
					my $en = $chromosome. ':' . $srna_start . ':' . $e;
					$g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1');
				} 

			}

			foreach my $start (@starts){  ##build graph of overlaps
				my $new = 0;
				last if $start - $min_inc > $srna_end;
				if ($start - $min_inc <= $srna_end){

					my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end;
					foreach my $end (keys %{$molecules{$chromosome}{$start}}){

						my $end_node = $chromosome . ':' . $start . ':' . $end;
						$g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1');
					}

				}
			}
		}

		if (!(defined $starts[0])) {
			##dump the info from the last graph
			if (scalar(@{$g->get_nodes()}) > 2){

				my $cluster_coeff = get_cc($g);
				if ($cluster_coeff >= $clus){
				 dump_locus($g, $cluster_coeff);
				}
			}
		}
	}
}

warn "Loci printed\nFinished\n";

my $endtime = time;

my $elapsed = $endtime - $starttime;

warn "Time elapsed = $elapsed s\n";	
close OUT;
close CLUSTER;
#########################################################################################
sub get_cc{   ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2

	my $graph = shift;

	my @component = @{$graph->get_nodes()}; #number of nodes
	my @clustering_coefficients;

	foreach my $vertex (@component)
	{

		my @neighbours = @{$graph->neighbors($vertex)};

		my %edges_in_graph;

		my $n = @neighbours; #n = the number of neighbours
		my $k = ($n * ($n - 1))/2;	#k = total number of possible connections

		my $e= 0; #actual number of connections within sub-graph

		foreach my $neighbour (@neighbours)
		{
			foreach my $neighbour_2 (@neighbours)
			{
			my $edge1 = "$neighbour\t$neighbour_2";
			my $edge2 = "$neighbour_2\t$neighbour";
				unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})
				{
					if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))
					{
						++$e;
						$edges_in_graph{$edge1}=1;
						$edges_in_graph{$edge2}=1;
					}
				}
			}
		}

		if ($k >= 1)
		{	
		my $c = $e / $k;
		push @clustering_coefficients, $c;
		}
		else {push @clustering_coefficients, '0';}
	}

	my $graph_n = scalar(@clustering_coefficients);
	my $graph_cc = 0;
	foreach my $cc (@clustering_coefficients){ 

		$graph_cc = $graph_cc + $cc;

	}
	$graph_cc = $graph_cc / $graph_n;

	return $graph_cc;
}

############################################################################################################

sub dump_locus{

	my $g = shift;
	my $cc = shift;
	my $chr;
	my $start = 1000000000000000000000000000000000000000000000; 
	my $end = -1;
	my @sample;
	my @tag;
	foreach my $node (@{$g->get_nodes()}){

		$node =~ m/^(\S+):(\d+):(\d+)$/;
		my $c=$1;
		my $s=$2;
		my $e=$3;
		my @data;
		foreach my $str (keys %{$molecules{$c}{$s}{$e}}) {
			push @tag,($s.",".$e.",".$str);
			@data=split(/;/,$molecules{$c}{$s}{$e}{$str});
#			for (my $i=0;$i<$#old_data ;$i++) {
#				$data[$i]+=$old_data[$i];
#			}
			my $length=$e-$s+1;
			if ($length>30) {
				$length=40;
			}
			push @data,$length;
			my $data=join ";",@data;#sample_exp/total_exp/length;
			push @sample,$data;
		}

		$chr = $c;
		$start = $s if $s < $start;
		$end = $e if $e > $end;
	}
	my $tag=join";",@tag;
	
	my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample);
	if ($max_length==40) {
		$max_length="\>30";
	}
	my $cluster_exp=join"\t",@cluster_exp;
	my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp;
	print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag\n";
	print OUT $gff, "\n";  
}

sub Max_length{
	my @exp=@{$_[0]};
	my %sample_length;
	my $total_exp;
	my @each;
	for (my $i=0;$i<=$#exp ;$i++) {
		my @tag=split/;/,$exp[$i];
		my $length=pop(@tag);
		my $exp=pop(@tag);
		$sample_length{$length}+=$exp;
		$total_exp+=$exp;
		for (my $j=0;$j<=$#tag ;$j++) {
			$each[$j]+=$tag[$j];
		}
	}
	my $max=0;
	my $max_key;
	foreach my $key (sort keys %sample_length) {
		my $p=$sample_length{$key}/$total_exp;
		if ($p>$max) {
			$max=$p;
			$max_key=$key;
		}
		$sample_length{$key}=sprintf("%.2f",$p);
	}
	return($max_key,$sample_length{$max_key},@each);
}