Mercurial > repos > big-tiandm > sirna_plant
view nibls.pl @ 28:e93f33f03019 draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 13 Nov 2014 22:45:51 -0500 |
parents | 9dcffd531c76 |
children |
line wrap: on
line source
#!/usr/bin/perl ##################################################################################################### #LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it. # # Dan MacLean # dan.maclean@sainsbury-laboratory.ac.uk # # This program is free academic software; academic and non-profit # users may redistribute it freely. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 # see included file GPL3.txt # # ###Dont forget you will need ... ##################################################################################################### # Boost::Graph #Copyright 2005 by David Burdick # Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm #Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself. ##################################################################################################### use strict; use warnings; use Boost::Graph; use Getopt::Long; my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n"; my $gff_file ; my $min_inc = 5; my $clus = 0.6; my $buff = 0; my $output_file; my $temp; my $mark; GetOptions( 'c=f' => \$clus, 'm=i' => \$min_inc, 'f|file=s' => \$gff_file, 'b=i' => \$buff, 'o=s' => \$output_file, 't=s' => \$temp, 'k=s' => \$mark ) ; die $usage unless $gff_file; my $starttime = time; warn "started $starttime\n"; ## load in data my %molecules; # stores starts and ends of srnas open GFF, "<$gff_file"; while (my $entry = <GFF>){ chomp $entry; next if($entry=~/^\#/); my @data = split(/\t/,$entry); my $chr=shift @data; my $strand=shift @data; my $start=shift @data; my $end=shift @data; # my $length1=$end-$start+1; # if ($length1>30) { # $length1=40; # } my $total; for (my $s=0;$s<@data ;$s++) { $total+=$data[$s]; } push @data,$total; # push @data,$length1; # if (defined $molecules{$chr}{$start}{$end}{$strand}) { # my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand}); # for (my $i=0;$i<$#old_data ;$i++) { # $data[$i]+=$old_data[$i]; # } # } my $data=join ";",@data; $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information #print "$chr\t$start\t$end\n"; } close GFF; warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n"; my @sample=split/\#/,$mark; $mark=join"\"\t\"",@sample; open OUT, ">$output_file"; print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n"; open CLUSTER,">$temp"; print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n"; foreach my $chromosome (keys %molecules){ my $g = new Boost::Graph(directed=>0); my @starts = keys(%{$molecules{$chromosome}} ); @starts = sort {$a <=> $b} @starts; while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){ ###use new graph if the next srna is too far away from this one.. if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){ ##dump the info from the old graph if (scalar(@{$g->get_nodes()}) > 2){ my $cluster_coeff = get_cc($g); if ($cluster_coeff >= $clus){ dump_locus($g, $cluster_coeff); } } $g = new Boost::Graph(directed=>0); } foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below unless ($e eq $srna_end){ my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name my $en = $chromosome. ':' . $srna_start . ':' . $e; $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1'); } } foreach my $start (@starts){ ##build graph of overlaps my $new = 0; last if $start - $min_inc > $srna_end; if ($start - $min_inc <= $srna_end){ my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end; foreach my $end (keys %{$molecules{$chromosome}{$start}}){ my $end_node = $chromosome . ':' . $start . ':' . $end; $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1'); } } } } if (!(defined $starts[0])) { ##dump the info from the last graph if (scalar(@{$g->get_nodes()}) > 2){ my $cluster_coeff = get_cc($g); if ($cluster_coeff >= $clus){ dump_locus($g, $cluster_coeff); } } } } } warn "Loci printed\nFinished\n"; my $endtime = time; my $elapsed = $endtime - $starttime; warn "Time elapsed = $elapsed s\n"; close OUT; close CLUSTER; ######################################################################################### sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2 my $graph = shift; my @component = @{$graph->get_nodes()}; #number of nodes my @clustering_coefficients; foreach my $vertex (@component) { my @neighbours = @{$graph->neighbors($vertex)}; my %edges_in_graph; my $n = @neighbours; #n = the number of neighbours my $k = ($n * ($n - 1))/2; #k = total number of possible connections my $e= 0; #actual number of connections within sub-graph foreach my $neighbour (@neighbours) { foreach my $neighbour_2 (@neighbours) { my $edge1 = "$neighbour\t$neighbour_2"; my $edge2 = "$neighbour_2\t$neighbour"; unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2}) { if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour)) { ++$e; $edges_in_graph{$edge1}=1; $edges_in_graph{$edge2}=1; } } } } if ($k >= 1) { my $c = $e / $k; push @clustering_coefficients, $c; } else {push @clustering_coefficients, '0';} } my $graph_n = scalar(@clustering_coefficients); my $graph_cc = 0; foreach my $cc (@clustering_coefficients){ $graph_cc = $graph_cc + $cc; } $graph_cc = $graph_cc / $graph_n; return $graph_cc; } ############################################################################################################ sub dump_locus{ my $g = shift; my $cc = shift; my $chr; my $start = 1000000000000000000000000000000000000000000000; my $end = -1; my @sample; my @tag; foreach my $node (@{$g->get_nodes()}){ $node =~ m/^(\S+):(\d+):(\d+)$/; my $c=$1; my $s=$2; my $e=$3; # my @data; foreach my $str (keys %{$molecules{$c}{$s}{$e}}) { my @data=split(/;/,$molecules{$c}{$s}{$e}{$str}); push @tag,($s.",".$e.",".$str.",".$data[-1]); # for (my $i=0;$i<$#old_data ;$i++) { # $data[$i]+=$old_data[$i]; # } my $length=$e-$s+1; if ($length>30) { $length=40; } push @data,$length; my $data=join ";",@data;#sample_exp/total_exp/length; push @sample,$data; } $chr = $c; $start = $s if $s < $start; $end = $e if $e > $end; } my $tag=join";",@tag; my $tag_number=@tag; my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample); if ($max_length==40) { $max_length="\>30"; } my $cluster_exp=join"\t",@cluster_exp; my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp; print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n"; print OUT $gff, "\n"; } sub Max_length{ my @exp=@{$_[0]}; my %sample_length; my $total_exp; my @each; for (my $i=0;$i<=$#exp ;$i++) { my @tag=split/;/,$exp[$i]; my $length=pop(@tag); my $exp=pop(@tag); $sample_length{$length}+=$exp; $total_exp+=$exp; for (my $j=0;$j<=$#tag ;$j++) { $each[$j]+=$tag[$j]; } } my $max=0; my $max_key; foreach my $key (sort keys %sample_length) { my $p=$sample_length{$key}/$total_exp; if ($p>$max) { $max=$p; $max_key=$key; } $sample_length{$key}=sprintf("%.2f",$p); } return($max_key,$sample_length{$max_key},@each); }