0
|
1 #!/usr/bin/perl
|
|
2 #####################################################################################################
|
|
3 #LocusPocus is a free script, it is provided with the hope that you will enjoy, you may freely redistribute it at will. We would be greatful if you would keep these acknowledgements with it.
|
|
4 #
|
|
5 # Dan MacLean
|
|
6 # dan.maclean@sainsbury-laboratory.ac.uk
|
|
7 #
|
|
8 # This program is free academic software; academic and non-profit
|
|
9 # users may redistribute it freely.
|
|
10 #
|
|
11 # This program is distributed in the hope that it will be useful,
|
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
14 #
|
|
15 # This software is released under GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007
|
|
16 # see included file GPL3.txt
|
|
17 #
|
|
18 #
|
|
19
|
|
20
|
|
21 ###Dont forget you will need ...
|
|
22 #####################################################################################################
|
|
23 # Boost::Graph
|
|
24 #Copyright 2005 by David Burdick
|
|
25 # Available from http://search.cpan.org/~dburdick/Boost-Graph-1.2/Graph.pm
|
|
26 #Boost::Graph is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
|
|
27 #####################################################################################################
|
|
28
|
|
29
|
|
30
|
|
31 use strict;
|
|
32 use warnings;
|
|
33 use Boost::Graph;
|
|
34 use Getopt::Long;
|
|
35
|
|
36
|
|
37 my $usage = "usage: $0 -f GFF_FILE [options]\n\n -m minimum inclusion distance (default 5)\n -c clustering coefficient (default 0.6) -b buffer between graphs (default 0) -k sample mark -o output file -t temp output file\n";
|
|
38
|
|
39 my $gff_file ;
|
|
40 my $min_inc = 5;
|
|
41 my $clus = 0.6;
|
|
42 my $buff = 0;
|
|
43 my $output_file;
|
|
44 my $temp;
|
|
45 my $mark;
|
|
46
|
|
47 GetOptions(
|
|
48
|
|
49 'c=f' => \$clus,
|
|
50 'm=i' => \$min_inc,
|
|
51 'f|file=s' => \$gff_file,
|
|
52 'b=i' => \$buff,
|
|
53 'o=s' => \$output_file,
|
|
54 't=s' => \$temp,
|
|
55 'k=s' => \$mark
|
|
56 ) ;
|
|
57
|
|
58
|
|
59 die $usage unless $gff_file;
|
|
60
|
|
61
|
|
62 my $starttime = time;
|
|
63 warn "started $starttime\n";
|
|
64
|
|
65 ## load in data
|
|
66 my %molecules; # stores starts and ends of srnas
|
|
67 open GFF, "<$gff_file";
|
|
68
|
|
69 while (my $entry = <GFF>){
|
|
70
|
|
71 chomp $entry;
|
|
72 next if($entry=~/^\#/);
|
|
73 my @data = split(/\t/,$entry);
|
|
74 my $chr=shift @data;
|
|
75 my $strand=shift @data;
|
|
76 my $start=shift @data;
|
|
77 my $end=shift @data;
|
|
78 # my $length1=$end-$start+1;
|
|
79 # if ($length1>30) {
|
|
80 # $length1=40;
|
|
81 # }
|
|
82 my $total;
|
19
|
83 for (my $s=0;$s<@data ;$s++) {
|
|
84 $total+=$data[$s];
|
0
|
85 }
|
|
86 push @data,$total;
|
|
87 # push @data,$length1;
|
|
88 # if (defined $molecules{$chr}{$start}{$end}{$strand}) {
|
|
89 # my @old_data=split(/;/,$molecules{$chr}{$start}{$end}{$strand});
|
|
90 # for (my $i=0;$i<$#old_data ;$i++) {
|
|
91 # $data[$i]+=$old_data[$i];
|
|
92 # }
|
|
93 # }
|
|
94 my $data=join ";",@data;
|
|
95 $molecules{$chr}{$start}{$end}{$strand} = $data;#chr#start#end#strand#add Tags information
|
|
96 #print "$chr\t$start\t$end\n";
|
|
97 }
|
|
98
|
|
99 close GFF;
|
|
100
|
|
101 warn "Data loaded...\nBuilding graphs and finding loci\nPlease be patient, this can take a while...\n";
|
|
102
|
|
103 my @sample=split/\#/,$mark;
|
|
104 $mark=join"\"\t\"",@sample;
|
|
105 open OUT, ">$output_file";
|
|
106 print OUT "\"Chr\"\t\"MajorLength\"\t\"Percent\"\t\"$mark\"\n";
|
|
107 open CLUSTER,">$temp";
|
|
108 print CLUSTER "\#Chr\tMajorLength\tPercent\tTagsNumber\tTagsInfor\n";
|
|
109 foreach my $chromosome (keys %molecules){
|
|
110 my $g = new Boost::Graph(directed=>0);
|
|
111 my @starts = keys(%{$molecules{$chromosome}} );
|
|
112 @starts = sort {$a <=> $b} @starts;
|
|
113
|
|
114 while (my $srna_start = shift @starts){ ## work from left most sRNA to right most, add to graph if they close enough
|
|
115
|
|
116
|
|
117 foreach my $srna_end (keys %{$molecules{$chromosome}{$srna_start}}){
|
|
118
|
|
119
|
|
120 ###use new graph if the next srna is too far away from this one..
|
|
121 if(defined $starts[0] and $srna_end + $min_inc + $buff < $starts[0]){
|
|
122
|
|
123
|
|
124 ##dump the info from the old graph
|
|
125 if (scalar(@{$g->get_nodes()}) > 2){
|
|
126
|
|
127 my $cluster_coeff = get_cc($g);
|
|
128 if ($cluster_coeff >= $clus){
|
|
129 dump_locus($g, $cluster_coeff);
|
|
130 }
|
|
131 }
|
|
132
|
|
133
|
|
134 $g = new Boost::Graph(directed=>0);
|
|
135
|
|
136 }
|
|
137
|
|
138 foreach my $e (keys %{$molecules{$chromosome}{$srna_start}}){ ### extra bit because all loci with same start and different end overlap by definition. but are not collected by main search below
|
|
139
|
|
140 unless ($e eq $srna_end){
|
|
141 my $sn = $chromosome. ':' . $srna_start . ':' . $srna_end; ## turn coordinate of sRNA inro a node name
|
|
142 my $en = $chromosome. ':' . $srna_start . ':' . $e;
|
|
143 $g->add_edge(node1=>"$sn", node2=>"$en", weight=>'1');
|
|
144 }
|
|
145
|
|
146 }
|
|
147
|
|
148 foreach my $start (@starts){ ##build graph of overlaps
|
|
149 my $new = 0;
|
|
150 last if $start - $min_inc > $srna_end;
|
|
151 if ($start - $min_inc <= $srna_end){
|
|
152
|
|
153 my $start_node = $chromosome . ':' . $srna_start . ':' . $srna_end;
|
|
154 foreach my $end (keys %{$molecules{$chromosome}{$start}}){
|
|
155
|
|
156 my $end_node = $chromosome . ':' . $start . ':' . $end;
|
|
157 $g->add_edge(node1=>"$start_node", node2=>"$end_node", weight=>'1');
|
|
158 }
|
|
159
|
|
160 }
|
|
161 }
|
|
162 }
|
|
163
|
|
164 if (!(defined $starts[0])) {
|
|
165 ##dump the info from the last graph
|
|
166 if (scalar(@{$g->get_nodes()}) > 2){
|
|
167
|
|
168 my $cluster_coeff = get_cc($g);
|
|
169 if ($cluster_coeff >= $clus){
|
|
170 dump_locus($g, $cluster_coeff);
|
|
171 }
|
|
172 }
|
|
173 }
|
|
174 }
|
|
175 }
|
|
176
|
|
177 warn "Loci printed\nFinished\n";
|
|
178
|
|
179 my $endtime = time;
|
|
180
|
|
181 my $elapsed = $endtime - $starttime;
|
|
182
|
|
183 warn "Time elapsed = $elapsed s\n";
|
|
184 close OUT;
|
|
185 close CLUSTER;
|
|
186 #########################################################################################
|
|
187 sub get_cc{ ## do cluster coeff calculation. No useful method anyway so self implemented NB, this is an undirected graph so k is n(n-1)/2
|
|
188
|
|
189 my $graph = shift;
|
|
190
|
|
191 my @component = @{$graph->get_nodes()}; #number of nodes
|
|
192 my @clustering_coefficients;
|
|
193
|
|
194 foreach my $vertex (@component)
|
|
195 {
|
|
196
|
|
197 my @neighbours = @{$graph->neighbors($vertex)};
|
|
198
|
|
199 my %edges_in_graph;
|
|
200
|
|
201 my $n = @neighbours; #n = the number of neighbours
|
|
202 my $k = ($n * ($n - 1))/2; #k = total number of possible connections
|
|
203
|
|
204 my $e= 0; #actual number of connections within sub-graph
|
|
205
|
|
206 foreach my $neighbour (@neighbours)
|
|
207 {
|
|
208 foreach my $neighbour_2 (@neighbours)
|
|
209 {
|
|
210 my $edge1 = "$neighbour\t$neighbour_2";
|
|
211 my $edge2 = "$neighbour_2\t$neighbour";
|
|
212 unless (exists $edges_in_graph{$edge1} or exists $edges_in_graph{$edge2})
|
|
213 {
|
|
214 if ($graph->has_edge($neighbour, $neighbour_2) or $graph->has_edge($neighbour_2, $neighbour))
|
|
215 {
|
|
216 ++$e;
|
|
217 $edges_in_graph{$edge1}=1;
|
|
218 $edges_in_graph{$edge2}=1;
|
|
219 }
|
|
220 }
|
|
221 }
|
|
222 }
|
|
223
|
|
224 if ($k >= 1)
|
|
225 {
|
|
226 my $c = $e / $k;
|
|
227 push @clustering_coefficients, $c;
|
|
228 }
|
|
229 else {push @clustering_coefficients, '0';}
|
|
230 }
|
|
231
|
|
232 my $graph_n = scalar(@clustering_coefficients);
|
|
233 my $graph_cc = 0;
|
|
234 foreach my $cc (@clustering_coefficients){
|
|
235
|
|
236 $graph_cc = $graph_cc + $cc;
|
|
237
|
|
238 }
|
|
239 $graph_cc = $graph_cc / $graph_n;
|
|
240
|
|
241 return $graph_cc;
|
|
242 }
|
|
243
|
|
244 ############################################################################################################
|
|
245
|
|
246 sub dump_locus{
|
|
247
|
|
248 my $g = shift;
|
|
249 my $cc = shift;
|
|
250 my $chr;
|
|
251 my $start = 1000000000000000000000000000000000000000000000;
|
|
252 my $end = -1;
|
|
253 my @sample;
|
|
254 my @tag;
|
|
255 foreach my $node (@{$g->get_nodes()}){
|
|
256
|
|
257 $node =~ m/^(\S+):(\d+):(\d+)$/;
|
|
258 my $c=$1;
|
|
259 my $s=$2;
|
|
260 my $e=$3;
|
19
|
261 # my @data;
|
0
|
262 foreach my $str (keys %{$molecules{$c}{$s}{$e}}) {
|
19
|
263 my @data=split(/;/,$molecules{$c}{$s}{$e}{$str});
|
|
264 push @tag,($s.",".$e.",".$str.",".$data[-1]);
|
0
|
265 # for (my $i=0;$i<$#old_data ;$i++) {
|
|
266 # $data[$i]+=$old_data[$i];
|
|
267 # }
|
|
268 my $length=$e-$s+1;
|
|
269 if ($length>30) {
|
|
270 $length=40;
|
|
271 }
|
|
272 push @data,$length;
|
|
273 my $data=join ";",@data;#sample_exp/total_exp/length;
|
|
274 push @sample,$data;
|
|
275 }
|
|
276
|
|
277 $chr = $c;
|
|
278 $start = $s if $s < $start;
|
|
279 $end = $e if $e > $end;
|
|
280 }
|
|
281 my $tag=join";",@tag;
|
19
|
282 my $tag_number=@tag;
|
0
|
283 my ($max_length,$max_p,@cluster_exp)=Max_length(\@sample);
|
|
284 if ($max_length==40) {
|
|
285 $max_length="\>30";
|
|
286 }
|
|
287 my $cluster_exp=join"\t",@cluster_exp;
|
|
288 my $gff = $chr."\:$start\-$end\t".$max_length."nt\t".$max_p."\t" . $cluster_exp;
|
19
|
289 print CLUSTER "$chr\:$start\-$end\t$max_length"."nt\t$max_p\t$tag_number\t$tag\n";
|
0
|
290 print OUT $gff, "\n";
|
|
291 }
|
|
292
|
|
293 sub Max_length{
|
|
294 my @exp=@{$_[0]};
|
|
295 my %sample_length;
|
|
296 my $total_exp;
|
|
297 my @each;
|
|
298 for (my $i=0;$i<=$#exp ;$i++) {
|
|
299 my @tag=split/;/,$exp[$i];
|
|
300 my $length=pop(@tag);
|
|
301 my $exp=pop(@tag);
|
|
302 $sample_length{$length}+=$exp;
|
|
303 $total_exp+=$exp;
|
|
304 for (my $j=0;$j<=$#tag ;$j++) {
|
|
305 $each[$j]+=$tag[$j];
|
|
306 }
|
|
307 }
|
|
308 my $max=0;
|
|
309 my $max_key;
|
|
310 foreach my $key (sort keys %sample_length) {
|
|
311 my $p=$sample_length{$key}/$total_exp;
|
|
312 if ($p>$max) {
|
|
313 $max=$p;
|
|
314 $max_key=$key;
|
|
315 }
|
|
316 $sample_length{$key}=sprintf("%.2f",$p);
|
|
317 }
|
|
318 return($max_key,$sample_length{$max_key},@each);
|
|
319 }
|