13
|
1 #!/usr/bin/perl
|
|
2
|
|
3 =head1 NAME
|
|
4 ReadCluster.pl
|
|
5
|
|
6 =head1 SYNOPSIS
|
|
7
|
|
8 USAGE: ReadCluster.pl --input input_sam_file --output output_prefix [--threshold 10000 --minClusterSize 4]
|
|
9
|
|
10 =head1 OPTIONS
|
|
11
|
|
12 B<--input,-i>
|
|
13 Input file
|
|
14
|
|
15 B<--output,-o>
|
|
16 output prefix
|
|
17
|
|
18 B<--window, -w>
|
|
19 Window size
|
|
20
|
|
21 B<--minClusterSize, -m>
|
|
22 Min size of cluster
|
|
23
|
|
24 B<--help,-h>
|
|
25 This help message
|
|
26
|
|
27 =head1 DESCRIPTION
|
|
28
|
|
29
|
|
30 =head1 INPUT
|
|
31
|
|
32
|
|
33 =head1 OUTPUT
|
|
34
|
|
35
|
|
36 =head1 CONTACT
|
|
37 Jaysheel D. Bhavsar @ bjaysheel[at]gmail[dot]com
|
|
38
|
|
39
|
|
40 ==head1 EXAMPLE
|
|
41 ReadCluster.pl --input=filename.sam --window=10000 --output=PREFIX
|
|
42
|
|
43 =cut
|
|
44
|
|
45 use strict;
|
|
46 use warnings;
|
|
47 use Data::Dumper;
|
|
48 use DBI;
|
|
49 use Pod::Usage;
|
|
50 use Scalar::Util qw(looks_like_number);
|
|
51 use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
|
|
52
|
|
53 my %options = ();
|
|
54 my $results = GetOptions (\%options,
|
|
55 'input|i=s',
|
|
56 'output|o=s',
|
|
57 'window|w=s',
|
|
58 'minClusterSize|m=s',
|
|
59 'help|h') || pod2usage();
|
|
60
|
|
61 ## display documentation
|
|
62 if( $options{'help'} ){
|
|
63 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
|
|
64 }
|
|
65 #############################################################################
|
|
66 ## make sure everything passed was peachy
|
|
67 &check_parameters(\%options);
|
|
68
|
|
69 my $r1_start = 0;
|
|
70 my $r2_start = 0;
|
|
71 my $r1_end = $r1_start + $options{window};
|
|
72 my $r2_end = $r2_start + $options{window};
|
|
73 my $r1_chr = "";
|
|
74 my $r2_chr = "";
|
|
75
|
|
76 my @cluster = ();
|
|
77
|
|
78 open (FHD, "<", $options{input}) or die "Cound not open file $options{input}\n";
|
|
79 open (INTRA, ">", $options{output} . ".intra.sam") or die "Cound not open file $options{output}.intra.sam\n";
|
|
80 open (INTER, ">", $options{output} . ".inter.sam") or die "Cound not open file $options{output}.inter.sam\n";
|
|
81
|
|
82 while (<FHD>){
|
|
83 chomp $_;
|
|
84
|
|
85 #skip processing lines starting with @ just print to output file.
|
|
86 if ($_ =~ /^@/){
|
|
87 print INTRA $_."\n";
|
|
88 print INTER $_."\n";
|
|
89 next;
|
|
90 }
|
|
91 #print "$_\n";
|
|
92 check_sequence($_);
|
|
93 }
|
|
94
|
|
95 close(FHD);
|
|
96 close(INTRA);
|
|
97 close(INTER);
|
|
98
|
|
99 exit(0);
|
|
100
|
|
101 #############################################################################
|
|
102 sub check_parameters {
|
|
103 my $options = shift;
|
|
104
|
|
105 my @required = ("input", "output");
|
|
106
|
|
107 foreach my $key (@required) {
|
|
108 unless ($options{$key}) {
|
|
109 print STDERR "ARG: $key is required\n";
|
|
110 pod2usage({-exitval => 2, -message => "error message", -verbose => 1, -output => \*STDERR});
|
|
111 exit(-1);
|
|
112 }
|
|
113 }
|
|
114
|
|
115 unless($options{window}) { $options{window} = 10000; }
|
|
116 unless($options{minClusterSize}) { $options{minClusterSize} = 4; }
|
|
117 }
|
|
118
|
|
119 #############################################################################
|
|
120 sub check_sequence {
|
|
121 my $line = shift;
|
|
122
|
|
123 my @data = split(/\t/, $line);
|
|
124
|
|
125 ## check if mates are within the window.
|
|
126 if ((inWindow($data[3], 1)) && (inWindow($data[7], 2)) &&
|
|
127 ($r1_chr =~ /$data[2]/) && ($r2_chr =~ /$data[6]/)) {
|
|
128
|
|
129 ## if minClusterSize is reached output
|
|
130 if (scalar(@cluster) >= $options{minClusterSize}) {
|
|
131
|
|
132 ## if chr are the same then print intra-chr else inter-chr
|
|
133 if ($data[6] =~ /=/) {
|
|
134 print INTRA $line."\n";
|
|
135 } else {
|
|
136 print INTER $line."\n";
|
|
137 }
|
|
138 } else {
|
|
139 push @cluster, $line;
|
|
140 }
|
|
141 } else {
|
|
142
|
|
143 if (scalar(@cluster) >= $options{minClusterSize}) {
|
|
144 dumpCluster(@cluster);
|
|
145 }
|
|
146
|
|
147 @cluster = ();
|
|
148 $r1_start = $data[3];
|
|
149 $r2_start = $data[7];
|
|
150 $r1_end = $r1_start + $options{window};
|
|
151 $r2_end = $r2_start + $options{window};
|
|
152 $r1_chr = $data[2];
|
|
153 $r2_chr = $data[6];
|
|
154 }
|
|
155 }
|
|
156
|
|
157 #############################################################################
|
|
158 sub inWindow {
|
|
159 my $coord = shift;
|
|
160 my $read = shift;
|
|
161
|
|
162 my $start = 0;
|
|
163 my $end = 0;
|
|
164
|
|
165 if ($read == 1) {
|
|
166 $start = $r1_start;
|
|
167 $end = $r1_end;
|
|
168 } else {
|
|
169 $start = $r2_start;
|
|
170 $end = $r2_end;
|
|
171 }
|
|
172
|
|
173 if (($coord > $start) && ($coord < $end)){
|
|
174 return 1;
|
|
175 } else { return 0; }
|
|
176 }
|
|
177
|
|
178 #############################################################################
|
|
179 sub dumpCluster {
|
|
180 my @cluster = shift;
|
|
181
|
|
182 foreach (@cluster){
|
|
183 my @data = split(/\t/, $_);
|
|
184
|
|
185 if ($data[6] =~ /=/) {
|
|
186 print INTRA $_."\n";
|
|
187 } else {
|
|
188 print INTER $_."\n";
|
|
189 }
|
|
190 }
|
|
191 }
|