annotate 2.4/src/ReadCluster.pl @ 16:8eb7d93f7e58 draft

Uploaded
author plus91-technologies-pvt-ltd
date Sat, 31 May 2014 11:23:36 -0400
parents e3609c8714fb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
1 #!/usr/bin/perl
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
2
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
3 =head1 NAME
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
4 ReadCluster.pl
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
5
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
6 =head1 SYNOPSIS
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
7
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
8 USAGE: ReadCluster.pl --input input_sam_file --output output_prefix [--threshold 10000 --minClusterSize 4]
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
9
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
10 =head1 OPTIONS
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
11
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
12 B<--input,-i>
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
13 Input file
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
14
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
15 B<--output,-o>
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
16 output prefix
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
17
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
18 B<--window, -w>
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
19 Window size
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
20
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
21 B<--minClusterSize, -m>
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
22 Min size of cluster
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
23
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
24 B<--help,-h>
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
25 This help message
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
26
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
27 =head1 DESCRIPTION
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
28
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
29
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
30 =head1 INPUT
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
31
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
32
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
33 =head1 OUTPUT
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
34
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
35
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
36 =head1 CONTACT
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
37 Jaysheel D. Bhavsar @ bjaysheel[at]gmail[dot]com
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
38
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
39
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
40 ==head1 EXAMPLE
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
41 ReadCluster.pl --input=filename.sam --window=10000 --output=PREFIX
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
42
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
43 =cut
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
44
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
45 use strict;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
46 use warnings;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
47 use Data::Dumper;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
48 use DBI;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
49 use Pod::Usage;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
50 use Scalar::Util qw(looks_like_number);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
51 use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
52
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
53 my %options = ();
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
54 my $results = GetOptions (\%options,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
55 'input|i=s',
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
56 'output|o=s',
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
57 'window|w=s',
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
58 'minClusterSize|m=s',
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
59 'help|h') || pod2usage();
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
60
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
61 ## display documentation
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
62 if( $options{'help'} ){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
63 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
64 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
65 #############################################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
66 ## make sure everything passed was peachy
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
67 &check_parameters(\%options);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
68
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
69 my $r1_start = 0;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
70 my $r2_start = 0;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
71 my $r1_end = $r1_start + $options{window};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
72 my $r2_end = $r2_start + $options{window};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
73 my $r1_chr = "";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
74 my $r2_chr = "";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
75
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
76 my @cluster = ();
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
77
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
78 open (FHD, "<", $options{input}) or die "Cound not open file $options{input}\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
79 open (INTRA, ">", $options{output} . ".intra.sam") or die "Cound not open file $options{output}.intra.sam\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
80 open (INTER, ">", $options{output} . ".inter.sam") or die "Cound not open file $options{output}.inter.sam\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
81
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
82 while (<FHD>){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
83 chomp $_;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
84
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
85 #skip processing lines starting with @ just print to output file.
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
86 if ($_ =~ /^@/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
87 print INTRA $_."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
88 print INTER $_."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
89 next;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
90 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
91 #print "$_\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
92 check_sequence($_);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
93 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
94
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
95 close(FHD);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
96 close(INTRA);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
97 close(INTER);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
98
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
99 exit(0);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
100
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
101 #############################################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
102 sub check_parameters {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
103 my $options = shift;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
104
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
105 my @required = ("input", "output");
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
106
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
107 foreach my $key (@required) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
108 unless ($options{$key}) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
109 print STDERR "ARG: $key is required\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
110 pod2usage({-exitval => 2, -message => "error message", -verbose => 1, -output => \*STDERR});
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
111 exit(-1);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
112 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
113 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
114
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
115 unless($options{window}) { $options{window} = 10000; }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
116 unless($options{minClusterSize}) { $options{minClusterSize} = 4; }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
117 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
118
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
119 #############################################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
120 sub check_sequence {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
121 my $line = shift;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
122
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
123 my @data = split(/\t/, $line);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
124
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
125 ## check if mates are within the window.
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
126 if ((inWindow($data[3], 1)) && (inWindow($data[7], 2)) &&
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
127 ($r1_chr =~ /$data[2]/) && ($r2_chr =~ /$data[6]/)) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
128
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
129 ## if minClusterSize is reached output
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
130 if (scalar(@cluster) >= $options{minClusterSize}) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
131
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
132 ## if chr are the same then print intra-chr else inter-chr
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
133 if ($data[6] =~ /=/) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
134 print INTRA $line."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
135 } else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
136 print INTER $line."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
137 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
138 } else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
139 push @cluster, $line;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
140 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
141 } else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
142
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
143 if (scalar(@cluster) >= $options{minClusterSize}) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
144 dumpCluster(@cluster);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
145 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
146
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
147 @cluster = ();
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
148 $r1_start = $data[3];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
149 $r2_start = $data[7];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
150 $r1_end = $r1_start + $options{window};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
151 $r2_end = $r2_start + $options{window};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
152 $r1_chr = $data[2];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
153 $r2_chr = $data[6];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
154 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
155 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
156
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
157 #############################################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
158 sub inWindow {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
159 my $coord = shift;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
160 my $read = shift;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
161
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
162 my $start = 0;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
163 my $end = 0;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
164
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
165 if ($read == 1) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
166 $start = $r1_start;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
167 $end = $r1_end;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
168 } else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
169 $start = $r2_start;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
170 $end = $r2_end;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
171 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
172
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
173 if (($coord > $start) && ($coord < $end)){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
174 return 1;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
175 } else { return 0; }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
176 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
177
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
178 #############################################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
179 sub dumpCluster {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
180 my @cluster = shift;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
181
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
182 foreach (@cluster){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
183 my @data = split(/\t/, $_);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
184
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
185 if ($data[6] =~ /=/) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
186 print INTRA $_."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
187 } else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
188 print INTER $_."\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
189 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
190 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
191 }