annotate removeDupSV.pl @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
1 #!/nfs_exports/apps/64-bit/gnu-apps/perl5.8.9/bin/perl -w
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
2
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
3 use strict;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
4 use Carp;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
5 use Getopt::Long;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
6 use English;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
7 use Pod::Usage;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
8 use Data::Dumper;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
9 use DBI;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
10 use DBD::mysql;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
11
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
12 my ( $help, $man, $version, $usage );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
13 my ($in, $out);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
14 my $min_inv = 10000;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
15 my $min_total_sclip = 6;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
16 my $add_inv_back;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
17 my $width = 20;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
18 my $optionOK = GetOptions(
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
19 'i|in|input=s' => \$in,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
20 'o|out|output=s' => \$out,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
21 'min_inv_size=i' => \$min_inv,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
22 'min_total_sclip=i' => \$min_total_sclip,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
23 'add_inv_back!' => \$add_inv_back,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
24 'width=i' => \$width,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
25 'h|help|?' => \$help,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
26 'man' => \$man,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
27 'usage' => \$usage,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
28 'v|version' => \$version,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
29 );
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
30
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
31 pod2usage(2) if($man);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
32 pod2usage( -verbose => 99, -sections => "USAGE|REQUIRED ARGUMENTS|OPTIONS" )
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
33 if($help or $usage);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
34 pod2usage( -verbose => 99, -sections => "VERSION") if($version);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
35
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
36 croak "Missing input file" if(!$in);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
37 open STDOUT, ">$out" if($out);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
38 open my $IN, "<$in" or croak "can't open $in: $OS_ERROR";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
39
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
40 my @SV;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
41 my @discard_SV;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
42 while(my $line = <$IN>) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
43 chomp $line;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
44 my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $line;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
45 next if( search_sv(\@SV, $chrA, $posA, $chrB, $posB, $type));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
46 if(($type eq 'ITX' || $type eq 'INV') && (abs($posB - $posA) < $min_inv && ($countA + $countB) < $min_total_sclip)) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
47 print STDERR "$line is removed due to possible circulization\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
48 push @discard_SV, $line;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
49 next;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
50 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
51 push @SV, [$chrA, $posA, $chrB, $posB, $type, $line];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
52 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
53
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
54 close $IN;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
55
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
56 foreach my $sv (@SV) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
57 print STDOUT $sv->[5];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
58 # if($sv->[4] eq 'DEL') {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
59 # my $tmp = get_gene_info($sv->[0], $sv->[1], $sv->[3]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
60 # print STDOUT "\t", $tmp if($tmp);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
61 # }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
62 # else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
63 # print STDOUT "\t", get_gene_info($sv->[0], $sv->[1], $sv->[1]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
64 # print STDOUT "\t", get_gene_info($sv->[2], $sv->[3], $sv->[3]);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
65 # }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
66 print STDOUT "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
67 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
68
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
69 if($add_inv_back) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
70 foreach my $sv (@discard_SV) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
71 my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $sv;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
72 next if(search_nearby_sv(\@SV, $chrA, $posA, $posB));
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
73 print STDOUT $sv;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
74 print "\n";
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
75 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
76 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
77 close STDOUT if($out);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
78 exit(0);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
79
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
80 sub search_nearby_sv {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
81 my($r_SV, $chrA, $posA, $posB) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
82 foreach my $sv (@{$r_SV}) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
83 return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posA) < 10000);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
84 return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posA) < 10000);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
85 return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posB) < 10000);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
86 return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posB) < 10000);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
87 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
88 return;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
89 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
90
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
91 sub search_sv {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
92 my ($r_SV, $chrA, $posA, $chrB, $posB, $type) = @_;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
93
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
94 for(my $i = 0; $i < scalar @{$r_SV}; $i++) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
95 my $sv = $r_SV->[$i];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
96 if($type eq "INV" && $r_SV->[$i][4] eq "ITX" ) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
97 if(( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < 20 &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
98 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < 20 )||
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
99 ( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < 20 &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
100 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < 20)) {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
101 delete $r_SV->[$i];
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
102 return;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
103 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
104 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
105 else {
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
106 return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
107 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
108 return 1 if( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < $width &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
109 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type);
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
110 return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
111 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type && $type eq 'CTX');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
112 return 1 if( $sv->[0] eq $chrB && abs($sv->[1]- $posB) < $width &&
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
113 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type && $type eq 'CTX');
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
114 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
115 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
116 return;
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
117 }
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
118
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
119 =head1 NAME
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
120
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
121 removeDupSV.pl - a program to remove duplicated SVs for CREST.pl
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
122
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
123
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
124 =head1 VERSION
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
125
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
126 This documentation refers to removeDupSV.pl version 0.0.1.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
127
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
128
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
129 =head1 USAGE
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
130 The program need an input file generated by CREST.pl
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
131 removeDupSV.pl -i tumor.predSV.txt -o tumor.predSV.txt.nodup
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
132
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
133 =head1 REQUIRED ARGUMENTS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
134
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
135 To run the program, several parameter must specified.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
136 -i The input file generated by CREST.pl
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
137 -o The output file
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
138
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
139 =head1 OPTIONS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
140
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
141 The options that can be used for the program.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
142 --min_inv_size For ITX/INV, the minimum event size, default 10000
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
143 --min_total_sclip For ITX/INV with size smaller than min_inv_size, the min
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
144 number of soft-clipped reads needed to keep it, default 6
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
145 --(no)add_inv_back Add ITX/INV back if there is another event nearby,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
146 default OFF.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
147 --width The max breakpoint distance such that it's considered as
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
148 same breakpoint, default 20
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
149 -h, --help, -? Help information
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
150 --man Man page.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
151 --usage Usage information.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
152 --version Software version.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
153
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
154
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
155 =head1 DESCRIPTION
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
156
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
157 This is a simple program to remove duplicated SVs, also it remove small INV/ITX
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
158 events since most of those are due to library preparation.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
159
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
160
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
161 =head1 BUGS AND LIMITATIONS
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
162
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
163 There are no known bugs in this module, but the method is limitted to bam file
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
164 that has soft-clipping cigar string generated.Please report problems to
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
165 Jianmin Wang (Jianmin.Wang@stjude.org)
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
166 Patches are welcome.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
167
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
168 =head1 AUTHOR
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
169
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
170 Jianmin Wang (Jianmin.Wang@stjude.org)
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
171
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
172
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
173 =head1 LICENCE AND COPYRIGHT
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
174
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
175 Copyright (c) 2010 by St. Jude Children's Research Hospital.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
176
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
177 This program is free software: you can redistribute it and/or modify
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
178 it under the terms of the GNU General Public License as published by
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
179 the Free Software Foundation, either version 2 of the License, or
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
180 (at your option) any later version.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
181
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
182 This program is distributed in the hope that it will be useful,
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
183 but WITHOUT ANY WARRANTY; without even the implied warranty of
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
184 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
185 GNU General Public License for more details.
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
186
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
187 You should have received a copy of the GNU General Public License
acc8d8bfeb9a Uploaded
jjohnson
parents:
diff changeset
188 along with this program. If not, see <http://www.gnu.org/licenses/>.