Mercurial > repos > jjohnson > crest
comparison removeDupSV.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 #!/nfs_exports/apps/64-bit/gnu-apps/perl5.8.9/bin/perl -w | |
2 | |
3 use strict; | |
4 use Carp; | |
5 use Getopt::Long; | |
6 use English; | |
7 use Pod::Usage; | |
8 use Data::Dumper; | |
9 use DBI; | |
10 use DBD::mysql; | |
11 | |
12 my ( $help, $man, $version, $usage ); | |
13 my ($in, $out); | |
14 my $min_inv = 10000; | |
15 my $min_total_sclip = 6; | |
16 my $add_inv_back; | |
17 my $width = 20; | |
18 my $optionOK = GetOptions( | |
19 'i|in|input=s' => \$in, | |
20 'o|out|output=s' => \$out, | |
21 'min_inv_size=i' => \$min_inv, | |
22 'min_total_sclip=i' => \$min_total_sclip, | |
23 'add_inv_back!' => \$add_inv_back, | |
24 'width=i' => \$width, | |
25 'h|help|?' => \$help, | |
26 'man' => \$man, | |
27 'usage' => \$usage, | |
28 'v|version' => \$version, | |
29 ); | |
30 | |
31 pod2usage(2) if($man); | |
32 pod2usage( -verbose => 99, -sections => "USAGE|REQUIRED ARGUMENTS|OPTIONS" ) | |
33 if($help or $usage); | |
34 pod2usage( -verbose => 99, -sections => "VERSION") if($version); | |
35 | |
36 croak "Missing input file" if(!$in); | |
37 open STDOUT, ">$out" if($out); | |
38 open my $IN, "<$in" or croak "can't open $in: $OS_ERROR"; | |
39 | |
40 my @SV; | |
41 my @discard_SV; | |
42 while(my $line = <$IN>) { | |
43 chomp $line; | |
44 my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $line; | |
45 next if( search_sv(\@SV, $chrA, $posA, $chrB, $posB, $type)); | |
46 if(($type eq 'ITX' || $type eq 'INV') && (abs($posB - $posA) < $min_inv && ($countA + $countB) < $min_total_sclip)) { | |
47 print STDERR "$line is removed due to possible circulization\n"; | |
48 push @discard_SV, $line; | |
49 next; | |
50 } | |
51 push @SV, [$chrA, $posA, $chrB, $posB, $type, $line]; | |
52 } | |
53 | |
54 close $IN; | |
55 | |
56 foreach my $sv (@SV) { | |
57 print STDOUT $sv->[5]; | |
58 # if($sv->[4] eq 'DEL') { | |
59 # my $tmp = get_gene_info($sv->[0], $sv->[1], $sv->[3]); | |
60 # print STDOUT "\t", $tmp if($tmp); | |
61 # } | |
62 # else { | |
63 # print STDOUT "\t", get_gene_info($sv->[0], $sv->[1], $sv->[1]); | |
64 # print STDOUT "\t", get_gene_info($sv->[2], $sv->[3], $sv->[3]); | |
65 # } | |
66 print STDOUT "\n"; | |
67 } | |
68 | |
69 if($add_inv_back) { | |
70 foreach my $sv (@discard_SV) { | |
71 my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $sv; | |
72 next if(search_nearby_sv(\@SV, $chrA, $posA, $posB)); | |
73 print STDOUT $sv; | |
74 print "\n"; | |
75 } | |
76 } | |
77 close STDOUT if($out); | |
78 exit(0); | |
79 | |
80 sub search_nearby_sv { | |
81 my($r_SV, $chrA, $posA, $posB) = @_; | |
82 foreach my $sv (@{$r_SV}) { | |
83 return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posA) < 10000); | |
84 return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posA) < 10000); | |
85 return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posB) < 10000); | |
86 return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posB) < 10000); | |
87 } | |
88 return; | |
89 } | |
90 | |
91 sub search_sv { | |
92 my ($r_SV, $chrA, $posA, $chrB, $posB, $type) = @_; | |
93 | |
94 for(my $i = 0; $i < scalar @{$r_SV}; $i++) { | |
95 my $sv = $r_SV->[$i]; | |
96 if($type eq "INV" && $r_SV->[$i][4] eq "ITX" ) { | |
97 if(( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < 20 && | |
98 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < 20 )|| | |
99 ( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < 20 && | |
100 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < 20)) { | |
101 delete $r_SV->[$i]; | |
102 return; | |
103 } | |
104 } | |
105 else { | |
106 return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && | |
107 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type); | |
108 return 1 if( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < $width && | |
109 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type); | |
110 return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && | |
111 $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type && $type eq 'CTX'); | |
112 return 1 if( $sv->[0] eq $chrB && abs($sv->[1]- $posB) < $width && | |
113 $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type && $type eq 'CTX'); | |
114 } | |
115 } | |
116 return; | |
117 } | |
118 | |
119 =head1 NAME | |
120 | |
121 removeDupSV.pl - a program to remove duplicated SVs for CREST.pl | |
122 | |
123 | |
124 =head1 VERSION | |
125 | |
126 This documentation refers to removeDupSV.pl version 0.0.1. | |
127 | |
128 | |
129 =head1 USAGE | |
130 The program need an input file generated by CREST.pl | |
131 removeDupSV.pl -i tumor.predSV.txt -o tumor.predSV.txt.nodup | |
132 | |
133 =head1 REQUIRED ARGUMENTS | |
134 | |
135 To run the program, several parameter must specified. | |
136 -i The input file generated by CREST.pl | |
137 -o The output file | |
138 | |
139 =head1 OPTIONS | |
140 | |
141 The options that can be used for the program. | |
142 --min_inv_size For ITX/INV, the minimum event size, default 10000 | |
143 --min_total_sclip For ITX/INV with size smaller than min_inv_size, the min | |
144 number of soft-clipped reads needed to keep it, default 6 | |
145 --(no)add_inv_back Add ITX/INV back if there is another event nearby, | |
146 default OFF. | |
147 --width The max breakpoint distance such that it's considered as | |
148 same breakpoint, default 20 | |
149 -h, --help, -? Help information | |
150 --man Man page. | |
151 --usage Usage information. | |
152 --version Software version. | |
153 | |
154 | |
155 =head1 DESCRIPTION | |
156 | |
157 This is a simple program to remove duplicated SVs, also it remove small INV/ITX | |
158 events since most of those are due to library preparation. | |
159 | |
160 | |
161 =head1 BUGS AND LIMITATIONS | |
162 | |
163 There are no known bugs in this module, but the method is limitted to bam file | |
164 that has soft-clipping cigar string generated.Please report problems to | |
165 Jianmin Wang (Jianmin.Wang@stjude.org) | |
166 Patches are welcome. | |
167 | |
168 =head1 AUTHOR | |
169 | |
170 Jianmin Wang (Jianmin.Wang@stjude.org) | |
171 | |
172 | |
173 =head1 LICENCE AND COPYRIGHT | |
174 | |
175 Copyright (c) 2010 by St. Jude Children's Research Hospital. | |
176 | |
177 This program is free software: you can redistribute it and/or modify | |
178 it under the terms of the GNU General Public License as published by | |
179 the Free Software Foundation, either version 2 of the License, or | |
180 (at your option) any later version. | |
181 | |
182 This program is distributed in the hope that it will be useful, | |
183 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
184 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
185 GNU General Public License for more details. | |
186 | |
187 You should have received a copy of the GNU General Public License | |
188 along with this program. If not, see <http://www.gnu.org/licenses/>. |