Mercurial > repos > jjohnson > crest
diff removeDupSV.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeDupSV.pl Wed Feb 08 16:59:24 2012 -0500 @@ -0,0 +1,188 @@ +#!/nfs_exports/apps/64-bit/gnu-apps/perl5.8.9/bin/perl -w + +use strict; +use Carp; +use Getopt::Long; +use English; +use Pod::Usage; +use Data::Dumper; +use DBI; +use DBD::mysql; + +my ( $help, $man, $version, $usage ); +my ($in, $out); +my $min_inv = 10000; +my $min_total_sclip = 6; +my $add_inv_back; +my $width = 20; +my $optionOK = GetOptions( + 'i|in|input=s' => \$in, + 'o|out|output=s' => \$out, + 'min_inv_size=i' => \$min_inv, + 'min_total_sclip=i' => \$min_total_sclip, + 'add_inv_back!' => \$add_inv_back, + 'width=i' => \$width, + 'h|help|?' => \$help, + 'man' => \$man, + 'usage' => \$usage, + 'v|version' => \$version, +); + +pod2usage(2) if($man); +pod2usage( -verbose => 99, -sections => "USAGE|REQUIRED ARGUMENTS|OPTIONS" ) + if($help or $usage); +pod2usage( -verbose => 99, -sections => "VERSION") if($version); + +croak "Missing input file" if(!$in); +open STDOUT, ">$out" if($out); +open my $IN, "<$in" or croak "can't open $in: $OS_ERROR"; + +my @SV; +my @discard_SV; +while(my $line = <$IN>) { + chomp $line; + my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $line; + next if( search_sv(\@SV, $chrA, $posA, $chrB, $posB, $type)); + if(($type eq 'ITX' || $type eq 'INV') && (abs($posB - $posA) < $min_inv && ($countA + $countB) < $min_total_sclip)) { + print STDERR "$line is removed due to possible circulization\n"; + push @discard_SV, $line; + next; + } + push @SV, [$chrA, $posA, $chrB, $posB, $type, $line]; +} + +close $IN; + +foreach my $sv (@SV) { + print STDOUT $sv->[5]; +# if($sv->[4] eq 'DEL') { +# my $tmp = get_gene_info($sv->[0], $sv->[1], $sv->[3]); +# print STDOUT "\t", $tmp if($tmp); +# } +# else { +# print STDOUT "\t", get_gene_info($sv->[0], $sv->[1], $sv->[1]); +# print STDOUT "\t", get_gene_info($sv->[2], $sv->[3], $sv->[3]); +# } + print STDOUT "\n"; +} + +if($add_inv_back) { + foreach my $sv (@discard_SV) { + my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $sv; + next if(search_nearby_sv(\@SV, $chrA, $posA, $posB)); + print STDOUT $sv; + print "\n"; + } +} +close STDOUT if($out); +exit(0); + +sub search_nearby_sv { + my($r_SV, $chrA, $posA, $posB) = @_; + foreach my $sv (@{$r_SV}) { + return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posA) < 10000); + return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posA) < 10000); + return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posB) < 10000); + return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posB) < 10000); + } + return; +} + +sub search_sv { + my ($r_SV, $chrA, $posA, $chrB, $posB, $type) = @_; + + for(my $i = 0; $i < scalar @{$r_SV}; $i++) { + my $sv = $r_SV->[$i]; + if($type eq "INV" && $r_SV->[$i][4] eq "ITX" ) { + if(( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < 20 && + $sv->[2] eq $chrB && abs($sv->[3] - $posB) < 20 )|| + ( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < 20 && + $sv->[2] eq $chrA && abs($sv->[3] - $posA) < 20)) { + delete $r_SV->[$i]; + return; + } + } + else { + return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && + $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type); + return 1 if( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < $width && + $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type); + return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && + $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type && $type eq 'CTX'); + return 1 if( $sv->[0] eq $chrB && abs($sv->[1]- $posB) < $width && + $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type && $type eq 'CTX'); + } + } + return; +} + +=head1 NAME + +removeDupSV.pl - a program to remove duplicated SVs for CREST.pl + + +=head1 VERSION + +This documentation refers to removeDupSV.pl version 0.0.1. + + +=head1 USAGE + The program need an input file generated by CREST.pl + removeDupSV.pl -i tumor.predSV.txt -o tumor.predSV.txt.nodup + +=head1 REQUIRED ARGUMENTS + + To run the program, several parameter must specified. + -i The input file generated by CREST.pl + -o The output file + +=head1 OPTIONS + + The options that can be used for the program. + --min_inv_size For ITX/INV, the minimum event size, default 10000 + --min_total_sclip For ITX/INV with size smaller than min_inv_size, the min + number of soft-clipped reads needed to keep it, default 6 + --(no)add_inv_back Add ITX/INV back if there is another event nearby, + default OFF. + --width The max breakpoint distance such that it's considered as + same breakpoint, default 20 + -h, --help, -? Help information + --man Man page. + --usage Usage information. + --version Software version. + + +=head1 DESCRIPTION + +This is a simple program to remove duplicated SVs, also it remove small INV/ITX +events since most of those are due to library preparation. + + +=head1 BUGS AND LIMITATIONS + +There are no known bugs in this module, but the method is limitted to bam file +that has soft-clipping cigar string generated.Please report problems to +Jianmin Wang (Jianmin.Wang@stjude.org) +Patches are welcome. + +=head1 AUTHOR + +Jianmin Wang (Jianmin.Wang@stjude.org) + + +=head1 LICENCE AND COPYRIGHT + +Copyright (c) 2010 by St. Jude Children's Research Hospital. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>.