Mercurial > repos > jjohnson > crest
view removeDupSV.pl @ 1:4f6952e0af48 default tip
CREST - add crest.loc.sample
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 08 Feb 2012 16:08:01 -0600 |
parents | acc8d8bfeb9a |
children |
line wrap: on
line source
#!/nfs_exports/apps/64-bit/gnu-apps/perl5.8.9/bin/perl -w use strict; use Carp; use Getopt::Long; use English; use Pod::Usage; use Data::Dumper; use DBI; use DBD::mysql; my ( $help, $man, $version, $usage ); my ($in, $out); my $min_inv = 10000; my $min_total_sclip = 6; my $add_inv_back; my $width = 20; my $optionOK = GetOptions( 'i|in|input=s' => \$in, 'o|out|output=s' => \$out, 'min_inv_size=i' => \$min_inv, 'min_total_sclip=i' => \$min_total_sclip, 'add_inv_back!' => \$add_inv_back, 'width=i' => \$width, 'h|help|?' => \$help, 'man' => \$man, 'usage' => \$usage, 'v|version' => \$version, ); pod2usage(2) if($man); pod2usage( -verbose => 99, -sections => "USAGE|REQUIRED ARGUMENTS|OPTIONS" ) if($help or $usage); pod2usage( -verbose => 99, -sections => "VERSION") if($version); croak "Missing input file" if(!$in); open STDOUT, ">$out" if($out); open my $IN, "<$in" or croak "can't open $in: $OS_ERROR"; my @SV; my @discard_SV; while(my $line = <$IN>) { chomp $line; my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $line; next if( search_sv(\@SV, $chrA, $posA, $chrB, $posB, $type)); if(($type eq 'ITX' || $type eq 'INV') && (abs($posB - $posA) < $min_inv && ($countA + $countB) < $min_total_sclip)) { print STDERR "$line is removed due to possible circulization\n"; push @discard_SV, $line; next; } push @SV, [$chrA, $posA, $chrB, $posB, $type, $line]; } close $IN; foreach my $sv (@SV) { print STDOUT $sv->[5]; # if($sv->[4] eq 'DEL') { # my $tmp = get_gene_info($sv->[0], $sv->[1], $sv->[3]); # print STDOUT "\t", $tmp if($tmp); # } # else { # print STDOUT "\t", get_gene_info($sv->[0], $sv->[1], $sv->[1]); # print STDOUT "\t", get_gene_info($sv->[2], $sv->[3], $sv->[3]); # } print STDOUT "\n"; } if($add_inv_back) { foreach my $sv (@discard_SV) { my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $sv; next if(search_nearby_sv(\@SV, $chrA, $posA, $posB)); print STDOUT $sv; print "\n"; } } close STDOUT if($out); exit(0); sub search_nearby_sv { my($r_SV, $chrA, $posA, $posB) = @_; foreach my $sv (@{$r_SV}) { return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posA) < 10000); return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posA) < 10000); return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posB) < 10000); return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posB) < 10000); } return; } sub search_sv { my ($r_SV, $chrA, $posA, $chrB, $posB, $type) = @_; for(my $i = 0; $i < scalar @{$r_SV}; $i++) { my $sv = $r_SV->[$i]; if($type eq "INV" && $r_SV->[$i][4] eq "ITX" ) { if(( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < 20 && $sv->[2] eq $chrB && abs($sv->[3] - $posB) < 20 )|| ( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < 20 && $sv->[2] eq $chrA && abs($sv->[3] - $posA) < 20)) { delete $r_SV->[$i]; return; } } else { return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type); return 1 if( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < $width && $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type); return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width && $sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type && $type eq 'CTX'); return 1 if( $sv->[0] eq $chrB && abs($sv->[1]- $posB) < $width && $sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type && $type eq 'CTX'); } } return; } =head1 NAME removeDupSV.pl - a program to remove duplicated SVs for CREST.pl =head1 VERSION This documentation refers to removeDupSV.pl version 0.0.1. =head1 USAGE The program need an input file generated by CREST.pl removeDupSV.pl -i tumor.predSV.txt -o tumor.predSV.txt.nodup =head1 REQUIRED ARGUMENTS To run the program, several parameter must specified. -i The input file generated by CREST.pl -o The output file =head1 OPTIONS The options that can be used for the program. --min_inv_size For ITX/INV, the minimum event size, default 10000 --min_total_sclip For ITX/INV with size smaller than min_inv_size, the min number of soft-clipped reads needed to keep it, default 6 --(no)add_inv_back Add ITX/INV back if there is another event nearby, default OFF. --width The max breakpoint distance such that it's considered as same breakpoint, default 20 -h, --help, -? Help information --man Man page. --usage Usage information. --version Software version. =head1 DESCRIPTION This is a simple program to remove duplicated SVs, also it remove small INV/ITX events since most of those are due to library preparation. =head1 BUGS AND LIMITATIONS There are no known bugs in this module, but the method is limitted to bam file that has soft-clipping cigar string generated.Please report problems to Jianmin Wang (Jianmin.Wang@stjude.org) Patches are welcome. =head1 AUTHOR Jianmin Wang (Jianmin.Wang@stjude.org) =head1 LICENCE AND COPYRIGHT Copyright (c) 2010 by St. Jude Children's Research Hospital. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.