diff removeDupSV.pl @ 0:acc8d8bfeb9a

Uploaded
author jjohnson
date Wed, 08 Feb 2012 16:59:24 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/removeDupSV.pl	Wed Feb 08 16:59:24 2012 -0500
@@ -0,0 +1,188 @@
+#!/nfs_exports/apps/64-bit/gnu-apps/perl5.8.9/bin/perl -w
+
+use strict;
+use Carp;
+use Getopt::Long;
+use English;
+use Pod::Usage;
+use Data::Dumper;
+use DBI;
+use DBD::mysql;
+
+my ( $help, $man, $version, $usage );
+my ($in, $out);
+my $min_inv = 10000;
+my $min_total_sclip = 6;
+my $add_inv_back;
+my $width = 20;
+my $optionOK = GetOptions(
+	'i|in|input=s'	=> \$in,
+	'o|out|output=s'	=> \$out,
+	'min_inv_size=i'	=> \$min_inv,
+	'min_total_sclip=i'	=> \$min_total_sclip,
+	'add_inv_back!'	=> \$add_inv_back,
+	'width=i'		=> \$width,
+	'h|help|?'		=> \$help,
+	'man'			=> \$man,
+	'usage'			=> \$usage,
+	'v|version'		=> \$version,
+);
+
+pod2usage(2) if($man);
+pod2usage( -verbose => 99, -sections => "USAGE|REQUIRED ARGUMENTS|OPTIONS" ) 
+	if($help or $usage);
+pod2usage( -verbose => 99, -sections => "VERSION") if($version);
+
+croak "Missing input file" if(!$in);
+open STDOUT, ">$out" if($out);
+open my $IN, "<$in" or croak "can't open $in: $OS_ERROR";
+
+my @SV;
+my @discard_SV;
+while(my $line = <$IN>) {
+	chomp $line;
+	my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $line;
+	next if( search_sv(\@SV, $chrA, $posA, $chrB, $posB, $type));
+	if(($type eq 'ITX' || $type eq 'INV') && (abs($posB - $posA) < $min_inv && ($countA + $countB) < $min_total_sclip)) {
+		print STDERR "$line is removed due to possible circulization\n";
+		push @discard_SV, $line;
+		next;
+	}
+	push @SV, [$chrA, $posA, $chrB, $posB, $type, $line];
+}
+
+close $IN;
+
+foreach my $sv (@SV) {
+	print STDOUT $sv->[5];
+#	if($sv->[4] eq 'DEL') {
+#		my $tmp = get_gene_info($sv->[0], $sv->[1], $sv->[3]);
+#		print STDOUT "\t", $tmp if($tmp);
+#	}
+#	else {
+#		print STDOUT "\t", get_gene_info($sv->[0], $sv->[1], $sv->[1]);
+#		print STDOUT "\t", get_gene_info($sv->[2], $sv->[3], $sv->[3]);
+#	}
+	print STDOUT "\n";
+}
+
+if($add_inv_back) {
+	foreach my $sv (@discard_SV) {
+		my ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, $type) = split /\t/, $sv;
+		next if(search_nearby_sv(\@SV, $chrA, $posA, $posB));
+		print STDOUT $sv;
+		print "\n";
+	}
+}
+close STDOUT if($out);
+exit(0);
+
+sub search_nearby_sv {
+	my($r_SV, $chrA, $posA, $posB) = @_;
+	foreach my $sv (@{$r_SV}) {
+		return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posA) < 10000);
+		return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posA) < 10000);
+		return 1 if($sv->[0] eq $chrA && abs($sv->[1] - $posB) < 10000);
+		return 1 if($sv->[2] eq $chrA && abs($sv->[3] - $posB) < 10000);
+	}
+	return;
+}
+
+sub search_sv {
+	my ($r_SV, $chrA, $posA, $chrB, $posB, $type) = @_;
+	
+	for(my $i = 0; $i < scalar @{$r_SV}; $i++) {
+		my $sv = $r_SV->[$i];
+		if($type eq "INV" && $r_SV->[$i][4] eq "ITX" ) {
+			if(( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < 20 &&
+	            $sv->[2] eq $chrB && abs($sv->[3] - $posB) < 20 )||
+				( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < 20 &&
+	            $sv->[2] eq $chrA && abs($sv->[3] - $posA) < 20)) {
+				delete $r_SV->[$i];
+				return;
+			}
+		}
+		else {	
+			return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width &&
+				$sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type);
+			return 1 if( $sv->[0] eq $chrB && abs($sv->[1] - $posB) < $width &&
+				$sv->[2] eq $chrA && abs($sv->[3] - $posA) < $width && $sv->[4] eq $type);
+			return 1 if( $sv->[0] eq $chrA && abs($sv->[1] - $posA) < $width &&
+				$sv->[2] eq $chrB && abs($sv->[3] - $posB) < $width && $sv->[4] eq $type && $type eq 'CTX');
+			return 1 if( $sv->[0] eq $chrB && abs($sv->[1]- $posB) < $width &&
+				$sv->[2] eq $chrA && abs($sv->[3] - $posA)  < $width  && $sv->[4] eq $type && $type eq 'CTX');
+		}
+	}
+	return;
+}
+
+=head1 NAME
+
+removeDupSV.pl - a program to remove duplicated SVs for CREST.pl
+
+
+=head1 VERSION
+
+This documentation refers to removeDupSV.pl version 0.0.1.
+
+
+=head1 USAGE
+	The program need an input file generated by CREST.pl	
+		removeDupSV.pl -i tumor.predSV.txt -o tumor.predSV.txt.nodup
+
+=head1 REQUIRED ARGUMENTS
+
+	To run the program, several parameter must specified.
+	-i	The input file generated by CREST.pl
+	-o	The output file
+
+=head1 OPTIONS
+
+	The options that can be used for the program.
+	--min_inv_size		For ITX/INV, the minimum event size, default 10000
+	--min_total_sclip	For ITX/INV with size smaller than min_inv_size, the min
+						number of soft-clipped reads needed to keep it, default 6
+	--(no)add_inv_back	Add ITX/INV back if there is another event nearby, 
+						default OFF.
+	--width				The max breakpoint distance such that it's considered as 
+						same breakpoint, default 20
+	-h, --help, -?		 Help information
+	--man				 Man page.
+	--usage				 Usage information.
+	--version			 Software version.
+
+
+=head1 DESCRIPTION
+
+This is a simple program to remove duplicated SVs, also it remove small INV/ITX 
+events since most of those are due to library preparation.
+
+
+=head1 BUGS AND LIMITATIONS
+
+There are no known bugs in this module, but the method is limitted to bam file 
+that has soft-clipping cigar string generated.Please report problems to 
+Jianmin Wang  (Jianmin.Wang@stjude.org)
+Patches are welcome.
+
+=head1 AUTHOR
+
+Jianmin Wang (Jianmin.Wang@stjude.org)
+
+
+=head1 LICENCE AND COPYRIGHT
+
+Copyright (c) 2010 by St. Jude Children's Research Hospital.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 2 of the License, or
+(at your option) any later version. 
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of 
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.