Mercurial > repos > jjohnson > crest

diff extractSClip.pl @ 0:acc8d8bfeb9a
Uploaded
author: jjohnson
date: Wed, 08 Feb 2012 16:59:24 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extractSClip.pl	Wed Feb 08 16:59:24 2012 -0500
@@ -0,0 +1,390 @@
+#!/usr/bin/perl -w
+use strict;
+#use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev';
+use Carp;
+use Getopt::Long;
+use English;
+use Pod::Usage;
+use Data::Dumper;
+use Bio::DB::Sam;
+use File::Temp qw/ tempfile tempdir /;
+use File::Spec;
+use File::Basename;
+use Cwd;
+use List::MoreUtils qw/ uniq /;
+#custom packages
+use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP);
+use SVUtil qw($use_scratch $work_dir get_input_bam get_work_dir parse_range is_PCR_dup);
+
+use constant FQ_BASE_NUMBER => 33;
+
+my $rmdup = 0;
+my $print_read = 1;
+$use_scratch = 0;
+$min_percent_id = 90;
+my ($work_dir, $out_dir);
+my $ref_genome;
+# input/output
+my ($out_prefix, $range, $input_bam );
+my $out_suffix = "sclip.txt";
+my $paired = 1;
+my ( $help, $man, $version, $usage );
+my $optionOK = GetOptions(
+	'i|in|input=s'	=> \$input_bam,	
+	'o|out_dir=s'	=> \$out_dir,
+	'ref_genome=s'  => \$ref_genome,
+	'p=s'			=> \$out_prefix,
+	'scratch!'		=> \$use_scratch,
+	'paired!'		=> \$paired,
+	'rmdup!'		=> \$rmdup,
+	'lq_cutoff=i'	=> \$lowqual_cutoff,
+	'min_pct_id=i'	=> \$min_percent_id,
+	'min_pct_hq=i'	=> \$min_percent_hq,
+	'print_read!'	=> \$print_read,
+	'r|range=s'		=> \$range,
+	'h|help|?'		=> \$help,
+	'man'			=> \$man,
+	'usage'			=> \$usage,
+	'v|version'		=> \$version,
+);
+
+pod2usage(-verbose=>2) if($man or $usage);
+pod2usage(1) if($help or $version );
+
+my $start_dir = getcwd;
+if($input_bam) {
+	croak "The bam file you specified does not exist!\n" unless(-e $input_bam);
+	$input_bam = File::Spec->rel2abs($input_bam);
+}
+else{
+	croak "You must specify the input bam file or sample name";
+}
+croak "You must provide the reference genome in fasta format!" if(!$ref_genome);
+croak "The reference genome file you speicified does not exist!\n"
+	unless(-e $ref_genome);
+
+my $input_base = fileparse($input_bam);
+
+#setup output dir and workind directory
+$out_dir = getcwd if(!$out_dir);
+mkdir $out_dir if(!-e $out_dir || ! -d $out_dir);
+$work_dir = get_work_dir() if($use_scratch);
+$work_dir = $out_dir if(!$work_dir);
+$use_scratch = undef if($work_dir eq $out_dir);
+chdir($work_dir); 
+
+# figure out output prefix
+$out_prefix = $input_base if(!$out_prefix);
+
+my $sam = Bio::DB::Sam->new( -bam => $input_bam, -fasta => $ref_genome);
+
+my $output_file;
+my $validator = SCValidator->new();
+if(!$paired) {
+	$validator->remove_validator("strand_validator");
+}
+if($range) {
+	my ($chr, $start, $end) = parse_range($range);
+	my $tmp = $chr;
+	$tmp = $tmp . ".$start" if($start);
+	$tmp = $tmp . ".$end" if($end);
+	$output_file = join('.', $out_prefix, $tmp, $out_suffix);
+	$output_file = File::Spec->catfile($out_dir, $output_file);
+
+	my($pcover, $ncover) = extract_range_sclip(
+		-SAM => $sam, 
+		-RANGE => $range, 
+		-WORK_DIR => $work_dir, 
+		-OUTPUT => $output_file,  
+		-VALIDATOR => $validator);
+	$output_file = join('.', $out_prefix, $tmp, "cover");
+	$output_file = File::Spec->catfile($out_dir, $output_file);
+	warn "$output_file file exists and it will be replaced!" if(-e $output_file);
+	$rmdup = 0;
+	#$start = 0 unless $start;
+    #my ($tid) = $sam->header->parse_region($chr);
+    #my $chr_len = $sam->header->target_len->[$tid];
+	#$end = $chr_len unless $end;
+	
+	#my ($coverage) = $sam->features(-type => 'coverage', -seq_id => $chr,
+	#	-start => $start, -end => $end);
+	#my @cc = $coverage->coverage;
+
+	$rmdup = 0;
+	my($fh, $fname) = tempfile( DIR => $work_dir);
+	foreach my $p (keys(%{$pcover})) {
+		my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
+		print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p)), "\n";
+	}
+	foreach my $p (keys(%{$ncover})) {
+		my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
+		print $fh join("\t", $chr, $p, "-", $c , count_coverage($sam, $chr, $p)), "\n";
+	}
+	system("sort -k 2 -n $fname -o $output_file");
+	system("rm $fname");
+}
+else{
+	$output_file = join('.', $out_prefix, $out_suffix);
+	$output_file = File::Spec->catfile($out_dir, $output_file);
+    my $header = $sam->header;
+    my $target_names = $header->target_name;
+    $output_file = join('.', $out_prefix, "cover");
+	my $read_file = join('.', $out_prefix, "sclip.txt");
+    $output_file = File::Spec->catfile($out_dir, $output_file);
+    $read_file = File::Spec->catfile($out_dir, $read_file);
+
+	if(-e $output_file) {
+		warn "$output_file file exists and it will be replaced!";
+		system("rm $output_file");
+	}
+	my @t_names = uniq @{$target_names};
+    foreach my $chr (@t_names){
+		my($fh, $fname) = tempfile( DIR => $work_dir);
+        my($pcover, $ncover) = extract_range_sclip(
+			-SAM => $sam,
+			-RANGE =>$chr, 
+			-WORK_DIR => $work_dir, 
+			-OUTPUT => $read_file, 
+			-VALIDATOR => $validator);
+		foreach my $p (keys(%{$pcover})) {
+			my $c = ($rmdup ? scalar(@{$pcover->{$p}}) : $pcover->{$p});
+			print $fh join("\t", $chr, $p, "+", $c, count_coverage($sam, $chr, $p) ), "\n";
+		}
+		foreach my $p (keys(%{$ncover})) {
+			my $c = ($rmdup ? scalar(@{$ncover->{$p}}) : $ncover->{$p});
+			print $fh join("\t", $chr, $p, "-", $c, count_coverage($sam, $chr, $p)), "\n";
+		}
+		system("sort -k 2 -n $fname -o $fname.sorted");
+		system("cat $fname.sorted >> $output_file");
+		system("rm $fname");
+		system("rm $fname.sorted");
+    }
+}
+chdir $start_dir;
+exit(0);
+
+sub count_coverage {
+	my ($sam, $chr, $pos, $clip) = @_;
+	if($rmdup) {
+		my @pairs;
+		my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos);
+		my $n = 0;
+		my $itr = $seg->features(-iterator => 1);
+		while( my $a = $itr->next_seq) {
+			my $sclip_len = 0;
+			if($clip) {
+				my @cigar_array = @{$a->cigar_array};
+				$sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP);
+				$sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP);
+			}
+			next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len));
+			$n++;
+			#return $n if( $n > $max_repetitive_cover);
+			if($a->mpos) {
+				push @pairs, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
+			}
+			else {
+				push @pairs, [$a->start, $a->end, 0, 0, $sclip_len];
+			}
+
+		}
+		return $n;
+	}
+	else{
+		my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, 
+			-start => $pos, -end => $pos);
+		my @c_d = $c->coverage;
+		return $c_d[0];
+	}
+}
+
+sub extract_range_sclip {
+	my %arg = @_;
+	my $sam = $arg{-SAM} || croak "missing -SAM";
+	my $range = $arg{-RANGE} || croak "missing -RANGE";
+	my $output_file = $arg{-OUTPUT} || croak "missing -OUTPUT";
+	my $validator = $arg{-VALIDATOR} || croak "missing -VALIDATOR";
+
+	my($fh, $fname) = tempfile( DIR => $work_dir);
+	my (%plus_cover, %neg_cover);
+	$sam->fetch($range, 
+		sub {
+			my $a = shift;
+			my $cigar_str = $a->cigar_str;
+#			print STDERR $a->qname, "\t", $cigar_str, "\n";
+			my @cigar_array = @{$a->cigar_array};
+			return if($a->cigar_str !~ m/S/);
+			if($paired && !$a->proper_pair) { #paired but mate is not mapped 
+				$validator->remove_validator("strand_validator");
+			}
+			#return if(!$a->proper_pair && $paired);
+			#return if($paired && !$a->mpos); 
+			my ($sclip_len, $ort, $pos, $seq, $qual_str, $qual);
+			$qual_str = join( "",  (map { chr $_ + FQ_BASE_NUMBER } $a->qscore));
+			if($cigar_array[0]->[0] eq 'S' && $validator->validate($a, LEFT_CLIP) ) {
+				$sclip_len = $cigar_array[0]->[1]; $ort = "-"; $pos = $a->start;
+				$seq = substr($a->query->dna, 0, $sclip_len );
+				$qual = substr($qual_str, 0, $sclip_len);
+				
+				my $print = 1;
+				if($rmdup) {
+					if(exists $neg_cover{$pos}) {
+						$print = 0 if(is_PCR_dup($a, $neg_cover{$pos}, $sclip_len));
+					}
+					else {
+						$neg_cover{$pos} = [];
+					}
+					if($print == 1) {
+						if($a->mpos) {
+							push @{$neg_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
+						}
+						else {
+							push @{$neg_cover{$pos}}, [$a->start, $a->end, 0, 0, $sclip_len];
+						}
+					}
+				}
+				else {
+					if(exists $neg_cover{$pos}) {
+						$neg_cover{$pos}++;
+					}
+					else{
+						$neg_cover{$pos} = 1;
+					}
+				}
+				print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
+					if($print_read && $print == 1);
+			}
+
+			if($cigar_array[$#cigar_array]->[0] eq 'S' &&  $validator->validate($a, RIGHT_CLIP)) {
+				$sclip_len = $cigar_array[$#cigar_array]->[1]; $ort = '+'; $pos = $a->end;
+				my $l = length($a->query->dna);
+				$seq = substr($a->query->dna, $l - $sclip_len);
+				$qual = substr($qual_str, $l - $sclip_len);
+
+				my $print = 1;
+				if($rmdup) {
+					if(exists $plus_cover{$pos}) {
+						$print = 0 if(is_PCR_dup($a, $plus_cover{$pos}, $sclip_len));
+					}
+					else {
+						$plus_cover{$pos} = [];
+					}
+					if($print ==1 ) {
+						if($a->mpos) {
+							push @{$plus_cover{$pos}}, [$a->start, $a->end, $a->mate_start, $a->mate_end, $sclip_len];
+						}
+						else {
+							push @{$plus_cover{$pos}}, [$a->start, 0, 0, $a->mate_end, $sclip_len];
+						}
+					}
+				}
+				else {
+					if(exists $plus_cover{$pos}) {
+						$plus_cover{$pos}++;
+					}
+					else{
+						$plus_cover{$pos} = 1;
+					}
+				}
+				$print = is_PCR_dup($a, $plus_cover{$pos}, $sclip_len) if($rmdup);
+				print $fh join("\t", $a->seq_id, $pos, $ort, $a->qname, $seq, $qual), "\n"
+					if($print_read && $print);
+			}
+			if($paired && !$a->proper_pair) { #paired but mate is not mapped, add back
+				$validator->add_validator("strand_validator"); 
+			}
+		}
+	);
+	if($print_read) {
+		system("sort -k 2 -n $fname -o $fname.sorted");
+		system("cat $fname.sorted >> $output_file");
+		system("rm $fname");
+		system("rm $fname.sorted");
+	}
+	return(\%plus_cover, \%neg_cover);
+}
+
+
+=head1 NAME
+
+extractSClip.pl - extract positions with soft clipped read in bam file.
+
+
+=head1 VERSION
+
+This documentation refers to extractSClip.pl version 0.0.1.
+
+
+=head1 USAGE
+
+	# extract all positions with soft clipped reads in whole genome:
+	./extractSClip.pl -i sample.bam -g hg18.fa 
+	# extract chr1 positions with soft clipped reads
+	./extractSClip.pl -i sample.bam -g hg18.fa -r chr1
+
+
+=head1 REQUIRED ARGUMENTS
+
+	-i: Input bam file.
+	--ref_genome: The genome file in fa file, must be the same used to map reads.
+	
+
+=head1 OPTIONS
+
+	-r: The range of positions need to be extracted. Format: chr1 or chr1:500-5000.
+	-o: The output directory, default is current directory.
+	--scratch: use scracth space, default off.
+	--rmdup: remove PCR dumplicate reads, default on, use --normdup to turn it off.
+	--lq_cutoff: low quality cutoff value, default 20. 
+	--min_pct_id: minimum percent identify for the aligned high qual part,default 90.
+	--min_pct_hq: minimum percent high quality for soft clipped part, default 80.
+	--print_read: individual soft-clipped read will be printed, default off.
+	-h, --help: The help page.
+	--man: Print the man page.
+	--usage: Print usage information.
+	-v, --version: print version information.
+
+
+=head1 DESCRIPTION
+
+This is a program to extract all soft-clipped positions such that for each position
+a list of requirements need to be satisfied. More specifically, the orientaion of 
+pair-end read should be satisfied, the minimum percent identify of aligned part 
+need to be satisfied, and the minimum percent of hiqh quality soft-clipped part 
+should be satisfied.
+
+
+=head1 DEPENDENCIES
+
+The program depend on several packages:
+1. Bioperl perl module.
+2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
+
+=head1 BUGS AND LIMITATIONS
+
+There are no known bugs in this module, but the method is limitted to bam file 
+that has soft-clipping cigar string generated.Please report problems to 
+Jianmin Wang  (Jianmin.Wang@stjude.org)
+Patches are welcome.
+
+=head1 AUTHOR
+
+Jianmin Wang (Jianmin.Wang@stjude.org)
+
+
+=head1 LICENCE AND COPYRIGHT
+
+Copyright (c) 2010 by St. Jude Children's Research Hospital.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 2 of the License, or
+(at your option) any later version. 
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of 
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
author	jjohnson
date	Wed, 08 Feb 2012 16:59:24 -0500
parents
children