Mercurial > repos > jjohnson > crest
diff CREST.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CREST.pl Wed Feb 08 16:59:24 2012 -0500 @@ -0,0 +1,1232 @@ +#!/usr/bin/perl -w +#use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev'; +use strict; +use Carp; +use Getopt::Long; +use English; +use Pod::Usage; +use Data::Dumper; +use Bio::DB::Sam; +use Bio::DB::Sam::Constants; +use Bio::SearchIO; +use Bio::SeqIO; +use File::Temp qw/ tempfile tempdir /; +use File::Spec; +use File::Path; +use File::Copy; +use File::Basename; +use List::MoreUtils qw/ uniq /; +use Cwd; +use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP); +use SVUtil qw($use_scratch $work_dir clip_fa_file prepare_reads parse_range get_input_bam + find_smallest_cover get_work_dir is_PCR_dup read_fa_file get_sclip_reads); +use StructVar; +require SVExtTools; + +use Transcript; +use Gene; +use GeneModel; + +my $debug = 1; +$min_percent_id = 90; +# input/output +my ( $out_dir, $out_prefix, $range, $bam_d, $bam_g, $sclip_file); +my $out_suffix = "predSV.txt"; +my $read_len = 100; +my $ref_genome ; + +my $hetero_factor = 0.4; +my $triger_p_value = 0.05; + +#RNASeq support +my $RNASeq = 0; +my $gene_model_file; +my $gm_format = "REFFLAT"; +my $gm; +my $max_sclip_reads = 50; # we will consider the position if we have enough sclipped reads +my $min_one_side_reads = 5; +my $min_pct_sclip_reads = 5; # we require at least 5% reads have softclipping + +# external programs related variable +# 1. cap3 related variables +my $cap3 = "cap3"; +my $cap3_options=" -h 70 -y 10 > /dev/null"; + +# 2 blat related variables, using blat server and blat standalone +my $blat_client_exe = "gfClient"; +my $blat_client_options = ' -out=psl -nohead -minIdentity=95 -maxIntron=5'; +my $target_genome = "hg18.2bit"; +my $dir_2bit = "/"; +my $blat_server = "sjblat"; +my $blat_port = 50000; +my $blat_exe = "blat"; +my $blat_options = " -tileSize=7 -stepSize=1 -out=psl -minScore=15 -noHead -maxIntron=1 "; + +$use_scratch = 0; +my $paired = 1; + +my $rescue = 1; +# other options +my ($min_sclip_reads, $max_repetitive_cover, $min_sclip_len ); +my $min_hit_reads; +my $rmdup; +my $min_clip_len; +my ($max_score_diff, $max_num_hits); +my ($min_dist_diff, $min_hit_len) = (15, 15); + +#SV filter related parameters +my $max_bp_dist = 15; +my $min_percent_cons_of_read = 0.75; +my $germline_seq_width = 100; +my $germline_search_width = 50; +my $rm_tandem_repeat = 1; #tandem repeat mediated events +my $tr_max_indel_size = 100; #tandem repeat mediated indel size +my $tr_min_size = 2; +my $tr_max_size = 8; #tandem repeat max_size of single repeat +my $tr_min_num = 4; #tandem repeat minimum number of occurence + +# verification pupuse +my $verify_file; +my $sensitive; +my $cluster_size; + +# common help options +my ( $help, $man, $version, $usage ); +my $optionOK = GetOptions( + # SV validation related parameters + 'max_bp_dist=i' => \$max_bp_dist, + 'min_percent_cons_of_read=f' => \$min_percent_cons_of_read, + 'germline_seq_width=i' => \$germline_seq_width, + 'germline_search_width=i' => \$germline_search_width, + # tandem repeat related parameters + 'rm_tandem_repeat!' => \$rm_tandem_repeat, + 'tr_max_indel_size=i' => \$tr_max_indel_size, + 'tr_min_size=i' => \$tr_min_size, + 'tr_max_size=i' => \$tr_max_size, + 'tr_min_num=i' => \$tr_min_num, + 'hetero_factor=f' => \$hetero_factor, + 'triger_p_value=f' => \$triger_p_value, + + # input/output + #'s|sample=s' => \$sample, + 'd|input_d=s' => \$bam_d, + 'g|inpug_g=s' => \$bam_g, + 'f|sclipfile=s' => \$sclip_file, + 'p|prefix=s' => \$out_prefix, + 'o|out_dir=s' => \$out_dir, + 'l|read_len=i' => \$read_len, + 'ref_genome=s' => \$ref_genome, + 'v|verify_file=s' => \$verify_file, + 'sensitive' => \$sensitive, + 'paired!' => \$paired, + 'rescue!' => \$rescue, + 'cluster_size=i' => \$cluster_size, + # external programs location and options + 'scratch!' => \$use_scratch, + 'cap3=s' => \$cap3, + 'cap3opt=s' => \$cap3_options, + 'blatclient=s' => \$blat_client_exe, + 'blatclientopt=s' => \$blat_client_options, + 'blat=s' => \$blat_exe, + 't|target_genome=s' => \$target_genome, + 'blatopt=s' => \$blat_options, + 'blatserver=s' => \$blat_server, + 'blatport=i' => \$blat_port, + '2bitdir=s' => \$dir_2bit, + #RNAseq support + 'RNASeq' => \$RNASeq, + 'genemodel=s' => \$gene_model_file, + 'gmformat=s' => \$gm_format, + #other related parameters + 'r|range=s' => \$range, + 'max_score_diff=i' => \$max_score_diff, + 'm|min_sclip_reads=i' => \$min_sclip_reads, + 'min_one_side_reads=i' => \$min_one_side_reads, + 'max_rep_cover=i' => \$max_repetitive_cover, + 'min_sclip_len=i' => \$min_sclip_len, + 'min_hit_len=i' => \$min_hit_len, + 'min_dist_diff=i' => \$min_dist_diff, + 'min_hit_reads=i' => \$min_hit_reads, + 'rmdup!' => \$rmdup, + + # soft_clipping related parameters (from SVDector package) + 'min_percent_id=i' => \$min_percent_id, + 'min_percent_hq=i' => \$SCValidator::min_percent_hq, + 'lowqual_cutoff=i' => \$lowqual_cutoff, + + # common help parameters + 'h|help|?' => \$help, + 'man' => \$man, + 'usage' => \$usage, + 'version' => \$version, +); + +pod2usage(-verbose=>2) if($man or $usage); +pod2usage(1) if($help or $version ); + +my $start_dir = getcwd; + +# figure out input file +if(!$bam_d) { + pod2usage(1); + croak "You need specify input bam file(s)"; +} + +if(!$sclip_file) { + pod2usage(1); + croak "You need to specify the softclipping file"; +} +$sclip_file = File::Spec->rel2abs($sclip_file); + +if(!$ref_genome) { + pod2usage(1); + croak "You need to specify the reference genome used by bam file"; +} + +croak "The file you sepcified does not exist" unless ( + -e $sclip_file && -e $bam_d && -e $ref_genome && -e $target_genome); +my $input_base; +$bam_d = File::Spec->rel2abs($bam_d); +$input_base = fileparse($bam_d); +$bam_g = File::Spec->rel2abs($bam_g) if($bam_g); + +#RNASeq support +if($RNASeq) { + croak "You need specify the input gene model file" unless ($gene_model_file); + StructVar->add_RNASeq_filter(); + $min_sclip_reads = 10 unless $min_sclip_reads; + $max_repetitive_cover = 5000 unless $max_repetitive_cover; + $min_sclip_len = 20 unless $min_clip_len; + $max_score_diff = 5 unless $max_score_diff; + $max_num_hits = 3 unless $max_num_hits; + $min_hit_reads = 5 unless $min_hit_reads; + $cluster_size = 5 unless $cluster_size; +} +else { + $min_sclip_reads = 3 unless $min_sclip_reads; + $max_repetitive_cover = 500 unless $max_repetitive_cover; + $min_sclip_len = 20 unless $min_clip_len; + $max_score_diff = 5 unless $max_score_diff; + $max_num_hits = 10 unless $max_num_hits; + $min_hit_reads = 1 unless $min_hit_reads; + $cluster_size = 1 unless $cluster_size; +} + +if($gene_model_file) { + $gm = GeneModel->new if($gene_model_file); + $gm->from_file($gene_model_file, $gm_format); + StructVar->gene_model($gm); +} + +# set up the external programs and validators +# Those variable will be global +my $assembler = Assembler->new( + -PRG => $cap3, + -OPTIONS => $cap3_options +); + +my $mapper = Mapper->new( + -PRG => join(' ', ($blat_client_exe, $blat_server, $blat_port)), + -OPTIONS => $blat_client_options, + -BIT2_DIR => $dir_2bit, + -MAX_SCORE_DIFF => $max_score_diff, + -MAX_NUM_HITS => $max_num_hits, +); + +my $aligner = Aligner->new( + -PRG => $blat_exe, + -OPTIONS => $blat_options, +); + +StructVar->assembler($assembler); +StructVar->read_len($read_len); +StructVar->aligner($aligner); +StructVar->mapper($mapper); +StructVar->RNASeq(1) if($RNASeq); +StructVar->genome($target_genome); +StructVar->remove_filter("tandem_repeat") unless($rm_tandem_repeat); +StructVar->tr_max_indel_size($tr_max_indel_size); +StructVar->tr_min_size($tr_min_size); +StructVar->tr_max_size($tr_max_size); +StructVar->tr_min_num($tr_min_num); +StructVar->max_bp_dist($max_bp_dist) if($max_bp_dist); +StructVar->germline_seq_width($germline_seq_width) if($germline_seq_width); +StructVar->germline_search_width($germline_search_width) if($germline_search_width); + + +my $validator = SCValidator->new(); +$validator->remove_validator('strand_validator') if(!$paired); + +#setup output and working directory +$out_dir = getcwd if(!$out_dir); +mkdir $out_dir if(!-e $out_dir || ! -d $out_dir); +$work_dir = get_work_dir(-SCRATCH => $use_scratch); +$work_dir = $out_dir if(!$work_dir); +$use_scratch = undef if($work_dir eq $out_dir); # don't erase the out_dir +chdir($work_dir); + +# figure out output prefix +if(!$out_prefix) { + $out_prefix = $input_base; +} + +my $sam_d = Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome); +StructVar->sam_d($sam_d); +my $sam_g; +if($bam_g) { + $sam_g = Bio::DB::Sam->new( -bam => $bam_g); + StructVar->sam_g($sam_g); +} + +#my $output_file = File::Spec->catfile($outdir, $out_prefix . $out_suffix); +my $output_file; +$output_file = join('.', $out_prefix, $out_suffix); +$output_file = File::Spec->catfile($out_dir, $output_file); + +# the softclip file is sorted, so no need to re-sort it +open my $SCLIP, "<$sclip_file" or croak "can't open $sclip_file:$OS_ERROR"; +my %sclip; +my $pre_p; +while( my $line = <$SCLIP> ) { + chomp $line; + my ($chr, $pos, $ort, $cover, $C) = split /\t/, $line; + if( ! exists($sclip{$chr})) { + $sclip{$chr} = []; + $pre_p = $pos; + } + $C = 30 unless $C; + if($pos < $pre_p) { + print STDERR "The input file is not sorted!"; + exit(1); + } + push @{$sclip{$chr}}, [$pos, $ort, $cover, $C]; +} +close $SCLIP; + +my @final_SV; +my @s_cover = @{find_smallest_cover($min_sclip_reads, $hetero_factor, $triger_p_value)}; +if($range) { + @final_SV = detect_range_SVs($sam_d, $range, \%sclip, \@s_cover); +} +else { + foreach my $chr (keys %sclip) { + my @tmpSV = detect_range_SVs($sam_d, $chr, \%sclip, \@s_cover); + @final_SV = @tmpSV unless (@final_SV); + foreach my $sv (@tmpSV) { + push @final_SV, $sv if($sv && ! is_dup_SV(\@final_SV, $sv)); + } + } +} +undef %sclip; +open my $OUT, ">$output_file" or croak "can't open $output_file:$OS_ERROR"; +foreach my $SV (@final_SV) { + if($SV->filter) { + print $OUT $SV->to_string ; + if($RNASeq) { + print $OUT "\t", join("\t", @{$SV->get_genes}); + } + print $OUT "\n"; + } +} +chdir $start_dir; +exit(0); + +sub detect_range_SVs { + my ($sam_d, $range, $r_sclips, $r_scover) = @_; + my ($chr, $start, $end) = parse_range($range); + my ($tid) = $sam_d->header->parse_region($chr); + my $chr_len = $sam_d->header->target_len->[$tid]; + my $r_range_sclips = search_sclip($r_sclips, $range); + return unless $r_range_sclips; + my @scover = @{$r_scover}; + my @rtn; + + my ($f_tree, $r_tree); + if($RNASeq) { + my $full_chr = $chr; + my ($gm_chr) = $gm->get_all_chr; + $full_chr = "chr" . $chr if($chr !~ m/chr/ && $gm_chr =~ /chr/); + ($f_tree, $r_tree) = ($gm->sub_model($full_chr, "+"), $gm->sub_model($full_chr, "-")) if($RNASeq); + } + my $n = scalar @{$r_range_sclips}; + push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '+', 1, 50]; + push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '-', 1, 50]; + $n = $n + 2; + + # try to save the 1 base off problem of soft-clipping + my (@ss1, $p1, $c1); + my (@ss2, $p2, $c2); + my ($n1, $n2, $cover1, $cover2) = (0, 0, 0, 0); + for( my $i = 0; $i < $n; $i++) { + my $s = $r_range_sclips->[$i]; +# print join("\t", @{$s}), "\n" if($debug); + my $clip = $s->[1] eq '+' ? RIGHT_CLIP : LEFT_CLIP; + next if($s->[0] >= $chr_len ); + if($clip == RIGHT_CLIP) { + $p1 = $c1 = $s->[0] unless $p1; + if($s->[0] < $c1) { + print STDERR "The soft-clipping file has problem! It's either not sorted or + the file has mutliple parts for a chromsome!"; + last; + } + if($s->[0] - $c1 < $cluster_size) { + $n1 += $s->[2]; + $cover1 = $s->[3] if($cover1 < $s->[3]); + push @ss1, $s; + if($s->[0] < $c1) { + $p1 += $s->[0]; + $c1 = $p1 / scalar @ss1; + } + next; + } + } + else { + $p2 = $c2 = $s->[0] unless $p2; + if($s->[0] < $c2) { + print STDERR "The soft-clipping file has problem! It's either not sorted or + the file has mutliple parts for a chromsome!"; + last; + } + if($s->[0] - $p2 < $cluster_size) { + $n2 += $s->[2]; + $cover2 = $s->[3] if($cover2 < $s->[3]); + push @ss2, $s; + if($s->[0] < $c2) { + $p2 += $s->[0]; + $c2 = $p2 / scalar @ss2; + } + next; + } + } + + my @cc = $clip == LEFT_CLIP ? @ss2 : @ss1; + my $pos = $clip == LEFT_CLIP ? $ss2[$#ss2]->[0] : $ss1[0]->[0]; + my ($n_s, $cover_s) = $clip == LEFT_CLIP ? ($n2, $cover2) : ($n1, $cover1); + my $tmp_range = $clip == LEFT_CLIP ? [$ss2[0]->[0], $ss2[$#ss2]->[0] + $cluster_size] : + [$ss1[0]->[0] - $cluster_size, $ss1[$#ss1]->[0]]; + + if($clip == LEFT_CLIP) { + @ss2 = (); $p2 = $c2 = $s->[0]; + $n2 = $s->[2]; + $cover2 = $s->[3]; + push @ss2, $s; + } + else { + @ss1 = (); $p1 = $c1 = $s->[0]; + $n1 = $s->[2]; + $cover1 = $s->[3]; + push @ss1, $s; + } + + my @s_reads; + if($RNASeq) { +# print $n_s, "\n"; + next if($n_s < $min_sclip_reads && $cover_s > $s_cover[$n_s] ) ; + next if($n_s < $max_sclip_reads && $n_s * 100 <= $cover_s * $min_pct_sclip_reads); + my @f_genes = $f_tree->intersect($tmp_range); + my @r_genes = $r_tree->intersect($tmp_range); + next if(scalar @f_genes == 0 && scalar @r_genes == 0); + } + else { + if($n_s < $min_sclip_reads) { #too few covered + if($cover_s > $s_cover[$n_s]) { + next if(!$sensitive); + foreach my $c (@cc) { + next if($c->[0] >= $chr_len); + push @s_reads, get_sclip_reads(-SAM => $sam_d, + -CHR =>$chr, + -START => $c->[0], + -END => $c->[0], + -CLIP => $clip, + -MINCLIPLEN => 0, + -VALIDATOR => $validator, + -PAIRED => $paired, + -RMDUP => $rmdup, + -EXTRA => abs($c->[0] - $pos) ); + } + } + } + } + if(! @s_reads) { + foreach my $c (@cc) { + next if($c->[0] >= $chr_len); + push @s_reads, get_sclip_reads(-SAM => $sam_d, + -CHR =>$chr, + -START => $c->[0], + -END => $c->[0], + -CLIP => $clip, + -MINCLIPLEN => 0, + -VALIDATOR => $validator, + -PAIRED => $paired, + -RMDUP => $rmdup, + -EXTRA => abs($c->[0] - $pos) ); + } + } + my $l = 0; + foreach my $r (@s_reads) { + my $len = length $r->{seq}; + $l = $len if($l < $len); + } + next if ($l < $min_sclip_len); + print STDERR join("\t", ($chr, $pos, $clip == LEFT_CLIP ? "-" : "+", scalar @s_reads)), "\n"; + my @SV = detect_SV( + -SAM => $sam_d, + -CHR => $chr, + -POS => $pos, + -ORT => $clip == LEFT_CLIP ? "-" : "+", + -SCLIP => \%sclip, + -COVER => $cover_s, + -SREADS => \@s_reads); + foreach my $sv (@SV) { + $sv->{type} = $sv->type; + push @rtn, $sv if($sv && ! is_dup_SV(\@rtn, $sv)); + } + } + + if($RNASeq) { + push @rtn, find_del_SVs($sam_d, $chr, $start, $end); + } + return @rtn; +} + +sub is_dup_SV { + my($r_SVs, $sv) = @_; + foreach my $s (@{$r_SVs}) { + return 1 + if( $s->first_bp->{pos} == $sv->second_bp->{pos} && + $s->first_bp->{chr} eq $sv->second_bp->{chr} && + $s->second_bp->{pos} == $sv->first_bp->{pos} && + $s->second_bp->{chr} eq $sv->first_bp->{chr} && + $s->{type} == $sv->{type} ); + return 1 + if( $s->first_bp->{pos} == $sv->first_bp->{pos} && + $s->first_bp->{chr} eq $sv->first_bp->{chr} && + $s->second_bp->{pos} == $sv->second_bp->{pos} && + $s->second_bp->{chr} eq $sv->second_bp->{chr} && + $s->{type} == $sv->{type} ); + + } + return; +} + +sub search_sclip { + my ($r_sclip, $range) = @_; + my ($chr, $start, $end) = parse_range($range); + return $r_sclip->{$chr} if(!$start); + my $start_i = bin_search($r_sclip->{$chr}, $start); + my $end_i = bin_search($r_sclip->{$chr}, $end); + my @tmp = @{$r_sclip->{$chr}}; + if($start_i <= $end_i) { + @tmp = @tmp[$start_i .. $end_i]; + if($start_i == $end_i) { + return undef if($tmp[0]->[0] < $start || $tmp[0]->[0] > $end); + } + return \@tmp; + } + else { + return undef; + } +} + +sub bin_search { + my ($a, $p) = @_; + my ($s, $e) = (0, scalar(@{$a})-1); + my $m = int( ($s + $e)/2); + while(1) { + return $s if($a->[$s][0] >= $p); + return $e if($a->[$e][0] <= $p); + return $m if($a->[$m][0] == $p || ($a->[$m-1][0] < $p && $a->[$m+1][0] > $p)); + if($a->[$m][0] > $p) { + $e = $m; + } + else { + $s = $m; + } + $m = int( ($s+$e)/2 ); + } +} + +sub count_coverage { + my ($sam, $chr, $pos, $clip) = @_; + if($rmdup && !$RNASeq) { + my @pairs; + my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos); + my $n = 0; + return 0 unless $seg; + my $itr = $seg->features(-iterator => 1); + while( my $a = $itr->next_seq) { + next unless($a->start && $a->end); #why unmapped reads here? + my $sclip_len = 0; + if($clip) { + my @cigar_array = @{$a->cigar_array}; + #$sclip_len = $1 if($a->cigar_str =~ m/S(\d+)$/ && $clip == RIGHT_CLIP); + $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP); + #$sclip_len = $1 if($a->cigar_str =~ m/^S(\d+)/ && $clip == LEFT_CLIP); + $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP); + } + next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len)); + $n++; + return $n if( $n > $max_repetitive_cover); + push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, + $a->mate_end ? $a->mate_end : 0, $sclip_len]; + } + return $n; + } + else{ + my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, + -start => $pos, -end => $pos); + return 0 unless $c; + my @c_d = $c->coverage; + return $c_d[0]; + } +} + +sub detect_SV { + my %param = @_; + my @s_reads = @{$param{-SREADS}}; + my($sam, $chr, $ort, $r_sclip, $coverage) = ( $param{-SAM}, + $param{-CHR}, $param{-ORT}, $param{-SCLIP}, $param{-COVER}); + my $clip = $ort eq '+' ? RIGHT_CLIP : LEFT_CLIP; + return if($coverage > $max_repetitive_cover); + my $fa_name = prepare_reads(\@s_reads, $clip); + my($contig_file, $sclip_count, $contig_reads) = $assembler->run($fa_name); + return if(!$contig_file or !(-e $contig_file)); + + $contig_file = clip_fa_file($contig_file, $clip); + my $contig_seqs = read_fa_file($contig_file); + if(scalar @s_reads == 1) { + $contig_file = clip_fa_file($fa_name, $clip); + $contig_seqs = read_fa_file($contig_file); + my @seq_names = keys %{$contig_seqs}; + $sclip_count->{$seq_names[0]} = 1; + } + + my @SV; + my $where = ($clip == LEFT_CLIP ? "right" : "left"); + my $mapping = $mapper->run( -QUERY => $contig_file ); + my (%reads, %quals, %s_lens, %pos); + foreach my $r (@s_reads) { + $reads{$r->{name}} = $r->{full_seq}; + $quals{$r->{name}} = $r->{full_qual}; + $s_lens{$r->{name}} = length($r->{seq}); + $pos{$r->{name}} = $r->{pos}; + } + + foreach my $s (keys(%{$mapping})) { + next if($sclip_count->{$s} < $min_sclip_reads); + + # try to find a better mapping + my @tmp_reads; + my $selected_read; + my @tmp_pos; + foreach my $n (@{$contig_reads->{$s}}) { + push @tmp_pos, $pos{$n}; + } + @tmp_pos = uniq @tmp_pos; + my $real_pos = $clip == LEFT_CLIP ? $tmp_pos[$#tmp_pos] : $tmp_pos[0]; + + my $n_new_SV = 0; + my $tmp_bp; + foreach my $t (@{$mapping->{$s}}) { + my $qort = $t->{qstrand}; + my $t_pos = $t->{tstart}; + $t_pos = $t->{tend} if( ($qort eq "+" && $clip == LEFT_CLIP) || + ($qort eq '-' && $clip == RIGHT_CLIP)); + if($t->{tchr} eq $chr && (abs($t_pos - $real_pos) < $min_dist_diff)) { # the soft clipping is incorrect + @SV = @SV[0 .. ($#SV - $n_new_SV + 1)] if($n_new_SV > 0); + last; + } + my $first_bp = {}; + $first_bp->{sc_reads} = $sclip_count->{$s}; + $first_bp->{$where . "_chr"} = $chr; + $first_bp->{$where . "_pos"} = $real_pos; + $first_bp->{all_pos} = \@tmp_pos; + $first_bp->{$where . "_ort"} = "+"; + $first_bp->{pos} = $real_pos; + $first_bp->{cover} = $coverage; + $first_bp->{chr} = $chr; + my $new_where = ($where eq "right" ? "left" : "right"); + + $first_bp->{$new_where . "_chr"} = $t->{tchr}; + $first_bp->{$new_where . "_pos"} = $t_pos; + $first_bp->{$new_where . "_ort"} = $qort; + $first_bp->{sc_seq} = $contig_seqs->{$s}; + $first_bp->{reads} = $contig_reads->{$s}; + + $tmp_bp = $first_bp unless $tmp_bp; + my $second_bp = check_sclip(-SAM => $sam, -TARGET => $t, -CHR => $chr, + -POS => $real_pos, -SCLIP => $r_sclip, -CLIP => $clip); + if(@{$second_bp}) { + foreach my $tmp_bp (@{$second_bp}) { + push @SV, StructVar->new(-FIRST_BP => $first_bp, -SECOND_BP => $tmp_bp); + $n_new_SV++; + } + } + } + # save some one side good soft-clipping SV + if( $rescue == 1 && + $n_new_SV == 0 && + $sclip_count->{$s} >= $min_one_side_reads && + (scalar(@{$mapping->{$s}}) == 1 || ($mapping->{$s}[0]{perfect} == 1 && $mapping->{$s}->[1]{perfect} == 0)) && + $tmp_bp->{chr} && + length($contig_seqs->{$s}) * 0.95 < $mapping->{$s}[0]{matches} ) { + + my %second_bp = %{$tmp_bp}; + $second_bp{sc_reads} = 0; + $second_bp{sc_seq} = ''; + ($second_bp{chr}, $second_bp{pos}) = $second_bp{pos} == $second_bp{left_pos} ? + ($second_bp{right_chr}, $second_bp{right_pos}) : ($second_bp{left_chr}, $second_bp{left_pos}); + $second_bp{cover} = count_coverage($sam, $second_bp{chr}, $second_bp{pos}); + $second_bp{reads} = []; + push @SV, StructVar->new(-FIRST_BP => $tmp_bp, -SECOND_BP => \%second_bp); + } + + } + system("rm $fa_name"); system("rm $fa_name*"); + return @SV; +} + +sub get_gene_range { + my ($chr, $pos, $clip) = @_; + my $r = $clip == LEFT_CLIP ? [$pos, $pos+5] : [$pos - 5, $pos]; + my @genes; + my $tmp = $chr; + $tmp = "chr" . $tmp if($tmp !~ m/chr/); + my ($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); + push @genes, $f_tree->intersect($r); + push @genes, $r_tree->intersect($r); + my ($s, $e); + if(scalar @genes == 0) { #no gene here + return [$pos - $read_len, $pos + $read_len]; + } + foreach my $g (@genes) { + my $start = $g->val->get_start($pos, $read_len); + my $end = $g->val->get_end($pos, $read_len); + $s = $start if(!$s || $s > $start); + $e = $end if(!$e || $e < $end); + } + return [$s, $e]; +} + +sub check_sclip { + my %arg = @_; + my ($sam, $chr, $pos, $target, $r_sclip, $clip) = + ( $arg{-SAM}, $arg{-CHR}, $arg{-POS}, $arg{-TARGET}, $arg{-SCLIP}, $arg{-CLIP} ); + + my @bp; + + # identify the searching region for partner soft cliping + my $extension; + if($RNASeq) { + $extension = 50; + } + else { + $extension = $read_len - ($target->{qend} - $target->{qstart}); + } + my ($tpos, $ort) = ($target->{tend}, $target->{qstrand}); + $tpos = $target->{tstart} if( ($clip == LEFT_CLIP && $ort eq '-') + || ($clip == RIGHT_CLIP && $ort eq '+')); + $extension = abs($tpos - $pos) - 1 + if($chr eq $target->{tchr} && abs($tpos - $pos) <= $extension); # don't consider itself + my $range =$target->{tchr} . ":"; + if($tpos < $extension) { + $range .= '1'; + } + else{ + $range .= $tpos - $extension; + } + + $range .= "-"; $range .= $tpos + $extension; + + # the orginal genome sequence where we find the soft_clipping + my $orig; + my $tmp = $chr; + #$tmp = 'chr' . $tmp if($tmp !~ m/^chr/); + $orig = $target_genome . ":" . $tmp . ":"; + my $base_pos; + if($RNASeq) { #let's do a wild guess! + my $r = get_gene_range($tmp, $pos, $clip); + return \@bp unless $r; + $base_pos = $r->[0]; + $orig = join("", ($orig, $r->[0], "-", $r->[1])); + } + else { + $base_pos = $pos - $read_len; + $orig = join("", ($orig, $pos - $read_len, "-", $pos + $read_len)); + } + + return \@bp if(!exists( $r_sclip->{$target->{tchr}})); #mapped to chrY or other minor chrs + # the soft clipping happens in highly repetitive region + # return \@bp if($coverage > $max_repetitive_cover); + + my $r_pp = search_sclip($r_sclip, $range); + return \@bp if(!$r_pp); + + my $real_pos; + my $found = 0; + my @r_reads; + my @l_reads; + foreach my $s (@{$r_pp}) { + #next if($s->[2] < $min_hit_reads); + next if($chr ne $target->{tchr} && ( + ($clip == LEFT_CLIP && $ort ne $s->[1] ) || + ($clip == RIGHT_CLIP && $ort eq $s->[1] )) ); + + next if($chr eq $target->{tchr} && ($s->[0] < $tpos - $extension + || $s->[0] > $tpos + $extension)); + my $tort = $s->[1]; + next if($s->[3] > $max_repetitive_cover); + my $tclip = $tort eq '+' ? RIGHT_CLIP : LEFT_CLIP; + my @reads = get_sclip_reads( + -SAM => $sam, + -CHR => $target->{tchr}, + -START => $s->[0], + -END => $s->[0], + -CLIP => $tclip, + -MINCLIPLEN => $min_hit_len, + -VALIDATOR => $validator, + -PAIRED => $paired, + -RMDUP => $rmdup); + next if(!@reads || scalar(@reads) == 0); +# push @r_reads, @reads if($tclip == RIGHT_CLIP); +# push @l_reads, @reads if($tclip == LEFT_CLIP); +# } + +# foreach my $tclip ( (RIGHT_CLIP, LEFT_CLIP)) { +# my $s_reads = $tclip == RIGHT_CLIP ? \@r_reads : \@l_reads; +# next if(scalar @{$s_reads} == 0); + my %count; +# my $fa_name = prepare_reads($s_reads, $tclip); + my $fa_name = prepare_reads(\@reads, $tclip); + + my($contig_file, $sclip_count, $contig_reads, $singlet_file) = $assembler->run($fa_name); + my (%reads, %quals, %s_lens, %pos); +# foreach my $r (@{$s_reads}) { + foreach my $r (@reads) { + $reads{$r->{name}} = $r->{full_seq}; + $quals{$r->{name}} = $r->{full_qual}; + $s_lens{$r->{name}} = length($r->{seq}); + $pos{$r->{name}} = $r->{pos}; + } + + $contig_file = clip_fa_file($contig_file, $tclip); + if( -s $singlet_file ) { + $singlet_file = clip_fa_file($singlet_file, $tclip); + system("cat $singlet_file >> $contig_file"); + system("rm $singlet_file"); + } + my $seqs = read_fa_file($contig_file); + my $hits = $aligner->run( -TARGET => $orig, -QUERY => $contig_file); + foreach my $t (keys %{$hits}) { + my $h = $hits->{$t}; + my $tmp_bp; + my @all_pos; + my $all_reads_name; + if(exists $contig_reads->{$t}) { + foreach my $n (@{$contig_reads->{$t}}) { + push @all_pos, $pos{$n}; + } + $all_reads_name = $contig_reads->{$t}; + @all_pos = uniq @all_pos; + @all_pos = sort {$a <=> $b} @all_pos; + + } + else { + push @all_pos, $pos{$t}; + $all_reads_name = [$t]; + } + + if(is_good_hit($h, $clip, $tclip)) { + my $hit_ort = ($h->strand('query') == 1 ? "+" : "-"); + my $real_pos = $tclip == RIGHT_CLIP ? $all_pos[0] : $all_pos[$#all_pos]; + if($tclip == RIGHT_CLIP ) { + $tmp_bp = { + left_ort => '+', + left_pos => $real_pos, + left_chr => $target->{tchr}, + right_ort => $hit_ort, + right_pos => ($hit_ort eq "+" ? $h->start("hit") : $h->end("hit")) + $base_pos, + right_chr => $chr, + } + } + else { + $tmp_bp = { + left_ort => $hit_ort, + left_chr => $chr, + left_pos => ($hit_ort eq "+" ? $h->end("hit") : $h->start("hit")) + $base_pos, + right_ort => "+", + right_chr => $target->{tchr}, + right_pos => $real_pos, + } + } + $tmp_bp->{chr} = $target->{tchr}; + $tmp_bp->{pos} = $real_pos; + $tmp_bp->{all_pos} = \@all_pos; + $tmp_bp->{sc_seq} = $seqs->{$t}; + $tmp_bp->{cover} = count_coverage($sam, $target->{tchr}, $real_pos); + $tmp_bp->{sc_reads} = exists $sclip_count->{$t} ? $sclip_count->{$t} : 1; + $tmp_bp->{reads} = exists $contig_reads->{$t} ? $contig_reads->{$t} : [$t]; + + push @bp, $tmp_bp; + } + } + system("rm $fa_name"); system("rm $fa_name*"); + } + return \@bp; +} + +# many more filter can be added here +sub is_good_hit { + my ($hit, $clip, $tclip) = @_; + + return if($hit->length_aln('query') < $min_hit_len); + return if($hit->frac_identical * 100 < $min_percent_id ); + return 1; + + # do we want to do the check? + my $ort = ($hit->start('query') < $hit->end('query') ? "+" : "-"); + my ($dist, $t_dist, $qstart, $qend); + $qstart = $ort eq '+' ? $hit->start('query') : $hit->end('query') ; + $qend = $ort eq '+' ? $hit->end('query') : $hit->start('query'); + $t_dist = $qstart; + $t_dist = $hit->query_length - $qend if($tclip == LEFT_CLIP); + +} + +# RNASeq support to identify deletions + +sub cigar2hit { + my ($s, @cigar_array) = @_; + my @pos; + my $p = $s; + foreach my $c (@cigar_array){ + my($op, $len) = @{$c}; + if($op eq 'N') { + push @pos, [$s, $p]; + $s = $p + $len; + $p = $s; + next; + } + $p += $len if($op eq 'M' || $op eq 'D'); + } + push @pos, [$s, $p]; + return \@pos; +} + +sub find_del_SVs { + my ($sam, $chr, $start, $end) = @_; + my $max_sclip_len = 0; + my $range = $chr; + if($start && $end) { + $range = $chr . ":" . $start . "-" . $end; + } + my %SV; + my $tmp = $chr; + $tmp = "chr" . $chr if($chr !~ m/chr/); + + my($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); + + $sam->fetch($range, sub { + my $a = shift; + my $cigar_str = $a->cigar_str; +# print $a->qname, "\t", $cigar_str, "\n"; + return unless $cigar_str =~ m/N/; # the read cross two genes must have N +# return unless $a->score >= 97; + my $sv = identify_del_SV($f_tree, $a); + if($sv){ + my $tmp1 = $sv->[0] . "+" . $sv->[1]; + if(!exists($SV{$tmp1})) { + $SV{$tmp1} = {}; + $SV{$tmp1}->{num} = 0; + $SV{$tmp1}->{left} = 0; + $SV{$tmp1}->{right} = 0; + } + $SV{$tmp1}->{num}++; + $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); + $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); + $SV{$tmp1}->{gene_ort} = "+"; + $SV{$tmp1}->{left_gene} = $sv->[3]; + $SV{$tmp1}->{right_gene} = $sv->[4]; + push @{$SV{$tmp1}->{reads}}, $a->qname; + + } + $sv = identify_del_SV($r_tree, $a); + if($sv){ + my $tmp1 = $sv->[0] . "-" . $sv->[1]; + if(!exists($SV{$tmp1})) { + $SV{$tmp1} = {}; + $SV{$tmp1}->{num} = 0; + $SV{$tmp1}->{left} = 0; + $SV{$tmp1}->{right} = 0; + $SV{$tmp1}->{reads} = []; + } + $SV{$tmp1}->{num}++; + $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); + $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); + $SV{$tmp1}->{gene_ort} = "-"; + $SV{$tmp1}->{left_gene} = $sv->[3]; + $SV{$tmp1}->{right_gene} = $sv->[4]; + push @{$SV{$tmp1}->{reads}}, $a->qname; + } + } + ); + + my @rtn; + foreach my $sv (keys %SV) { + my( $pos1, $pos2 ) = $sv =~ m/(\d+)[+|-](\d+)/; + print $pos1, "\t", $pos2, "\n"; + my( $cover1, $cover2) = (count_coverage($sam, $chr, $pos1 - 1), count_coverage($sam, $chr, $pos2)); + next if($SV{$sv}->{num} < $min_sclip_reads); + my $cover = $cover1 < $cover2 ? $cover1 : $cover2; + next if($SV{$sv}->{num} * 100 < $cover * $min_pct_sclip_reads && $SV{$sv} < 50); + push @rtn, StructVar->new ( + -FIRST_BP => { left_ort => '+', + left_pos => $pos1 - 1, + left_chr => $chr, + right_ort => '+', + right_pos => $pos2, + right_chr => $chr, + chr => $chr, + cover => count_coverage($sam, $chr, $pos1 - 1), + pos => $pos1 - 1, + sc_reads => $SV{$sv}->{num}, + left_len => $SV{$sv}->{left}, + right_len => $SV{$sv}->{right}, + gene_ort => $SV{$sv}->{gene_ort}, + left_gene => $SV{$sv}->{left_gene}, + right_gene => $SV{$sv}->{right_gene}, + reads => $SV{$sv}->{reads}, + }, + -SECOND_BP => { left_ort => '+', + left_pos => $pos1 - 1, + left_chr => $chr, + right_ort => '+', + right_pos => $pos2, + right_chr => $chr, + chr => $chr, + pos => $pos2, + cover => count_coverage($sam, $chr, $pos2), + sc_reads => $SV{$sv}->{num}, + reads => [], + } + ); + } + return @rtn; +} + +sub identify_del_SV { + my ($tree, $a) = @_; + my @cigar_array = @{$a->cigar_array}; + return unless($a->start && $a->end); + return if(scalar $tree->intersect([$a->start, $a->end]) <= 1); + + my @hits = @{ cigar2hit($a->start, @cigar_array) }; + my @genes; + my $last_gene; + my @change; + my ($left, $right) = (0, 0); + for( my $i = 0; $i < scalar @hits; $i++) { + my $h = $hits[$i]; + my @g = $tree->intersect($h); + my $g_size = scalar @g; + if($g_size == 0) { +# print STDERR $a->qname, " is mapped outside of gene\n"; + next; + } + if( scalar @g == 1) { + push @genes, $g[0] unless ($last_gene && $last_gene->val->name eq $g[0]->val->name); + if($last_gene && $last_gene->val->name ne $g[0]->val->name) { + $right += ($h->[1] - $h->[0] ); + push @change, $i; + } + $left += ($h->[1] - $h->[0] ) if($right == 0); + $last_gene = $g[0]; + next; + } +# print STDERR $a->qname, " connect two genes into one?\n"; + return; + } + return if(scalar @genes <= 1); + if(scalar @genes > 2) { +# print STDERR $a->qname, "crossed more than 2 genes!\n"; + return; + } + # now we down to 2 genes + my ($left_gene, $right_gene) = @genes; + return unless $left_gene->val->overlap([$hits[$change[0]-1]->[0], $hits[$change[0]-1]->[1]]); + return unless $right_gene->val->overlap([$hits[$change[0]]->[0], $hits[$change[0]]->[1]]); + my $p = $left_gene->val->end; + print join("\t", ( $a->score, + $a->cigar_str, $left, $right, $hits[$change[0]-1]->[1], $hits[$change[0]]->[0], + $left_gene->val->name, $right_gene->val->name)), "\n"; + return [$hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $a, $left_gene->val, $right_gene->val, $left, $right]; +} + +=head1 NAME + +CREST.pl - a Structure Variation detection tools using softclipping for +whole genome sequencing. + + +=head1 VERSION + +This documentation refers to CREST.pl version 0.0.1. + + +=head1 USAGE + + This program depends on several things that need to be installed and/or + specified. The program uses BioPerl and Bio::DB::Sam module to parse + the files and bam files. Also it uses Blat software suits to do genome + mapping and alignment. To make the program efficient, it also requires + a blat server setup. And the program uses CAP3 assembler. + + Identify somatic SVs from input: + CREST.pl -f somatic.cover -d tumor.bam -g germline.bam + Identify SVs from a single bam compared wtih reference genome: + CREST.pl -f sclip.cover -d sample.bam + Identify SVs from a single bam on chr1 only + CREST.pl -f sclip.cover -d sample.bam -r chr1 + +=head1 REQUIRED ARGUMENTS + + To run the program, several parameter must specified. + -d, --input_d: The input (diagnositic) bam file + -f, --sclipfile: The soft clipping information generated by extractSClip.pl + --ref_genome: The reference genome file in fa format + -t, --target_genome: The 2bit genome file used by blat server and blat + --blatserver: The blat server name or ip address + --blatport: The blat server port + +=head1 OPTIONS + + The options that can be used for the program. + -g, --input_g The germline (paired) bam file + -p, --prefix The output prefix, default is the input bam file name + -o, --out_dir Output directory, default is the working directory + -l, --read_len The read length of the sequencing data, defaut 100 + --sensitive The program will generate more SVs with higher false positive rate. + + --(no)scratch Use the scratch space, default is off. + --(no)paired Use paired reads or not, defafult is on, so change to --nopaired for unpaired reads. + --cap3 CAP3 executable position, default cap3 + --cap3opt CAP3 options, single quoted, Default ' > /dev/null' + --blatclient gfClient excutable position, default gfClient + --blatclientopt gfClient options, single quoted, default '-out=psl -nohead' + --blat blat executable potion, default balt + --blatopt blat options, single quoted, default '-tileSize=7 -stepSize=1 -out=psl -minScore=15' + + -r, --range The range where SV will be detected, using chr1:100-200 format. + --max_score_diff The maximum score difference when stopping select hit, default 10. + --min_sclip_reads Minimum number of soft clipping read to triger the procedure, default 3. + --max_rep_cover The min number of coverage to be called as repetitive and don't triger + the procedure, default 500. + --min_sclip_len The min length of soft clipping part at a position to triger the detection, + default 20. + --min_hit_len Min length of a hit for genome mapping + --min_dist_diff Min distance between the mapped position and the soft clipping position, default 20. + --(no)rmdup Remove PCR dumplicate. Default remove. + + --min_percent_id Min percentage of identity of soft clipping read mapping, default 90 + --min_percent_hq Min percentage of high quality base in soft clipping reads, default 80 + --lowqual_cutoff Low quality cutoff value, default 20. + + -h, --help, -? Help information + --man Man page. + --usage Usage information. + --version Software version. + + --(no)rm_tandem_repeat Remove tandem repeat caused SV events, default is ON. When it's on ptrfinder program + need to be on the path. + --tr_max_indel_size Maximum tandem repeat mediated INDEL events, default 100 + --tr_min_size Minimum tandem reapet size, default 2 + --tr_max_size Maximum tandem repeat size, default 8 + --tr_min_num Minimum tandem repeat number, defaut 4 + --min_percent_cons_of_read Minimum percent of consensus length of read length, default 0.75 + --max_bp_dist Maximum distance between two idenfitifed break points, default 15 + --germline_seq_width Half window width of genomic sequence around break point for germline SV filtering, + default 100 + --germline_search_width Half window width for seaching soft-clipped reads around breakpoint for germline SV + filtering, default 50. + + --hetero_factor The factor about the SV's heterogenirity and heterozygosity, default 0.4. + --triger_p_value The p-value that will triger the SV detection when number of soft-clipped reads is small, + defaut 0.05. + + --(no)rescue Use rescue mode, when it's on, the a SV with only 1 side with enough soft-clipped reads + is considered as a valid one instead of rejecting it. Default on. + --min_one_side_reads When rescure mode is on, the minimum number of soft-clipped reads on one side, default 5. + + --RNASeq RNAseq mode, default off + --genemodel A gene model file, currently only refFlat format (BED) is supported. Requried for RNASeq. + --cluster_size The soft-clipped reads within cluster_size will be considered together, default is 3, RNAseq mode + only. + + +=head1 DESCRIPTION + +This is a program designed to identify Structure Variations (SVs) using soft +clipping reads. With the improvement of next-gen sequencing techniques, +both the coverage and length of pair-end reads have been increased steady. +Many SV detection software availalbe uses pair-end information due to the +limitted read length. Now 100bp is pretty common and many reads will cross +the break points. Some mapping programs ( bwa, etc ) have the ability to +identify so called soft-clipping reads. A soft-clipping read is a read that +different part can be mapped to different genomic posiiton, but the read can +be uniquely positioned using the mate-pair position and the insert length. +So this program use those reads to do an assembly-mapping-assembly-alignment +procedure to identify potential structure variiations. + + +=head1 DIAGNOSTICS + +A list of every error and warning message that the application can generate +(even the ones that will "never happen"), with a full explanation of each +problem, one or more likely causes, and any suggested remedies. If the +application generates exit status codes (e.g., under Unix), then list the exit +status associated with each error. + +=head1 CONFIGURATION AND ENVIRONMENT + +The program is designed to use under a cluster or high performance computing +environment since it's dealing with over 100G input data. The program can be +used as highly parallellized. And the program is developped under linux/unix. + + +=head1 DEPENDENCIES + +The program depend on several packages: +1. Bioperl perl module. +2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed. +3. blat suits, include blat, gfClient, gfServer. +4. CAP3 assembly program. + + +=head1 BUGS AND LIMITATIONS + +There are no known bugs in this module, but the method is limitted to bam file +that has soft-clipping cigar string generated.Please report problems to +Jianmin Wang (Jianmin.Wang@stjude.org) +Patches are welcome. + +=head1 AUTHOR + +Jianmin Wang (Jianmin.Wang@stjude.org) + + +=head1 LICENCE AND COPYRIGHT + +Copyright (c) 2010 by St. Jude Children's Research Hospital. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>.