Mercurial > repos > jjohnson > crest
view CREST.pl @ 1:4f6952e0af48 default tip
CREST - add crest.loc.sample
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Wed, 08 Feb 2012 16:08:01 -0600 |
parents | acc8d8bfeb9a |
children |
line wrap: on
line source
#!/usr/bin/perl -w #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev'; use strict; use Carp; use Getopt::Long; use English; use Pod::Usage; use Data::Dumper; use Bio::DB::Sam; use Bio::DB::Sam::Constants; use Bio::SearchIO; use Bio::SeqIO; use File::Temp qw/ tempfile tempdir /; use File::Spec; use File::Path; use File::Copy; use File::Basename; use List::MoreUtils qw/ uniq /; use Cwd; use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP); use SVUtil qw($use_scratch $work_dir clip_fa_file prepare_reads parse_range get_input_bam find_smallest_cover get_work_dir is_PCR_dup read_fa_file get_sclip_reads); use StructVar; require SVExtTools; use Transcript; use Gene; use GeneModel; my $debug = 1; $min_percent_id = 90; # input/output my ( $out_dir, $out_prefix, $range, $bam_d, $bam_g, $sclip_file); my $out_suffix = "predSV.txt"; my $read_len = 100; my $ref_genome ; my $hetero_factor = 0.4; my $triger_p_value = 0.05; #RNASeq support my $RNASeq = 0; my $gene_model_file; my $gm_format = "REFFLAT"; my $gm; my $max_sclip_reads = 50; # we will consider the position if we have enough sclipped reads my $min_one_side_reads = 5; my $min_pct_sclip_reads = 5; # we require at least 5% reads have softclipping # external programs related variable # 1. cap3 related variables my $cap3 = "cap3"; my $cap3_options=" -h 70 -y 10 > /dev/null"; # 2 blat related variables, using blat server and blat standalone my $blat_client_exe = "gfClient"; my $blat_client_options = ' -out=psl -nohead -minIdentity=95 -maxIntron=5'; my $target_genome = "hg18.2bit"; my $dir_2bit = "/"; my $blat_server = "sjblat"; my $blat_port = 50000; my $blat_exe = "blat"; my $blat_options = " -tileSize=7 -stepSize=1 -out=psl -minScore=15 -noHead -maxIntron=1 "; $use_scratch = 0; my $paired = 1; my $rescue = 1; # other options my ($min_sclip_reads, $max_repetitive_cover, $min_sclip_len ); my $min_hit_reads; my $rmdup; my $min_clip_len; my ($max_score_diff, $max_num_hits); my ($min_dist_diff, $min_hit_len) = (15, 15); #SV filter related parameters my $max_bp_dist = 15; my $min_percent_cons_of_read = 0.75; my $germline_seq_width = 100; my $germline_search_width = 50; my $rm_tandem_repeat = 1; #tandem repeat mediated events my $tr_max_indel_size = 100; #tandem repeat mediated indel size my $tr_min_size = 2; my $tr_max_size = 8; #tandem repeat max_size of single repeat my $tr_min_num = 4; #tandem repeat minimum number of occurence # verification pupuse my $verify_file; my $sensitive; my $cluster_size; # common help options my ( $help, $man, $version, $usage ); my $optionOK = GetOptions( # SV validation related parameters 'max_bp_dist=i' => \$max_bp_dist, 'min_percent_cons_of_read=f' => \$min_percent_cons_of_read, 'germline_seq_width=i' => \$germline_seq_width, 'germline_search_width=i' => \$germline_search_width, # tandem repeat related parameters 'rm_tandem_repeat!' => \$rm_tandem_repeat, 'tr_max_indel_size=i' => \$tr_max_indel_size, 'tr_min_size=i' => \$tr_min_size, 'tr_max_size=i' => \$tr_max_size, 'tr_min_num=i' => \$tr_min_num, 'hetero_factor=f' => \$hetero_factor, 'triger_p_value=f' => \$triger_p_value, # input/output #'s|sample=s' => \$sample, 'd|input_d=s' => \$bam_d, 'g|inpug_g=s' => \$bam_g, 'f|sclipfile=s' => \$sclip_file, 'p|prefix=s' => \$out_prefix, 'o|out_dir=s' => \$out_dir, 'l|read_len=i' => \$read_len, 'ref_genome=s' => \$ref_genome, 'v|verify_file=s' => \$verify_file, 'sensitive' => \$sensitive, 'paired!' => \$paired, 'rescue!' => \$rescue, 'cluster_size=i' => \$cluster_size, # external programs location and options 'scratch!' => \$use_scratch, 'cap3=s' => \$cap3, 'cap3opt=s' => \$cap3_options, 'blatclient=s' => \$blat_client_exe, 'blatclientopt=s' => \$blat_client_options, 'blat=s' => \$blat_exe, 't|target_genome=s' => \$target_genome, 'blatopt=s' => \$blat_options, 'blatserver=s' => \$blat_server, 'blatport=i' => \$blat_port, '2bitdir=s' => \$dir_2bit, #RNAseq support 'RNASeq' => \$RNASeq, 'genemodel=s' => \$gene_model_file, 'gmformat=s' => \$gm_format, #other related parameters 'r|range=s' => \$range, 'max_score_diff=i' => \$max_score_diff, 'm|min_sclip_reads=i' => \$min_sclip_reads, 'min_one_side_reads=i' => \$min_one_side_reads, 'max_rep_cover=i' => \$max_repetitive_cover, 'min_sclip_len=i' => \$min_sclip_len, 'min_hit_len=i' => \$min_hit_len, 'min_dist_diff=i' => \$min_dist_diff, 'min_hit_reads=i' => \$min_hit_reads, 'rmdup!' => \$rmdup, # soft_clipping related parameters (from SVDector package) 'min_percent_id=i' => \$min_percent_id, 'min_percent_hq=i' => \$SCValidator::min_percent_hq, 'lowqual_cutoff=i' => \$lowqual_cutoff, # common help parameters 'h|help|?' => \$help, 'man' => \$man, 'usage' => \$usage, 'version' => \$version, ); pod2usage(-verbose=>2) if($man or $usage); pod2usage(1) if($help or $version ); my $start_dir = getcwd; # figure out input file if(!$bam_d) { pod2usage(1); croak "You need specify input bam file(s)"; } if(!$sclip_file) { pod2usage(1); croak "You need to specify the softclipping file"; } $sclip_file = File::Spec->rel2abs($sclip_file); if(!$ref_genome) { pod2usage(1); croak "You need to specify the reference genome used by bam file"; } croak "The file you sepcified does not exist" unless ( -e $sclip_file && -e $bam_d && -e $ref_genome && -e $target_genome); my $input_base; $bam_d = File::Spec->rel2abs($bam_d); $input_base = fileparse($bam_d); $bam_g = File::Spec->rel2abs($bam_g) if($bam_g); #RNASeq support if($RNASeq) { croak "You need specify the input gene model file" unless ($gene_model_file); StructVar->add_RNASeq_filter(); $min_sclip_reads = 10 unless $min_sclip_reads; $max_repetitive_cover = 5000 unless $max_repetitive_cover; $min_sclip_len = 20 unless $min_clip_len; $max_score_diff = 5 unless $max_score_diff; $max_num_hits = 3 unless $max_num_hits; $min_hit_reads = 5 unless $min_hit_reads; $cluster_size = 5 unless $cluster_size; } else { $min_sclip_reads = 3 unless $min_sclip_reads; $max_repetitive_cover = 500 unless $max_repetitive_cover; $min_sclip_len = 20 unless $min_clip_len; $max_score_diff = 5 unless $max_score_diff; $max_num_hits = 10 unless $max_num_hits; $min_hit_reads = 1 unless $min_hit_reads; $cluster_size = 1 unless $cluster_size; } if($gene_model_file) { $gm = GeneModel->new if($gene_model_file); $gm->from_file($gene_model_file, $gm_format); StructVar->gene_model($gm); } # set up the external programs and validators # Those variable will be global my $assembler = Assembler->new( -PRG => $cap3, -OPTIONS => $cap3_options ); my $mapper = Mapper->new( -PRG => join(' ', ($blat_client_exe, $blat_server, $blat_port)), -OPTIONS => $blat_client_options, -BIT2_DIR => $dir_2bit, -MAX_SCORE_DIFF => $max_score_diff, -MAX_NUM_HITS => $max_num_hits, ); my $aligner = Aligner->new( -PRG => $blat_exe, -OPTIONS => $blat_options, ); StructVar->assembler($assembler); StructVar->read_len($read_len); StructVar->aligner($aligner); StructVar->mapper($mapper); StructVar->RNASeq(1) if($RNASeq); StructVar->genome($target_genome); StructVar->remove_filter("tandem_repeat") unless($rm_tandem_repeat); StructVar->tr_max_indel_size($tr_max_indel_size); StructVar->tr_min_size($tr_min_size); StructVar->tr_max_size($tr_max_size); StructVar->tr_min_num($tr_min_num); StructVar->max_bp_dist($max_bp_dist) if($max_bp_dist); StructVar->germline_seq_width($germline_seq_width) if($germline_seq_width); StructVar->germline_search_width($germline_search_width) if($germline_search_width); my $validator = SCValidator->new(); $validator->remove_validator('strand_validator') if(!$paired); #setup output and working directory $out_dir = getcwd if(!$out_dir); mkdir $out_dir if(!-e $out_dir || ! -d $out_dir); $work_dir = get_work_dir(-SCRATCH => $use_scratch); $work_dir = $out_dir if(!$work_dir); $use_scratch = undef if($work_dir eq $out_dir); # don't erase the out_dir chdir($work_dir); # figure out output prefix if(!$out_prefix) { $out_prefix = $input_base; } my $sam_d = Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome); StructVar->sam_d($sam_d); my $sam_g; if($bam_g) { $sam_g = Bio::DB::Sam->new( -bam => $bam_g); StructVar->sam_g($sam_g); } #my $output_file = File::Spec->catfile($outdir, $out_prefix . $out_suffix); my $output_file; $output_file = join('.', $out_prefix, $out_suffix); $output_file = File::Spec->catfile($out_dir, $output_file); # the softclip file is sorted, so no need to re-sort it open my $SCLIP, "<$sclip_file" or croak "can't open $sclip_file:$OS_ERROR"; my %sclip; my $pre_p; while( my $line = <$SCLIP> ) { chomp $line; my ($chr, $pos, $ort, $cover, $C) = split /\t/, $line; if( ! exists($sclip{$chr})) { $sclip{$chr} = []; $pre_p = $pos; } $C = 30 unless $C; if($pos < $pre_p) { print STDERR "The input file is not sorted!"; exit(1); } push @{$sclip{$chr}}, [$pos, $ort, $cover, $C]; } close $SCLIP; my @final_SV; my @s_cover = @{find_smallest_cover($min_sclip_reads, $hetero_factor, $triger_p_value)}; if($range) { @final_SV = detect_range_SVs($sam_d, $range, \%sclip, \@s_cover); } else { foreach my $chr (keys %sclip) { my @tmpSV = detect_range_SVs($sam_d, $chr, \%sclip, \@s_cover); @final_SV = @tmpSV unless (@final_SV); foreach my $sv (@tmpSV) { push @final_SV, $sv if($sv && ! is_dup_SV(\@final_SV, $sv)); } } } undef %sclip; open my $OUT, ">$output_file" or croak "can't open $output_file:$OS_ERROR"; foreach my $SV (@final_SV) { if($SV->filter) { print $OUT $SV->to_string ; if($RNASeq) { print $OUT "\t", join("\t", @{$SV->get_genes}); } print $OUT "\n"; } } chdir $start_dir; exit(0); sub detect_range_SVs { my ($sam_d, $range, $r_sclips, $r_scover) = @_; my ($chr, $start, $end) = parse_range($range); my ($tid) = $sam_d->header->parse_region($chr); my $chr_len = $sam_d->header->target_len->[$tid]; my $r_range_sclips = search_sclip($r_sclips, $range); return unless $r_range_sclips; my @scover = @{$r_scover}; my @rtn; my ($f_tree, $r_tree); if($RNASeq) { my $full_chr = $chr; my ($gm_chr) = $gm->get_all_chr; $full_chr = "chr" . $chr if($chr !~ m/chr/ && $gm_chr =~ /chr/); ($f_tree, $r_tree) = ($gm->sub_model($full_chr, "+"), $gm->sub_model($full_chr, "-")) if($RNASeq); } my $n = scalar @{$r_range_sclips}; push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '+', 1, 50]; push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '-', 1, 50]; $n = $n + 2; # try to save the 1 base off problem of soft-clipping my (@ss1, $p1, $c1); my (@ss2, $p2, $c2); my ($n1, $n2, $cover1, $cover2) = (0, 0, 0, 0); for( my $i = 0; $i < $n; $i++) { my $s = $r_range_sclips->[$i]; # print join("\t", @{$s}), "\n" if($debug); my $clip = $s->[1] eq '+' ? RIGHT_CLIP : LEFT_CLIP; next if($s->[0] >= $chr_len ); if($clip == RIGHT_CLIP) { $p1 = $c1 = $s->[0] unless $p1; if($s->[0] < $c1) { print STDERR "The soft-clipping file has problem! It's either not sorted or the file has mutliple parts for a chromsome!"; last; } if($s->[0] - $c1 < $cluster_size) { $n1 += $s->[2]; $cover1 = $s->[3] if($cover1 < $s->[3]); push @ss1, $s; if($s->[0] < $c1) { $p1 += $s->[0]; $c1 = $p1 / scalar @ss1; } next; } } else { $p2 = $c2 = $s->[0] unless $p2; if($s->[0] < $c2) { print STDERR "The soft-clipping file has problem! It's either not sorted or the file has mutliple parts for a chromsome!"; last; } if($s->[0] - $p2 < $cluster_size) { $n2 += $s->[2]; $cover2 = $s->[3] if($cover2 < $s->[3]); push @ss2, $s; if($s->[0] < $c2) { $p2 += $s->[0]; $c2 = $p2 / scalar @ss2; } next; } } my @cc = $clip == LEFT_CLIP ? @ss2 : @ss1; my $pos = $clip == LEFT_CLIP ? $ss2[$#ss2]->[0] : $ss1[0]->[0]; my ($n_s, $cover_s) = $clip == LEFT_CLIP ? ($n2, $cover2) : ($n1, $cover1); my $tmp_range = $clip == LEFT_CLIP ? [$ss2[0]->[0], $ss2[$#ss2]->[0] + $cluster_size] : [$ss1[0]->[0] - $cluster_size, $ss1[$#ss1]->[0]]; if($clip == LEFT_CLIP) { @ss2 = (); $p2 = $c2 = $s->[0]; $n2 = $s->[2]; $cover2 = $s->[3]; push @ss2, $s; } else { @ss1 = (); $p1 = $c1 = $s->[0]; $n1 = $s->[2]; $cover1 = $s->[3]; push @ss1, $s; } my @s_reads; if($RNASeq) { # print $n_s, "\n"; next if($n_s < $min_sclip_reads && $cover_s > $s_cover[$n_s] ) ; next if($n_s < $max_sclip_reads && $n_s * 100 <= $cover_s * $min_pct_sclip_reads); my @f_genes = $f_tree->intersect($tmp_range); my @r_genes = $r_tree->intersect($tmp_range); next if(scalar @f_genes == 0 && scalar @r_genes == 0); } else { if($n_s < $min_sclip_reads) { #too few covered if($cover_s > $s_cover[$n_s]) { next if(!$sensitive); foreach my $c (@cc) { next if($c->[0] >= $chr_len); push @s_reads, get_sclip_reads(-SAM => $sam_d, -CHR =>$chr, -START => $c->[0], -END => $c->[0], -CLIP => $clip, -MINCLIPLEN => 0, -VALIDATOR => $validator, -PAIRED => $paired, -RMDUP => $rmdup, -EXTRA => abs($c->[0] - $pos) ); } } } } if(! @s_reads) { foreach my $c (@cc) { next if($c->[0] >= $chr_len); push @s_reads, get_sclip_reads(-SAM => $sam_d, -CHR =>$chr, -START => $c->[0], -END => $c->[0], -CLIP => $clip, -MINCLIPLEN => 0, -VALIDATOR => $validator, -PAIRED => $paired, -RMDUP => $rmdup, -EXTRA => abs($c->[0] - $pos) ); } } my $l = 0; foreach my $r (@s_reads) { my $len = length $r->{seq}; $l = $len if($l < $len); } next if ($l < $min_sclip_len); print STDERR join("\t", ($chr, $pos, $clip == LEFT_CLIP ? "-" : "+", scalar @s_reads)), "\n"; my @SV = detect_SV( -SAM => $sam_d, -CHR => $chr, -POS => $pos, -ORT => $clip == LEFT_CLIP ? "-" : "+", -SCLIP => \%sclip, -COVER => $cover_s, -SREADS => \@s_reads); foreach my $sv (@SV) { $sv->{type} = $sv->type; push @rtn, $sv if($sv && ! is_dup_SV(\@rtn, $sv)); } } if($RNASeq) { push @rtn, find_del_SVs($sam_d, $chr, $start, $end); } return @rtn; } sub is_dup_SV { my($r_SVs, $sv) = @_; foreach my $s (@{$r_SVs}) { return 1 if( $s->first_bp->{pos} == $sv->second_bp->{pos} && $s->first_bp->{chr} eq $sv->second_bp->{chr} && $s->second_bp->{pos} == $sv->first_bp->{pos} && $s->second_bp->{chr} eq $sv->first_bp->{chr} && $s->{type} == $sv->{type} ); return 1 if( $s->first_bp->{pos} == $sv->first_bp->{pos} && $s->first_bp->{chr} eq $sv->first_bp->{chr} && $s->second_bp->{pos} == $sv->second_bp->{pos} && $s->second_bp->{chr} eq $sv->second_bp->{chr} && $s->{type} == $sv->{type} ); } return; } sub search_sclip { my ($r_sclip, $range) = @_; my ($chr, $start, $end) = parse_range($range); return $r_sclip->{$chr} if(!$start); my $start_i = bin_search($r_sclip->{$chr}, $start); my $end_i = bin_search($r_sclip->{$chr}, $end); my @tmp = @{$r_sclip->{$chr}}; if($start_i <= $end_i) { @tmp = @tmp[$start_i .. $end_i]; if($start_i == $end_i) { return undef if($tmp[0]->[0] < $start || $tmp[0]->[0] > $end); } return \@tmp; } else { return undef; } } sub bin_search { my ($a, $p) = @_; my ($s, $e) = (0, scalar(@{$a})-1); my $m = int( ($s + $e)/2); while(1) { return $s if($a->[$s][0] >= $p); return $e if($a->[$e][0] <= $p); return $m if($a->[$m][0] == $p || ($a->[$m-1][0] < $p && $a->[$m+1][0] > $p)); if($a->[$m][0] > $p) { $e = $m; } else { $s = $m; } $m = int( ($s+$e)/2 ); } } sub count_coverage { my ($sam, $chr, $pos, $clip) = @_; if($rmdup && !$RNASeq) { my @pairs; my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos); my $n = 0; return 0 unless $seg; my $itr = $seg->features(-iterator => 1); while( my $a = $itr->next_seq) { next unless($a->start && $a->end); #why unmapped reads here? my $sclip_len = 0; if($clip) { my @cigar_array = @{$a->cigar_array}; #$sclip_len = $1 if($a->cigar_str =~ m/S(\d+)$/ && $clip == RIGHT_CLIP); $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP); #$sclip_len = $1 if($a->cigar_str =~ m/^S(\d+)/ && $clip == LEFT_CLIP); $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP); } next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len)); $n++; return $n if( $n > $max_repetitive_cover); push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, $a->mate_end ? $a->mate_end : 0, $sclip_len]; } return $n; } else{ my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, -start => $pos, -end => $pos); return 0 unless $c; my @c_d = $c->coverage; return $c_d[0]; } } sub detect_SV { my %param = @_; my @s_reads = @{$param{-SREADS}}; my($sam, $chr, $ort, $r_sclip, $coverage) = ( $param{-SAM}, $param{-CHR}, $param{-ORT}, $param{-SCLIP}, $param{-COVER}); my $clip = $ort eq '+' ? RIGHT_CLIP : LEFT_CLIP; return if($coverage > $max_repetitive_cover); my $fa_name = prepare_reads(\@s_reads, $clip); my($contig_file, $sclip_count, $contig_reads) = $assembler->run($fa_name); return if(!$contig_file or !(-e $contig_file)); $contig_file = clip_fa_file($contig_file, $clip); my $contig_seqs = read_fa_file($contig_file); if(scalar @s_reads == 1) { $contig_file = clip_fa_file($fa_name, $clip); $contig_seqs = read_fa_file($contig_file); my @seq_names = keys %{$contig_seqs}; $sclip_count->{$seq_names[0]} = 1; } my @SV; my $where = ($clip == LEFT_CLIP ? "right" : "left"); my $mapping = $mapper->run( -QUERY => $contig_file ); my (%reads, %quals, %s_lens, %pos); foreach my $r (@s_reads) { $reads{$r->{name}} = $r->{full_seq}; $quals{$r->{name}} = $r->{full_qual}; $s_lens{$r->{name}} = length($r->{seq}); $pos{$r->{name}} = $r->{pos}; } foreach my $s (keys(%{$mapping})) { next if($sclip_count->{$s} < $min_sclip_reads); # try to find a better mapping my @tmp_reads; my $selected_read; my @tmp_pos; foreach my $n (@{$contig_reads->{$s}}) { push @tmp_pos, $pos{$n}; } @tmp_pos = uniq @tmp_pos; my $real_pos = $clip == LEFT_CLIP ? $tmp_pos[$#tmp_pos] : $tmp_pos[0]; my $n_new_SV = 0; my $tmp_bp; foreach my $t (@{$mapping->{$s}}) { my $qort = $t->{qstrand}; my $t_pos = $t->{tstart}; $t_pos = $t->{tend} if( ($qort eq "+" && $clip == LEFT_CLIP) || ($qort eq '-' && $clip == RIGHT_CLIP)); if($t->{tchr} eq $chr && (abs($t_pos - $real_pos) < $min_dist_diff)) { # the soft clipping is incorrect @SV = @SV[0 .. ($#SV - $n_new_SV + 1)] if($n_new_SV > 0); last; } my $first_bp = {}; $first_bp->{sc_reads} = $sclip_count->{$s}; $first_bp->{$where . "_chr"} = $chr; $first_bp->{$where . "_pos"} = $real_pos; $first_bp->{all_pos} = \@tmp_pos; $first_bp->{$where . "_ort"} = "+"; $first_bp->{pos} = $real_pos; $first_bp->{cover} = $coverage; $first_bp->{chr} = $chr; my $new_where = ($where eq "right" ? "left" : "right"); $first_bp->{$new_where . "_chr"} = $t->{tchr}; $first_bp->{$new_where . "_pos"} = $t_pos; $first_bp->{$new_where . "_ort"} = $qort; $first_bp->{sc_seq} = $contig_seqs->{$s}; $first_bp->{reads} = $contig_reads->{$s}; $tmp_bp = $first_bp unless $tmp_bp; my $second_bp = check_sclip(-SAM => $sam, -TARGET => $t, -CHR => $chr, -POS => $real_pos, -SCLIP => $r_sclip, -CLIP => $clip); if(@{$second_bp}) { foreach my $tmp_bp (@{$second_bp}) { push @SV, StructVar->new(-FIRST_BP => $first_bp, -SECOND_BP => $tmp_bp); $n_new_SV++; } } } # save some one side good soft-clipping SV if( $rescue == 1 && $n_new_SV == 0 && $sclip_count->{$s} >= $min_one_side_reads && (scalar(@{$mapping->{$s}}) == 1 || ($mapping->{$s}[0]{perfect} == 1 && $mapping->{$s}->[1]{perfect} == 0)) && $tmp_bp->{chr} && length($contig_seqs->{$s}) * 0.95 < $mapping->{$s}[0]{matches} ) { my %second_bp = %{$tmp_bp}; $second_bp{sc_reads} = 0; $second_bp{sc_seq} = ''; ($second_bp{chr}, $second_bp{pos}) = $second_bp{pos} == $second_bp{left_pos} ? ($second_bp{right_chr}, $second_bp{right_pos}) : ($second_bp{left_chr}, $second_bp{left_pos}); $second_bp{cover} = count_coverage($sam, $second_bp{chr}, $second_bp{pos}); $second_bp{reads} = []; push @SV, StructVar->new(-FIRST_BP => $tmp_bp, -SECOND_BP => \%second_bp); } } system("rm $fa_name"); system("rm $fa_name*"); return @SV; } sub get_gene_range { my ($chr, $pos, $clip) = @_; my $r = $clip == LEFT_CLIP ? [$pos, $pos+5] : [$pos - 5, $pos]; my @genes; my $tmp = $chr; $tmp = "chr" . $tmp if($tmp !~ m/chr/); my ($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); push @genes, $f_tree->intersect($r); push @genes, $r_tree->intersect($r); my ($s, $e); if(scalar @genes == 0) { #no gene here return [$pos - $read_len, $pos + $read_len]; } foreach my $g (@genes) { my $start = $g->val->get_start($pos, $read_len); my $end = $g->val->get_end($pos, $read_len); $s = $start if(!$s || $s > $start); $e = $end if(!$e || $e < $end); } return [$s, $e]; } sub check_sclip { my %arg = @_; my ($sam, $chr, $pos, $target, $r_sclip, $clip) = ( $arg{-SAM}, $arg{-CHR}, $arg{-POS}, $arg{-TARGET}, $arg{-SCLIP}, $arg{-CLIP} ); my @bp; # identify the searching region for partner soft cliping my $extension; if($RNASeq) { $extension = 50; } else { $extension = $read_len - ($target->{qend} - $target->{qstart}); } my ($tpos, $ort) = ($target->{tend}, $target->{qstrand}); $tpos = $target->{tstart} if( ($clip == LEFT_CLIP && $ort eq '-') || ($clip == RIGHT_CLIP && $ort eq '+')); $extension = abs($tpos - $pos) - 1 if($chr eq $target->{tchr} && abs($tpos - $pos) <= $extension); # don't consider itself my $range =$target->{tchr} . ":"; if($tpos < $extension) { $range .= '1'; } else{ $range .= $tpos - $extension; } $range .= "-"; $range .= $tpos + $extension; # the orginal genome sequence where we find the soft_clipping my $orig; my $tmp = $chr; #$tmp = 'chr' . $tmp if($tmp !~ m/^chr/); $orig = $target_genome . ":" . $tmp . ":"; my $base_pos; if($RNASeq) { #let's do a wild guess! my $r = get_gene_range($tmp, $pos, $clip); return \@bp unless $r; $base_pos = $r->[0]; $orig = join("", ($orig, $r->[0], "-", $r->[1])); } else { $base_pos = $pos - $read_len; $orig = join("", ($orig, $pos - $read_len, "-", $pos + $read_len)); } return \@bp if(!exists( $r_sclip->{$target->{tchr}})); #mapped to chrY or other minor chrs # the soft clipping happens in highly repetitive region # return \@bp if($coverage > $max_repetitive_cover); my $r_pp = search_sclip($r_sclip, $range); return \@bp if(!$r_pp); my $real_pos; my $found = 0; my @r_reads; my @l_reads; foreach my $s (@{$r_pp}) { #next if($s->[2] < $min_hit_reads); next if($chr ne $target->{tchr} && ( ($clip == LEFT_CLIP && $ort ne $s->[1] ) || ($clip == RIGHT_CLIP && $ort eq $s->[1] )) ); next if($chr eq $target->{tchr} && ($s->[0] < $tpos - $extension || $s->[0] > $tpos + $extension)); my $tort = $s->[1]; next if($s->[3] > $max_repetitive_cover); my $tclip = $tort eq '+' ? RIGHT_CLIP : LEFT_CLIP; my @reads = get_sclip_reads( -SAM => $sam, -CHR => $target->{tchr}, -START => $s->[0], -END => $s->[0], -CLIP => $tclip, -MINCLIPLEN => $min_hit_len, -VALIDATOR => $validator, -PAIRED => $paired, -RMDUP => $rmdup); next if(!@reads || scalar(@reads) == 0); # push @r_reads, @reads if($tclip == RIGHT_CLIP); # push @l_reads, @reads if($tclip == LEFT_CLIP); # } # foreach my $tclip ( (RIGHT_CLIP, LEFT_CLIP)) { # my $s_reads = $tclip == RIGHT_CLIP ? \@r_reads : \@l_reads; # next if(scalar @{$s_reads} == 0); my %count; # my $fa_name = prepare_reads($s_reads, $tclip); my $fa_name = prepare_reads(\@reads, $tclip); my($contig_file, $sclip_count, $contig_reads, $singlet_file) = $assembler->run($fa_name); my (%reads, %quals, %s_lens, %pos); # foreach my $r (@{$s_reads}) { foreach my $r (@reads) { $reads{$r->{name}} = $r->{full_seq}; $quals{$r->{name}} = $r->{full_qual}; $s_lens{$r->{name}} = length($r->{seq}); $pos{$r->{name}} = $r->{pos}; } $contig_file = clip_fa_file($contig_file, $tclip); if( -s $singlet_file ) { $singlet_file = clip_fa_file($singlet_file, $tclip); system("cat $singlet_file >> $contig_file"); system("rm $singlet_file"); } my $seqs = read_fa_file($contig_file); my $hits = $aligner->run( -TARGET => $orig, -QUERY => $contig_file); foreach my $t (keys %{$hits}) { my $h = $hits->{$t}; my $tmp_bp; my @all_pos; my $all_reads_name; if(exists $contig_reads->{$t}) { foreach my $n (@{$contig_reads->{$t}}) { push @all_pos, $pos{$n}; } $all_reads_name = $contig_reads->{$t}; @all_pos = uniq @all_pos; @all_pos = sort {$a <=> $b} @all_pos; } else { push @all_pos, $pos{$t}; $all_reads_name = [$t]; } if(is_good_hit($h, $clip, $tclip)) { my $hit_ort = ($h->strand('query') == 1 ? "+" : "-"); my $real_pos = $tclip == RIGHT_CLIP ? $all_pos[0] : $all_pos[$#all_pos]; if($tclip == RIGHT_CLIP ) { $tmp_bp = { left_ort => '+', left_pos => $real_pos, left_chr => $target->{tchr}, right_ort => $hit_ort, right_pos => ($hit_ort eq "+" ? $h->start("hit") : $h->end("hit")) + $base_pos, right_chr => $chr, } } else { $tmp_bp = { left_ort => $hit_ort, left_chr => $chr, left_pos => ($hit_ort eq "+" ? $h->end("hit") : $h->start("hit")) + $base_pos, right_ort => "+", right_chr => $target->{tchr}, right_pos => $real_pos, } } $tmp_bp->{chr} = $target->{tchr}; $tmp_bp->{pos} = $real_pos; $tmp_bp->{all_pos} = \@all_pos; $tmp_bp->{sc_seq} = $seqs->{$t}; $tmp_bp->{cover} = count_coverage($sam, $target->{tchr}, $real_pos); $tmp_bp->{sc_reads} = exists $sclip_count->{$t} ? $sclip_count->{$t} : 1; $tmp_bp->{reads} = exists $contig_reads->{$t} ? $contig_reads->{$t} : [$t]; push @bp, $tmp_bp; } } system("rm $fa_name"); system("rm $fa_name*"); } return \@bp; } # many more filter can be added here sub is_good_hit { my ($hit, $clip, $tclip) = @_; return if($hit->length_aln('query') < $min_hit_len); return if($hit->frac_identical * 100 < $min_percent_id ); return 1; # do we want to do the check? my $ort = ($hit->start('query') < $hit->end('query') ? "+" : "-"); my ($dist, $t_dist, $qstart, $qend); $qstart = $ort eq '+' ? $hit->start('query') : $hit->end('query') ; $qend = $ort eq '+' ? $hit->end('query') : $hit->start('query'); $t_dist = $qstart; $t_dist = $hit->query_length - $qend if($tclip == LEFT_CLIP); } # RNASeq support to identify deletions sub cigar2hit { my ($s, @cigar_array) = @_; my @pos; my $p = $s; foreach my $c (@cigar_array){ my($op, $len) = @{$c}; if($op eq 'N') { push @pos, [$s, $p]; $s = $p + $len; $p = $s; next; } $p += $len if($op eq 'M' || $op eq 'D'); } push @pos, [$s, $p]; return \@pos; } sub find_del_SVs { my ($sam, $chr, $start, $end) = @_; my $max_sclip_len = 0; my $range = $chr; if($start && $end) { $range = $chr . ":" . $start . "-" . $end; } my %SV; my $tmp = $chr; $tmp = "chr" . $chr if($chr !~ m/chr/); my($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); $sam->fetch($range, sub { my $a = shift; my $cigar_str = $a->cigar_str; # print $a->qname, "\t", $cigar_str, "\n"; return unless $cigar_str =~ m/N/; # the read cross two genes must have N # return unless $a->score >= 97; my $sv = identify_del_SV($f_tree, $a); if($sv){ my $tmp1 = $sv->[0] . "+" . $sv->[1]; if(!exists($SV{$tmp1})) { $SV{$tmp1} = {}; $SV{$tmp1}->{num} = 0; $SV{$tmp1}->{left} = 0; $SV{$tmp1}->{right} = 0; } $SV{$tmp1}->{num}++; $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); $SV{$tmp1}->{gene_ort} = "+"; $SV{$tmp1}->{left_gene} = $sv->[3]; $SV{$tmp1}->{right_gene} = $sv->[4]; push @{$SV{$tmp1}->{reads}}, $a->qname; } $sv = identify_del_SV($r_tree, $a); if($sv){ my $tmp1 = $sv->[0] . "-" . $sv->[1]; if(!exists($SV{$tmp1})) { $SV{$tmp1} = {}; $SV{$tmp1}->{num} = 0; $SV{$tmp1}->{left} = 0; $SV{$tmp1}->{right} = 0; $SV{$tmp1}->{reads} = []; } $SV{$tmp1}->{num}++; $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); $SV{$tmp1}->{gene_ort} = "-"; $SV{$tmp1}->{left_gene} = $sv->[3]; $SV{$tmp1}->{right_gene} = $sv->[4]; push @{$SV{$tmp1}->{reads}}, $a->qname; } } ); my @rtn; foreach my $sv (keys %SV) { my( $pos1, $pos2 ) = $sv =~ m/(\d+)[+|-](\d+)/; print $pos1, "\t", $pos2, "\n"; my( $cover1, $cover2) = (count_coverage($sam, $chr, $pos1 - 1), count_coverage($sam, $chr, $pos2)); next if($SV{$sv}->{num} < $min_sclip_reads); my $cover = $cover1 < $cover2 ? $cover1 : $cover2; next if($SV{$sv}->{num} * 100 < $cover * $min_pct_sclip_reads && $SV{$sv} < 50); push @rtn, StructVar->new ( -FIRST_BP => { left_ort => '+', left_pos => $pos1 - 1, left_chr => $chr, right_ort => '+', right_pos => $pos2, right_chr => $chr, chr => $chr, cover => count_coverage($sam, $chr, $pos1 - 1), pos => $pos1 - 1, sc_reads => $SV{$sv}->{num}, left_len => $SV{$sv}->{left}, right_len => $SV{$sv}->{right}, gene_ort => $SV{$sv}->{gene_ort}, left_gene => $SV{$sv}->{left_gene}, right_gene => $SV{$sv}->{right_gene}, reads => $SV{$sv}->{reads}, }, -SECOND_BP => { left_ort => '+', left_pos => $pos1 - 1, left_chr => $chr, right_ort => '+', right_pos => $pos2, right_chr => $chr, chr => $chr, pos => $pos2, cover => count_coverage($sam, $chr, $pos2), sc_reads => $SV{$sv}->{num}, reads => [], } ); } return @rtn; } sub identify_del_SV { my ($tree, $a) = @_; my @cigar_array = @{$a->cigar_array}; return unless($a->start && $a->end); return if(scalar $tree->intersect([$a->start, $a->end]) <= 1); my @hits = @{ cigar2hit($a->start, @cigar_array) }; my @genes; my $last_gene; my @change; my ($left, $right) = (0, 0); for( my $i = 0; $i < scalar @hits; $i++) { my $h = $hits[$i]; my @g = $tree->intersect($h); my $g_size = scalar @g; if($g_size == 0) { # print STDERR $a->qname, " is mapped outside of gene\n"; next; } if( scalar @g == 1) { push @genes, $g[0] unless ($last_gene && $last_gene->val->name eq $g[0]->val->name); if($last_gene && $last_gene->val->name ne $g[0]->val->name) { $right += ($h->[1] - $h->[0] ); push @change, $i; } $left += ($h->[1] - $h->[0] ) if($right == 0); $last_gene = $g[0]; next; } # print STDERR $a->qname, " connect two genes into one?\n"; return; } return if(scalar @genes <= 1); if(scalar @genes > 2) { # print STDERR $a->qname, "crossed more than 2 genes!\n"; return; } # now we down to 2 genes my ($left_gene, $right_gene) = @genes; return unless $left_gene->val->overlap([$hits[$change[0]-1]->[0], $hits[$change[0]-1]->[1]]); return unless $right_gene->val->overlap([$hits[$change[0]]->[0], $hits[$change[0]]->[1]]); my $p = $left_gene->val->end; print join("\t", ( $a->score, $a->cigar_str, $left, $right, $hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $left_gene->val->name, $right_gene->val->name)), "\n"; return [$hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $a, $left_gene->val, $right_gene->val, $left, $right]; } =head1 NAME CREST.pl - a Structure Variation detection tools using softclipping for whole genome sequencing. =head1 VERSION This documentation refers to CREST.pl version 0.0.1. =head1 USAGE This program depends on several things that need to be installed and/or specified. The program uses BioPerl and Bio::DB::Sam module to parse the files and bam files. Also it uses Blat software suits to do genome mapping and alignment. To make the program efficient, it also requires a blat server setup. And the program uses CAP3 assembler. Identify somatic SVs from input: CREST.pl -f somatic.cover -d tumor.bam -g germline.bam Identify SVs from a single bam compared wtih reference genome: CREST.pl -f sclip.cover -d sample.bam Identify SVs from a single bam on chr1 only CREST.pl -f sclip.cover -d sample.bam -r chr1 =head1 REQUIRED ARGUMENTS To run the program, several parameter must specified. -d, --input_d: The input (diagnositic) bam file -f, --sclipfile: The soft clipping information generated by extractSClip.pl --ref_genome: The reference genome file in fa format -t, --target_genome: The 2bit genome file used by blat server and blat --blatserver: The blat server name or ip address --blatport: The blat server port =head1 OPTIONS The options that can be used for the program. -g, --input_g The germline (paired) bam file -p, --prefix The output prefix, default is the input bam file name -o, --out_dir Output directory, default is the working directory -l, --read_len The read length of the sequencing data, defaut 100 --sensitive The program will generate more SVs with higher false positive rate. --(no)scratch Use the scratch space, default is off. --(no)paired Use paired reads or not, defafult is on, so change to --nopaired for unpaired reads. --cap3 CAP3 executable position, default cap3 --cap3opt CAP3 options, single quoted, Default ' > /dev/null' --blatclient gfClient excutable position, default gfClient --blatclientopt gfClient options, single quoted, default '-out=psl -nohead' --blat blat executable potion, default balt --blatopt blat options, single quoted, default '-tileSize=7 -stepSize=1 -out=psl -minScore=15' -r, --range The range where SV will be detected, using chr1:100-200 format. --max_score_diff The maximum score difference when stopping select hit, default 10. --min_sclip_reads Minimum number of soft clipping read to triger the procedure, default 3. --max_rep_cover The min number of coverage to be called as repetitive and don't triger the procedure, default 500. --min_sclip_len The min length of soft clipping part at a position to triger the detection, default 20. --min_hit_len Min length of a hit for genome mapping --min_dist_diff Min distance between the mapped position and the soft clipping position, default 20. --(no)rmdup Remove PCR dumplicate. Default remove. --min_percent_id Min percentage of identity of soft clipping read mapping, default 90 --min_percent_hq Min percentage of high quality base in soft clipping reads, default 80 --lowqual_cutoff Low quality cutoff value, default 20. -h, --help, -? Help information --man Man page. --usage Usage information. --version Software version. --(no)rm_tandem_repeat Remove tandem repeat caused SV events, default is ON. When it's on ptrfinder program need to be on the path. --tr_max_indel_size Maximum tandem repeat mediated INDEL events, default 100 --tr_min_size Minimum tandem reapet size, default 2 --tr_max_size Maximum tandem repeat size, default 8 --tr_min_num Minimum tandem repeat number, defaut 4 --min_percent_cons_of_read Minimum percent of consensus length of read length, default 0.75 --max_bp_dist Maximum distance between two idenfitifed break points, default 15 --germline_seq_width Half window width of genomic sequence around break point for germline SV filtering, default 100 --germline_search_width Half window width for seaching soft-clipped reads around breakpoint for germline SV filtering, default 50. --hetero_factor The factor about the SV's heterogenirity and heterozygosity, default 0.4. --triger_p_value The p-value that will triger the SV detection when number of soft-clipped reads is small, defaut 0.05. --(no)rescue Use rescue mode, when it's on, the a SV with only 1 side with enough soft-clipped reads is considered as a valid one instead of rejecting it. Default on. --min_one_side_reads When rescure mode is on, the minimum number of soft-clipped reads on one side, default 5. --RNASeq RNAseq mode, default off --genemodel A gene model file, currently only refFlat format (BED) is supported. Requried for RNASeq. --cluster_size The soft-clipped reads within cluster_size will be considered together, default is 3, RNAseq mode only. =head1 DESCRIPTION This is a program designed to identify Structure Variations (SVs) using soft clipping reads. With the improvement of next-gen sequencing techniques, both the coverage and length of pair-end reads have been increased steady. Many SV detection software availalbe uses pair-end information due to the limitted read length. Now 100bp is pretty common and many reads will cross the break points. Some mapping programs ( bwa, etc ) have the ability to identify so called soft-clipping reads. A soft-clipping read is a read that different part can be mapped to different genomic posiiton, but the read can be uniquely positioned using the mate-pair position and the insert length. So this program use those reads to do an assembly-mapping-assembly-alignment procedure to identify potential structure variiations. =head1 DIAGNOSTICS A list of every error and warning message that the application can generate (even the ones that will "never happen"), with a full explanation of each problem, one or more likely causes, and any suggested remedies. If the application generates exit status codes (e.g., under Unix), then list the exit status associated with each error. =head1 CONFIGURATION AND ENVIRONMENT The program is designed to use under a cluster or high performance computing environment since it's dealing with over 100G input data. The program can be used as highly parallellized. And the program is developped under linux/unix. =head1 DEPENDENCIES The program depend on several packages: 1. Bioperl perl module. 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed. 3. blat suits, include blat, gfClient, gfServer. 4. CAP3 assembly program. =head1 BUGS AND LIMITATIONS There are no known bugs in this module, but the method is limitted to bam file that has soft-clipping cigar string generated.Please report problems to Jianmin Wang (Jianmin.Wang@stjude.org) Patches are welcome. =head1 AUTHOR Jianmin Wang (Jianmin.Wang@stjude.org) =head1 LICENCE AND COPYRIGHT Copyright (c) 2010 by St. Jude Children's Research Hospital. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.