view lib/detect_LTR_insertion_sites.pl @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env perl

# parses ACE files of assembled repeats and detects potential
# LTR borders/insertion sites of LTR-retroelements

# "site" is a region (size of $window) including TG or CA
# "out" is a region adjacent to the site, presumably representing insertion sites

# this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
# -m default set to 10 

use Getopt::Std;




getopt('iowsmdrp');
if ($opt_i) {
	$infile = $opt_i;
} else {
	die "-i input_file_name missing\n";
}

if ($opt_p) {
    $db_PBS = $opt_p;
} else {
    die "-p PBS database is missing\n";
}




if ($opt_o) {
	$outfile = $opt_o;
} else {
	die "-o output_file_name missing\n";
}
if ($opt_w) {
	$window = $opt_w;
} else {
	$window = 7;
	print "window size not set, using default ($window)\n";
}
if ($opt_s) {
	$min_site_depth = $opt_s;   # minimal average read depth (over $window) required for the site
} else {
	$min_site_depth = 10;
	print "min_site_depth not set, using default ($min_site_depth)\n";
}
if ($opt_m) {
	$min_out_masked = $opt_m;  # minimal average number of masked reads outside the site (over $window)
} else {
	$min_out_masked = 10;
	print "min_out_masked not set, using default ($min_out_masked)\n";
}
if ($opt_d) {
	$min_masked_fold_diff = $opt_d;  # how many times should the proportion of masked reads "out" be higher than in "site"
} else {
	$min_masked_fold_diff = 3;
	print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
}
if ($opt_x) {
	$max_char_to_masked = $opt_x;  # max fold difference between depth in "site" and masked depth "out"
} else {
	$max_char_to_masked = 10;
	print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; 
}
if ($opt_r) {
	$extract_region = $opt_r;
} else {
	$extract_region = 30;
	print "extract_region not set, using default ($extract_region)\n";
}

# main
$out_table = $outfile;
$out_LTR   = "$outfile.LTR";
$out_ADJ   = "$outfile.ADJ";
open (IN,$infile) or die;
open (OUT,">$out_table") or die;
open (LTR,">$out_LTR") or die;  # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
open (ADJ,">$out_ADJ") or die;  # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
print OUT "#Parameters:\n";
print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
print "Analyzing ACE file...\n";
$prev = 0;
while ($radek = <IN>) {
	$contig_found = &read_contig;
	if ($contig_found) {
		if ($cl > $prev) {
			$prev = $cl;
		}
		&reconstruct_assembly;
		&find_sites;
	}
}
close IN;
close OUT;
close LTR;
close ADJ;
print "Running blast against tRNA database...\n";
&add_PBS_info;    # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)

$error = system("rm $out_table");
if ($error) {
	print "Error removing $out_table\n";
}

sub read_contig {
	my ($reads_found,$read_id);
	# global variables
	$cl = 0;
	$contig = 0;
	$cont_length = 0;
	$reads = 0;   # number of reads
	$cons = "";   # contig consensus (including gaps *)
	%read_starts = ();  # starts of reads within assembly
	%read_lengths = (); # length of reads in assembly (may contain gaps)
	%read_from = ();    # start of non-masked part of read sequence (relative to the read)
	%read_to = ();      # end of non-masked part of read sequence   
	
	do {
		if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
			$cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
			while ($radek = <IN> and length($radek) > 1) {
				chomp($radek);
				$cons .= $radek;
			}
			do {
				if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
					#print "$1 : $2\n";
					$read_starts{$1} = $2;
				}
			} while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
			$reads_found = 0;
			while ($reads_found < $reads) {        # expects previously specified number of reads 
				$radek = <IN>;              # expects RD lines with read ids alternate with QA lines
				if ($radek =~/^RD (\S+) (\d+)/) {
					$read_id = $1;
					$read_lengths{$read_id} = $2;
				}
				if ($radek =~/^QA (\d+) (\d+)/) {
					$read_from{$read_id} = $1;
					$read_to{$read_id} = $2;
					$reads_found++;
				}
			}
			return 1;
		}
	} while ($radek = <IN>);
	return 0;
}

sub reconstruct_assembly {
	my ($id,$min_start,$max_end,$shift,$f,$poz);
	# global variables
	@assembly_seq = ();     # sequence at each position of assembly; it corresponds to consensus, or regions
	                        # before (-) and after (+) consensus [assembly may be longer than consensus]
	@assembly_char = ();    # number of assembly positions with non-masked characters (nucleotides or gaps)
	@assembly_masked = ();  # number of masked positions
	$assembly_length = 0;   # total length, including - and + regions
	
	$min_start = 1; $max_end = 1;
	foreach $id (keys(%read_starts)) {
		if ($min_start > $read_starts{$id}) {
			$min_start = $read_starts{$id};
		}
		if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
			$max_end = $read_starts{$id}+$read_lengths{$id}-1;
		}
	}
	if ($min_start < 1) {
		$shift = abs($min_start) +1;
		$assembly_length = $shift + $max_end;
	} else {
		$assembly_length = $max_end;
		$shift = 0;
	}
	
	for ($f=1;$f<=$shift;$f++) {
		$assembly_seq[$f] = "-";
	}
	for ($f=1;$f<=$cont_length;$f++) {
		$assembly_seq[$f+$shift] = substr($cons,$f-1,1);
	}
	for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
		$assembly_seq[$f] = "+";
	}

	for ($f=1;$f<=$assembly_length;$f++) {
		$assembly_char[$f] = 0;
		$assembly_masked[$f] = 0;
	}
	foreach $id (keys(%read_starts)) {
		for ($f=1;$f<=$read_lengths{$id};$f++) {
			$poz = $read_starts{$id} + $shift + $f - 1;
			if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
				$assembly_char[$poz]++;
			} else {
				$assembly_masked[$poz]++;
			}
		}
	}
}

sub revcompl {
	my ($input) = @_;
	my ($base,$seq,$f);
	
	$seq = "";
	for ($f=length($input)-1;$f>=0;$f--) {
		$base = substr($input,$f,1);
		if ($base eq "A") {
			$seq .= "T";
		} elsif ($base eq "T") {
			$seq .= "A";
		} elsif ($base eq "G") {
			$seq .= "C";
		} elsif ($base eq "C") {
			$seq .= "G";
		} elsif ($base eq "+" or $base eq "-") {
			$seq .= $base;
		} else {
			$seq .= "N";
		}
	}
	return $seq;
}

sub find_sites {
	my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
	my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
	
	# find positions of LTR borders (TG and CA)
	@TG = ();   # positions of "T"
	@CA = ();   # positions of "A"
	for ($f=1;$f<$assembly_length;$f++) {
		if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
			push(@TG,$f);
		}
		if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
			push(@CA,$f+1);
		}
	}
	
	foreach $pos (@TG) {
		if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
			for ($f=$pos;$f<$pos+$window;$f++) {
				$site_sum_char += $assembly_char[$f];
				$site_sum_masked += $assembly_masked[$f];
				$site_seq .= $assembly_seq[$f];
			}
			$out_sum_char = 0; $out_sum_masked = 0;
			for ($f=$pos-$window;$f<$pos;$f++) {
				$out_sum_char += $assembly_char[$f];
				$out_sum_masked += $assembly_masked[$f];
			}
			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
					print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
					$region = "";
					for ($f=$pos;$f<=$assembly_length;$f++) {
						if ($assembly_seq[$f] ne "*") {
							$region .= $assembly_seq[$f];
						}
						if (length($region) == $extract_region) {
							$f = $assembly_length;  # terminate cycle
						}
					}
					print OUT "$region\t";
					print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
					$region = &revcompl($region);
					print LTR "$region\n";
					$region = "";
					for ($f=$pos-1;$f>0;$f=$f-1) {
						if ($assembly_seq[$f] ne "*") {
							$region = $assembly_seq[$f].$region;
						}
						if (length($region) == $extract_region) {
							$f = 0;  # terminate cycle
						}
					}
					print OUT "$region\n";
					print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
					$region = &revcompl($region);
					print ADJ "$region\n";
				}
			}
		}
	}
	
	foreach $pos (@CA) {
		if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
			for ($f=$pos-$window+1;$f<=$pos;$f++) {
				$site_sum_char += $assembly_char[$f];
				$site_sum_masked += $assembly_masked[$f];
				$site_seq .= $assembly_seq[$f];
			}
			$out_sum_char = 0; $out_sum_masked = 0;
			for ($f=$pos+1;$f<=$pos+$window;$f++) {
				$out_sum_char += $assembly_char[$f];
				$out_sum_masked += $assembly_masked[$f];
			}
			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
					print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
					$region = "";
					for ($f=$pos;$f>0;$f=$f-1) {
						if ($assembly_seq[$f] ne "*") {
							$region = $assembly_seq[$f].$region;
						}
						if (length($region) == $extract_region) {
							$f = 0;  # terminate cycle
						}
					}
					print OUT "$region\t";
					print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
					print LTR "$region\n";
					$region = "";
					for ($f=$pos+1;$f<=$assembly_length;$f++) {
						if ($assembly_seq[$f] ne "*") {
							$region .= $assembly_seq[$f];
						}
						if (length($region) == $extract_region) {
							$f = $assembly_length;  # terminate cycle
						}
					}
					print OUT "$region\n";
					print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
					print ADJ "$region\n";
				}
			}
		}
	}
}

sub add_PBS_info {
	my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
	
	$pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
	
	@table = ();
	open (TAB,$out_table) or die;
	while ($rad = <TAB>) {
		push(@table,$rad);
		$tab_length++;
	}
	close TAB;
	
	open (BLAST,"$pbs_blast_command |") or die;
	$prev_query = "";
	while ($rad = <BLAST>) {
		if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {   
			if ("$1\t$2\t$3\t$4" ne $prev_query) {           # to exclude additional HSPs from the same query/subject pair
				for ($f=0;$f<$tab_length;$f++) {       
					@pol = split(/\t/,$table[$f]);
					if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
						chomp($table[$f]);
						$table[$f] .= "\t$rad";
						$f = $tab_length;  # terminate cycle
					}
				}
				$prev_query = "$1\t$2\t$3\t$4";
			}
		}
	}
	close BLAST;
	
	open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
	for ($f=0;$f<$tab_length;$f++) {
		print TAB_WITH_BLAST $table[$f];
	}
	close TAB_WITH_BLAST;
}