diff lib/detect_LTR_insertion_sites.pl @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/detect_LTR_insertion_sites.pl	Thu Dec 19 10:24:45 2019 -0500
@@ -0,0 +1,390 @@
+#!/usr/bin/env perl
+
+# parses ACE files of assembled repeats and detects potential
+# LTR borders/insertion sites of LTR-retroelements
+
+# "site" is a region (size of $window) including TG or CA
+# "out" is a region adjacent to the site, presumably representing insertion sites
+
+# this is RepeatExplorer version of "detect_insertion_sites_LTRs.pl"
+# -m default set to 10 
+
+use Getopt::Std;
+
+
+
+
+getopt('iowsmdrp');
+if ($opt_i) {
+	$infile = $opt_i;
+} else {
+	die "-i input_file_name missing\n";
+}
+
+if ($opt_p) {
+    $db_PBS = $opt_p;
+} else {
+    die "-p PBS database is missing\n";
+}
+
+
+
+
+if ($opt_o) {
+	$outfile = $opt_o;
+} else {
+	die "-o output_file_name missing\n";
+}
+if ($opt_w) {
+	$window = $opt_w;
+} else {
+	$window = 7;
+	print "window size not set, using default ($window)\n";
+}
+if ($opt_s) {
+	$min_site_depth = $opt_s;   # minimal average read depth (over $window) required for the site
+} else {
+	$min_site_depth = 10;
+	print "min_site_depth not set, using default ($min_site_depth)\n";
+}
+if ($opt_m) {
+	$min_out_masked = $opt_m;  # minimal average number of masked reads outside the site (over $window)
+} else {
+	$min_out_masked = 10;
+	print "min_out_masked not set, using default ($min_out_masked)\n";
+}
+if ($opt_d) {
+	$min_masked_fold_diff = $opt_d;  # how many times should the proportion of masked reads "out" be higher than in "site"
+} else {
+	$min_masked_fold_diff = 3;
+	print "min_masked_fold_diff not set, using default ($min_masked_fold_diff)\n";
+}
+if ($opt_x) {
+	$max_char_to_masked = $opt_x;  # max fold difference between depth in "site" and masked depth "out"
+} else {
+	$max_char_to_masked = 10;
+	print "max_char_to_masked not set, using default ($max_char_to_masked)\n"; 
+}
+if ($opt_r) {
+	$extract_region = $opt_r;
+} else {
+	$extract_region = 30;
+	print "extract_region not set, using default ($extract_region)\n";
+}
+
+# main
+$out_table = $outfile;
+$out_LTR   = "$outfile.LTR";
+$out_ADJ   = "$outfile.ADJ";
+open (IN,$infile) or die;
+open (OUT,">$out_table") or die;
+open (LTR,">$out_LTR") or die;  # LTR end regions as fasta seq; all are converetd to ....CA (so TG... regions are reverse-complemented)
+open (ADJ,">$out_ADJ") or die;  # regions adjacent to LTR ends; if LTR end is rev-complemented, so is its corresponding adjacent region
+print OUT "#Parameters:\n";
+print OUT "#infile\t$infile\n#outfile\t$outfile\n#window\t$window\n#min_site_depth\t$min_site_depth\n";
+print OUT "#min_out_masked\t$min_out_masked\n#min_masked_fold_diff\t$min_masked_fold_diff\n#max_char_to_masked\t$max_char_to_masked\n#extract_region\t$extract_region\n\n";
+print OUT "CL\tcontig\tTG/CA\tposition\tsite\tsite_depth\tout_masked\tmasked_ratio_site\tmasked_ratio_out\tregion_in\tregion_out\tblast PBS\n";
+print "Analyzing ACE file...\n";
+$prev = 0;
+while ($radek = <IN>) {
+	$contig_found = &read_contig;
+	if ($contig_found) {
+		if ($cl > $prev) {
+			$prev = $cl;
+		}
+		&reconstruct_assembly;
+		&find_sites;
+	}
+}
+close IN;
+close OUT;
+close LTR;
+close ADJ;
+print "Running blast against tRNA database...\n";
+&add_PBS_info;    # detects similarities of sequences in ADJ to tRNA database (!!! reads ADJ and $out_table !!!)
+
+$error = system("rm $out_table");
+if ($error) {
+	print "Error removing $out_table\n";
+}
+
+sub read_contig {
+	my ($reads_found,$read_id);
+	# global variables
+	$cl = 0;
+	$contig = 0;
+	$cont_length = 0;
+	$reads = 0;   # number of reads
+	$cons = "";   # contig consensus (including gaps *)
+	%read_starts = ();  # starts of reads within assembly
+	%read_lengths = (); # length of reads in assembly (may contain gaps)
+	%read_from = ();    # start of non-masked part of read sequence (relative to the read)
+	%read_to = ();      # end of non-masked part of read sequence   
+	
+	do {
+		if ($radek =~/^CO CL(\d+)Contig(\d+) (\d+) (\d+)/) {
+			$cl = $1; $contig = $2; $cont_length = $3; $reads = $4;
+			while ($radek = <IN> and length($radek) > 1) {
+				chomp($radek);
+				$cons .= $radek;
+			}
+			do {
+				if ($radek =~/^AF (\S+) [UC] ([-]?\d+)/) {
+					#print "$1 : $2\n";
+					$read_starts{$1} = $2;
+				}
+			} while ($radek = <IN> and not $radek =~/^BS \d+ \d+/);
+			$reads_found = 0;
+			while ($reads_found < $reads) {        # expects previously specified number of reads 
+				$radek = <IN>;              # expects RD lines with read ids alternate with QA lines
+				if ($radek =~/^RD (\S+) (\d+)/) {
+					$read_id = $1;
+					$read_lengths{$read_id} = $2;
+				}
+				if ($radek =~/^QA (\d+) (\d+)/) {
+					$read_from{$read_id} = $1;
+					$read_to{$read_id} = $2;
+					$reads_found++;
+				}
+			}
+			return 1;
+		}
+	} while ($radek = <IN>);
+	return 0;
+}
+
+sub reconstruct_assembly {
+	my ($id,$min_start,$max_end,$shift,$f,$poz);
+	# global variables
+	@assembly_seq = ();     # sequence at each position of assembly; it corresponds to consensus, or regions
+	                        # before (-) and after (+) consensus [assembly may be longer than consensus]
+	@assembly_char = ();    # number of assembly positions with non-masked characters (nucleotides or gaps)
+	@assembly_masked = ();  # number of masked positions
+	$assembly_length = 0;   # total length, including - and + regions
+	
+	$min_start = 1; $max_end = 1;
+	foreach $id (keys(%read_starts)) {
+		if ($min_start > $read_starts{$id}) {
+			$min_start = $read_starts{$id};
+		}
+		if ($max_end < $read_starts{$id}+$read_lengths{$id}-1) {
+			$max_end = $read_starts{$id}+$read_lengths{$id}-1;
+		}
+	}
+	if ($min_start < 1) {
+		$shift = abs($min_start) +1;
+		$assembly_length = $shift + $max_end;
+	} else {
+		$assembly_length = $max_end;
+		$shift = 0;
+	}
+	
+	for ($f=1;$f<=$shift;$f++) {
+		$assembly_seq[$f] = "-";
+	}
+	for ($f=1;$f<=$cont_length;$f++) {
+		$assembly_seq[$f+$shift] = substr($cons,$f-1,1);
+	}
+	for ($f=$shift+$cont_length+1;$f<=$assembly_length;$f++) {
+		$assembly_seq[$f] = "+";
+	}
+
+	for ($f=1;$f<=$assembly_length;$f++) {
+		$assembly_char[$f] = 0;
+		$assembly_masked[$f] = 0;
+	}
+	foreach $id (keys(%read_starts)) {
+		for ($f=1;$f<=$read_lengths{$id};$f++) {
+			$poz = $read_starts{$id} + $shift + $f - 1;
+			if ($f>=$read_from{$id} and $f<=$read_to{$id}) {
+				$assembly_char[$poz]++;
+			} else {
+				$assembly_masked[$poz]++;
+			}
+		}
+	}
+}
+
+sub revcompl {
+	my ($input) = @_;
+	my ($base,$seq,$f);
+	
+	$seq = "";
+	for ($f=length($input)-1;$f>=0;$f--) {
+		$base = substr($input,$f,1);
+		if ($base eq "A") {
+			$seq .= "T";
+		} elsif ($base eq "T") {
+			$seq .= "A";
+		} elsif ($base eq "G") {
+			$seq .= "C";
+		} elsif ($base eq "C") {
+			$seq .= "G";
+		} elsif ($base eq "+" or $base eq "-") {
+			$seq .= $base;
+		} else {
+			$seq .= "N";
+		}
+	}
+	return $seq;
+}
+
+sub find_sites {
+	my ($f,$site_sum_char,$site_sum_masked,$out_sum_char,$site_seq,$out_sum_masked,@TG,@CA,$pos);
+	my ($masked_ratio_site,$masked_ratio_out,$site_depth,$out_masked,$region);
+	
+	# find positions of LTR borders (TG and CA)
+	@TG = ();   # positions of "T"
+	@CA = ();   # positions of "A"
+	for ($f=1;$f<$assembly_length;$f++) {
+		if ($assembly_seq[$f] eq "T" and $assembly_seq[$f+1] eq "G") {
+			push(@TG,$f);
+		}
+		if ($assembly_seq[$f] eq "C" and $assembly_seq[$f+1] eq "A") {
+			push(@CA,$f+1);
+		}
+	}
+	
+	foreach $pos (@TG) {
+		if ($pos-$window > 0 and $pos+$window-1 <= $assembly_length) {
+			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
+			for ($f=$pos;$f<$pos+$window;$f++) {
+				$site_sum_char += $assembly_char[$f];
+				$site_sum_masked += $assembly_masked[$f];
+				$site_seq .= $assembly_seq[$f];
+			}
+			$out_sum_char = 0; $out_sum_masked = 0;
+			for ($f=$pos-$window;$f<$pos;$f++) {
+				$out_sum_char += $assembly_char[$f];
+				$out_sum_masked += $assembly_masked[$f];
+			}
+			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
+			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
+			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
+			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
+			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
+				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
+					print OUT "$cl\t$contig\tTG\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
+					$region = "";
+					for ($f=$pos;$f<=$assembly_length;$f++) {
+						if ($assembly_seq[$f] ne "*") {
+							$region .= $assembly_seq[$f];
+						}
+						if (length($region) == $extract_region) {
+							$f = $assembly_length;  # terminate cycle
+						}
+					}
+					print OUT "$region\t";
+					print LTR ">CL",$cl,"c".$contig."_TG_$pos\n";
+					$region = &revcompl($region);
+					print LTR "$region\n";
+					$region = "";
+					for ($f=$pos-1;$f>0;$f=$f-1) {
+						if ($assembly_seq[$f] ne "*") {
+							$region = $assembly_seq[$f].$region;
+						}
+						if (length($region) == $extract_region) {
+							$f = 0;  # terminate cycle
+						}
+					}
+					print OUT "$region\n";
+					print ADJ ">CL",$cl,"c".$contig."_TG_$pos\n";
+					$region = &revcompl($region);
+					print ADJ "$region\n";
+				}
+			}
+		}
+	}
+	
+	foreach $pos (@CA) {
+		if ($pos-$window+1 > 0 and $pos+$window <= $assembly_length) {
+			$site_sum_char = 0; $site_sum_masked = 0; $site_seq = "";
+			for ($f=$pos-$window+1;$f<=$pos;$f++) {
+				$site_sum_char += $assembly_char[$f];
+				$site_sum_masked += $assembly_masked[$f];
+				$site_seq .= $assembly_seq[$f];
+			}
+			$out_sum_char = 0; $out_sum_masked = 0;
+			for ($f=$pos+1;$f<=$pos+$window;$f++) {
+				$out_sum_char += $assembly_char[$f];
+				$out_sum_masked += $assembly_masked[$f];
+			}
+			$site_depth = sprintf("%0.1f",$site_sum_char/$window);   # average read (unmasked) depth over the site
+			$out_masked = sprintf("%0.1f",$out_sum_masked/$window);  # average number of masked reads outside the site
+			$masked_ratio_site = sprintf("%0.4f",$site_sum_masked/($site_sum_masked+$site_sum_char));
+			$masked_ratio_out  = sprintf("%0.4f",$out_sum_masked/($out_sum_masked+$out_sum_char));
+			if ($site_depth >= $min_site_depth and $out_masked >= $min_out_masked) {
+				if ($masked_ratio_out >= ($min_masked_fold_diff * $masked_ratio_site) and $max_char_to_masked >= ($site_depth/$out_masked)) {
+					print OUT "$cl\t$contig\tCA\t$pos\t$site_seq\t$site_depth\t$out_masked\t$masked_ratio_site\t$masked_ratio_out\t";
+					$region = "";
+					for ($f=$pos;$f>0;$f=$f-1) {
+						if ($assembly_seq[$f] ne "*") {
+							$region = $assembly_seq[$f].$region;
+						}
+						if (length($region) == $extract_region) {
+							$f = 0;  # terminate cycle
+						}
+					}
+					print OUT "$region\t";
+					print LTR ">CL",$cl,"c".$contig."_CA_$pos\n";
+					print LTR "$region\n";
+					$region = "";
+					for ($f=$pos+1;$f<=$assembly_length;$f++) {
+						if ($assembly_seq[$f] ne "*") {
+							$region .= $assembly_seq[$f];
+						}
+						if (length($region) == $extract_region) {
+							$f = $assembly_length;  # terminate cycle
+						}
+					}
+					print OUT "$region\n";
+					print ADJ ">CL",$cl,"c".$contig."_CA_$pos\n";
+					print ADJ "$region\n";
+				}
+			}
+		}
+	}
+}
+
+sub add_PBS_info {
+	my ($pbs_blast_command,@pol,$rad,$prev_query,@table,$tab_length);
+	
+	$pbs_blast_command = "blastall -p blastn -d $db_PBS -i $out_ADJ -m 8 -b 1 -e 1 -W 7 -F F";
+	
+	@table = ();
+	open (TAB,$out_table) or die;
+	while ($rad = <TAB>) {
+		push(@table,$rad);
+		$tab_length++;
+	}
+	close TAB;
+	
+	open (BLAST,"$pbs_blast_command |") or die;
+	$prev_query = "";
+	while ($rad = <BLAST>) {
+		if ($rad =~/^CL(\d+)c(\d+)_(TG|CA)_(\d+)\t\S+\t\S+\t/) {   
+			if ("$1\t$2\t$3\t$4" ne $prev_query) {           # to exclude additional HSPs from the same query/subject pair
+				for ($f=0;$f<$tab_length;$f++) {       
+					@pol = split(/\t/,$table[$f]);
+					if ($pol[0] eq "$1" and $pol[1] eq "$2" and $pol[2] eq "$3" and $pol[3] eq "$4") {
+						chomp($table[$f]);
+						$table[$f] .= "\t$rad";
+						$f = $tab_length;  # terminate cycle
+					}
+				}
+				$prev_query = "$1\t$2\t$3\t$4";
+			}
+		}
+	}
+	close BLAST;
+	
+	open (TAB_WITH_BLAST,">$out_table.with_PBS_blast.csv") or die;
+	for ($f=0;$f<$tab_length;$f++) {
+		print TAB_WITH_BLAST $table[$f];
+	}
+	close TAB_WITH_BLAST;
+}
+
+
+
+