#!/usr/bin/perl
#V1.0.1 added log, option parameters
use strict;
use warnings;
use Getopt::Long;
#v1.1.0 choose exclusion file
#v1.0.2 manage empty files
#v1.0.1 bug correction
#V1.0.1 added log, option parameters

my $input_pileup_file;
my $output_pileup_file;
my $input_exclusion_file;
my $log_file;

my $nb_base_covered=0;
my $nb_variant=0;
my $nb_variant_conserved=0;
my $do_exclusion=0;
my $nb_exclusion=0;
my $nb_excluded=0;
my %exclusion;
GetOptions (
"input_pileup_file=s" => \$input_pileup_file,
"input_exclusion_file=s" => \$input_exclusion_file,
"log_file=s" => \$log_file
) or die("Error in command line arguments\n");

open(IN, $input_pileup_file) or die ("Can't open $input_pileup_file\n");
if ( -z IN){
	exit(0);
}

if ($input_exclusion_file){
	open (EX,$input_exclusion_file)  or die ("Can't open $input_exclusion_file\n");
	if ( -z EX){
		
	}
	else {
		$do_exclusion=1;
		while (my $line=<EX>){
			$nb_exclusion++;
			chomp($line);
			my @fields = split(/\s+/,$line);
			if (($fields[0])&&($fields[1])){
				$exclusion{"$fields[0]\t$fields[1]"}=1;
			}
			else {
				print STDERR "Error formatting in Exclusion File\nExclusion file lines should be 'chromosome number' TAB 'position'\n$line\n",$fields[0],"\n",$fields[1],"\n";
			}
		}
		
	}
	close (EX);
}

#Extraction des variants
my $nb_line=0;
while (my $line=<IN>){
	#print $line;
	if ($line !~ /^\s*$/){
		my @fields = split(/\s+/,$line);
		if ($fields[4]){
			$nb_base_covered++;
			my $pile = $fields[4];
			$pile =~ s/\$//g; #the read start at this position
			$pile =~ s/\^.//g; #the read end at this position followed by quality char
			if ($fields[4]=~/[ATGCN]/i){ #Indel are +/-\d[ATGCatgc]+ and SNP are [ATGCatgc]+ so [ATGCN]/i detection cover indel and snp
				$nb_variant++;
				if (($do_exclusion==1)&&($exclusion{"$fields[0]\t$fields[1]"})){
					
				}
				else {
					print $line;
					$nb_variant_conserved++;
				}
			}
		}
		elsif ($fields[3]==0){
		}
		else {
			print STDERR "Error in pileup format\nPileup result expected at column 5\n$line\n"; 
		}
	}
	#print $line;
	
	
}
close(IN);

open (LF,">$log_file") or die("Can't open $log_file\n");
print LF "\n####\t Variant extraction \n";
print LF "Position covered\t:\t$nb_base_covered\n";
print LF "Variant detected\t:\t$nb_variant\n";
print LF "Variant conserved\t:\t$nb_variant_conserved\n";
close (LF);
