view rapsodyn/extractseq.pl @ 14:93e6f2af1ce2 draft

Uploaded
author mcharles
date Mon, 26 Jan 2015 18:10:52 -0500
parents 0a6c1cfe4dc8
children
line wrap: on
line source

#!/usr/bin/perl
#V1.10 manage empty files
#V1.02 Trop de pb avec nbci blast+, changment du header des fasta
#V1.01 #Ajout d'un _ a la fin du nom pour eviter les problemes avec ncbi blast+

use strict;
use warnings;
use Getopt::Long;

my $input_variant_file;
my $input_assembly_file;
my $WINDOWS_LENGTH = 50;

GetOptions (
"input_variant_file=s" => \$input_variant_file,
"input_assembly_file=s" => \$input_assembly_file,
"window_length=i" => \$WINDOWS_LENGTH
) or die("Error in command line arguments\n");

open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");

if ( -z INV){
	print ">empty\nAAAAA";
	exit(0);
}

open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n");

my @variant_list;


### Retrieving the assembly
my %genome;

my $current_header="";
my $current_seq="";
while (my $ligne = <INA>){
	if ($ligne =~ /^\>(.*?)\s*$/){
		if ($current_header){
			$genome{$current_header} = $current_seq;
		}
		$current_header=$1;
		$current_seq = "";
	}
	else {
		if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
			$current_seq .= $1;
		}
		else {
			print STDERR "Erreur Parsing n°2\n$ligne\n";
		}
	}
}
#TRAITEMENT DU DERNIER
if ($current_header){
	$genome{$current_header} = $current_seq;
	undef($current_seq);
}
close (INA);


### Retrieving the variant
while (my $ligne=<INV>){
	if ($ligne !~ /^\s*$/){
		my %variant;
		my @fields = split (/\s+/,$ligne);
		$variant{"ref"}=$fields[0];
		$variant{"position"}=$fields[1];
		$variant{"baseref"}=$fields[2];
		$variant{"depth"}=$fields[3];
		$variant{"pileup"}=$fields[4];
		
		
		my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1);
		my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}}));
		my $length = $stop-$start+1;
		
		#print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n";

		$variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length;
		
		my $pileup = $variant{"pileup"};
		$pileup =~ s/\$//g; #the read start at this position
		$pileup =~ s/\^.//g; #the read end at this position
		my $descriptor = $variant{"position"}."_".$variant{"depth"}."_";
		if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){
			$descriptor .="I".$1."_".$2;
		}
		elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){
			$descriptor .="D".$1."_".$2;
		}
		elsif ($pileup=~/([ACGTNacgtn])/){
			$descriptor.="M1"."_".$1;
		}
		else {
			$descriptor.="?_?";
		}
		$variant{"desc"}=$descriptor;
		
		#print ">",$variant{"ref"},"_",$descriptor,"_","\n",$variant{"SEQ"},"\n";

		
		
		#V1.02 : changement du header
		#print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
		print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
		
		push(@variant_list,\%variant);
	}
}
close (INV);









#***********
sub min{
	my $first = shift;
	my $second = shift;
	if ($first <= $second){
		return $first;
	}
	else {
		return $second;
	}
}

sub max {
	my $first = shift;
	my $second = shift;
	if ($first >= $second){
		return $first;
	}
	else {
		return $second;
	}
}