diff rapsodyn/extractseq.pl @ 0:442a7c88b886 draft

Uploaded
author mcharles
date Wed, 10 Sep 2014 09:18:15 -0400
parents
children 1776b8ddd87e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/extractseq.pl	Wed Sep 10 09:18:15 2014 -0400
@@ -0,0 +1,132 @@
+#!/usr/bin/perl
+#V1.10
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $input_variant_file;
+my $input_assembly_file;
+my $WINDOWS_LENGTH = 50;
+
+GetOptions (
+"input_variant_file=s" => \$input_variant_file,
+"input_assembly_file=s" => \$input_assembly_file,
+"window_length=i" => \$WINDOWS_LENGTH
+) or die("Error in command line arguments\n");
+
+open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
+open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n");
+
+my @variant_list;
+
+
+### Retrieving the assembly
+my %genome;
+
+my $current_header="";
+my $current_seq="";
+while (my $ligne = <INA>){
+	if ($ligne =~ /^\>(.*?)\s*$/){
+		if ($current_header){
+			$genome{$current_header} = $current_seq;
+		}
+		$current_header=$1;
+		$current_seq = "";
+	}
+	else {
+		if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
+			$current_seq .= $1;
+		}
+		else {
+			print STDERR "Erreur Parsing n°2\n$ligne\n";
+		}
+	}
+}
+#TRAITEMENT DU DERNIER
+if ($current_header){
+	$genome{$current_header} = $current_seq;
+	undef($current_seq);
+}
+close (INA);
+
+
+### Retrieving the variant
+while (my $ligne=<INV>){
+	if ($ligne !~ /^\s*$/){
+		my %variant;
+		my @fields = split (/\s+/,$ligne);
+		$variant{"ref"}=$fields[0];
+		$variant{"position"}=$fields[1];
+		$variant{"baseref"}=$fields[2];
+		$variant{"depth"}=$fields[3];
+		$variant{"pileup"}=$fields[4];
+		
+		
+		my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1);
+		my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}}));
+		my $length = $stop-$start+1;
+		
+		#print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n";
+
+		$variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length;
+		
+		my $pileup = $variant{"pileup"};
+		$pileup =~ s/\$//g; #the read start at this position
+		$pileup =~ s/\^.//g; #the read end at this position
+		my $descriptor = $variant{"position"}."_".$variant{"depth"}."_";
+		if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){
+			$descriptor .="I".$1."_".$2;
+		}
+		elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){
+			$descriptor .="D".$1."_".$2;
+		}
+		elsif ($pileup=~/([ACGTNacgtn])/){
+			$descriptor.="M1"."_".$1;
+		}
+		else {
+			$descriptor.="?_?";
+		}
+		$variant{"desc"}=$descriptor;
+		
+		print ">",$variant{"ref"},"_",$descriptor,"\n",$variant{"SEQ"},"\n";
+
+		
+		
+		#print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
+		
+		push(@variant_list,\%variant);
+	}
+}
+close (INV);
+
+
+
+
+
+
+
+
+
+#***********
+sub min{
+	my $first = shift;
+	my $second = shift;
+	if ($first <= $second){
+		return $first;
+	}
+	else {
+		return $second;
+	}
+}
+
+sub max {
+	my $first = shift;
+	my $second = shift;
+	if ($first >= $second){
+		return $first;
+	}
+	else {
+		return $second;
+	}
+}
\ No newline at end of file