0
|
1 #!/usr/bin/perl
|
10
|
2 #V1.10 manage empty files
|
|
3 #V1.02 Trop de pb avec nbci blast+, changment du header des fasta
|
7
|
4 #V1.01 #Ajout d'un _ a la fin du nom pour eviter les problemes avec ncbi blast+
|
0
|
5
|
|
6 use strict;
|
|
7 use warnings;
|
|
8 use Getopt::Long;
|
|
9
|
|
10 my $input_variant_file;
|
|
11 my $input_assembly_file;
|
|
12 my $WINDOWS_LENGTH = 50;
|
|
13
|
|
14 GetOptions (
|
|
15 "input_variant_file=s" => \$input_variant_file,
|
|
16 "input_assembly_file=s" => \$input_assembly_file,
|
|
17 "window_length=i" => \$WINDOWS_LENGTH
|
|
18 ) or die("Error in command line arguments\n");
|
|
19
|
|
20 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
|
10
|
21
|
|
22 if ( -z INV){
|
|
23 print ">empty\nAAAAA";
|
|
24 exit(0);
|
|
25 }
|
|
26
|
0
|
27 open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n");
|
|
28
|
|
29 my @variant_list;
|
|
30
|
|
31
|
|
32 ### Retrieving the assembly
|
|
33 my %genome;
|
|
34
|
|
35 my $current_header="";
|
|
36 my $current_seq="";
|
|
37 while (my $ligne = <INA>){
|
|
38 if ($ligne =~ /^\>(.*?)\s*$/){
|
|
39 if ($current_header){
|
|
40 $genome{$current_header} = $current_seq;
|
|
41 }
|
|
42 $current_header=$1;
|
|
43 $current_seq = "";
|
|
44 }
|
|
45 else {
|
|
46 if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
|
|
47 $current_seq .= $1;
|
|
48 }
|
|
49 else {
|
|
50 print STDERR "Erreur Parsing n°2\n$ligne\n";
|
|
51 }
|
|
52 }
|
|
53 }
|
|
54 #TRAITEMENT DU DERNIER
|
|
55 if ($current_header){
|
|
56 $genome{$current_header} = $current_seq;
|
|
57 undef($current_seq);
|
|
58 }
|
|
59 close (INA);
|
|
60
|
|
61
|
|
62 ### Retrieving the variant
|
|
63 while (my $ligne=<INV>){
|
|
64 if ($ligne !~ /^\s*$/){
|
|
65 my %variant;
|
|
66 my @fields = split (/\s+/,$ligne);
|
|
67 $variant{"ref"}=$fields[0];
|
|
68 $variant{"position"}=$fields[1];
|
|
69 $variant{"baseref"}=$fields[2];
|
|
70 $variant{"depth"}=$fields[3];
|
|
71 $variant{"pileup"}=$fields[4];
|
|
72
|
|
73
|
|
74 my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1);
|
|
75 my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}}));
|
|
76 my $length = $stop-$start+1;
|
|
77
|
|
78 #print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n";
|
|
79
|
|
80 $variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length;
|
|
81
|
|
82 my $pileup = $variant{"pileup"};
|
|
83 $pileup =~ s/\$//g; #the read start at this position
|
|
84 $pileup =~ s/\^.//g; #the read end at this position
|
|
85 my $descriptor = $variant{"position"}."_".$variant{"depth"}."_";
|
|
86 if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){
|
|
87 $descriptor .="I".$1."_".$2;
|
|
88 }
|
|
89 elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){
|
|
90 $descriptor .="D".$1."_".$2;
|
|
91 }
|
|
92 elsif ($pileup=~/([ACGTNacgtn])/){
|
|
93 $descriptor.="M1"."_".$1;
|
|
94 }
|
|
95 else {
|
|
96 $descriptor.="?_?";
|
|
97 }
|
|
98 $variant{"desc"}=$descriptor;
|
|
99
|
10
|
100 #print ">",$variant{"ref"},"_",$descriptor,"_","\n",$variant{"SEQ"},"\n";
|
0
|
101
|
|
102
|
|
103
|
10
|
104 #V1.02 : changement du header
|
0
|
105 #print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
|
10
|
106 print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
|
0
|
107
|
|
108 push(@variant_list,\%variant);
|
|
109 }
|
|
110 }
|
|
111 close (INV);
|
|
112
|
|
113
|
|
114
|
|
115
|
|
116
|
|
117
|
|
118
|
|
119
|
|
120
|
|
121 #***********
|
|
122 sub min{
|
|
123 my $first = shift;
|
|
124 my $second = shift;
|
|
125 if ($first <= $second){
|
|
126 return $first;
|
|
127 }
|
|
128 else {
|
|
129 return $second;
|
|
130 }
|
|
131 }
|
|
132
|
|
133 sub max {
|
|
134 my $first = shift;
|
|
135 my $second = shift;
|
|
136 if ($first >= $second){
|
|
137 return $first;
|
|
138 }
|
|
139 else {
|
|
140 return $second;
|
|
141 }
|
6
|
142 }
|