annotate rapsodyn/extractseq.pl @ 13:827da1a9a326 draft

Uploaded
author mcharles
date Mon, 19 Jan 2015 10:38:29 -0500
parents 0a6c1cfe4dc8
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
1 #!/usr/bin/perl
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
2 #V1.10 manage empty files
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
3 #V1.02 Trop de pb avec nbci blast+, changment du header des fasta
7
3f7b0788a1c4 Uploaded
mcharles
parents: 6
diff changeset
4 #V1.01 #Ajout d'un _ a la fin du nom pour eviter les problemes avec ncbi blast+
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
5
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
6 use strict;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
7 use warnings;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
8 use Getopt::Long;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
9
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
10 my $input_variant_file;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
11 my $input_assembly_file;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
12 my $WINDOWS_LENGTH = 50;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
13
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
14 GetOptions (
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
15 "input_variant_file=s" => \$input_variant_file,
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
16 "input_assembly_file=s" => \$input_assembly_file,
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
17 "window_length=i" => \$WINDOWS_LENGTH
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
18 ) or die("Error in command line arguments\n");
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
19
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
20 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
21
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
22 if ( -z INV){
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
23 print ">empty\nAAAAA";
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
24 exit(0);
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
25 }
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
26
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
27 open(INA, $input_assembly_file) or die ("Can't open $input_assembly_file\n");
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
28
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
29 my @variant_list;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
30
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
31
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
32 ### Retrieving the assembly
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
33 my %genome;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
34
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
35 my $current_header="";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
36 my $current_seq="";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
37 while (my $ligne = <INA>){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
38 if ($ligne =~ /^\>(.*?)\s*$/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
39 if ($current_header){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
40 $genome{$current_header} = $current_seq;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
41 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
42 $current_header=$1;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
43 $current_seq = "";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
44 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
45 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
46 if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
47 $current_seq .= $1;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
48 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
49 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
50 print STDERR "Erreur Parsing n°2\n$ligne\n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
51 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
52 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
53 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
54 #TRAITEMENT DU DERNIER
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
55 if ($current_header){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
56 $genome{$current_header} = $current_seq;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
57 undef($current_seq);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
58 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
59 close (INA);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
60
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
61
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
62 ### Retrieving the variant
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
63 while (my $ligne=<INV>){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
64 if ($ligne !~ /^\s*$/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
65 my %variant;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
66 my @fields = split (/\s+/,$ligne);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
67 $variant{"ref"}=$fields[0];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
68 $variant{"position"}=$fields[1];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
69 $variant{"baseref"}=$fields[2];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
70 $variant{"depth"}=$fields[3];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
71 $variant{"pileup"}=$fields[4];
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
72
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
73
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
74 my $start = &max($variant{"position"} - $WINDOWS_LENGTH,1);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
75 my $stop = &min ($variant{"position"} + $WINDOWS_LENGTH,length($genome{$variant{"ref"}}));
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
76 my $length = $stop-$start+1;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
77
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
78 #print $variant{"position"}," / ",length($genome{$variant{"ref"}})," / ","$start / $stop / $length \n";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
79
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
80 $variant{"SEQ"} = substr $genome{$variant{"ref"}},$start-1,$length;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
81
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
82 my $pileup = $variant{"pileup"};
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
83 $pileup =~ s/\$//g; #the read start at this position
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
84 $pileup =~ s/\^.//g; #the read end at this position
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
85 my $descriptor = $variant{"position"}."_".$variant{"depth"}."_";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
86 if ($pileup=~/\+([0-9]+)([ACGTNacgtn]+)/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
87 $descriptor .="I".$1."_".$2;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
88 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
89 elsif ($pileup=~/\-([0-9]+)([ACGTNacgtn]+)/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
90 $descriptor .="D".$1."_".$2;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
91 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
92 elsif ($pileup=~/([ACGTNacgtn])/){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
93 $descriptor.="M1"."_".$1;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
94 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
95 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
96 $descriptor.="?_?";
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
97 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
98 $variant{"desc"}=$descriptor;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
99
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
100 #print ">",$variant{"ref"},"_",$descriptor,"_","\n",$variant{"SEQ"},"\n";
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
101
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
102
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
103
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
104 #V1.02 : changement du header
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
105 #print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
10
0a6c1cfe4dc8 Uploaded
mcharles
parents: 9
diff changeset
106 print ">",$variant{"ref"},"_",$variant{"position"},"_",$variant{"depth"},"\n",$variant{"SEQ"},"\n";
0
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
107
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
108 push(@variant_list,\%variant);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
109 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
110 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
111 close (INV);
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
112
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
113
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
114
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
115
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
116
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
117
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
118
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
119
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
120
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
121 #***********
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
122 sub min{
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
123 my $first = shift;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
124 my $second = shift;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
125 if ($first <= $second){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
126 return $first;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
127 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
128 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
129 return $second;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
130 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
131 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
132
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
133 sub max {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
134 my $first = shift;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
135 my $second = shift;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
136 if ($first >= $second){
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
137 return $first;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
138 }
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
139 else {
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
140 return $second;
442a7c88b886 Uploaded
mcharles
parents:
diff changeset
141 }
6
1776b8ddd87e Uploaded
mcharles
parents: 0
diff changeset
142 }