Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/ParseBlastForUniqueMatch.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 3f7b0788a1c4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:442a7c88b886 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 | |
5 | |
6 my $input_variant_file = $ARGV[0]; | |
7 my $input_blast_file = $ARGV[1]; | |
8 my $window_length = $ARGV[2]; | |
9 my $nb_mismatch_max = $ARGV[3]; | |
10 | |
11 my %hash_name; | |
12 | |
13 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); | |
14 while (my $line =<INB>){ | |
15 my @fields = split (/\s+/,$line); | |
16 # print $#fields,"\n"; | |
17 # print $line; | |
18 # exit(0); | |
19 my $query_name; | |
20 my $query_start; | |
21 my $query_stop; | |
22 my $query_aln; | |
23 my $subject_aln; | |
24 my $compt_mismatch_5p=0; | |
25 my $compt_mismatch_3p=0; | |
26 | |
27 if ($#fields == 24){ | |
28 $query_name = $fields[0]; | |
29 $query_start = $fields[6]; | |
30 $query_stop = $fields[7]; | |
31 $query_aln = $fields[20]; | |
32 $subject_aln = $fields[21]; | |
33 } | |
34 elsif ($#fields == 4){ | |
35 $query_name = $fields[0]; | |
36 $query_start = $fields[1]; | |
37 $query_stop = $fields[2]; | |
38 $query_aln = $fields[3]; | |
39 $subject_aln = $fields[4]; | |
40 } | |
41 else { | |
42 print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n"; | |
43 exit(0); | |
44 } | |
45 | |
46 | |
47 my @field_name = split (/\_/,$query_name); | |
48 my $name = $field_name[0]."_".$field_name[1]; | |
49 if ($query_name =~ /random/){ | |
50 $name .= "_".$field_name[2]; | |
51 # print $name."\n"; | |
52 | |
53 } | |
54 if (!$hash_name{$name}){ | |
55 $hash_name{$name}=0; | |
56 } | |
57 elsif ($hash_name{$name}>1){ | |
58 next; | |
59 } | |
60 # if ($query_name eq "chrUnn_random_8279117_1_M1_a"){ | |
61 # print "READ : $query_name\n$name\n"; | |
62 # print "HASH : ",$hash_name{$name},"\n"; | |
63 # # exit(0); | |
64 # } | |
65 | |
66 | |
67 | |
68 chomp($query_aln); | |
69 chomp($subject_aln); | |
70 | |
71 my $nb_gap_query=0; | |
72 | |
73 if (length($query_aln) == length($subject_aln)){ | |
74 if (length($query_aln)<$window_length-$nb_mismatch_max){ | |
75 } | |
76 else { | |
77 my @q = split(//,$query_aln); | |
78 my @s = split(//,$subject_aln); | |
79 for (my $i=0;$i<=$#q;$i++){ | |
80 my $global_idx = $query_start-1+$i-$nb_gap_query; | |
81 if ($q[$i] eq "-"){ | |
82 if ($global_idx < $window_length){ | |
83 $compt_mismatch_5p++; | |
84 } | |
85 elsif ($global_idx > $window_length){ | |
86 $compt_mismatch_3p++; | |
87 } | |
88 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global | |
89 } | |
90 else { | |
91 if ($q[$i] ne $s[$i]){ | |
92 if ($global_idx < $window_length){ | |
93 $compt_mismatch_5p++; | |
94 } | |
95 elsif ($global_idx > $window_length){ | |
96 $compt_mismatch_3p++; | |
97 } | |
98 } | |
99 } | |
100 } | |
101 $compt_mismatch_5p += $query_start-1; | |
102 $compt_mismatch_3p += $window_length *2 + 1 - $query_stop; | |
103 | |
104 # for (my $i=0;$i<$window_length;$i++){ | |
105 # if ($tbl_q_aln[$i] eq "#"){ | |
106 # $compt_mismatch_5p++; | |
107 # } | |
108 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ | |
109 # $compt_mismatch_5p++; | |
110 # } | |
111 # else { | |
112 # } | |
113 | |
114 # } | |
115 # for (my $i=$window_length+1;$i<=$window_length*2;$i++){ | |
116 # if ($tbl_q_aln[$i] eq "#"){ | |
117 # $compt_mismatch_3p++; | |
118 # } | |
119 # elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){ | |
120 # $compt_mismatch_3p++; | |
121 # } | |
122 # else { | |
123 # } | |
124 # } | |
125 if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){ | |
126 $hash_name{$name}++; | |
127 } | |
128 | |
129 } | |
130 } | |
131 else { | |
132 print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n"; | |
133 } | |
134 | |
135 # if ($line=~/chrCnn_random_49828229/){ | |
136 # print $line; | |
137 # print $query_aln,"\n"; | |
138 # print $subject_aln,"\n"; | |
139 # print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n"; | |
140 # print $hash_name{"chrCnn_random_49828229"},"\n"; | |
141 # print "\n"; | |
142 # } | |
143 | |
144 | |
145 | |
146 } | |
147 # exit(0); | |
148 | |
149 close (INB); | |
150 | |
151 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); | |
152 | |
153 while (my $ligne = <INV>) { | |
154 | |
155 my @champs = split (/\s+/,$ligne); | |
156 my $header = $champs[0]."_".$champs[1]; | |
157 | |
158 if ($hash_name{$header}){ | |
159 if ($hash_name{$header}==1){ | |
160 print $ligne; | |
161 } | |
162 } | |
163 else { | |
164 #print STDERR "No blast result for ",$header,"\n"; | |
165 } | |
166 | |
167 | |
168 } | |
169 | |
170 close(INV); | |
171 | |
172 | |
173 # foreach my $key (sort keys %hash_name){ | |
174 # print $key,"\t",$hash_name{$key},"\n"; | |
175 | |
176 # } |