Mercurial > repos > plus91-technologies-pvt-ltd > softsearch
comparison 2.4/src/standalone_blat2.pl @ 13:e3609c8714fb draft
Uploaded
author | plus91-technologies-pvt-ltd |
---|---|
date | Fri, 30 May 2014 03:37:55 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:980fce54e60a | 13:e3609c8714fb |
---|---|
1 #!/usr/bin/perl -sw | |
2 use Getopt::Long; | |
3 sub usage(){ | |
4 print " | |
5 Usage: <VCF> -g <genome.2bit> -seq|s <seq.fa> -f genome.fa | |
6 -o out.vcf | |
7 -n contig.names | |
8 -dist how wide of a window to look for bp [50]\n | |
9 -v verbose option | |
10 Requires samtools,bedTools, and blat in your path\n; | |
11 "; | |
12 die; | |
13 } | |
14 #Initialize values | |
15 my ($blat,$genome,$tei_bed,$vntr_bed,$out_vcf,$contig_names,$contig,$fasta,$uninformative_contigs,$dist,$verbose,$bedTools,$samtools); | |
16 GetOptions ("genome|g=s" => \$genome, | |
17 "o|out:s" => \$out_vcf, | |
18 "names|n:s" => \$contig_names, | |
19 "seq|s=s" => \$contig, | |
20 "f|fasta:s" => \$fasta, | |
21 "b|bad:s" => \$uninformative_contigs, | |
22 "dist:s" => \$dist, | |
23 "v" => \$verbose | |
24 ); | |
25 #$genome="/data2/bsi/reference/db/hg19.2bit"" | |
26 #$blat="/projects/bsi/bictools/apps/alignment/blat/34/blat" ; | |
27 #TEI.bed=egrep "LINE|SINE|LTR" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed >TEI.bed | |
28 #VNTR_BED=egrep "Satellite|Simple_repeat" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed > VNTR.bed | |
29 | |
30 | |
31 $blat=`which blat`; | |
32 if (!$blat) {die "Your do not have BLAT in your path\n\n"} | |
33 $samtools=`which samtools`; | |
34 if (!$samtools) {die "Your do not have samtools in your path\n\n"} | |
35 $bedTools=`which sortBed`; | |
36 if (!$bedTools) {die "Your do not have bedTools in your path\n\n"} | |
37 | |
38 | |
39 if (!$dist) {$dist=50} | |
40 if (!$out_vcf) {$out_vcf="out.vcf"} | |
41 if (!$contig_names) {$contig_names="contig.names"} | |
42 if (!$uninformative_contigs) {$uninformative_contigs="uninformative.contigs"} | |
43 | |
44 if ((!$genome)||(!$contig)||(!$fasta)){&usage;die;} | |
45 | |
46 | |
47 open(VCF,"$ARGV[0]") or die "must specify VCF file\n\n"; | |
48 open(OUT_VCF,">",$out_vcf) or die "can't open the output VCF\n"; | |
49 open(CONTIG_LIST,">",$contig_names) or die "can't open the contig names\n"; | |
50 open(BAD_CONTIG_LIST,">",$uninformative_contigs) or die "can't open the contig names\n"; | |
51 #print "writing to CONTIG_LIST=$contig_names\n"; | |
52 while (<VCF>) { | |
53 if($_=~/^#/){ | |
54 if ($.==1) { | |
55 print OUT_VCF $_; | |
56 print OUT_VCF "##INFO=<ID=STRAND,Number=1,Type=String,Description=\"Strand to which assembled contig aligned\">\n"; | |
57 print OUT_VCF "##INFO=<ID=CONTIG,Number=1,Type=String,Description=\"Name of assembeled contig matching event\">\n"; | |
58 print OUT_VCF "##INFO=<ID=MECHANISM,Number=1,Type=String,Description=\"Proposed mechanism of how the event arose\">\n"; | |
59 print OUT_VCF "##INFO=<ID=INSLEN,Number=1,Type=Integer,Description=\"Length of insertion\">\n"; | |
60 print OUT_VCF "##INFO=<ID=HOM_LEN,Number=1,Type=Integer,Description=\"Length of microhomology\">\n"; | |
61 next; | |
62 } | |
63 else { | |
64 print OUT_VCF $_; | |
65 next; | |
66 } | |
67 }; | |
68 chomp; | |
69 | |
70 ##look for exact location of BP | |
71 @line=split("\t",$_); | |
72 my($left_chr,$start,$end); | |
73 | |
74 #Get left position | |
75 $left_chr=$line[0]; | |
76 $start=$line[1]-$dist; | |
77 $end=$line[1]+$dist; | |
78 | |
79 #Get right position | |
80 my ($mate_pos,@mate,$mate_chr,$mate_start,$mate_end); | |
81 $mate_pos=$line[4]; | |
82 $mate_pos=~s/[\[|\]|A-Z]//g; | |
83 #print "mate_pos=$mate_pos\n"; | |
84 @mate=split(/:/,$mate_pos); | |
85 $mate_chr=$mate[0]; $mate_pos=$mate[1]; | |
86 $mate_start=$mate_pos-$dist;$mate_end=$mate_pos+$dist; | |
87 #print "$left_chr:$start-$end\n$mate_chr:$mate_start-$mate_end\n"; | |
88 | |
89 #Run through blat | |
90 my ($result1,$result2); | |
91 my $target1=join("",$left_chr,":",$start,"-",$end); | |
92 my $target2=join("",$mate_chr,":",$mate_start,"-",$mate_end); | |
93 #print "target1=$target1\ttarget2=$target2\n";die; | |
94 $result1=get_result($target1); | |
95 $result2=get_result($target2); | |
96 | |
97 | |
98 my $NOV_INS=""; | |
99 #If there is a NOV_INS, then there shouldn't be any output, so trick the results | |
100 if ($_=~/EVENT=NOV_INS/) { | |
101 $mate_start=$start; | |
102 $NOV_INS="true"; | |
103 if (!$result1) {$result1=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);} | |
104 if (!$result2) {$result2=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);} | |
105 } | |
106 | |
107 #Skip over events that aren't supported | |
108 if ((!$result1)||(!$result2)){ | |
109 my @tmp1=split("\t",$result1); | |
110 my @tmp2=split("\t",$result2); | |
111 if ($tmp1[9]) {print BAD_CONTIG_LIST "$tmp1[9]\n"} | |
112 if ($tmp2[9]) {print BAD_CONTIG_LIST "$tmp2[9]\n" } | |
113 next; | |
114 } | |
115 #Parse blat results | |
116 my @result1=split("\t",$result1); | |
117 my @result2=split("\t",$result2); | |
118 if($result2[9] ne $result1[9]){print "$result2[9] != $result1[9]\n";next} | |
119 #print "@result1\n@result2\n";die; | |
120 my $pos1=$start+($result1[12]-$result1[11]); | |
121 my $pos2=$mate_start+($result2[12]-$result2[11]); | |
122 #print "$_\n$pos1\t$pos2\n"; | |
123 | |
124 ############################################################## | |
125 ### Build Classifier | |
126 | |
127 my ($QSTART1,$QEND1,$QSTART2,$QEND2,$len,$MECHANISM, $INSERTION, $DELETION, $bed_res1,$bed_res2); | |
128 $MECHANISM="UNKNOWN"; | |
129 $len="UNKNOWN"; | |
130 #Make sure the later event is second | |
131 if ($result1[11] < $result2[11]){ | |
132 $QSTART1=$result1[11]; | |
133 $QEND1=$result1[12]; | |
134 $QSTART2=$result2[11]; | |
135 $QEND2=$result2[12]; | |
136 } | |
137 else{ | |
138 $QSTART1=$result2[11]; | |
139 $QEND1=$result2[12]; | |
140 $QSTART2=$result1[11]; | |
141 $QEND2=$result1[12]; | |
142 } | |
143 #Now calculate the difference between $QEND1 and QSTART2 | |
144 if($verbose){print "QEND1=$QEND1\tQSTART2=$QSTART2\n";} | |
145 $len=$QEND1-$QSTART2; | |
146 #Check for TEI | |
147 if($_=~/MECHANISM=TEI/){$MECHANISM="TEI"} | |
148 elsif($_=~/MECHANISM=VNTR/){$MECHANISM="VNTR"} | |
149 else{ | |
150 if ($len==0) {$MECHANISM="NHEJ"} | |
151 else{ | |
152 if ($len>0){$INSERTION="true"} | |
153 if ($len<0){$DELETION="true"} | |
154 if ($INSERTION){ | |
155 if ($len>10) {$MECHANISM="FOSTES"} | |
156 else{$MECHANISM="NHEJ"} | |
157 } | |
158 elsif ($DELETION){ | |
159 if ($len>100) {$MECHANISM="NAHR"} | |
160 elsif ($len > 2){$MECHANISM="altEJ"} | |
161 else{$MECHANISM="NHEJ"} | |
162 } | |
163 } | |
164 } | |
165 | |
166 | |
167 #if ($verbose){print "@result1";print "@result2";} | |
168 | |
169 #print out VCF | |
170 ############################################################# | |
171 # create temporary variable name | |
172 ############################################################# | |
173 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`); | |
174 my $random_name=join "", map { ("a".."z")[rand 26] } 1..8; | |
175 my $random_name2=join "", map { ("a".."z")[rand 26] } 1..8; | |
176 | |
177 #Get Ref Base | |
178 my ($ref_base,$alt_base,$tmp_mate_pos); | |
179 $ref_base=getBases($left_chr,$pos1,$fasta); | |
180 $alt_base=getBases($mate_chr,$pos2,$fasta);#print "ALT=$alt_base\n"; | |
181 #Substitute the new mate position and base | |
182 $tmp_mate_pos=$line[4]; | |
183 $tmp_mate_pos=~s/$mate_pos/$pos2/; | |
184 $tmp_mate_pos=~s/[A-Z]/$alt_base/; | |
185 #split apart the INFO field to adjust the ISIZE and MATEID | |
186 my $NEW_INFO=""; | |
187 my @INFO=split(/;/,$line[7]); | |
188 for (my $i=0;$i<@INFO;$i++){ | |
189 if ($INFO[$i] =~ /^ISIZE=/){ | |
190 my @tmp=split(/=/,$INFO[$i]); | |
191 $NEW_INFO.="ISIZE="; | |
192 my $new_ISZIE=$pos2-$pos1; | |
193 $NEW_INFO.=$new_ISZIE | |
194 } | |
195 elsif($INFO[$i] =~ /^MATE_ID=/){ | |
196 $NEW_INFO.=";MATE_ID=".$random_name2 . ";"; | |
197 } | |
198 else{ | |
199 $NEW_INFO.=$INFO[$i].";"; | |
200 } | |
201 } | |
202 #ADD in strand and name | |
203 $NEW_INFO.="STRAND=".$result1[8]; | |
204 $NEW_INFO.=";CONTIG=".$result1[9]; | |
205 if($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;} | |
206 $NEW_INFO.=";HOM_LEN=".$len; | |
207 #don't pring contig nage if its a novel insertion | |
208 if(!$NOV_INS){print CONTIG_LIST "$result1[9]\n";}#else{print "I'm not printing $result1[9]\n";} | |
209 print OUT_VCF "$left_chr\t$pos1\t$random_name\t$ref_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n"; | |
210 #Now go through and fill info in for mate | |
211 #Substitute the new mate position and base | |
212 $tmp_mate_pos=$line[4]; | |
213 $tmp_mate_pos=~s/$mate_pos/$pos1/; | |
214 $tmp_mate_pos=~s/[A-Z]/$ref_base/; | |
215 $tmp_mate_pos=~s/$mate_chr/$left_chr/; | |
216 $NEW_INFO=""; | |
217 @INFO=split(/;/,$line[7]); | |
218 for (my $i=0;$i<@INFO;$i++){ | |
219 if ($INFO[$i] =~ /^ISIZE=/){ | |
220 my @tmp=split(/=/,$INFO[$i]); | |
221 $NEW_INFO.="ISIZE="; | |
222 my $new_ISZIE=$pos2-$pos1; | |
223 $NEW_INFO.=$new_ISZIE | |
224 } | |
225 elsif($INFO[$i] =~ /^MATE_ID=/){ | |
226 $NEW_INFO.=";MATE_ID=".$random_name.";"; | |
227 } | |
228 else{ | |
229 $NEW_INFO.=$INFO[$i].";"; | |
230 } | |
231 } | |
232 #ADD in strand and name | |
233 $NEW_INFO.="STRAND=".$result2[8]; | |
234 $NEW_INFO.=";CONTIG=".$result2[9]; | |
235 if ($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;} | |
236 $NEW_INFO.=";HOM_LEN=".$len; | |
237 | |
238 #don't pring contig nage if its a novel insertion | |
239 if(!$NOV_INS){print CONTIG_LIST "$result2[9]\n";} #else{print "I'm not printing $result1[9]\n";} | |
240 print OUT_VCF "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n"; | |
241 if ($verbose){print "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n";} | |
242 } | |
243 close VCF; | |
244 close OUT_VCF; | |
245 close CONTIG_LIST; | |
246 close BAD_CONTIG_LIST; | |
247 sub get_result{ | |
248 my $target=($_[0]); | |
249 if($verbose){print "target=$target\n"}#;die; | |
250 my $cmd="blat $genome:$target $contig /dev/stdout -t=dna -q=dna -noHead|egrep -v \"Searched|Loaded\" |head -1"; | |
251 | |
252 if ($verbose){print "$cmd\n"} #print "$cmd\n";die; | |
253 my $result=`$cmd`; | |
254 next if (!$cmd); | |
255 return ($result); | |
256 } | |
257 sub getBases{ | |
258 my ($chr,$pos1,$fasta)=@_; | |
259 my @result=(); | |
260 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";$result[1]="NA";}; | |
261 @result = `samtools faidx $fasta $chr:$pos1-$pos1`; | |
262 if(!$result[1]){$result[1]="NA"}; | |
263 chomp($result[1]); | |
264 return uc($result[1]); | |
265 } | |
266 | |
267 |