annotate 2.4/src/standalone_blat2.pl @ 13:e3609c8714fb draft

Uploaded
author plus91-technologies-pvt-ltd
date Fri, 30 May 2014 03:37:55 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
1 #!/usr/bin/perl -sw
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
2 use Getopt::Long;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
3 sub usage(){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
4 print "
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
5 Usage: <VCF> -g <genome.2bit> -seq|s <seq.fa> -f genome.fa
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
6 -o out.vcf
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
7 -n contig.names
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
8 -dist how wide of a window to look for bp [50]\n
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
9 -v verbose option
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
10 Requires samtools,bedTools, and blat in your path\n;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
11 ";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
12 die;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
13 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
14 #Initialize values
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
15 my ($blat,$genome,$tei_bed,$vntr_bed,$out_vcf,$contig_names,$contig,$fasta,$uninformative_contigs,$dist,$verbose,$bedTools,$samtools);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
16 GetOptions ("genome|g=s" => \$genome,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
17 "o|out:s" => \$out_vcf,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
18 "names|n:s" => \$contig_names,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
19 "seq|s=s" => \$contig,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
20 "f|fasta:s" => \$fasta,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
21 "b|bad:s" => \$uninformative_contigs,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
22 "dist:s" => \$dist,
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
23 "v" => \$verbose
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
24 );
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
25 #$genome="/data2/bsi/reference/db/hg19.2bit""
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
26 #$blat="/projects/bsi/bictools/apps/alignment/blat/34/blat" ;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
27 #TEI.bed=egrep "LINE|SINE|LTR" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed >TEI.bed
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
28 #VNTR_BED=egrep "Satellite|Simple_repeat" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed > VNTR.bed
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
29
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
30
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
31 $blat=`which blat`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
32 if (!$blat) {die "Your do not have BLAT in your path\n\n"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
33 $samtools=`which samtools`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
34 if (!$samtools) {die "Your do not have samtools in your path\n\n"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
35 $bedTools=`which sortBed`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
36 if (!$bedTools) {die "Your do not have bedTools in your path\n\n"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
37
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
38
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
39 if (!$dist) {$dist=50}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
40 if (!$out_vcf) {$out_vcf="out.vcf"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
41 if (!$contig_names) {$contig_names="contig.names"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
42 if (!$uninformative_contigs) {$uninformative_contigs="uninformative.contigs"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
43
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
44 if ((!$genome)||(!$contig)||(!$fasta)){&usage;die;}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
45
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
46
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
47 open(VCF,"$ARGV[0]") or die "must specify VCF file\n\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
48 open(OUT_VCF,">",$out_vcf) or die "can't open the output VCF\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
49 open(CONTIG_LIST,">",$contig_names) or die "can't open the contig names\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
50 open(BAD_CONTIG_LIST,">",$uninformative_contigs) or die "can't open the contig names\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
51 #print "writing to CONTIG_LIST=$contig_names\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
52 while (<VCF>) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
53 if($_=~/^#/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
54 if ($.==1) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
55 print OUT_VCF $_;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
56 print OUT_VCF "##INFO=<ID=STRAND,Number=1,Type=String,Description=\"Strand to which assembled contig aligned\">\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
57 print OUT_VCF "##INFO=<ID=CONTIG,Number=1,Type=String,Description=\"Name of assembeled contig matching event\">\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
58 print OUT_VCF "##INFO=<ID=MECHANISM,Number=1,Type=String,Description=\"Proposed mechanism of how the event arose\">\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
59 print OUT_VCF "##INFO=<ID=INSLEN,Number=1,Type=Integer,Description=\"Length of insertion\">\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
60 print OUT_VCF "##INFO=<ID=HOM_LEN,Number=1,Type=Integer,Description=\"Length of microhomology\">\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
61 next;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
62 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
63 else {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
64 print OUT_VCF $_;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
65 next;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
66 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
67 };
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
68 chomp;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
69
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
70 ##look for exact location of BP
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
71 @line=split("\t",$_);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
72 my($left_chr,$start,$end);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
73
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
74 #Get left position
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
75 $left_chr=$line[0];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
76 $start=$line[1]-$dist;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
77 $end=$line[1]+$dist;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
78
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
79 #Get right position
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
80 my ($mate_pos,@mate,$mate_chr,$mate_start,$mate_end);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
81 $mate_pos=$line[4];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
82 $mate_pos=~s/[\[|\]|A-Z]//g;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
83 #print "mate_pos=$mate_pos\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
84 @mate=split(/:/,$mate_pos);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
85 $mate_chr=$mate[0]; $mate_pos=$mate[1];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
86 $mate_start=$mate_pos-$dist;$mate_end=$mate_pos+$dist;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
87 #print "$left_chr:$start-$end\n$mate_chr:$mate_start-$mate_end\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
88
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
89 #Run through blat
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
90 my ($result1,$result2);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
91 my $target1=join("",$left_chr,":",$start,"-",$end);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
92 my $target2=join("",$mate_chr,":",$mate_start,"-",$mate_end);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
93 #print "target1=$target1\ttarget2=$target2\n";die;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
94 $result1=get_result($target1);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
95 $result2=get_result($target2);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
96
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
97
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
98 my $NOV_INS="";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
99 #If there is a NOV_INS, then there shouldn't be any output, so trick the results
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
100 if ($_=~/EVENT=NOV_INS/) {
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
101 $mate_start=$start;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
102 $NOV_INS="true";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
103 if (!$result1) {$result1=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
104 if (!$result2) {$result2=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
105 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
106
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
107 #Skip over events that aren't supported
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
108 if ((!$result1)||(!$result2)){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
109 my @tmp1=split("\t",$result1);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
110 my @tmp2=split("\t",$result2);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
111 if ($tmp1[9]) {print BAD_CONTIG_LIST "$tmp1[9]\n"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
112 if ($tmp2[9]) {print BAD_CONTIG_LIST "$tmp2[9]\n" }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
113 next;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
114 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
115 #Parse blat results
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
116 my @result1=split("\t",$result1);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
117 my @result2=split("\t",$result2);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
118 if($result2[9] ne $result1[9]){print "$result2[9] != $result1[9]\n";next}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
119 #print "@result1\n@result2\n";die;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
120 my $pos1=$start+($result1[12]-$result1[11]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
121 my $pos2=$mate_start+($result2[12]-$result2[11]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
122 #print "$_\n$pos1\t$pos2\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
123
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
124 ##############################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
125 ### Build Classifier
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
126
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
127 my ($QSTART1,$QEND1,$QSTART2,$QEND2,$len,$MECHANISM, $INSERTION, $DELETION, $bed_res1,$bed_res2);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
128 $MECHANISM="UNKNOWN";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
129 $len="UNKNOWN";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
130 #Make sure the later event is second
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
131 if ($result1[11] < $result2[11]){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
132 $QSTART1=$result1[11];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
133 $QEND1=$result1[12];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
134 $QSTART2=$result2[11];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
135 $QEND2=$result2[12];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
136 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
137 else{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
138 $QSTART1=$result2[11];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
139 $QEND1=$result2[12];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
140 $QSTART2=$result1[11];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
141 $QEND2=$result1[12];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
142 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
143 #Now calculate the difference between $QEND1 and QSTART2
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
144 if($verbose){print "QEND1=$QEND1\tQSTART2=$QSTART2\n";}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
145 $len=$QEND1-$QSTART2;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
146 #Check for TEI
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
147 if($_=~/MECHANISM=TEI/){$MECHANISM="TEI"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
148 elsif($_=~/MECHANISM=VNTR/){$MECHANISM="VNTR"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
149 else{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
150 if ($len==0) {$MECHANISM="NHEJ"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
151 else{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
152 if ($len>0){$INSERTION="true"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
153 if ($len<0){$DELETION="true"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
154 if ($INSERTION){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
155 if ($len>10) {$MECHANISM="FOSTES"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
156 else{$MECHANISM="NHEJ"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
157 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
158 elsif ($DELETION){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
159 if ($len>100) {$MECHANISM="NAHR"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
160 elsif ($len > 2){$MECHANISM="altEJ"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
161 else{$MECHANISM="NHEJ"}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
162 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
163 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
164 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
165
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
166
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
167 #if ($verbose){print "@result1";print "@result2";}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
168
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
169 #print out VCF
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
170 #############################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
171 # create temporary variable name
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
172 #############################################################
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
173 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
174 my $random_name=join "", map { ("a".."z")[rand 26] } 1..8;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
175 my $random_name2=join "", map { ("a".."z")[rand 26] } 1..8;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
176
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
177 #Get Ref Base
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
178 my ($ref_base,$alt_base,$tmp_mate_pos);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
179 $ref_base=getBases($left_chr,$pos1,$fasta);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
180 $alt_base=getBases($mate_chr,$pos2,$fasta);#print "ALT=$alt_base\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
181 #Substitute the new mate position and base
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
182 $tmp_mate_pos=$line[4];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
183 $tmp_mate_pos=~s/$mate_pos/$pos2/;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
184 $tmp_mate_pos=~s/[A-Z]/$alt_base/;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
185 #split apart the INFO field to adjust the ISIZE and MATEID
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
186 my $NEW_INFO="";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
187 my @INFO=split(/;/,$line[7]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
188 for (my $i=0;$i<@INFO;$i++){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
189 if ($INFO[$i] =~ /^ISIZE=/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
190 my @tmp=split(/=/,$INFO[$i]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
191 $NEW_INFO.="ISIZE=";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
192 my $new_ISZIE=$pos2-$pos1;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
193 $NEW_INFO.=$new_ISZIE
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
194 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
195 elsif($INFO[$i] =~ /^MATE_ID=/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
196 $NEW_INFO.=";MATE_ID=".$random_name2 . ";";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
197 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
198 else{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
199 $NEW_INFO.=$INFO[$i].";";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
200 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
201 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
202 #ADD in strand and name
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
203 $NEW_INFO.="STRAND=".$result1[8];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
204 $NEW_INFO.=";CONTIG=".$result1[9];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
205 if($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
206 $NEW_INFO.=";HOM_LEN=".$len;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
207 #don't pring contig nage if its a novel insertion
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
208 if(!$NOV_INS){print CONTIG_LIST "$result1[9]\n";}#else{print "I'm not printing $result1[9]\n";}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
209 print OUT_VCF "$left_chr\t$pos1\t$random_name\t$ref_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
210 #Now go through and fill info in for mate
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
211 #Substitute the new mate position and base
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
212 $tmp_mate_pos=$line[4];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
213 $tmp_mate_pos=~s/$mate_pos/$pos1/;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
214 $tmp_mate_pos=~s/[A-Z]/$ref_base/;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
215 $tmp_mate_pos=~s/$mate_chr/$left_chr/;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
216 $NEW_INFO="";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
217 @INFO=split(/;/,$line[7]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
218 for (my $i=0;$i<@INFO;$i++){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
219 if ($INFO[$i] =~ /^ISIZE=/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
220 my @tmp=split(/=/,$INFO[$i]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
221 $NEW_INFO.="ISIZE=";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
222 my $new_ISZIE=$pos2-$pos1;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
223 $NEW_INFO.=$new_ISZIE
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
224 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
225 elsif($INFO[$i] =~ /^MATE_ID=/){
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
226 $NEW_INFO.=";MATE_ID=".$random_name.";";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
227 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
228 else{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
229 $NEW_INFO.=$INFO[$i].";";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
230 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
231 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
232 #ADD in strand and name
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
233 $NEW_INFO.="STRAND=".$result2[8];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
234 $NEW_INFO.=";CONTIG=".$result2[9];
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
235 if ($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
236 $NEW_INFO.=";HOM_LEN=".$len;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
237
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
238 #don't pring contig nage if its a novel insertion
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
239 if(!$NOV_INS){print CONTIG_LIST "$result2[9]\n";} #else{print "I'm not printing $result1[9]\n";}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
240 print OUT_VCF "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
241 if ($verbose){print "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n";}
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
242 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
243 close VCF;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
244 close OUT_VCF;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
245 close CONTIG_LIST;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
246 close BAD_CONTIG_LIST;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
247 sub get_result{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
248 my $target=($_[0]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
249 if($verbose){print "target=$target\n"}#;die;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
250 my $cmd="blat $genome:$target $contig /dev/stdout -t=dna -q=dna -noHead|egrep -v \"Searched|Loaded\" |head -1";
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
251
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
252 if ($verbose){print "$cmd\n"} #print "$cmd\n";die;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
253 my $result=`$cmd`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
254 next if (!$cmd);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
255 return ($result);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
256 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
257 sub getBases{
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
258 my ($chr,$pos1,$fasta)=@_;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
259 my @result=();
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
260 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";$result[1]="NA";};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
261 @result = `samtools faidx $fasta $chr:$pos1-$pos1`;
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
262 if(!$result[1]){$result[1]="NA"};
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
263 chomp($result[1]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
264 return uc($result[1]);
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
265 }
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
266
e3609c8714fb Uploaded
plus91-technologies-pvt-ltd
parents:
diff changeset
267