Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/make_intragenic_junctions.pl @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 | |
5 ################################## | |
6 # make_intragenic_junctions.pl | |
7 # | |
8 # This is an extension of MFB's perl code make_exon_junctions.pl | |
9 # Generates "abnormal" junctions between exons of a gene | |
10 # In the code GeneA, GeneB should be input as same gene | |
11 # | |
12 # RV last modified 4/9/2013 | |
13 ################################## | |
14 | |
15 if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";} | |
16 my $geneA = shift; | |
17 my $geneB = shift; | |
18 my $annotations = shift; | |
19 my $map = shift; | |
20 my $fasta = shift; | |
21 my $overlap = shift; | |
22 | |
23 # Get transcript Ensembl IDs and orientations for GeneA and GeneB | |
24 | |
25 my @ensembA; | |
26 my @ensembB; | |
27 my @orientationA; | |
28 my @orientationB; | |
29 my @seqidA; | |
30 my @seqidB; | |
31 | |
32 my $testerA=0; my $testerB=0; | |
33 open (ANN, "<$annotations") or die "can't open Ensembl annotations\n"; | |
34 while (my $text = <ANN>) { | |
35 chomp $text; | |
36 my @line = split " ", $text; | |
37 if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;} | |
38 if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;} | |
39 } | |
40 close (ANN); | |
41 if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";} | |
42 | |
43 # Get exon lengths for each transcript for GeneA and GeneB | |
44 | |
45 my @exon_length_A; | |
46 my @exon_length_B; | |
47 | |
48 my $chr_A; | |
49 my $chr_B; | |
50 | |
51 my @exon_end_A; | |
52 my @exon_start_B; | |
53 | |
54 open (MAP, "<$map") or die "can't open Ensembl map\n"; | |
55 while (my $text = <MAP>) { | |
56 chomp $text; | |
57 my @line = split " ", $text; | |
58 my $ensID = pop (@line); | |
59 pop @line; | |
60 for (my $i=0; $i<=$#ensembA; $i++) { | |
61 if ($ensembA[$i] eq $ensID) { | |
62 $chr_A = $line[0]; | |
63 my $num_exons = ($#line+1)/3; | |
64 for (my $exonA=0; $exonA<$num_exons; $exonA++) { | |
65 my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1; | |
66 if ($orientationA[$i] == 1) { | |
67 push @{$exon_length_A[$i]}, $length; | |
68 push @{$exon_end_A[$i]}, $line[3*$exonA + 2]; | |
69 } | |
70 else { | |
71 unshift @{$exon_length_A[$i]}, $length; | |
72 unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1]; | |
73 } | |
74 } | |
75 } | |
76 } | |
77 for (my $i=0; $i<=$#ensembB; $i++) { | |
78 if ($ensembB[$i] eq $ensID) { | |
79 $chr_B = $line[0]; | |
80 my $num_exons = ($#line+1)/3; | |
81 for (my $exonB=0; $exonB<$num_exons; $exonB++) { | |
82 my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1; | |
83 if ($orientationB[$i] == 1) { | |
84 push @{$exon_length_B[$i]}, $length; | |
85 push @{$exon_start_B[$i]}, $line[3*$exonB + 1]; | |
86 } | |
87 else { | |
88 unshift @{$exon_length_B[$i]}, $length; | |
89 unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2]; | |
90 } | |
91 } | |
92 } | |
93 } | |
94 } | |
95 close (MAP); | |
96 | |
97 # Get sequence for each transcript (take reverse complement when necessary) | |
98 | |
99 my @sequenceA; | |
100 my @sequenceB; | |
101 | |
102 open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n"; | |
103 my $top = <FHSEQ>; | |
104 my $readID = 0; | |
105 while (my $text = <FHSEQ>) { | |
106 chomp $text; | |
107 if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation | |
108 $text=~ s/>//; | |
109 $readID=$text; | |
110 } | |
111 else { | |
112 for (my $i=0; $i<=$#seqidA; $i++) { | |
113 if ($seqidA[$i] == $readID) { | |
114 if ($sequenceA[$i]) { | |
115 $sequenceA[$i] = $sequenceA[$i].$text; | |
116 } | |
117 else {$sequenceA[$i] = $text;} | |
118 } | |
119 } | |
120 for (my $i=0; $i<=$#seqidB; $i++) { | |
121 if ($seqidB[$i] == $readID) { | |
122 if ($sequenceB[$i]) { | |
123 $sequenceB[$i] = $sequenceB[$i].$text; | |
124 } | |
125 else {$sequenceB[$i] = $text;} | |
126 } | |
127 } | |
128 } | |
129 } | |
130 close (FHSEQ); | |
131 | |
132 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { | |
133 if ($orientationA[$txtA]==-1) { | |
134 $sequenceA[$txtA] = rc($sequenceA[$txtA]); | |
135 } | |
136 } | |
137 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { | |
138 if ($orientationB[$txtB]==-1) { | |
139 $sequenceB[$txtB] = rc($sequenceB[$txtB]); | |
140 } | |
141 } | |
142 | |
143 | |
144 # Print sequences for each hypothetical exon junction (for each pair of transcripts) | |
145 my %junctions; | |
146 my %junctions_known; | |
147 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { | |
148 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { | |
149 | |
150 my $num_exon_A = $#{$exon_length_A[$txtA]}+1; | |
151 my $num_exon_B = $#{$exon_length_B[$txtB]}+1; | |
152 | |
153 my $running_pos_A=0; | |
154 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { | |
155 $running_pos_A += $exon_length_A[$txtA][$exonA]; | |
156 my $junction_start = $exon_end_A[$txtA][$exonA]; | |
157 my $start = $running_pos_A - $overlap; | |
158 my $seqA; | |
159 if ($start >= 0) { | |
160 $seqA = substr($sequenceA[$txtA], $start, $overlap); | |
161 } | |
162 else { | |
163 $start=0; | |
164 my $tmp_length = $running_pos_A; | |
165 $seqA = substr($sequenceA[$txtA], $start, $tmp_length); | |
166 } | |
167 my $running_pos_B=0; | |
168 for (my $exonB=0; $exonB<$num_exon_B; $exonB++) { | |
169 my $junction_end = $exon_start_B[$txtB][$exonB]; | |
170 my $start = $running_pos_B; | |
171 my $seqB = substr($sequenceB[$txtB], $start, $overlap); | |
172 $running_pos_B += $exon_length_B[$txtB][$exonB]; | |
173 my $key= ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; | |
174 my $junction = $seqA.$seqB; | |
175 #print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; | |
176 #print "$junction\n"; | |
177 $junctions{$key}=$junction; | |
178 } | |
179 } | |
180 } | |
181 | |
182 } | |
183 | |
184 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { | |
185 my $num_exon_A = $#{$exon_length_A[$txtA]}; | |
186 | |
187 my $running_pos_A=0; | |
188 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { | |
189 $running_pos_A += $exon_length_A[$txtA][$exonA]; | |
190 my $junction_start = $exon_end_A[$txtA][$exonA]; | |
191 my $start = $running_pos_A - $overlap; | |
192 my $seqA; | |
193 if ($start >= 0) { | |
194 $seqA = substr($sequenceA[$txtA], $start, $overlap); | |
195 } | |
196 else { | |
197 $start=0; | |
198 my $tmp_length = $running_pos_A; | |
199 $seqA = substr($sequenceA[$txtA], $start, $tmp_length); | |
200 } | |
201 my $running_pos_B=0; | |
202 my $junction_end = $exon_start_B[$txtA][$exonA+1]; | |
203 my $start1 = $running_pos_B; | |
204 my $seqB = substr($sequenceA[$txtA], $start1, $overlap); | |
205 #$running_pos_B += $exon_length_A[$txtA][$exonA+1]; | |
206 my $key= ">chr$chr_A\.$junction_start\.chr$chr_A\.$junction_end\n"; | |
207 my $junction = $seqA.$seqB; | |
208 #print "$junction\n"; | |
209 $junctions_known{$key}=$junction; | |
210 | |
211 } | |
212 } | |
213 | |
214 my @disc; | |
215 my %count = (); | |
216 foreach my $element (keys (%junctions_known), keys (%junctions)) { $count{$element}++ } | |
217 foreach my $element (keys %count) { | |
218 if($count{$element} == 1){ | |
219 #print $element; | |
220 push @disc , $element; | |
221 } | |
222 } | |
223 | |
224 foreach my $junc(@disc){ | |
225 if($junctions_known{$junc}){print $junc,$junctions_known{$junc},"\n"; | |
226 } | |
227 else {print $junc,$junctions{$junc},"\n"; | |
228 } | |
229 | |
230 } | |
231 | |
232 | |
233 ################## | |
234 ################## | |
235 ### SUBROUTINES | |
236 ################## | |
237 ################## | |
238 | |
239 sub rc{ | |
240 my $dna = shift; | |
241 my $revcom = reverse($dna); | |
242 $revcom =~ tr/ACGTacgt/TGCAtgca/; | |
243 return $revcom; | |
244 } |