comparison pyPRADA_1.2/make_exon_junctions.pl @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc2ca1a3ba4
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4
5 ##################################
6 # make_exon_junctions.pl
7 #
8 # Take as input Gene A and Gene B
9 # in fusion. Using Ensembl52
10 # mapping information and
11 # transcript sequences, outputs
12 # sequences for all possible
13 # exon junctions with the 3' end
14 # of Gene A fused to the 5' end
15 # of Gene B
16 #
17 # M. Berger, January 27, 2009
18 # WTG modified 7/10/2012
19 # RV last modified 4/9/2013
20 ##################################
21
22 if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";}
23 my $geneA = shift;
24 my $geneB = shift;
25 my $annotations = shift;
26 my $map = shift;
27 my $fasta = shift;
28 my $overlap = shift;
29
30 # Get transcript Ensembl IDs and orientations for GeneA and GeneB
31
32 my @ensembA;
33 my @ensembB;
34 my @orientationA;
35 my @orientationB;
36 my @seqidA;
37 my @seqidB;
38
39 my $testerA=0; my $testerB=0;
40 open (ANN, "<$annotations") or die "can't open Ensembl annotations\n";
41 while (my $text = <ANN>) {
42 chomp $text;
43 my @line = split " ", $text;
44 if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;}
45 if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;}
46 }
47 close (ANN);
48 if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";}
49
50 # Get exon lengths for each transcript for GeneA and GeneB
51
52 my @exon_length_A;
53 my @exon_length_B;
54
55 my $chr_A;
56 my $chr_B;
57
58 my @exon_end_A;
59 my @exon_start_B;
60
61 open (MAP, "<$map") or die "can't open Ensembl map\n";
62 while (my $text = <MAP>) {
63 chomp $text;
64 my @line = split " ", $text;
65 my $ensID = pop (@line);
66 pop @line;
67 for (my $i=0; $i<=$#ensembA; $i++) {
68 if ($ensembA[$i] eq $ensID) {
69 $chr_A = $line[0];
70 my $num_exons = ($#line+1)/3;
71 for (my $exonA=0; $exonA<$num_exons; $exonA++) {
72 my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1;
73 if ($orientationA[$i] == 1) {
74 push @{$exon_length_A[$i]}, $length;
75 push @{$exon_end_A[$i]}, $line[3*$exonA + 2];
76 }
77 else {
78 unshift @{$exon_length_A[$i]}, $length;
79 unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1];
80 }
81 }
82 }
83 }
84 for (my $i=0; $i<=$#ensembB; $i++) {
85 if ($ensembB[$i] eq $ensID) {
86 $chr_B = $line[0];
87 my $num_exons = ($#line+1)/3;
88 for (my $exonB=0; $exonB<$num_exons; $exonB++) {
89 my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1;
90 if ($orientationB[$i] == 1) {
91 push @{$exon_length_B[$i]}, $length;
92 push @{$exon_start_B[$i]}, $line[3*$exonB + 1];
93 }
94 else {
95 unshift @{$exon_length_B[$i]}, $length;
96 unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2];
97 }
98 }
99 }
100 }
101 }
102 close (MAP);
103
104 # Get sequence for each transcript (take reverse complement when necessary)
105
106 my @sequenceA;
107 my @sequenceB;
108
109 open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n";
110 my $top = <FHSEQ>;
111 my $readID = 0;
112 while (my $text = <FHSEQ>) {
113 chomp $text;
114 if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation
115 $text=~ s/>//;
116 $readID=$text;
117 }
118 else {
119 for (my $i=0; $i<=$#seqidA; $i++) {
120 if ($seqidA[$i] == $readID) {
121 if ($sequenceA[$i]) {
122 $sequenceA[$i] = $sequenceA[$i].$text;
123 }
124 else {$sequenceA[$i] = $text;}
125 }
126 }
127 for (my $i=0; $i<=$#seqidB; $i++) {
128 if ($seqidB[$i] == $readID) {
129 if ($sequenceB[$i]) {
130 $sequenceB[$i] = $sequenceB[$i].$text;
131 }
132 else {$sequenceB[$i] = $text;}
133 }
134 }
135 }
136 }
137 close (FHSEQ);
138
139 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
140 if ($orientationA[$txtA]==-1) {
141 $sequenceA[$txtA] = rc($sequenceA[$txtA]);
142 }
143 }
144 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
145 if ($orientationB[$txtB]==-1) {
146 $sequenceB[$txtB] = rc($sequenceB[$txtB]);
147 }
148 }
149
150
151 # Print sequences for each hypothetical exon junction (for each pair of transcripts)
152
153
154 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
155 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
156
157 my $num_exon_A = $#{$exon_length_A[$txtA]}+1;
158 my $num_exon_B = $#{$exon_length_B[$txtB]}+1;
159
160 my $running_pos_A=0;
161 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
162 $running_pos_A += $exon_length_A[$txtA][$exonA];
163 my $junction_start = $exon_end_A[$txtA][$exonA];
164 my $start = $running_pos_A - $overlap;
165 my $seqA;
166 if ($start >= 0) {
167 $seqA = substr($sequenceA[$txtA], $start, $overlap);
168 }
169 else {
170 $start=0;
171 my $tmp_length = $running_pos_A;
172 $seqA = substr($sequenceA[$txtA], $start, $tmp_length);
173 }
174 my $running_pos_B=0;
175 for (my $exonB=0; $exonB<$num_exon_B; $exonB++) {
176 my $junction_end = $exon_start_B[$txtB][$exonB];
177 my $start = $running_pos_B;
178 my $seqB = substr($sequenceB[$txtB], $start, $overlap);
179 $running_pos_B += $exon_length_B[$txtB][$exonB];
180 print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
181 my $junction = $seqA.$seqB;
182 print "$junction\n";
183 }
184 }
185 }
186 }
187
188
189
190 ##################
191 ##################
192 ### SUBROUTINES
193 ##################
194 ##################
195
196 sub rc{
197 my $dna = shift;
198 my $revcom = reverse($dna);
199 $revcom =~ tr/ACGTacgt/TGCAtgca/;
200 return $revcom;
201 }