Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/make_exon_junctions.pl @ 0:acc2ca1a3ba4
Uploaded
| author | siyuan |
|---|---|
| date | Thu, 20 Feb 2014 00:44:58 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:acc2ca1a3ba4 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 use strict; | |
| 3 use warnings; | |
| 4 | |
| 5 ################################## | |
| 6 # make_exon_junctions.pl | |
| 7 # | |
| 8 # Take as input Gene A and Gene B | |
| 9 # in fusion. Using Ensembl52 | |
| 10 # mapping information and | |
| 11 # transcript sequences, outputs | |
| 12 # sequences for all possible | |
| 13 # exon junctions with the 3' end | |
| 14 # of Gene A fused to the 5' end | |
| 15 # of Gene B | |
| 16 # | |
| 17 # M. Berger, January 27, 2009 | |
| 18 # WTG modified 7/10/2012 | |
| 19 # RV last modified 4/9/2013 | |
| 20 ################################## | |
| 21 | |
| 22 if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";} | |
| 23 my $geneA = shift; | |
| 24 my $geneB = shift; | |
| 25 my $annotations = shift; | |
| 26 my $map = shift; | |
| 27 my $fasta = shift; | |
| 28 my $overlap = shift; | |
| 29 | |
| 30 # Get transcript Ensembl IDs and orientations for GeneA and GeneB | |
| 31 | |
| 32 my @ensembA; | |
| 33 my @ensembB; | |
| 34 my @orientationA; | |
| 35 my @orientationB; | |
| 36 my @seqidA; | |
| 37 my @seqidB; | |
| 38 | |
| 39 my $testerA=0; my $testerB=0; | |
| 40 open (ANN, "<$annotations") or die "can't open Ensembl annotations\n"; | |
| 41 while (my $text = <ANN>) { | |
| 42 chomp $text; | |
| 43 my @line = split " ", $text; | |
| 44 if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;} | |
| 45 if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;} | |
| 46 } | |
| 47 close (ANN); | |
| 48 if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";} | |
| 49 | |
| 50 # Get exon lengths for each transcript for GeneA and GeneB | |
| 51 | |
| 52 my @exon_length_A; | |
| 53 my @exon_length_B; | |
| 54 | |
| 55 my $chr_A; | |
| 56 my $chr_B; | |
| 57 | |
| 58 my @exon_end_A; | |
| 59 my @exon_start_B; | |
| 60 | |
| 61 open (MAP, "<$map") or die "can't open Ensembl map\n"; | |
| 62 while (my $text = <MAP>) { | |
| 63 chomp $text; | |
| 64 my @line = split " ", $text; | |
| 65 my $ensID = pop (@line); | |
| 66 pop @line; | |
| 67 for (my $i=0; $i<=$#ensembA; $i++) { | |
| 68 if ($ensembA[$i] eq $ensID) { | |
| 69 $chr_A = $line[0]; | |
| 70 my $num_exons = ($#line+1)/3; | |
| 71 for (my $exonA=0; $exonA<$num_exons; $exonA++) { | |
| 72 my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1; | |
| 73 if ($orientationA[$i] == 1) { | |
| 74 push @{$exon_length_A[$i]}, $length; | |
| 75 push @{$exon_end_A[$i]}, $line[3*$exonA + 2]; | |
| 76 } | |
| 77 else { | |
| 78 unshift @{$exon_length_A[$i]}, $length; | |
| 79 unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1]; | |
| 80 } | |
| 81 } | |
| 82 } | |
| 83 } | |
| 84 for (my $i=0; $i<=$#ensembB; $i++) { | |
| 85 if ($ensembB[$i] eq $ensID) { | |
| 86 $chr_B = $line[0]; | |
| 87 my $num_exons = ($#line+1)/3; | |
| 88 for (my $exonB=0; $exonB<$num_exons; $exonB++) { | |
| 89 my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1; | |
| 90 if ($orientationB[$i] == 1) { | |
| 91 push @{$exon_length_B[$i]}, $length; | |
| 92 push @{$exon_start_B[$i]}, $line[3*$exonB + 1]; | |
| 93 } | |
| 94 else { | |
| 95 unshift @{$exon_length_B[$i]}, $length; | |
| 96 unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2]; | |
| 97 } | |
| 98 } | |
| 99 } | |
| 100 } | |
| 101 } | |
| 102 close (MAP); | |
| 103 | |
| 104 # Get sequence for each transcript (take reverse complement when necessary) | |
| 105 | |
| 106 my @sequenceA; | |
| 107 my @sequenceB; | |
| 108 | |
| 109 open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n"; | |
| 110 my $top = <FHSEQ>; | |
| 111 my $readID = 0; | |
| 112 while (my $text = <FHSEQ>) { | |
| 113 chomp $text; | |
| 114 if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation | |
| 115 $text=~ s/>//; | |
| 116 $readID=$text; | |
| 117 } | |
| 118 else { | |
| 119 for (my $i=0; $i<=$#seqidA; $i++) { | |
| 120 if ($seqidA[$i] == $readID) { | |
| 121 if ($sequenceA[$i]) { | |
| 122 $sequenceA[$i] = $sequenceA[$i].$text; | |
| 123 } | |
| 124 else {$sequenceA[$i] = $text;} | |
| 125 } | |
| 126 } | |
| 127 for (my $i=0; $i<=$#seqidB; $i++) { | |
| 128 if ($seqidB[$i] == $readID) { | |
| 129 if ($sequenceB[$i]) { | |
| 130 $sequenceB[$i] = $sequenceB[$i].$text; | |
| 131 } | |
| 132 else {$sequenceB[$i] = $text;} | |
| 133 } | |
| 134 } | |
| 135 } | |
| 136 } | |
| 137 close (FHSEQ); | |
| 138 | |
| 139 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { | |
| 140 if ($orientationA[$txtA]==-1) { | |
| 141 $sequenceA[$txtA] = rc($sequenceA[$txtA]); | |
| 142 } | |
| 143 } | |
| 144 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { | |
| 145 if ($orientationB[$txtB]==-1) { | |
| 146 $sequenceB[$txtB] = rc($sequenceB[$txtB]); | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 | |
| 151 # Print sequences for each hypothetical exon junction (for each pair of transcripts) | |
| 152 | |
| 153 | |
| 154 for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { | |
| 155 for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { | |
| 156 | |
| 157 my $num_exon_A = $#{$exon_length_A[$txtA]}+1; | |
| 158 my $num_exon_B = $#{$exon_length_B[$txtB]}+1; | |
| 159 | |
| 160 my $running_pos_A=0; | |
| 161 for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { | |
| 162 $running_pos_A += $exon_length_A[$txtA][$exonA]; | |
| 163 my $junction_start = $exon_end_A[$txtA][$exonA]; | |
| 164 my $start = $running_pos_A - $overlap; | |
| 165 my $seqA; | |
| 166 if ($start >= 0) { | |
| 167 $seqA = substr($sequenceA[$txtA], $start, $overlap); | |
| 168 } | |
| 169 else { | |
| 170 $start=0; | |
| 171 my $tmp_length = $running_pos_A; | |
| 172 $seqA = substr($sequenceA[$txtA], $start, $tmp_length); | |
| 173 } | |
| 174 my $running_pos_B=0; | |
| 175 for (my $exonB=0; $exonB<$num_exon_B; $exonB++) { | |
| 176 my $junction_end = $exon_start_B[$txtB][$exonB]; | |
| 177 my $start = $running_pos_B; | |
| 178 my $seqB = substr($sequenceB[$txtB], $start, $overlap); | |
| 179 $running_pos_B += $exon_length_B[$txtB][$exonB]; | |
| 180 print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; | |
| 181 my $junction = $seqA.$seqB; | |
| 182 print "$junction\n"; | |
| 183 } | |
| 184 } | |
| 185 } | |
| 186 } | |
| 187 | |
| 188 | |
| 189 | |
| 190 ################## | |
| 191 ################## | |
| 192 ### SUBROUTINES | |
| 193 ################## | |
| 194 ################## | |
| 195 | |
| 196 sub rc{ | |
| 197 my $dna = shift; | |
| 198 my $revcom = reverse($dna); | |
| 199 $revcom =~ tr/ACGTacgt/TGCAtgca/; | |
| 200 return $revcom; | |
| 201 } |
