Mercurial > repos > siyuan > prada
diff pyPRADA_1.2/make_exon_junctions.pl @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyPRADA_1.2/make_exon_junctions.pl Thu Feb 20 00:44:58 2014 -0500 @@ -0,0 +1,201 @@ +#!/usr/bin/perl +use strict; +use warnings; + +################################## +# make_exon_junctions.pl +# +# Take as input Gene A and Gene B +# in fusion. Using Ensembl52 +# mapping information and +# transcript sequences, outputs +# sequences for all possible +# exon junctions with the 3' end +# of Gene A fused to the 5' end +# of Gene B +# +# M. Berger, January 27, 2009 +# WTG modified 7/10/2012 +# RV last modified 4/9/2013 +################################## + +if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";} +my $geneA = shift; +my $geneB = shift; +my $annotations = shift; +my $map = shift; +my $fasta = shift; +my $overlap = shift; + +# Get transcript Ensembl IDs and orientations for GeneA and GeneB + +my @ensembA; +my @ensembB; +my @orientationA; +my @orientationB; +my @seqidA; +my @seqidB; + +my $testerA=0; my $testerB=0; +open (ANN, "<$annotations") or die "can't open Ensembl annotations\n"; +while (my $text = <ANN>) { + chomp $text; + my @line = split " ", $text; + if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;} + if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;} +} +close (ANN); +if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";} + +# Get exon lengths for each transcript for GeneA and GeneB + +my @exon_length_A; +my @exon_length_B; + +my $chr_A; +my $chr_B; + +my @exon_end_A; +my @exon_start_B; + +open (MAP, "<$map") or die "can't open Ensembl map\n"; +while (my $text = <MAP>) { + chomp $text; + my @line = split " ", $text; + my $ensID = pop (@line); + pop @line; + for (my $i=0; $i<=$#ensembA; $i++) { + if ($ensembA[$i] eq $ensID) { + $chr_A = $line[0]; + my $num_exons = ($#line+1)/3; + for (my $exonA=0; $exonA<$num_exons; $exonA++) { + my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1; + if ($orientationA[$i] == 1) { + push @{$exon_length_A[$i]}, $length; + push @{$exon_end_A[$i]}, $line[3*$exonA + 2]; + } + else { + unshift @{$exon_length_A[$i]}, $length; + unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1]; + } + } + } + } + for (my $i=0; $i<=$#ensembB; $i++) { + if ($ensembB[$i] eq $ensID) { + $chr_B = $line[0]; + my $num_exons = ($#line+1)/3; + for (my $exonB=0; $exonB<$num_exons; $exonB++) { + my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1; + if ($orientationB[$i] == 1) { + push @{$exon_length_B[$i]}, $length; + push @{$exon_start_B[$i]}, $line[3*$exonB + 1]; + } + else { + unshift @{$exon_length_B[$i]}, $length; + unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2]; + } + } + } + } +} +close (MAP); + +# Get sequence for each transcript (take reverse complement when necessary) + +my @sequenceA; +my @sequenceB; + +open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n"; +my $top = <FHSEQ>; +my $readID = 0; +while (my $text = <FHSEQ>) { + chomp $text; + if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation + $text=~ s/>//; + $readID=$text; + } + else { + for (my $i=0; $i<=$#seqidA; $i++) { + if ($seqidA[$i] == $readID) { + if ($sequenceA[$i]) { + $sequenceA[$i] = $sequenceA[$i].$text; + } + else {$sequenceA[$i] = $text;} + } + } + for (my $i=0; $i<=$#seqidB; $i++) { + if ($seqidB[$i] == $readID) { + if ($sequenceB[$i]) { + $sequenceB[$i] = $sequenceB[$i].$text; + } + else {$sequenceB[$i] = $text;} + } + } + } +} +close (FHSEQ); + +for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { + if ($orientationA[$txtA]==-1) { + $sequenceA[$txtA] = rc($sequenceA[$txtA]); + } +} +for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { + if ($orientationB[$txtB]==-1) { + $sequenceB[$txtB] = rc($sequenceB[$txtB]); + } +} + + +# Print sequences for each hypothetical exon junction (for each pair of transcripts) + + +for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) { + for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) { + + my $num_exon_A = $#{$exon_length_A[$txtA]}+1; + my $num_exon_B = $#{$exon_length_B[$txtB]}+1; + + my $running_pos_A=0; + for (my $exonA=0; $exonA<$num_exon_A; $exonA++) { + $running_pos_A += $exon_length_A[$txtA][$exonA]; + my $junction_start = $exon_end_A[$txtA][$exonA]; + my $start = $running_pos_A - $overlap; + my $seqA; + if ($start >= 0) { + $seqA = substr($sequenceA[$txtA], $start, $overlap); + } + else { + $start=0; + my $tmp_length = $running_pos_A; + $seqA = substr($sequenceA[$txtA], $start, $tmp_length); + } + my $running_pos_B=0; + for (my $exonB=0; $exonB<$num_exon_B; $exonB++) { + my $junction_end = $exon_start_B[$txtB][$exonB]; + my $start = $running_pos_B; + my $seqB = substr($sequenceB[$txtB], $start, $overlap); + $running_pos_B += $exon_length_B[$txtB][$exonB]; + print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n"; + my $junction = $seqA.$seqB; + print "$junction\n"; + } + } + } +} + + + +################## +################## +### SUBROUTINES +################## +################## + +sub rc{ + my $dna = shift; + my $revcom = reverse($dna); + $revcom =~ tr/ACGTacgt/TGCAtgca/; + return $revcom; +}