diff pyPRADA_1.2/make_exon_junctions.pl @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/make_exon_junctions.pl	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,201 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+##################################
+# make_exon_junctions.pl
+#
+# Take as input Gene A and Gene B
+#  in fusion.  Using Ensembl52
+#  mapping information and
+#  transcript sequences, outputs
+#  sequences for all possible
+#  exon junctions with the 3' end
+#  of Gene A fused to the 5' end
+#  of Gene B
+#
+# M. Berger, January 27, 2009
+# WTG modified 7/10/2012
+# RV last modified 4/9/2013
+##################################
+
+if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";}
+my $geneA = shift;
+my $geneB = shift;
+my $annotations = shift;
+my $map = shift;
+my $fasta = shift;
+my $overlap = shift;
+
+# Get transcript Ensembl IDs and orientations for GeneA and GeneB
+
+my @ensembA;
+my @ensembB;
+my @orientationA;
+my @orientationB;
+my @seqidA;
+my @seqidB;
+
+my $testerA=0; my $testerB=0;
+open (ANN, "<$annotations") or die "can't open Ensembl annotations\n";
+while (my $text = <ANN>) {
+    chomp $text;
+    my @line = split " ", $text;
+    if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;}
+    if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;}
+}
+close (ANN);
+if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";}
+
+# Get exon lengths for each transcript for GeneA and GeneB
+
+my @exon_length_A;
+my @exon_length_B;
+
+my $chr_A;
+my $chr_B;
+
+my @exon_end_A;
+my @exon_start_B;
+
+open (MAP, "<$map") or die "can't open Ensembl map\n";
+while (my $text = <MAP>) {
+    chomp $text;
+    my @line = split " ", $text;
+    my $ensID = pop (@line);
+    pop @line;
+    for (my $i=0; $i<=$#ensembA; $i++) {
+	if ($ensembA[$i] eq $ensID) {
+	    $chr_A = $line[0];
+	    my $num_exons = ($#line+1)/3;
+	    for (my $exonA=0; $exonA<$num_exons; $exonA++) {
+		my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1;
+		if ($orientationA[$i] == 1) {
+		    push @{$exon_length_A[$i]}, $length;
+		    push @{$exon_end_A[$i]}, $line[3*$exonA + 2];
+		}
+		else {
+		    unshift @{$exon_length_A[$i]}, $length;
+		    unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1];
+		}
+	    }
+	}
+    }
+    for (my $i=0; $i<=$#ensembB; $i++) {
+	if ($ensembB[$i] eq $ensID) {
+	    $chr_B = $line[0];
+	    my $num_exons = ($#line+1)/3;
+	    for (my $exonB=0; $exonB<$num_exons; $exonB++) {
+		my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1;
+		if ($orientationB[$i] == 1) {
+		    push @{$exon_length_B[$i]}, $length;
+		    push @{$exon_start_B[$i]}, $line[3*$exonB + 1];
+		}
+		else {
+		    unshift @{$exon_length_B[$i]}, $length;
+		    unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2];
+		}
+	    }
+	}
+    }
+}
+close (MAP);
+
+# Get sequence for each transcript (take reverse complement when necessary)
+
+my @sequenceA;
+my @sequenceB;
+
+open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n";
+my $top = <FHSEQ>;
+my $readID = 0;
+while (my $text = <FHSEQ>) {
+    chomp $text;
+    if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation
+	$text=~ s/>//;
+	$readID=$text;
+    }
+    else {
+	for (my $i=0; $i<=$#seqidA; $i++) {
+	    if ($seqidA[$i] == $readID) {
+		if ($sequenceA[$i]) {
+		    $sequenceA[$i] = $sequenceA[$i].$text;
+		}
+		else {$sequenceA[$i] = $text;}
+	    }
+	}
+	for (my $i=0; $i<=$#seqidB; $i++) {
+	    if ($seqidB[$i] == $readID) {
+		if ($sequenceB[$i]) {
+		    $sequenceB[$i] = $sequenceB[$i].$text;
+		}
+		else {$sequenceB[$i] = $text;}
+	    }
+	}
+    }
+}
+close (FHSEQ);
+
+for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
+    if ($orientationA[$txtA]==-1) {
+	$sequenceA[$txtA] = rc($sequenceA[$txtA]);
+    }
+}
+for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
+    if ($orientationB[$txtB]==-1) {
+	$sequenceB[$txtB] = rc($sequenceB[$txtB]);
+    }
+}
+
+
+# Print sequences for each hypothetical exon junction (for each pair of transcripts)
+
+
+for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
+    for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
+
+	my $num_exon_A = $#{$exon_length_A[$txtA]}+1;
+	my $num_exon_B = $#{$exon_length_B[$txtB]}+1;
+
+	my $running_pos_A=0;
+	for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
+	    $running_pos_A += $exon_length_A[$txtA][$exonA];
+	    my $junction_start = $exon_end_A[$txtA][$exonA];
+	    my $start = $running_pos_A - $overlap;
+	    my $seqA;
+	    if ($start >= 0) {
+		$seqA = substr($sequenceA[$txtA], $start, $overlap);
+	    }
+	    else {
+		$start=0;
+		my $tmp_length = $running_pos_A;
+		$seqA = substr($sequenceA[$txtA], $start, $tmp_length);
+	    }
+	    my $running_pos_B=0;
+	    for (my $exonB=0; $exonB<$num_exon_B; $exonB++) {
+		my $junction_end = $exon_start_B[$txtB][$exonB];
+		my $start = $running_pos_B;
+		my $seqB = substr($sequenceB[$txtB], $start, $overlap);
+		$running_pos_B += $exon_length_B[$txtB][$exonB];
+		print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
+		my $junction = $seqA.$seqB;
+		print "$junction\n";
+	    }
+	}
+    }
+}
+
+
+
+##################
+##################
+### SUBROUTINES
+##################
+##################
+
+sub rc{
+    my $dna = shift;
+    my $revcom = reverse($dna);
+    $revcom =~ tr/ACGTacgt/TGCAtgca/;
+    return $revcom;
+}