view pyPRADA_1.2/make_intragenic_junctions.pl @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;

##################################
# make_intragenic_junctions.pl
#
# This is an extension of MFB's perl code make_exon_junctions.pl
# Generates "abnormal" junctions between exons of a gene 
# In the code GeneA, GeneB should be input as same gene
#
# RV last modified 4/9/2013
##################################

if ($#ARGV!=5) {die "usage is 'perl make_exon_junctions.pl GeneA GeneB ref.annotation ref.map ref.fasta junL > output'\n";}
my $geneA = shift;
my $geneB = shift;
my $annotations = shift;
my $map = shift;
my $fasta = shift;
my $overlap = shift;

# Get transcript Ensembl IDs and orientations for GeneA and GeneB

my @ensembA;
my @ensembB;
my @orientationA;
my @orientationB;
my @seqidA;
my @seqidB;

my $testerA=0; my $testerB=0;
open (ANN, "<$annotations") or die "can't open Ensembl annotations\n";
while (my $text = <ANN>) {
    chomp $text;
    my @line = split " ", $text;
    if ($line[3] eq $geneA) {push @ensembA, $line[1]; push @orientationA, $line[4]; push @seqidA, $line[0]; $testerA++;}
    if ($line[3] eq $geneB) {push @ensembB, $line[1]; push @orientationB, $line[4]; push @seqidB, $line[0]; $testerB++;}
}
close (ANN);
if ($testerA==0 || $testerB==0) {die "couldn't find one of the genes\n";}

# Get exon lengths for each transcript for GeneA and GeneB

my @exon_length_A;
my @exon_length_B;

my $chr_A;
my $chr_B;

my @exon_end_A;
my @exon_start_B;

open (MAP, "<$map") or die "can't open Ensembl map\n";
while (my $text = <MAP>) {
    chomp $text;
    my @line = split " ", $text;
    my $ensID = pop (@line);
    pop @line;
    for (my $i=0; $i<=$#ensembA; $i++) {
	if ($ensembA[$i] eq $ensID) {
	    $chr_A = $line[0];
	    my $num_exons = ($#line+1)/3;
	    for (my $exonA=0; $exonA<$num_exons; $exonA++) {
		my $length = $line[3*$exonA + 2] - $line[3*$exonA + 1] + 1;
		if ($orientationA[$i] == 1) {
		    push @{$exon_length_A[$i]}, $length;
		    push @{$exon_end_A[$i]}, $line[3*$exonA + 2];
		}
		else {
		    unshift @{$exon_length_A[$i]}, $length;
		    unshift @{$exon_end_A[$i]}, $line[3*$exonA + 1];
		}
	    }
	}
    }
    for (my $i=0; $i<=$#ensembB; $i++) {
	if ($ensembB[$i] eq $ensID) {
	    $chr_B = $line[0];
	    my $num_exons = ($#line+1)/3;
	    for (my $exonB=0; $exonB<$num_exons; $exonB++) {
		my $length = $line[3*$exonB + 2] - $line[3*$exonB + 1] + 1;
		if ($orientationB[$i] == 1) {
		    push @{$exon_length_B[$i]}, $length;
		    push @{$exon_start_B[$i]}, $line[3*$exonB + 1];
		}
		else {
		    unshift @{$exon_length_B[$i]}, $length;
		    unshift @{$exon_start_B[$i]}, $line[3*$exonB + 2];
		}
	    }
	}
    }
}
close (MAP);

# Get sequence for each transcript (take reverse complement when necessary)

my @sequenceA;
my @sequenceB;

open (FHSEQ, "<$fasta") or die "can't open Ensembl map\n";
my $top = <FHSEQ>;
my $readID = 0;
while (my $text = <FHSEQ>) {
    chomp $text;
    if ($text =~ ">") { #WTG changed to update for specific ENST ID number which might not be in the same order in the fasta as in the annotation
	$text=~ s/>//;
	$readID=$text;
    }
    else {
	for (my $i=0; $i<=$#seqidA; $i++) {
	    if ($seqidA[$i] == $readID) {
		if ($sequenceA[$i]) {
		    $sequenceA[$i] = $sequenceA[$i].$text;
		}
		else {$sequenceA[$i] = $text;}
	    }
	}
	for (my $i=0; $i<=$#seqidB; $i++) {
	    if ($seqidB[$i] == $readID) {
		if ($sequenceB[$i]) {
		    $sequenceB[$i] = $sequenceB[$i].$text;
		}
		else {$sequenceB[$i] = $text;}
	    }
	}
    }
}
close (FHSEQ);
	
for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
    if ($orientationA[$txtA]==-1) {
	$sequenceA[$txtA] = rc($sequenceA[$txtA]);
    }
}
for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {
    if ($orientationB[$txtB]==-1) {
	$sequenceB[$txtB] = rc($sequenceB[$txtB]);
    }
}


# Print sequences for each hypothetical exon junction (for each pair of transcripts)
my %junctions;
my %junctions_known;
for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
    for (my $txtB=0; $txtB<=$#sequenceB; $txtB++) {

	my $num_exon_A = $#{$exon_length_A[$txtA]}+1;
	my $num_exon_B = $#{$exon_length_B[$txtB]}+1;

	my $running_pos_A=0;
	for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
	    $running_pos_A += $exon_length_A[$txtA][$exonA];
	    my $junction_start = $exon_end_A[$txtA][$exonA];
	    my $start = $running_pos_A - $overlap;
	    my $seqA;
	    if ($start >= 0) {
		$seqA = substr($sequenceA[$txtA], $start, $overlap);
	    }
	    else {
		$start=0;
		my $tmp_length = $running_pos_A;
		$seqA = substr($sequenceA[$txtA], $start, $tmp_length);
	    }
	    my $running_pos_B=0;
	    for (my $exonB=0; $exonB<$num_exon_B; $exonB++) {
		my $junction_end = $exon_start_B[$txtB][$exonB];
		my $start = $running_pos_B;
		my $seqB = substr($sequenceB[$txtB], $start, $overlap);
		$running_pos_B += $exon_length_B[$txtB][$exonB];
		my $key= ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
		my $junction = $seqA.$seqB;
		#print ">chr$chr_A\.$junction_start\.chr$chr_B\.$junction_end\n";
		#print "$junction\n";
		$junctions{$key}=$junction;
	    }
	}
	}
	
}

for (my $txtA=0; $txtA<=$#sequenceA; $txtA++) {
my $num_exon_A = $#{$exon_length_A[$txtA]};
	
	my $running_pos_A=0;
	for (my $exonA=0; $exonA<$num_exon_A; $exonA++) {
	    $running_pos_A += $exon_length_A[$txtA][$exonA];
	    my $junction_start = $exon_end_A[$txtA][$exonA];
		my $start = $running_pos_A - $overlap;
		my $seqA;
	    if ($start >= 0) {
		$seqA = substr($sequenceA[$txtA], $start, $overlap);
	    }
	    else {
		$start=0;
		my $tmp_length = $running_pos_A;
		$seqA = substr($sequenceA[$txtA], $start, $tmp_length);
	    }
		my $running_pos_B=0;
		my $junction_end = $exon_start_B[$txtA][$exonA+1];
		my $start1 = $running_pos_B;
		my $seqB = substr($sequenceA[$txtA], $start1, $overlap);
		#$running_pos_B += $exon_length_A[$txtA][$exonA+1];
		my $key= ">chr$chr_A\.$junction_start\.chr$chr_A\.$junction_end\n";
		my $junction = $seqA.$seqB;
		#print "$junction\n";
		$junctions_known{$key}=$junction;
		
	}
}

    my @disc;
    my %count = ();
    foreach my $element (keys (%junctions_known), keys (%junctions)) { $count{$element}++ }
    foreach my $element (keys %count) {
            if($count{$element} == 1){ 
			#print $element;
            push @disc , $element;
			}
    }

foreach my $junc(@disc){
	if($junctions_known{$junc}){print  $junc,$junctions_known{$junc},"\n";
	}
	else {print  $junc,$junctions{$junc},"\n";
	}

}


##################
##################
### SUBROUTINES
##################
##################

sub rc{
    my $dna = shift;
    my $revcom = reverse($dna);
    $revcom =~ tr/ACGTacgt/TGCAtgca/;
    return $revcom;
}