Mercurial > repos > vipints > rdiff
diff rDiff/src/tools/transform_single_end_reads.m @ 0:0f80a5141704
version 0.3 uploaded
author | vipints |
---|---|
date | Thu, 14 Feb 2013 23:38:36 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rDiff/src/tools/transform_single_end_reads.m Thu Feb 14 23:38:36 2013 -0500 @@ -0,0 +1,94 @@ +function [UNIQUE_NEW_EXONS, GRAPHNODES, ORDER_OF_GRAPHNODE, EIRS_IN_SEQ] = transform_single_end_reads(gene, SEQUENCED_LENGTH) +% This function calculates all regions onto which a read may fall. +% SEQUENCED_LENGTH is the length of a read. + + SPLICINGEVENTS=gene.splicingevents; + SEQUENCE=gene.sequence; + EXONSEQUENCE=gene.exonsequence; + + NB_OF_TRANS = size(EXONSEQUENCE,1); + + NEWEXONS = 1:(length(SPLICINGEVENTS) - 1); + UNIQUE_NEW_EXONS = NEWEXONS(SEQUENCE>0); + + NB_OF_EXONS = length(UNIQUE_NEW_EXONS); + MAX_EXON_NB_TO_POS = cumsum((NEWEXONS .* SEQUENCE) > 0); + + % NB_EXONSEQUENCE is for each transcript the position in + % SPLICINGEVENTS where one of its exonic region starts and 0 if it + % is intronic + + NB_EXONSEQUENCE = EXONSEQUENCE .* repmat(NEWEXONS,NB_OF_TRANS,1); + + %%% This contains all exons + GRAPHNODES = []; + + %%% This is the array where a 1 is in column j if that EIR (Exons in + %a region covered by a read) is contained in the transcript + EIRS_IN_SEQ = []; + + CURRENT_NODE = 0;% To catch errors with non-initalized variable + + EIR_TRANSCRIPTS = cell(1,NB_OF_TRANS); + ORDER_OF_GRAPHNODE = cell(1,NB_OF_TRANS); + LENGTHS_OF_GRAPHNODE = cell(1,NB_OF_TRANS); + + SPLICING_LENGTHS = SPLICINGEVENTS(2:end) - SPLICINGEVENTS(1:end-1); + + for i = 1:NB_OF_TRANS + + %%% remove zero positions, adjust splicing positions + CURRENT_EXONS = NB_EXONSEQUENCE(i, EXONSEQUENCE(i,:) > 0); + + SPLICING_CORRECT = cumsum(SPLICING_LENGTHS .* (NB_EXONSEQUENCE(i,:) == 0)); + %SPLICING_CORRECT contains the position of SPLICINGSEQUENCE in + %the transcript when all introns are spliced out + SPLICING_CORRECT = [1, SPLICINGEVENTS(2:end) - SPLICING_CORRECT]; + + %This ensures that the end of the transcript is also in CURRENT_SPLICINGEVENTS + IDX = EXONSEQUENCE(i,:) == 1 ; + IDX(find(EXONSEQUENCE(i,:) == 1, 1, 'last') + 1) = true; + CURRENT_SPLICINGEVENTS = SPLICING_CORRECT(IDX); + if length(CURRENT_SPLICINGEVENTS) == 0 + continue + end + LASTPOS = CURRENT_SPLICINGEVENTS(end); + if LASTPOS <= SEQUENCED_LENGTH + warning('CURRENT_SPLICINGEVENTS(end) > SEQUENCED_LENGTH') + end + %assert(LASTPOS > SEQUENCED_LENGTH,'CURRENT_SPLICINGEVENTS(end) > SEQUENCED_LENGTH') + % Calculate the positions when the EIRS can change + + % Determine positions which start SEQUENCED_LENGTH positions before a splicing event + % defines a window of size SEQUENCED_LENGTH around the CURRENT_SPLICINGEVENTS + READEVENTS_START = max([CURRENT_SPLICINGEVENTS(1:end - 1) - SEQUENCED_LENGTH + 1; ones(1,length(CURRENT_SPLICINGEVENTS)-1)],[],1); + READEVENTS_END = min([CURRENT_SPLICINGEVENTS(2:end); repmat(LASTPOS - SEQUENCED_LENGTH,1,length(CURRENT_SPLICINGEVENTS(2:end)))],[],1); + + % Calculate EIRS + % CHANGE_POINTS are those points in a transcript where a EIR changes, namly the splicesites of that transcript plus and + % minus the SEQUENCED_LENGTH - the above descibed window + CHANGE_POINTS = unique([READEVENTS_START, READEVENTS_END]); + + for j = 1:(length(CHANGE_POINTS) - 1) + + POINTS_OF_INTEREST = ( READEVENTS_START(1,:) <= CHANGE_POINTS(j)) & (READEVENTS_END(1,:) > CHANGE_POINTS(j)); + + % MAX_EXON_NB_TO_POS is mapping back to the unspliced coordinates + CURRENT_EIRS = zeros(1,NB_OF_EXONS); + CURRENT_EIRS( MAX_EXON_NB_TO_POS(CURRENT_EXONS(POINTS_OF_INTEREST))) = 1; + + %%% Already seen such exon composition in sliding + %%% window? + [TEMP, CURRENT_NODE] = intersect(GRAPHNODES, CURRENT_EIRS, 'rows'); + if isempty(TEMP) + GRAPHNODES = [GRAPHNODES; CURRENT_EIRS]; %Add Key + EIRS_IN_SEQ = [EIRS_IN_SEQ, zeros(NB_OF_TRANS,1)]; + CURRENT_NODE = size(GRAPHNODES,1); + end + + EIRS_IN_SEQ(i,CURRENT_NODE) = 1; + ORDER_OF_GRAPHNODE{i} = [ORDER_OF_GRAPHNODE{i}, CURRENT_NODE]; + LENGTHS_OF_GRAPHNODE{i} = [LENGTHS_OF_GRAPHNODE{i}, [CHANGE_POINTS(j); CHANGE_POINTS(j+1)]]; + end + end + \ No newline at end of file