annotate rDiff/src/tools/transform_single_end_reads.m @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
1 function [UNIQUE_NEW_EXONS, GRAPHNODES, ORDER_OF_GRAPHNODE, EIRS_IN_SEQ] = transform_single_end_reads(gene, SEQUENCED_LENGTH)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
2 % This function calculates all regions onto which a read may fall.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
3 % SEQUENCED_LENGTH is the length of a read.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
4
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
5 SPLICINGEVENTS=gene.splicingevents;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
6 SEQUENCE=gene.sequence;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
7 EXONSEQUENCE=gene.exonsequence;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
8
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
9 NB_OF_TRANS = size(EXONSEQUENCE,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
10
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
11 NEWEXONS = 1:(length(SPLICINGEVENTS) - 1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
12 UNIQUE_NEW_EXONS = NEWEXONS(SEQUENCE>0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
13
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
14 NB_OF_EXONS = length(UNIQUE_NEW_EXONS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
15 MAX_EXON_NB_TO_POS = cumsum((NEWEXONS .* SEQUENCE) > 0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
16
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
17 % NB_EXONSEQUENCE is for each transcript the position in
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
18 % SPLICINGEVENTS where one of its exonic region starts and 0 if it
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
19 % is intronic
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
20
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
21 NB_EXONSEQUENCE = EXONSEQUENCE .* repmat(NEWEXONS,NB_OF_TRANS,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
22
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
23 %%% This contains all exons
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
24 GRAPHNODES = [];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
25
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
26 %%% This is the array where a 1 is in column j if that EIR (Exons in
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
27 %a region covered by a read) is contained in the transcript
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
28 EIRS_IN_SEQ = [];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
29
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
30 CURRENT_NODE = 0;% To catch errors with non-initalized variable
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
31
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
32 EIR_TRANSCRIPTS = cell(1,NB_OF_TRANS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
33 ORDER_OF_GRAPHNODE = cell(1,NB_OF_TRANS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
34 LENGTHS_OF_GRAPHNODE = cell(1,NB_OF_TRANS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
35
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
36 SPLICING_LENGTHS = SPLICINGEVENTS(2:end) - SPLICINGEVENTS(1:end-1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
37
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
38 for i = 1:NB_OF_TRANS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
39
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
40 %%% remove zero positions, adjust splicing positions
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
41 CURRENT_EXONS = NB_EXONSEQUENCE(i, EXONSEQUENCE(i,:) > 0);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
42
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
43 SPLICING_CORRECT = cumsum(SPLICING_LENGTHS .* (NB_EXONSEQUENCE(i,:) == 0));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
44 %SPLICING_CORRECT contains the position of SPLICINGSEQUENCE in
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
45 %the transcript when all introns are spliced out
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
46 SPLICING_CORRECT = [1, SPLICINGEVENTS(2:end) - SPLICING_CORRECT];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
47
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
48 %This ensures that the end of the transcript is also in CURRENT_SPLICINGEVENTS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
49 IDX = EXONSEQUENCE(i,:) == 1 ;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
50 IDX(find(EXONSEQUENCE(i,:) == 1, 1, 'last') + 1) = true;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
51 CURRENT_SPLICINGEVENTS = SPLICING_CORRECT(IDX);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
52 if length(CURRENT_SPLICINGEVENTS) == 0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
53 continue
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
54 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
55 LASTPOS = CURRENT_SPLICINGEVENTS(end);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
56 if LASTPOS <= SEQUENCED_LENGTH
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
57 warning('CURRENT_SPLICINGEVENTS(end) > SEQUENCED_LENGTH')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
58 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
59 %assert(LASTPOS > SEQUENCED_LENGTH,'CURRENT_SPLICINGEVENTS(end) > SEQUENCED_LENGTH')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
60 % Calculate the positions when the EIRS can change
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
61
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
62 % Determine positions which start SEQUENCED_LENGTH positions before a splicing event
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
63 % defines a window of size SEQUENCED_LENGTH around the CURRENT_SPLICINGEVENTS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
64 READEVENTS_START = max([CURRENT_SPLICINGEVENTS(1:end - 1) - SEQUENCED_LENGTH + 1; ones(1,length(CURRENT_SPLICINGEVENTS)-1)],[],1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
65 READEVENTS_END = min([CURRENT_SPLICINGEVENTS(2:end); repmat(LASTPOS - SEQUENCED_LENGTH,1,length(CURRENT_SPLICINGEVENTS(2:end)))],[],1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
66
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
67 % Calculate EIRS
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
68 % CHANGE_POINTS are those points in a transcript where a EIR changes, namly the splicesites of that transcript plus and
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
69 % minus the SEQUENCED_LENGTH - the above descibed window
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
70 CHANGE_POINTS = unique([READEVENTS_START, READEVENTS_END]);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
71
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
72 for j = 1:(length(CHANGE_POINTS) - 1)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
73
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
74 POINTS_OF_INTEREST = ( READEVENTS_START(1,:) <= CHANGE_POINTS(j)) & (READEVENTS_END(1,:) > CHANGE_POINTS(j));
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
75
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
76 % MAX_EXON_NB_TO_POS is mapping back to the unspliced coordinates
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
77 CURRENT_EIRS = zeros(1,NB_OF_EXONS);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
78 CURRENT_EIRS( MAX_EXON_NB_TO_POS(CURRENT_EXONS(POINTS_OF_INTEREST))) = 1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
79
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
80 %%% Already seen such exon composition in sliding
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
81 %%% window?
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
82 [TEMP, CURRENT_NODE] = intersect(GRAPHNODES, CURRENT_EIRS, 'rows');
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
83 if isempty(TEMP)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
84 GRAPHNODES = [GRAPHNODES; CURRENT_EIRS]; %Add Key
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
85 EIRS_IN_SEQ = [EIRS_IN_SEQ, zeros(NB_OF_TRANS,1)];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
86 CURRENT_NODE = size(GRAPHNODES,1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
87 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
88
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
89 EIRS_IN_SEQ(i,CURRENT_NODE) = 1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
90 ORDER_OF_GRAPHNODE{i} = [ORDER_OF_GRAPHNODE{i}, CURRENT_NODE];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
91 LENGTHS_OF_GRAPHNODE{i} = [LENGTHS_OF_GRAPHNODE{i}, [CHANGE_POINTS(j); CHANGE_POINTS(j+1)]];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
92 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
93 end
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
94