annotate rDiff/mex/read.cpp @ 2:233c30f91d66

updated python based GFF parsing module which will handle GTF/GFF/GFF3 file types
author vipints <vipin@cbio.mskcc.org>
date Tue, 08 Oct 2013 07:15:44 -0400
parents 0f80a5141704
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
1 /*
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
2 * This program is free software; you can redistribute it and/or modify
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
3 * it under the terms of the GNU General Public License as published by
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
4 * the Free Software Foundation; either version 3 of the License, or
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
5 * (at your option) any later version.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
6 *
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
7 * Written (W) 2010-2011 Jonas Behr, Regina Bohnert, Gunnar Raetsch
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
8 * Copyright (C) 2010-2011 Max Planck Society
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
9 */
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
10
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
11
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
12 #include "read.h"
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
13
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
14 CRead::CRead() {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
15 read_id = NULL;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
16 sam_line = NULL;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
17 start_pos = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
18 matches = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
19 mismatches = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
20 multiple_alignment_index = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
21 strand = NULL;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
22 left = false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
23 right = false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
24 reverse = false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
25 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
26
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
27 CRead::~CRead() {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
28 delete[] read_id;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
29 delete[] sam_line;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
30 delete[] strand;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
31 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
32
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
33 /*
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
34 * Augments 'coverage' array at the positions covered by the read in the queried interval.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
35 */
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
36 void CRead::get_coverage(int p_start_pos, int p_end_pos, uint32_t* coverage)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
37 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
38 // block1 block2
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
39 // |=====|======|============|===========|======|====|
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
40 // ^ ^ ^
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
41 // p_start_pos | p_end_pos
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
42 // start_pos
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
43 // |0000001111111111111000000000000111111100000|
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
44 // *coverage
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
45 int len = p_end_pos-p_start_pos+1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
46 for (uint32_t n = 0; n < block_starts.size(); n++) {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
47 int32_t from, to;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
48 from = block_starts[n]+start_pos-p_start_pos;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
49 to = block_starts[n]+start_pos-p_start_pos+block_lengths[n];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
50 if (from < 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
51 from = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
52 if (to < 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
53 continue;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
54 else if (to > len)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
55 to = len;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
56 for (int bp=from; bp<to; bp++) {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
57 coverage[bp]++;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
58 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
59 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
60 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
61 int CRead::get_last_position()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
62 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
63 if (block_starts.size()>0) // this if for some reason zero in case of softclips
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
64 return start_pos+block_starts.back()+block_lengths.back();
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
65 return -1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
66 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
67
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
68 /*
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
69 * Adds the column indices (= positions) covered by the read to 'reads' array in current row (= read).
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
70 * These indices can be used to build up a sparse matrix of reads x positions.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
71 */
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
72 void CRead::get_reads_sparse(int p_start_pos, int p_end_pos, double* reads, uint32_t & reads_c, uint32_t row_idx) {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
73 int len = p_end_pos-p_start_pos+1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
74 for (uint32_t n = 0; n < block_starts.size(); n++) {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
75 uint32_t from, to;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
76 if (block_starts[n]+start_pos-p_start_pos >= 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
77 from = block_starts[n]+start_pos-p_start_pos;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
78 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
79 from = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
80 if (block_starts[n]+start_pos-p_start_pos+block_lengths[n] >= 0)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
81 to = block_starts[n]+start_pos-p_start_pos+block_lengths[n];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
82 else
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
83 to = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
84 for (int bp=from; bp<to&bp<len; bp++) {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
85 reads[reads_c] = row_idx+1; // row indices for sparse matrix
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
86 reads[reads_c+1] = bp+1; // column indices for sparse matrix
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
87 reads_c += 2;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
88 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
89 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
90 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
91
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
92 void CRead::get_acc_splice_sites(vector<int>* acc_pos)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
93 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
94 if (strand[0]=='+')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
95 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
96 for (int k=1;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
97 acc_pos->push_back(start_pos+block_starts[k]-1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
98 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
99 else if (strand[0]=='-')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
100 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
101 for (int k=1;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
102 acc_pos->push_back(start_pos+block_starts[k-1]+block_lengths[k-1]-2);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
103 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
104 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
105
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
106 void CRead::get_don_splice_sites(vector<int>* don_pos)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
107 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
108
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
109 if (strand[0]=='+')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
110 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
111 for (int k=1;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
112 don_pos->push_back(start_pos+block_starts[k-1]+block_lengths[k-1]-2);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
113 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
114 else if (strand[0]=='-')
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
115 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
116 for (int k=1;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
117 don_pos->push_back(start_pos+block_starts[k]-1);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
118 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
119 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
120
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
121 int CRead::min_exon_len()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
122 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
123 int min = 1e8;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
124 for (int k=0;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
125 if (block_lengths[k]<min)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
126 min = block_lengths[k];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
127 return min;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
128 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
129
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
130 int CRead::max_intron_len()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
131 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
132 int max = 0;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
133 for (int k=1;k<block_starts.size(); k++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
134 if (block_starts[k]-(block_starts[k-1]+block_lengths[k-1])>max)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
135 max = block_starts[k]-(block_starts[k-1]+block_lengths[k-1]);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
136 return max;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
137 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
138
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
139 /*
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
140 * Adds start and end of introns in the read consecutively to the 'introns' vector.
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
141 */
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
142 void CRead::get_introns(vector<int>* introns)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
143 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
144 for (int i=1; i<block_starts.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
145 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
146 int istart = block_starts[i-1]+block_lengths[i-1]+start_pos;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
147 int iend = block_starts[i]+start_pos-1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
148 introns->push_back(istart);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
149 introns->push_back(iend);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
150 //fprintf(stdout, "%i intron: %d->%d\n", i, istart, iend);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
151 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
152 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
153 void CRead::get_introns(vector<uint32_t>* intron_starts, vector<uint32_t>* intron_ends, vector<uint32_t>* block_len1, vector<uint32_t>* block_len2)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
154 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
155 for (int i=1; i<block_starts.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
156 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
157 uint32_t istart = block_starts[i-1]+block_lengths[i-1]+start_pos;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
158 uint32_t iend = block_starts[i]+start_pos-1;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
159 intron_starts->push_back(istart);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
160 intron_ends->push_back(iend);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
161 block_len1->push_back(block_lengths[i-1]) ;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
162 block_len2->push_back(block_lengths[i]) ;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
163 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
164 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
165
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
166 bool CRead::operator==(const CRead& read) const
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
167 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
168 if (block_starts.size()!=read.block_starts.size())
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
169 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
170 if (block_lengths.size()!=read.block_lengths.size())
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
171 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
172 if (start_pos!=read.start_pos)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
173 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
174 if (strand[0] != read.strand[0])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
175 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
176 for (int i=0; i<block_starts.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
177 if (block_starts[i]!=read.block_starts[i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
178 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
179 for (int i=0; i<block_lengths.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
180 if (block_lengths[i]!=read.block_lengths[i])
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
181 return false;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
182 return true;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
183 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
184
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
185 void CRead::print()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
186 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
187 fprintf(stdout, "start_pos: %d\n", start_pos);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
188 fprintf(stdout, "starts:");
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
189 for (int i=0; i<block_starts.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
190 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
191 fprintf(stdout, " %d", block_starts[i]);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
192 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
193 fprintf(stdout, "\n");
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
194
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
195 fprintf(stdout, "lengths:");
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
196 for (int i=0; i<block_starts.size(); i++)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
197 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
198 fprintf(stdout, " %d", block_lengths[i]);
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
199 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
200 fprintf(stdout, "\n");
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
201 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
202
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
203 void CRead::set_strand(char s)
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
204 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
205 delete[] strand;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
206 strand = new char [2];
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
207 strand[0] = s;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
208 strand[1] = '0';
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
209 }
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
210
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
211 int CRead::get_mismatches()
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
212 {
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
213 return mismatches ;
0f80a5141704 version 0.3 uploaded
vipints
parents:
diff changeset
214 }