Mercurial > repos > vipints > deseq_hts
comparison deseq-hts_2.0/mex/get_bam_properties.cpp @ 10:2fe512c7bfdf draft
DESeq2 version 1.0.19 added to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Tue, 08 Oct 2013 08:15:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:e27b4f7811c2 | 10:2fe512c7bfdf |
---|---|
1 /* | |
2 * This program is free software; you can redistribute it and/or modify | |
3 * it under the terms of the GNU General Public License as published by | |
4 * the Free Software Foundation; either version 3 of the License, or | |
5 * (at your option) any later version. | |
6 * | |
7 * Written (W) 2009-2011 Regina Bohnert | |
8 * Copyright (C) 2009-2011 Max Planck Society | |
9 */ | |
10 | |
11 | |
12 #include <stdio.h> | |
13 #include <stdlib.h> | |
14 #include <signal.h> | |
15 #include <ctype.h> | |
16 #include <assert.h> | |
17 | |
18 #include <vector> | |
19 using std::vector; | |
20 #include <string> | |
21 using std::string; | |
22 #include <algorithm> | |
23 using std::find; | |
24 using std::min; | |
25 | |
26 #include <mex.h> | |
27 | |
28 | |
29 char *get_string(const mxArray *prhs); | |
30 | |
31 typedef unsigned int uint32_t; | |
32 typedef unsigned char uint8_t; | |
33 | |
34 /* | |
35 * [read_len num_reads] = get_bam_properties(fname, path_samtools, contig_name) | |
36 * | |
37 * -- input -- | |
38 * prhs[0] file name of paired reads in BAM format (sorted by read id) | |
39 * prhs[1] path to samtools | |
40 * prhs[2] contig name | |
41 * | |
42 * -- output -- | |
43 * plhs[0] length of read | |
44 * plhs[1] number of unique reads | |
45 */ | |
46 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { | |
47 // checks for the right number of arguments | |
48 if (nrhs !=3 || nlhs > 2) { | |
49 mexErrMsgTxt("number of input and output args should be 3 and 2\nUSAGE:\n [read_len, num_reads] = get_bam_properties(fname, path_samtools, contig_name)\n"); | |
50 return; | |
51 } | |
52 | |
53 signal(SIGCHLD, SIG_IGN); // avoid zombies | |
54 | |
55 // read input arguments | |
56 char *fname = get_string(prhs[0]); | |
57 char *path_samtools = get_string(prhs[1]); | |
58 char *contig_name = get_string(prhs[2]); | |
59 char command[10000]; | |
60 | |
61 sprintf(command, "%s./samtools view %s %s 2>/dev/null", path_samtools, fname, contig_name); | |
62 //printf("%s\n", command); | |
63 | |
64 // get number of unique reads | |
65 int status; | |
66 uint32_t num_unique_reads = 0; | |
67 char command2[10000]; | |
68 sprintf(command2, "%s | cut -f 1 | sort -u | wc -l", command); | |
69 FILE* fp = popen(command2, "r"); | |
70 if (fp == NULL) { | |
71 mexErrMsgTxt("Error using popen\n"); | |
72 } | |
73 int num_scans = 1; | |
74 num_scans = fscanf(fp, "%d", &num_unique_reads); | |
75 if (num_scans != 1) { | |
76 rewind(fp); | |
77 char ret[1000]; | |
78 fgets(ret, 1000, fp); | |
79 fprintf(stdout, "%s", ret); | |
80 mexErrMsgTxt("Could not determine number of reads\n"); | |
81 } | |
82 status = pclose(fp); | |
83 //printf("%i", num_unique_reads); | |
84 | |
85 // select reads for given positions and strand | |
86 int num_rows_selected = min((int) num_unique_reads, 100); | |
87 sprintf(command, "%s | head -n %i | cut -f 1-11", command, num_rows_selected); | |
88 fp = popen(command, "r"); | |
89 if (fp == NULL) { | |
90 mexErrMsgTxt("Error using popen\n"); | |
91 } | |
92 /* SAM format | |
93 1: read id, 2: flag, 3: reference name, 4: start (1-based, incl.), 5: mapping quality, | |
94 6: CIGAR, 7: mate reference name, 8: mate start (1-based, incl.), 9: insert size, 10: read, 11: quality | |
95 12+: additional tags | |
96 */ | |
97 uint32_t read_idx = 0, row_idx = 0, num_col = 0; | |
98 uint32_t flag = 0, start_pos = 0, map_score = 0, mate_end_pos = 0, num_matches = 0, num_del = 0, num_ins = 0, ins_size = 0; | |
99 char ri [1000], read_contig_name [1000], cg [1000], mate_read_id [1000], read [1000], read_qual [1000]; | |
100 string last_read_id; | |
101 vector<uint32_t> block_lengths, block_starts; | |
102 vector<string> read_ids; | |
103 vector<string>::iterator it; | |
104 | |
105 uint32_t read_len = 0; | |
106 bool empty_line = true; | |
107 int num_rows = 0; | |
108 while(empty_line && num_rows < num_rows_selected) { | |
109 num_col = fscanf(fp, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s", &ri, &flag, &read_contig_name, &start_pos, &map_score, &cg, &mate_read_id, &mate_end_pos, &ins_size, &read, &read_qual); | |
110 if (num_col != 11) { | |
111 mexErrMsgTxt("error reading SAM line\n"); | |
112 } | |
113 | |
114 string cigar = (string) cg; | |
115 // ignore lines with reads w/o mapping information | |
116 if (start_pos == 0 || cigar.compare("*")==0) { | |
117 continue; | |
118 } | |
119 // parse CIGAR | |
120 uint last_c = 0; | |
121 string last_str; | |
122 num_matches = 0; | |
123 char *end = NULL; | |
124 uint32_t tmp_nm = 0, tmp_nd = 0, tmp_ni = 0; | |
125 uint32_t last_block_start = 0, last_block_length = 0, last_intron_len = 0; | |
126 block_lengths.clear(); block_starts.clear(); | |
127 | |
128 for (uint c = 0; c < cigar.size(); c++) { | |
129 switch (cigar[c]) { | |
130 case 'M': | |
131 last_str = cigar.substr(last_c, c-last_c); | |
132 tmp_nm = strtoul(last_str.c_str(), &end, 10); | |
133 if (*end != '\0') | |
134 mexErrMsgTxt("error: number of mismatches\n"); | |
135 end = NULL; | |
136 last_block_length += tmp_nm; | |
137 num_matches += tmp_nm; | |
138 last_c = c + 1; | |
139 break; | |
140 case 'I': | |
141 last_str = cigar.substr(last_c, c-last_c); | |
142 tmp_ni = strtoul(last_str.c_str(), &end, 10); | |
143 if (*end != '\0') | |
144 mexErrMsgTxt("error: number of insertions\n"); | |
145 end = NULL; | |
146 num_ins += tmp_ni; | |
147 last_c = c + 1; | |
148 break; | |
149 case 'D': | |
150 last_str = cigar.substr(last_c, c-last_c); | |
151 tmp_nd = strtoul(last_str.c_str(), &end, 10); | |
152 if (*end != '\0') | |
153 mexErrMsgTxt("error: number of deletions\n"); | |
154 end = NULL; | |
155 num_del += tmp_nd; | |
156 last_block_length += tmp_nd; | |
157 last_c = c + 1; | |
158 break; | |
159 case 'N': | |
160 last_str = cigar.substr(last_c, c-last_c); | |
161 last_intron_len = strtoul(last_str.c_str(), &end, 10); | |
162 end = NULL; | |
163 last_c = c + 1; | |
164 break; | |
165 case 'S': | |
166 break; | |
167 case 'H': | |
168 break; | |
169 case 'P': | |
170 break; | |
171 default: | |
172 break; | |
173 } | |
174 if (cigar[c] == 'N' || c==cigar.size()-1) { | |
175 block_starts.push_back(last_block_start); | |
176 last_block_start = last_block_start + last_block_length + last_intron_len; | |
177 last_intron_len = 0; | |
178 block_lengths.push_back(last_block_length); | |
179 last_block_length = 0; | |
180 } | |
181 } | |
182 read_len = 0; | |
183 for (uint n = 0; n < block_lengths.size(); n++) { | |
184 read_len += block_lengths[n]; | |
185 } | |
186 empty_line = false; | |
187 } // end of stream parsing | |
188 | |
189 status = pclose(fp); | |
190 | |
191 if (empty_line) | |
192 mexErrMsgTxt("Could not determine read length\n"); | |
193 | |
194 plhs[0] = mxCreateDoubleScalar((double) read_len); | |
195 plhs[1] = mxCreateDoubleScalar((double) num_unique_reads); | |
196 | |
197 return; | |
198 } | |
199 | |
200 | |
201 char *get_string(const mxArray *prhs) { | |
202 char *buf; | |
203 int buflen; | |
204 if (!prhs) | |
205 mexErrMsgTxt("get_string called with NULL pointer arg"); | |
206 if (!mxIsChar(prhs)) | |
207 mexErrMsgTxt("input is not a string"); | |
208 if (mxGetM(prhs) != 1) | |
209 mexErrMsgTxt("input is not a row vector"); | |
210 buflen = mxGetN(prhs) + 1; | |
211 buf = (char*) malloc(buflen); | |
212 /* copy the string from prhs into buf and add terminating NULL char */ | |
213 if (mxGetString(prhs, buf, buflen)) | |
214 mexErrMsgTxt("not enough space"); | |
215 return buf; | |
216 } |