comparison deseq-hts_1.0/mex/get_bam_properties.cpp @ 0:94a108763d9e draft

deseq-hts version 1.0 wraps the DESeq 1.6.0
author vipints
date Wed, 09 May 2012 20:43:47 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:94a108763d9e
1 /*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 3 of the License, or
5 * (at your option) any later version.
6 *
7 * Written (W) 2009-2011 Regina Bohnert
8 * Copyright (C) 2009-2011 Max Planck Society
9 */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <signal.h>
15 #include <ctype.h>
16 #include <assert.h>
17
18 #include <vector>
19 using std::vector;
20 #include <string>
21 using std::string;
22 #include <algorithm>
23 using std::find;
24 using std::min;
25
26 #include <mex.h>
27
28
29 char *get_string(const mxArray *prhs);
30
31 typedef unsigned int uint32_t;
32 typedef unsigned char uint8_t;
33
34 /*
35 * [read_len num_reads] = get_bam_properties(fname, path_samtools, contig_name)
36 *
37 * -- input --
38 * prhs[0] file name of paired reads in BAM format (sorted by read id)
39 * prhs[1] path to samtools
40 * prhs[2] contig name
41 *
42 * -- output --
43 * plhs[0] length of read
44 * plhs[1] number of unique reads
45 */
46 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
47 // checks for the right number of arguments
48 if (nrhs !=3 || nlhs > 2) {
49 mexErrMsgTxt("number of input and output args should be 3 and 2\nUSAGE:\n [read_len, num_reads] = get_bam_properties(fname, path_samtools, contig_name)\n");
50 return;
51 }
52
53 signal(SIGCHLD, SIG_IGN); // avoid zombies
54
55 // read input arguments
56 char *fname = get_string(prhs[0]);
57 char *path_samtools = get_string(prhs[1]);
58 char *contig_name = get_string(prhs[2]);
59 char command[10000];
60
61 sprintf(command, "%s./samtools view %s %s 2>/dev/null", path_samtools, fname, contig_name);
62 //printf("%s\n", command);
63
64 // get number of unique reads
65 int status;
66 uint32_t num_unique_reads = 0;
67 char command2[10000];
68 sprintf(command2, "%s | cut -f 1 | sort -u | wc -l", command);
69 FILE* fp = popen(command2, "r");
70 if (fp == NULL) {
71 mexErrMsgTxt("Error using popen\n");
72 }
73 int num_scans = 1;
74 num_scans = fscanf(fp, "%d", &num_unique_reads);
75 if (num_scans != 1) {
76 rewind(fp);
77 char ret[1000];
78 fgets(ret, 1000, fp);
79 fprintf(stdout, "%s", ret);
80 mexErrMsgTxt("Could not determine number of reads\n");
81 }
82 status = pclose(fp);
83 //printf("%i", num_unique_reads);
84
85 // select reads for given positions and strand
86 int num_rows_selected = min((int) num_unique_reads, 100);
87 sprintf(command, "%s | head -n %i | cut -f 1-11", command, num_rows_selected);
88 fp = popen(command, "r");
89 if (fp == NULL) {
90 mexErrMsgTxt("Error using popen\n");
91 }
92 /* SAM format
93 1: read id, 2: flag, 3: reference name, 4: start (1-based, incl.), 5: mapping quality,
94 6: CIGAR, 7: mate reference name, 8: mate start (1-based, incl.), 9: insert size, 10: read, 11: quality
95 12+: additional tags
96 */
97 uint32_t read_idx = 0, row_idx = 0, num_col = 0;
98 uint32_t flag = 0, start_pos = 0, map_score = 0, mate_end_pos = 0, num_matches = 0, num_del = 0, num_ins = 0, ins_size = 0;
99 char ri [1000], read_contig_name [1000], cg [1000], mate_read_id [1000], read [1000], read_qual [1000];
100 string last_read_id;
101 vector<uint32_t> block_lengths, block_starts;
102 vector<string> read_ids;
103 vector<string>::iterator it;
104
105 uint32_t read_len = 0;
106 bool empty_line = true;
107 int num_rows = 0;
108 while(empty_line && num_rows < num_rows_selected) {
109 num_col = fscanf(fp, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s", &ri, &flag, &read_contig_name, &start_pos, &map_score, &cg, &mate_read_id, &mate_end_pos, &ins_size, &read, &read_qual);
110 if (num_col != 11) {
111 mexErrMsgTxt("error reading SAM line\n");
112 }
113
114 string cigar = (string) cg;
115 // ignore lines with reads w/o mapping information
116 if (start_pos == 0 || cigar.compare("*")==0) {
117 continue;
118 }
119 // parse CIGAR
120 uint last_c = 0;
121 string last_str;
122 num_matches = 0;
123 char *end = NULL;
124 uint32_t tmp_nm = 0, tmp_nd = 0, tmp_ni = 0;
125 uint32_t last_block_start = 0, last_block_length = 0, last_intron_len = 0;
126 block_lengths.clear(); block_starts.clear();
127
128 for (uint c = 0; c < cigar.size(); c++) {
129 switch (cigar[c]) {
130 case 'M':
131 last_str = cigar.substr(last_c, c-last_c);
132 tmp_nm = strtoul(last_str.c_str(), &end, 10);
133 if (*end != '\0')
134 mexErrMsgTxt("error: number of mismatches\n");
135 end = NULL;
136 last_block_length += tmp_nm;
137 num_matches += tmp_nm;
138 last_c = c + 1;
139 break;
140 case 'I':
141 last_str = cigar.substr(last_c, c-last_c);
142 tmp_ni = strtoul(last_str.c_str(), &end, 10);
143 if (*end != '\0')
144 mexErrMsgTxt("error: number of insertions\n");
145 end = NULL;
146 num_ins += tmp_ni;
147 last_c = c + 1;
148 break;
149 case 'D':
150 last_str = cigar.substr(last_c, c-last_c);
151 tmp_nd = strtoul(last_str.c_str(), &end, 10);
152 if (*end != '\0')
153 mexErrMsgTxt("error: number of deletions\n");
154 end = NULL;
155 num_del += tmp_nd;
156 last_block_length += tmp_nd;
157 last_c = c + 1;
158 break;
159 case 'N':
160 last_str = cigar.substr(last_c, c-last_c);
161 last_intron_len = strtoul(last_str.c_str(), &end, 10);
162 end = NULL;
163 last_c = c + 1;
164 break;
165 case 'S':
166 break;
167 case 'H':
168 break;
169 case 'P':
170 break;
171 default:
172 break;
173 }
174 if (cigar[c] == 'N' || c==cigar.size()-1) {
175 block_starts.push_back(last_block_start);
176 last_block_start = last_block_start + last_block_length + last_intron_len;
177 last_intron_len = 0;
178 block_lengths.push_back(last_block_length);
179 last_block_length = 0;
180 }
181 }
182 read_len = 0;
183 for (uint n = 0; n < block_lengths.size(); n++) {
184 read_len += block_lengths[n];
185 }
186 empty_line = false;
187 } // end of stream parsing
188
189 status = pclose(fp);
190
191 if (empty_line)
192 mexErrMsgTxt("Could not determine read length\n");
193
194 plhs[0] = mxCreateDoubleScalar((double) read_len);
195 plhs[1] = mxCreateDoubleScalar((double) num_unique_reads);
196
197 return;
198 }
199
200
201 char *get_string(const mxArray *prhs) {
202 char *buf;
203 int buflen;
204 if (!prhs)
205 mexErrMsgTxt("get_string called with NULL pointer arg");
206 if (!mxIsChar(prhs))
207 mexErrMsgTxt("input is not a string");
208 if (mxGetM(prhs) != 1)
209 mexErrMsgTxt("input is not a row vector");
210 buflen = mxGetN(prhs) + 1;
211 buf = (char*) malloc(buflen);
212 /* copy the string from prhs into buf and add terminating NULL char */
213 if (mxGetString(prhs, buf, buflen))
214 mexErrMsgTxt("not enough space");
215 return buf;
216 }