0
+ − 1 /*
+ − 2 * This program is free software; you can redistribute it and/or modify
+ − 3 * it under the terms of the GNU General Public License as published by
+ − 4 * the Free Software Foundation; either version 3 of the License, or
+ − 5 * (at your option) any later version.
+ − 6 *
+ − 7 * Written (W) 2009-2011 Regina Bohnert
+ − 8 * Copyright (C) 2009-2011 Max Planck Society
+ − 9 */
+ − 10
+ − 11
+ − 12 #include <stdio.h>
+ − 13 #include <stdlib.h>
+ − 14 #include <signal.h>
+ − 15 #include <ctype.h>
+ − 16 #include <assert.h>
+ − 17
+ − 18 #include <vector>
+ − 19 using std::vector;
+ − 20 #include <string>
+ − 21 using std::string;
+ − 22 #include <algorithm>
+ − 23 using std::find;
+ − 24 using std::min;
+ − 25
+ − 26 #include <mex.h>
+ − 27
+ − 28
+ − 29 char *get_string(const mxArray *prhs);
+ − 30
+ − 31 typedef unsigned int uint32_t;
+ − 32 typedef unsigned char uint8_t;
+ − 33
+ − 34 /*
+ − 35 * [read_len num_reads] = get_bam_properties(fname, path_samtools, contig_name)
+ − 36 *
+ − 37 * -- input --
+ − 38 * prhs[0] file name of paired reads in BAM format (sorted by read id)
+ − 39 * prhs[1] path to samtools
+ − 40 * prhs[2] contig name
+ − 41 *
+ − 42 * -- output --
+ − 43 * plhs[0] length of read
+ − 44 * plhs[1] number of unique reads
+ − 45 */
+ − 46 void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+ − 47 // checks for the right number of arguments
+ − 48 if (nrhs !=3 || nlhs > 2) {
+ − 49 mexErrMsgTxt("number of input and output args should be 3 and 2\nUSAGE:\n [read_len, num_reads] = get_bam_properties(fname, path_samtools, contig_name)\n");
+ − 50 return;
+ − 51 }
+ − 52
+ − 53 signal(SIGCHLD, SIG_IGN); // avoid zombies
+ − 54
+ − 55 // read input arguments
+ − 56 char *fname = get_string(prhs[0]);
+ − 57 char *path_samtools = get_string(prhs[1]);
+ − 58 char *contig_name = get_string(prhs[2]);
+ − 59 char command[10000];
+ − 60
+ − 61 sprintf(command, "%s./samtools view %s %s 2>/dev/null", path_samtools, fname, contig_name);
+ − 62 //printf("%s\n", command);
+ − 63
+ − 64 // get number of unique reads
+ − 65 int status;
+ − 66 uint32_t num_unique_reads = 0;
+ − 67 char command2[10000];
+ − 68 sprintf(command2, "%s | cut -f 1 | sort -u | wc -l", command);
+ − 69 FILE* fp = popen(command2, "r");
+ − 70 if (fp == NULL) {
+ − 71 mexErrMsgTxt("Error using popen\n");
+ − 72 }
+ − 73 int num_scans = 1;
+ − 74 num_scans = fscanf(fp, "%d", &num_unique_reads);
+ − 75 if (num_scans != 1) {
+ − 76 rewind(fp);
+ − 77 char ret[1000];
+ − 78 fgets(ret, 1000, fp);
+ − 79 fprintf(stdout, "%s", ret);
+ − 80 mexErrMsgTxt("Could not determine number of reads\n");
+ − 81 }
+ − 82 status = pclose(fp);
+ − 83 //printf("%i", num_unique_reads);
+ − 84
+ − 85 // select reads for given positions and strand
+ − 86 int num_rows_selected = min((int) num_unique_reads, 100);
+ − 87 sprintf(command, "%s | head -n %i | cut -f 1-11", command, num_rows_selected);
+ − 88 fp = popen(command, "r");
+ − 89 if (fp == NULL) {
+ − 90 mexErrMsgTxt("Error using popen\n");
+ − 91 }
+ − 92 /* SAM format
+ − 93 1: read id, 2: flag, 3: reference name, 4: start (1-based, incl.), 5: mapping quality,
+ − 94 6: CIGAR, 7: mate reference name, 8: mate start (1-based, incl.), 9: insert size, 10: read, 11: quality
+ − 95 12+: additional tags
+ − 96 */
+ − 97 uint32_t read_idx = 0, row_idx = 0, num_col = 0;
+ − 98 uint32_t flag = 0, start_pos = 0, map_score = 0, mate_end_pos = 0, num_matches = 0, num_del = 0, num_ins = 0, ins_size = 0;
+ − 99 char ri [1000], read_contig_name [1000], cg [1000], mate_read_id [1000], read [1000], read_qual [1000];
+ − 100 string last_read_id;
+ − 101 vector<uint32_t> block_lengths, block_starts;
+ − 102 vector<string> read_ids;
+ − 103 vector<string>::iterator it;
+ − 104
+ − 105 uint32_t read_len = 0;
+ − 106 bool empty_line = true;
+ − 107 int num_rows = 0;
+ − 108 while(empty_line && num_rows < num_rows_selected) {
+ − 109 num_col = fscanf(fp, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s", &ri, &flag, &read_contig_name, &start_pos, &map_score, &cg, &mate_read_id, &mate_end_pos, &ins_size, &read, &read_qual);
+ − 110 if (num_col != 11) {
+ − 111 mexErrMsgTxt("error reading SAM line\n");
+ − 112 }
+ − 113
+ − 114 string cigar = (string) cg;
+ − 115 // ignore lines with reads w/o mapping information
+ − 116 if (start_pos == 0 || cigar.compare("*")==0) {
+ − 117 continue;
+ − 118 }
+ − 119 // parse CIGAR
+ − 120 uint last_c = 0;
+ − 121 string last_str;
+ − 122 num_matches = 0;
+ − 123 char *end = NULL;
+ − 124 uint32_t tmp_nm = 0, tmp_nd = 0, tmp_ni = 0;
+ − 125 uint32_t last_block_start = 0, last_block_length = 0, last_intron_len = 0;
+ − 126 block_lengths.clear(); block_starts.clear();
+ − 127
+ − 128 for (uint c = 0; c < cigar.size(); c++) {
+ − 129 switch (cigar[c]) {
+ − 130 case 'M':
+ − 131 last_str = cigar.substr(last_c, c-last_c);
+ − 132 tmp_nm = strtoul(last_str.c_str(), &end, 10);
+ − 133 if (*end != '\0')
+ − 134 mexErrMsgTxt("error: number of mismatches\n");
+ − 135 end = NULL;
+ − 136 last_block_length += tmp_nm;
+ − 137 num_matches += tmp_nm;
+ − 138 last_c = c + 1;
+ − 139 break;
+ − 140 case 'I':
+ − 141 last_str = cigar.substr(last_c, c-last_c);
+ − 142 tmp_ni = strtoul(last_str.c_str(), &end, 10);
+ − 143 if (*end != '\0')
+ − 144 mexErrMsgTxt("error: number of insertions\n");
+ − 145 end = NULL;
+ − 146 num_ins += tmp_ni;
+ − 147 last_c = c + 1;
+ − 148 break;
+ − 149 case 'D':
+ − 150 last_str = cigar.substr(last_c, c-last_c);
+ − 151 tmp_nd = strtoul(last_str.c_str(), &end, 10);
+ − 152 if (*end != '\0')
+ − 153 mexErrMsgTxt("error: number of deletions\n");
+ − 154 end = NULL;
+ − 155 num_del += tmp_nd;
+ − 156 last_block_length += tmp_nd;
+ − 157 last_c = c + 1;
+ − 158 break;
+ − 159 case 'N':
+ − 160 last_str = cigar.substr(last_c, c-last_c);
+ − 161 last_intron_len = strtoul(last_str.c_str(), &end, 10);
+ − 162 end = NULL;
+ − 163 last_c = c + 1;
+ − 164 break;
+ − 165 case 'S':
+ − 166 break;
+ − 167 case 'H':
+ − 168 break;
+ − 169 case 'P':
+ − 170 break;
+ − 171 default:
+ − 172 break;
+ − 173 }
+ − 174 if (cigar[c] == 'N' || c==cigar.size()-1) {
+ − 175 block_starts.push_back(last_block_start);
+ − 176 last_block_start = last_block_start + last_block_length + last_intron_len;
+ − 177 last_intron_len = 0;
+ − 178 block_lengths.push_back(last_block_length);
+ − 179 last_block_length = 0;
+ − 180 }
+ − 181 }
+ − 182 read_len = 0;
+ − 183 for (uint n = 0; n < block_lengths.size(); n++) {
+ − 184 read_len += block_lengths[n];
+ − 185 }
+ − 186 empty_line = false;
+ − 187 } // end of stream parsing
+ − 188
+ − 189 status = pclose(fp);
+ − 190
+ − 191 if (empty_line)
+ − 192 mexErrMsgTxt("Could not determine read length\n");
+ − 193
+ − 194 plhs[0] = mxCreateDoubleScalar((double) read_len);
+ − 195 plhs[1] = mxCreateDoubleScalar((double) num_unique_reads);
+ − 196
+ − 197 return;
+ − 198 }
+ − 199
+ − 200
+ − 201 char *get_string(const mxArray *prhs) {
+ − 202 char *buf;
+ − 203 int buflen;
+ − 204 if (!prhs)
+ − 205 mexErrMsgTxt("get_string called with NULL pointer arg");
+ − 206 if (!mxIsChar(prhs))
+ − 207 mexErrMsgTxt("input is not a string");
+ − 208 if (mxGetM(prhs) != 1)
+ − 209 mexErrMsgTxt("input is not a row vector");
+ − 210 buflen = mxGetN(prhs) + 1;
+ − 211 buf = (char*) malloc(buflen);
+ − 212 /* copy the string from prhs into buf and add terminating NULL char */
+ − 213 if (mxGetString(prhs, buf, buflen))
+ − 214 mexErrMsgTxt("not enough space");
+ − 215 return buf;
+ − 216 }