view deseq-hts_1.0/mex/get_bam_properties.cpp @ 1:8ab01cc29c4b draft

Uploaded
author vipints
date Wed, 27 Jun 2012 15:35:11 -0400
parents 94a108763d9e
children
line wrap: on
line source

/*
*  This program is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 3 of the License, or
*  (at your option) any later version.
*
*   Written (W) 2009-2011 Regina Bohnert
*   Copyright (C) 2009-2011 Max Planck Society
*/


#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <ctype.h>
#include <assert.h>

#include <vector>
  using std::vector;
#include <string>
  using std::string;
#include <algorithm>
  using std::find; 
  using std::min;

#include <mex.h>


char *get_string(const mxArray *prhs);

typedef unsigned int uint32_t;
typedef unsigned char uint8_t;

/*
 * [read_len num_reads] = get_bam_properties(fname, path_samtools, contig_name)
 *
 * -- input --
 * prhs[0] file name of paired reads in BAM format (sorted by read id)
 * prhs[1] path to samtools
 * prhs[2] contig name
 *
 * -- output --
 * plhs[0] length of read
 * plhs[1] number of unique reads
*/
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // checks for the right number of arguments
  if (nrhs !=3 || nlhs > 2) {
    mexErrMsgTxt("number of input and output args should be 3 and 2\nUSAGE:\n     [read_len, num_reads] = get_bam_properties(fname, path_samtools, contig_name)\n");
    return;
  }

  signal(SIGCHLD, SIG_IGN); // avoid zombies

  // read input arguments
  char *fname = get_string(prhs[0]);
  char *path_samtools = get_string(prhs[1]);
  char *contig_name = get_string(prhs[2]);
  char command[10000];
  
  sprintf(command, "%s./samtools view %s %s 2>/dev/null", path_samtools, fname, contig_name);
  //printf("%s\n", command);

  // get number of unique reads
  int status;
  uint32_t num_unique_reads = 0;
  char command2[10000];
  sprintf(command2, "%s | cut -f 1 | sort -u | wc -l", command);
  FILE* fp = popen(command2, "r");
  if (fp == NULL) {
    mexErrMsgTxt("Error using popen\n");
  }
  int num_scans = 1;
  num_scans = fscanf(fp, "%d", &num_unique_reads);
  if (num_scans != 1) {
    rewind(fp);
    char ret[1000];
    fgets(ret, 1000, fp);
    fprintf(stdout, "%s", ret);
    mexErrMsgTxt("Could not determine number of reads\n");
  }
  status = pclose(fp);
  //printf("%i", num_unique_reads);
  
  // select reads for given positions and strand
  int num_rows_selected = min((int) num_unique_reads, 100);
  sprintf(command, "%s | head -n %i | cut -f 1-11", command, num_rows_selected);
  fp = popen(command, "r");
  if (fp == NULL) {
    mexErrMsgTxt("Error using popen\n");
  }
  /* SAM format
     1: read id, 2: flag, 3: reference name, 4: start (1-based, incl.), 5: mapping quality,
     6: CIGAR, 7: mate reference name, 8: mate start (1-based, incl.), 9: insert size, 10: read, 11: quality
     12+: additional tags  
  */
  uint32_t read_idx = 0, row_idx = 0, num_col = 0;
  uint32_t flag = 0, start_pos = 0, map_score = 0, mate_end_pos = 0, num_matches = 0, num_del = 0, num_ins = 0, ins_size = 0;
  char ri [1000], read_contig_name [1000], cg [1000], mate_read_id [1000], read [1000], read_qual [1000];
  string last_read_id;
  vector<uint32_t> block_lengths, block_starts;
  vector<string> read_ids;
  vector<string>::iterator it;
  
  uint32_t read_len = 0;
  bool empty_line = true;
  int num_rows = 0;
  while(empty_line && num_rows < num_rows_selected) {
    num_col = fscanf(fp, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s", &ri, &flag, &read_contig_name, &start_pos, &map_score, &cg, &mate_read_id, &mate_end_pos, &ins_size, &read, &read_qual);
    if (num_col != 11) {
      mexErrMsgTxt("error reading SAM line\n");
    }
    
    string cigar = (string) cg;
    // ignore lines with reads w/o mapping information 
    if (start_pos == 0 || cigar.compare("*")==0) {
      continue;
    }
    // parse CIGAR
    uint last_c = 0;
    string last_str;
    num_matches = 0;
    char *end = NULL;
    uint32_t tmp_nm = 0, tmp_nd = 0, tmp_ni = 0;
    uint32_t last_block_start = 0, last_block_length = 0, last_intron_len = 0;
    block_lengths.clear(); block_starts.clear();
    
    for (uint c = 0; c < cigar.size(); c++) {
      switch (cigar[c]) {
      case 'M':
	last_str = cigar.substr(last_c, c-last_c);
	tmp_nm = strtoul(last_str.c_str(), &end, 10);
	if (*end != '\0')
	  mexErrMsgTxt("error: number of mismatches\n");
	end = NULL;
	last_block_length += tmp_nm;
	num_matches += tmp_nm;
	last_c = c + 1;
	break;
      case 'I':
	last_str = cigar.substr(last_c, c-last_c);
	tmp_ni = strtoul(last_str.c_str(), &end, 10);
	if (*end != '\0')
	  mexErrMsgTxt("error: number of insertions\n");
	end = NULL;
	num_ins += tmp_ni;
	last_c = c + 1;
	break;
      case 'D':
	last_str = cigar.substr(last_c, c-last_c);
	tmp_nd = strtoul(last_str.c_str(), &end, 10);
	if (*end != '\0')
	  mexErrMsgTxt("error: number of deletions\n");
	end = NULL;
	num_del += tmp_nd;
	last_block_length += tmp_nd;
	last_c = c + 1;
	break;
      case 'N':
	last_str = cigar.substr(last_c, c-last_c);
	last_intron_len = strtoul(last_str.c_str(), &end, 10);
	end = NULL;
	last_c = c + 1;
	break;
      case 'S':
	break;
      case 'H':
	break;
      case 'P':
	break;
      default:
	break;
      }
      if (cigar[c] == 'N' || c==cigar.size()-1) {
	block_starts.push_back(last_block_start);
	last_block_start = last_block_start + last_block_length + last_intron_len;
	last_intron_len = 0;
	block_lengths.push_back(last_block_length);
	last_block_length = 0;
      }
    }
    read_len = 0;
    for (uint n = 0; n < block_lengths.size(); n++) {
      read_len += block_lengths[n];
    }
    empty_line = false;
  } // end of stream parsing
	
  status = pclose(fp);
  
  if (empty_line) 
    mexErrMsgTxt("Could not determine read length\n");
  
  plhs[0] = mxCreateDoubleScalar((double) read_len);
  plhs[1] = mxCreateDoubleScalar((double) num_unique_reads);
  
  return;
}


char *get_string(const mxArray *prhs) {
  char *buf;
  int buflen;
  if (!prhs)
    mexErrMsgTxt("get_string called with NULL pointer arg");
  if (!mxIsChar(prhs))
    mexErrMsgTxt("input is not a string");
  if (mxGetM(prhs) != 1)
    mexErrMsgTxt("input is not a row vector");
  buflen = mxGetN(prhs) + 1;
  buf = (char*) malloc(buflen);
  /* copy the string from prhs into buf and add terminating NULL char */
  if (mxGetString(prhs, buf, buflen))
    mexErrMsgTxt("not enough space");
  return buf;
}