view deseq-hts_1.0/mex/get_reads.cpp @ 1:8ab01cc29c4b draft

Uploaded
author vipints
date Wed, 27 Jun 2012 15:35:11 -0400
parents 94a108763d9e
children
line wrap: on
line source

/*
*  This program is free software; you can redistribute it and/or modify
*  it under the terms of the GNU General Public License as published by
*  the Free Software Foundation; either version 3 of the License, or
*  (at your option) any later version.
*
*   Written (W) 2010-2011 Jonas Behr, Regina Bohnert, Gunnar Raetsch
*   Copyright (C) 2010-2011 Max Planck Society
*/


#include <stdio.h>
#include <string.h>
#include <signal.h>
#include <mex.h>
#include <algorithm>
#include <vector>
	using std::vector;
#include "get_reads_direct.h"
#include "mex_input.h"
#include "read.h"

#define MAXLINE 10000

/*
 * input: 
 * 1 bam file
 * 2 chromosome
 * 3 region start (1-based index)
 * 4 region end (1-based index)
 * 5 strand (either '+' or '-' or '0')
 * [6] collapse flag: if true the reads are collapsed to a coverage track
 * [7] subsample percentage: percentage of reads to be subsampled (in per mill)
 * [8] intron length filter
 * [9] exon length filter
 * [10] mismatch filter
 * [11] bool: use mapped reads for coverage
 * [12] bool: use spliced reads for coverage
 * [13] return maxminlen
 * [14] return pair coverage
 *
 * output: 
 * 1 coverage
 * [2] intron cell array
 * [3] pair coverage
 * [4] pair list
 *
 * example call: 
 * [cov introns] = get_reads('polyA_left_I+_el15_mm1_spliced.bam', 'I', 10000, 12000, '-', 1, 30);
 */
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  
  if (nrhs<5 || nrhs>14 || (nlhs<1 || nlhs>4)) {
    fprintf(stderr, "usage: [x [introns] [pair]] = get_reads(fname, chr, start, end, strand, [collapse], [subsample], [max intron length], [min exonlength], [max mismatches], [mapped], [spliced], [maxminlen], [pair]);\n");
    return; 
  }
  
  /* obligatory arguments
   * **********************/
  char *fname = get_string(prhs[0]);
	//fprintf(stdout, "arg1: %s\n", fname);
  char *chr = get_string(prhs[1]);
	//fprintf(stdout, "arg2: %s\n", chr);
  int from_pos = get_int(prhs[2]);
	//fprintf(stdout, "arg3: %d\n", from_pos);
  int to_pos = get_int(prhs[3]);
	//fprintf(stdout, "arg4: %d\n", to_pos);
  char *strand = get_string(prhs[4]);
	//fprintf(stdout, "arg5: %s\n", strand);

  if (from_pos>to_pos)
     mexErrMsgTxt("Start (arg 3) must be <= end (arg 4)\n");

  if (strand[0]!='+' && strand[0]!='-' && strand[0]!='0') 
    mexErrMsgTxt("Unknown strand (arg 5): either + or - or 0");

  /* optional arguments
   * ******************/	
  int collapse = 0;
  if (nrhs>=6)
    collapse = get_int(prhs[5]);
  
  int subsample = 1000;	
  if (nrhs>=7)
    subsample = get_int(prhs[6]);
	  
  int intron_len_filter = 1e9;
  if (nrhs>=8)
    intron_len_filter = get_int(prhs[7]);

  int exon_len_filter = -1;
  if (nrhs>=9)
    exon_len_filter = get_int(prhs[8]);

  int filter_mismatch = 1e9;
  if (nrhs>=10)
    filter_mismatch = get_int(prhs[9]);

  int mapped = 1;
  if (nrhs>=11)
    mapped = get_int(prhs[10]);

  int spliced = 1;
  if (nrhs>=12)
    spliced = get_int(prhs[11]);

  int maxminlen = 0;
  if (nrhs>=13)
    maxminlen = get_int(prhs[12]);

  int pair_cov = 0;
	if (nrhs>=14)
		pair_cov = get_int(prhs[13]);

  /* call function to get reads
   * **************************/	
  char region[MAXLINE];
  sprintf(region, "%s:%i-%i", chr, from_pos, to_pos);

  vector<CRead*> all_reads;
  
  get_reads_from_bam(fname, region, &all_reads, strand[0], subsample);
 
  /* filter reads
   * **************/	
  int left = 0;
  int right = 0;
  
  vector<CRead*> reads;
  for (int i=0; i<all_reads.size(); i++) {
    if (all_reads[i]->left)
      left++;
    if (all_reads[i]->right)
      right++;
    if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch)
      reads.push_back(all_reads[i]);
  }

 
  /* prepare output
   * **************/	
  int num_rows = reads.size();
  int num_pos = to_pos-from_pos+1;
 
  if (pair_cov==1 && nlhs>=3) {
    // sort reads by read_id
    printf("\n\nleft:%i right:%i \n\n", left, right);
    sort(reads.begin(), reads.end(), CRead::compare_by_read_id);
  }
  
  // read coverages collapsed 
  if (collapse) {
    plhs[0] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL);
    uint32_t *mask_ret = (uint32_t*) mxGetData(plhs[0]);
    if (num_pos>0 && mask_ret==NULL)
      mexErrMsgTxt("Error allocating memory\n");
    if (mapped && spliced) {
      for (int i=0; i<reads.size(); i++) {
	reads[i]->get_coverage(from_pos, to_pos, mask_ret);
      }
    } else {
      for (int i=0; i<reads.size(); i++) {
	ssize_t num_exons = reads[i]->block_starts.size();
	if ((num_exons==1 && mapped) || (num_exons>1 && spliced))
	  reads[i]->get_coverage(from_pos, to_pos, mask_ret);
      }
    }
  }
  // reads not collapsed
  else {
    uint32_t nzmax = 0; // maximal number of nonzero elements 
    int len = to_pos-from_pos+1;
    for (uint i=0; i<reads.size(); i++) {
      for (uint n = 0; n < reads[i]->block_starts.size(); n++) {
	uint32_t from, to;
	if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos >= 0)
	  from = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos;
	else
	  from = 0;
	if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n] >= 0)
	  to = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n];
	else
	  to = 0;
	for (int bp=from; bp<to&bp<len; bp++) {
	  nzmax++;
	}
      }
    }
    // 1st row: row indices
    // 2nd row: column indices
    plhs[0] = mxCreateDoubleMatrix(2, nzmax, mxREAL);
    double *mask_ret = (double*) mxGetData(plhs[0]);
    if (nzmax>0 && mask_ret==NULL)
      mexErrMsgTxt("Error allocating memory\n");
    uint32_t mask_ret_c = 0; // counter
    for (uint i=0; i<reads.size(); i++) {
      reads[i]->get_reads_sparse(from_pos, to_pos, mask_ret, mask_ret_c, i);
    }
    if (mask_ret_c!=2*nzmax)
      mexErrMsgTxt("Error filling index arrays for sparse matrix\n");
  }
  // introns
  if (maxminlen==0 && nlhs>=2) {
      vector<int> intron_list;
      for (int i=0; i<reads.size(); i++) {
        reads[i]->get_introns(&intron_list);
      }
      
      plhs[1] = mxCreateNumericMatrix(2, intron_list.size()/2, mxUINT32_CLASS, mxREAL);
      uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]);
      for (int p = 0; p<intron_list.size(); p++) {
        p_intron_list[p] = intron_list[p];
      }
      intron_list.clear();	
    } else if (nlhs>=2) {
      vector<uint32_t> intron_starts;
      vector<uint32_t> intron_ends;
      vector<uint32_t> block_len1;
      vector<uint32_t> block_len2;
      for (int i=0; i<reads.size(); i++) {
        reads[i]->get_introns(&intron_starts, &intron_ends, &block_len1, &block_len2);
      }
      
      plhs[1] = mxCreateNumericMatrix(4, intron_starts.size(), mxINT32_CLASS, mxREAL);
      uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]);
      for (int p = 0; p<intron_starts.size(); p++) {	
        p_intron_list[4*p] = intron_starts[p];
        p_intron_list[(4*p)+1] = intron_ends[p];
        p_intron_list[(4*p)+2] = block_len1[p];
        p_intron_list[(4*p)+3] = block_len2[p];
      }   
      intron_starts.clear() ; 
      intron_ends.clear() ;
      block_len1.clear() ;
      block_len2.clear() ;
  }
  if (pair_cov==1 && nlhs>=3) {
    plhs[2] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL);
    uint32_t *p_pair_map = (uint32_t*) mxGetData(plhs[2]);
    if (num_pos>0 && p_pair_map==NULL)
      mexErrMsgTxt("Error allocating memory\n");
    
    vector<int> pair_ids;
    
    int take_cnt = 0;
    int discard_cnt = 0; 
    // find consecutive reads with the same id
    for (int i=0; i<((int) reads.size())-1; i++) {
      int j = i+1;
      while(j<reads.size() && strcmp(reads[i]->read_id, reads[j]->read_id) == 0) {
        if ((reads[i]->left && reads[j]->right) || (reads[j]->left && reads[i]->right) && (reads[i]->reverse != reads[j]->reverse)) {
	      if (reads[i]->get_last_position()==-1 || reads[j]->get_last_position()==-1)
	        break;
          if (reads[i]->get_last_position()<reads[j]->start_pos && reads[j]->start_pos-reads[i]->get_last_position()<60000) {
            int from = std::max(0, reads[i]->get_last_position()-from_pos);
            int to = std::min(num_pos-1, reads[j]->start_pos-from_pos);
            pair_ids.push_back(i);
            pair_ids.push_back(j);
            for (int k=from; k<to; k++)
              p_pair_map[k]++;
            take_cnt++;
          } else if (reads[i]->start_pos>reads[j]->get_last_position() && reads[j]->get_last_position()-reads[i]->start_pos<60000) {
            int from = std::max(0, reads[j]->get_last_position()-from_pos);
            int to = std::min(num_pos-1, reads[i]->start_pos-from_pos);
            pair_ids.push_back(i);
            pair_ids.push_back(j);
            for (int k=from; k<to; k++)
              p_pair_map[k]++;
            take_cnt++;
          } else
            discard_cnt++;
        }
        else
          discard_cnt++;
        j++;
      }
    }
    printf("take:%i, discard:%i \n", take_cnt, discard_cnt);
    
    if (nlhs>=4) {
      plhs[3] = mxCreateNumericMatrix(2, pair_ids.size()/2, mxUINT32_CLASS, mxREAL);
      uint32_t *pair_ids_ret = (uint32_t*) mxGetData(plhs[3]);
      if (pair_ids.size()>0 && pair_ids_ret==NULL)
        mexErrMsgTxt("Error allocating memory\n");
      for (int i=0; i<pair_ids.size(); i++) {
        pair_ids_ret[i] = pair_ids[i];
      }
    }
  }
  for (int i=0; i<all_reads.size(); i++)
    delete all_reads[i];
}