diff rDiff/src/tools/read_utils/get_reads.cpp @ 0:0f80a5141704

version 0.3 uploaded
author vipints
date Thu, 14 Feb 2013 23:38:36 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rDiff/src/tools/read_utils/get_reads.cpp	Thu Feb 14 23:38:36 2013 -0500
@@ -0,0 +1,346 @@
+/* written by Jonas Behr, Regina Bohnert and Gunnar Raetsch, FML Tuebingen, Germany, 2010 */
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <mex.h>
+#include <algorithm>
+#include <vector>
+	using std::vector;
+#include "get_reads_direct.h"
+#include "mex_input.h"
+#include "read.h"
+
+#define MAXLINE 10000
+
+/*
+ * input: 
+ * 1 bam file
+ * 2 chromosome
+ * 3 region start (1-based index)
+ * 4 region end (1-based index)
+ * 5 strand (either '+' or '-' or '0')
+ * [6] collapse flag: if true the reads are collapsed to a coverage track
+ * [7] subsample percentage: percentage of reads to be subsampled (in per mill)
+ * [8] intron length filter
+ * [9] exon length filter
+ * [10] mismatch filter
+ * [11] bool: use mapped reads for coverage
+ * [12] bool: use spliced reads for coverage
+ * [13] return maxminlen
+ * [14] return pair coverage and pair index list
+ * [15] only_clipped 
+ * [15] switch of pair filter
+ *
+ * output: 
+ * 1 coverage
+ * [2] intron cell array
+ * [3] pair coverage
+ * [4] pair list
+ *
+ * example call: 
+ * [cov introns] = get_reads('polyA_left_I+_el15_mm1_spliced.bam', 'I', 10000, 12000, '-', 1, 30);
+ */
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+	
+	if (nrhs<5 || nrhs>16 || (nlhs<1 || nlhs>4)) {
+		fprintf(stderr, "usage: [x [introns] [pair]] = get_reads(fname, chr, start, end, strand, [collapse], [subsample], [max intron length], [min exonlength], [max mismatches], [mapped], [spliced], [maxminlen], [pair], [only clipped], [all pairs]);\n");
+		return; 
+	}
+	
+	/* obligatory arguments
+	 * **********************/
+	char *fname = get_string(prhs[0]);
+	//fprintf(stdout, "arg1: %s\n", fname);
+	char *chr = get_string(prhs[1]);
+	//fprintf(stdout, "arg2: %s\n", chr);
+	int from_pos = get_int(prhs[2]);
+	//fprintf(stdout, "arg3: %d\n", from_pos);
+	int to_pos = get_int(prhs[3]);
+	//fprintf(stdout, "arg4: %d\n", to_pos);
+	char *strand = get_string(prhs[4]);
+	//fprintf(stdout, "arg5: %s\n", strand);
+
+	if (from_pos>to_pos)
+		 mexErrMsgTxt("Start (arg 3) must be <= end (arg 4)\n");
+
+	if (strand[0]!='+' && strand[0]!='-' && strand[0]!='0') 
+		mexErrMsgTxt("Unknown strand (arg 5): either + or - or 0");
+
+	/* optional arguments
+	 * ******************/	
+	int collapse = 0;
+	if (nrhs>=6)
+		collapse = get_int(prhs[5]);
+	
+	int subsample = 1000;	
+	if (nrhs>=7)
+		subsample = get_int(prhs[6]);
+		
+	int intron_len_filter = 1e9;
+	if (nrhs>=8)
+		intron_len_filter = get_int(prhs[7]);
+
+	int exon_len_filter = -1;
+	if (nrhs>=9)
+		exon_len_filter = get_int(prhs[8]);
+
+	int filter_mismatch = 1e9;
+	if (nrhs>=10)
+		filter_mismatch = get_int(prhs[9]);
+
+	int mapped = 1;
+	if (nrhs>=11)
+		mapped = get_int(prhs[10]);
+
+	int spliced = 1;
+	if (nrhs>=12)
+		spliced = get_int(prhs[11]);
+
+	int maxminlen = 0;
+	if (nrhs>=13)
+		maxminlen = get_int(prhs[12]);
+
+	int pair_cov = 0;
+	if (nrhs>=14)
+		pair_cov = get_int(prhs[13]);
+
+	int only_clipped = 0;
+	if (nrhs>=15)
+		only_clipped = get_int(prhs[14]);
+
+	int no_pair_filter = 0;
+	if (nrhs>=16)
+		no_pair_filter = get_int(prhs[15]);
+
+
+	/* call function to get reads
+	 * **************************/	
+	char region[MAXLINE];
+	sprintf(region, "%s:%i-%i", chr, from_pos, to_pos);
+
+	vector<CRead*> all_reads;
+	
+	get_reads_from_bam(fname, region, &all_reads, strand[0], subsample);
+
+	for (int i=0; i<all_reads.size(); i++) {
+		all_reads[i]->strip_leftright_tag() ;
+	}
+	
+	/* filter reads
+	 * **************/	
+	int left = 0;
+	int right = 0;
+	
+	vector<CRead*> reads;
+	for (int i=0; i<all_reads.size(); i++) {
+		if (all_reads[i]->left)
+			left++;
+		if (all_reads[i]->right)
+			right++;
+		//if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && all_reads[i]->multiple_alignment_index==0)
+		if (all_reads[i]->max_intron_len()<intron_len_filter && all_reads[i]->min_exon_len()>exon_len_filter && all_reads[i]->get_mismatches()<=filter_mismatch && (only_clipped==0 || all_reads[i]->is_clipped))
+			reads.push_back(all_reads[i]);
+	}
+
+ 
+	/* prepare output
+	 * **************/	
+	int num_rows = reads.size();
+	int num_pos = to_pos-from_pos+1;
+ 
+	if (pair_cov==1 && nlhs>=3) {
+		// sort reads by read_id
+		//printf("\n\nleft:%i right:%i \n\n", left, right);
+		//printf("\nreads[0]->read_id: %s\n", reads[0]->read_id);
+		sort(reads.begin(), reads.end(), CRead::compare_by_read_id);
+	}
+	
+	// read coverages collapsed 
+	if (collapse) {
+		plhs[0] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL);
+		uint32_t *mask_ret = (uint32_t*) mxGetData(plhs[0]);
+		if (num_pos>0 && mask_ret==NULL)
+			mexErrMsgTxt("Error allocating memory\n");
+		if (mapped && spliced) {
+			for (int i=0; i<reads.size(); i++) {
+				reads[i]->get_coverage(from_pos, to_pos, mask_ret);
+			}
+		} else {
+			for (int i=0; i<reads.size(); i++) {
+				ssize_t num_exons = reads[i]->block_starts.size();
+				if ((num_exons==1 && mapped) || (num_exons>1 && spliced))
+					reads[i]->get_coverage(from_pos, to_pos, mask_ret);
+			}
+		}
+	}
+	// reads not collapsed
+	else {
+		uint32_t nzmax = 0; // maximal number of nonzero elements 
+		int len = to_pos-from_pos+1;
+		for (uint i=0; i<reads.size(); i++) 
+		{
+			ssize_t num_exons = reads[i]->block_starts.size();
+			if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced)))
+			{
+				continue;
+			}
+			for (uint n = 0; n < reads[i]->block_starts.size(); n++) 
+			{
+				uint32_t from, to;
+				if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos >= 0)
+					from = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos;
+				else
+					from = 0;
+				if (reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n] >= 0)
+					to = reads[i]->block_starts[n]+reads[i]->start_pos-from_pos+reads[i]->block_lengths[n];
+				else
+					to = 0;
+				for (int bp=from; bp<to&bp<len; bp++)
+				{
+					nzmax++;
+				}
+			}
+		}
+		// 1st row: row indices
+		// 2nd row: column indices
+		plhs[0] = mxCreateDoubleMatrix(2, nzmax, mxREAL);
+		double *mask_ret = (double*) mxGetData(plhs[0]);
+		if (nzmax>0 && mask_ret==NULL)
+			mexErrMsgTxt("Error allocating memory\n");
+		uint32_t mask_ret_c = 0; // counter
+		for (uint i=0; i<reads.size(); i++) 
+		{
+			ssize_t num_exons = reads[i]->block_starts.size();
+			if (!((mapped && spliced) || (num_exons==1 && mapped) || (num_exons>1 && spliced)))
+			{
+				continue;
+			}
+			reads[i]->get_reads_sparse(from_pos, to_pos, mask_ret, mask_ret_c, i);
+		}
+		if (mask_ret_c!=2*nzmax)
+			mexErrMsgTxt("Error filling index arrays for sparse matrix\n");
+	}
+	// introns
+	if (maxminlen==0 && nlhs>=2) {
+			vector<int> intron_list;
+			for (int i=0; i<reads.size(); i++) {
+				reads[i]->get_introns(&intron_list);
+			}
+			
+			plhs[1] = mxCreateNumericMatrix(2, intron_list.size()/2, mxUINT32_CLASS, mxREAL);
+			uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]);
+			for (int p = 0; p<intron_list.size(); p++) {
+				p_intron_list[p] = intron_list[p];
+			}
+			intron_list.clear();	
+		} else if (nlhs>=2) {
+			vector<uint32_t> intron_starts;
+			vector<uint32_t> intron_ends;
+			vector<uint32_t> block_len1;
+			vector<uint32_t> block_len2;
+			for (int i=0; i<reads.size(); i++) {
+				reads[i]->get_introns(&intron_starts, &intron_ends, &block_len1, &block_len2);
+			}
+			
+			plhs[1] = mxCreateNumericMatrix(4, intron_starts.size(), mxINT32_CLASS, mxREAL);
+			uint32_t *p_intron_list = (uint32_t*) mxGetData(plhs[1]);
+			for (int p = 0; p<intron_starts.size(); p++) {	
+				p_intron_list[4*p] = intron_starts[p];
+				p_intron_list[(4*p)+1] = intron_ends[p];
+				p_intron_list[(4*p)+2] = block_len1[p];
+				p_intron_list[(4*p)+3] = block_len2[p];
+			}	 
+			intron_starts.clear() ; 
+			intron_ends.clear() ;
+			block_len1.clear() ;
+			block_len2.clear() ;
+	}
+	if (pair_cov==1 && nlhs>=3) {
+		plhs[2] = mxCreateNumericMatrix(1, num_pos, mxUINT32_CLASS, mxREAL);
+		uint32_t *p_pair_map = (uint32_t*) mxGetData(plhs[2]);
+		if (num_pos>0 && p_pair_map==NULL)
+			mexErrMsgTxt("Error allocating memory\n");
+		
+		vector<int> pair_ids;
+		
+		int take_cnt = 0;
+		int discard_cnt = 0; 
+		int discard_strand_cnt = 0; 
+		//printf("reads.size(): %i\n", reads.size());
+		// find consecutive reads with the same id
+		for (int i=0; i<((int) reads.size())-1; i++) 
+		{
+			//printf("reads[%i]->read_id: %s\n", i, reads[i]->read_id);
+			int j = i+1;
+			while(j<reads.size() && strcmp(reads[i]->read_id, reads[j]->read_id) == 0) 
+			{
+				if ((reads[i]->left && reads[j]->right) || (reads[j]->left && reads[i]->right) && (reads[i]->reverse != reads[j]->reverse)) 
+				{
+					if (reads[i]->get_last_position()==-1 || reads[j]->get_last_position()==-1)
+						break;
+					if (false)//(reads[i]->strand[0]=='0' && reads[j]->strand[0]=='0' )
+					{ 
+						// discard pairs without strand information
+						discard_strand_cnt++;
+					}
+					else if (reads[i]->get_last_position()<reads[j]->start_pos && reads[j]->start_pos-reads[i]->get_last_position()<60000) 
+					{
+						int from = std::max(0, reads[i]->get_last_position()-from_pos);
+						int to = std::min(num_pos-1, reads[j]->start_pos-from_pos);
+						pair_ids.push_back(i);
+						pair_ids.push_back(j);
+						for (int k=from; k<to; k++)
+							p_pair_map[k]++;
+						take_cnt++;
+					}
+					else if (reads[i]->start_pos>reads[j]->get_last_position() && reads[j]->get_last_position()-reads[i]->start_pos<60000)
+					{
+						int from = std::max(0, reads[j]->get_last_position()-from_pos);
+						int to = std::min(num_pos-1, reads[i]->start_pos-from_pos);
+						pair_ids.push_back(i);
+						pair_ids.push_back(j);
+						for (int k=from; k<to; k++)
+							p_pair_map[k]++;
+						take_cnt++;
+					}
+					else
+					{
+						if (no_pair_filter>0 && reads[i]->start_pos<reads[j]->start_pos)
+						{
+							pair_ids.push_back(i);
+							pair_ids.push_back(j);
+							take_cnt++;
+						}
+						else if (no_pair_filter>0)
+						{
+							pair_ids.push_back(j);
+							pair_ids.push_back(i);
+							take_cnt++;
+						}
+						else
+							discard_cnt++;
+						//printf("istart:%i, ilast:%i  jstart:%i, jlast: %i\n", reads[i]->start_pos, reads[i]->get_last_position(), reads[j]->start_pos, reads[j]->get_last_position());
+					}
+				}
+				else
+					discard_cnt++;
+				j++;
+			}
+		}
+		//printf("take:%i, discard:%i discard_strand_cnt:%i\n", take_cnt, discard_cnt+discard_strand_cnt, discard_strand_cnt);
+		
+		if (nlhs>=4) {
+			plhs[3] = mxCreateNumericMatrix(2, pair_ids.size()/2, mxUINT32_CLASS, mxREAL);
+			uint32_t *pair_ids_ret = (uint32_t*) mxGetData(plhs[3]);
+			if (pair_ids.size()>0 && pair_ids_ret==NULL)
+				mexErrMsgTxt("Error allocating memory\n");
+			for (int i=0; i<pair_ids.size(); i++) {
+				pair_ids_ret[i] = pair_ids[i];
+			}
+		}
+	}
+	for (int i=0; i<all_reads.size(); i++)
+		delete all_reads[i];
+}
+