diff PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdupse.c @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdupse.c	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,159 @@
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+#include "klist.h"
+
+#define QUEUE_CLEAR_SIZE 0x100000
+#define MAX_POS 0x7fffffff
+
+typedef struct {
+	int endpos;
+	uint32_t score:31, discarded:1;
+	bam1_t *b;
+} elem_t, *elem_p;
+#define __free_elem(p) bam_destroy1((p)->data.b)
+KLIST_INIT(q, elem_t, __free_elem)
+typedef klist_t(q) queue_t;
+
+KHASH_MAP_INIT_INT(best, elem_p)
+typedef khash_t(best) besthash_t;
+
+typedef struct {
+	uint64_t n_checked, n_removed;
+	besthash_t *left, *rght;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+	khint_t k = kh_get(lib, aux, lib);
+	if (k == kh_end(aux)) {
+		int ret;
+		char *p = strdup(lib);
+		lib_aux_t *q;
+		k = kh_put(lib, aux, p, &ret);
+		q = &kh_val(aux, k);
+		q->left = kh_init(best);
+		q->rght = kh_init(best);
+		q->n_checked = q->n_removed = 0;
+		return q;
+	} else return &kh_val(aux, k);
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+	int i, q;
+	uint8_t *qual = bam1_qual(b);
+	for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+	return q;
+}
+
+static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
+{
+	elem_t *p = kl_pushp(q, queue);
+	p->discarded = 0;
+	p->endpos = endpos; p->score = score;
+	if (p->b == 0) p->b = bam_init1();
+	bam_copy1(p->b, b);
+	return p;
+}
+
+static void clear_besthash(besthash_t *h, int32_t pos)
+{
+	khint_t k;
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
+			kh_del(best, h, k);
+}
+
+static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+{
+	if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
+		khint_t k;
+		while (1) {
+			elem_t *q;
+			if (queue->head == queue->tail) break;
+			q = &kl_val(queue->head);
+			if (q->discarded) {
+				q->b->data_len = 0;
+				kl_shift(q, queue, 0);
+				continue;
+			}
+			if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
+			samwrite(out, q->b);
+			q->b->data_len = 0;
+			kl_shift(q, queue, 0);
+		}
+		for (k = kh_begin(h); k != kh_end(h); ++k) {
+			if (kh_exist(h, k)) {
+				clear_besthash(kh_val(h, k).left, pos);
+				clear_besthash(kh_val(h, k).rght, pos);
+			}
+		}
+	}
+}
+
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
+{
+	bam1_t *b;
+	queue_t *queue;
+	khint_t k;
+	int last_tid = -2;
+	khash_t(lib) *aux;
+
+	aux = kh_init(lib);
+	b = bam_init1();
+	queue = kl_init(q);
+	while (samread(in, b) >= 0) {
+		bam1_core_t *c = &b->core;
+		int endpos = bam_calend(c, bam1_cigar(b));
+		int score = sum_qual(b);
+		
+		if (last_tid != c->tid) {
+			if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+			last_tid = c->tid;
+		} else dump_alignment(out, queue, c->pos, aux);
+		if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
+			push_queue(queue, b, endpos, score);
+		} else {
+			const char *lib;
+			lib_aux_t *q;
+			besthash_t *h;
+			uint32_t key;
+			int ret;
+			lib = bam_get_library(in->header, b);
+			q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+			++q->n_checked;
+			h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
+			key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
+			k = kh_put(best, h, key, &ret);
+			if (ret == 0) { // in the hash table
+				elem_t *p = kh_val(h, k);
+				++q->n_removed;
+				if (p->score < score) {
+					if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
+						p->discarded = 1;
+						kh_val(h, k) = push_queue(queue, b, endpos, score);
+					} else { // replace
+						p->score = score; p->endpos = endpos;
+						bam_copy1(p->b, b);
+					}
+				} // otherwise, discard the alignment
+			} else kh_val(h, k) = push_queue(queue, b, endpos, score);
+		}
+	}
+	dump_alignment(out, queue, MAX_POS, aux);
+
+	for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+		if (kh_exist(aux, k)) {
+			lib_aux_t *q = &kh_val(aux, k);
+			fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+					(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+			kh_destroy(best, q->left); kh_destroy(best, q->rght);
+			free((char*)kh_key(aux, k));
+		}
+	}
+	kh_destroy(lib, aux);
+	bam_destroy1(b);
+	kl_destroy(q, queue);
+}