diff gecko/src/frags2text.c @ 1:35af401890c0 draft

Uploaded
author bitlab
date Thu, 13 Dec 2018 07:59:25 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gecko/src/frags2text.c	Thu Dec 13 07:59:25 2018 -0500
@@ -0,0 +1,311 @@
+/* 
+ *
+ * Sintax: ./frags2text fragsFILE.frags fastaX fastaY fragsFILE.txt
+ 
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include "structs.h"
+#include "commonFunctions.h"
+#include "comparisonFunctions.h"
+
+#define TAB_INSERT 70
+
+struct rIndex2 * loadReadsIndex(char * filename, uint64_t * nReads){
+	struct rIndex2 * RR;
+	uint64_t nR=0,i;
+	FILE *f;
+	uint64_t fsize;
+
+    if ((f=fopen(filename,"rb"))==NULL) terror("Could not open index input file");
+
+	fseeko(f, 0, SEEK_END);
+	fsize = ftello(f);
+	rewind(f);
+	nR = fsize/sizeof(struct rIndex2);
+
+    if ((RR =(struct rIndex2*) calloc(nR,sizeof(struct rIndex2)))==NULL) terror("Could not allocate index");
+
+	for (i=0; i<nR; i++){
+		if(0 == fread(&RR[i],sizeof(struct rIndex2),1,f)) break;
+	}
+	fclose(f);
+	(*nReads) = nR;
+	return RR;
+}
+
+void write_headers(FILE * f1, FILE * f2, FILE * fO, uint64_t pos1, uint64_t pos2, struct FragFile * f){
+	fseek(f1, pos1, SEEK_SET);
+	char c = fgetc(f1);
+	while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f1); }
+	fprintf(fO, " ALIGNED WITH ");
+	fseek(f2, pos2, SEEK_SET);
+	c = fgetc(f2);
+	while(c != '\n'){ fprintf(fO, "%c", c); c = fgetc(f2); }
+	fprintf(fO, " LENGTH: %"PRIu64" IDENT: %"PRIu64" STRAND: %c @(%"PRIu64", %"PRIu64")\n", f->length, f->ident, f->strand, f->xStart, f->yStart);
+}
+
+void get_both_seqs(char * fastaX, char * fastaY, uint64_t iniX, uint64_t finX, uint64_t iniY, uint64_t finY, uint64_t posX, uint64_t posY, uint64_t LacX, uint64_t LacY, uint64_t lX, uint64_t lY, FILE * fO){
+	char copyX[TAB_INSERT+1];
+	char copyY[TAB_INSERT+1];
+	memset(copyX, 0x0, TAB_INSERT+1);
+	memset(copyY, 0x0, TAB_INSERT+1);
+	uint64_t atcopyX = 0;
+	uint64_t atcopyY = 0;
+	uint64_t i;
+
+	uint64_t pos_used_x, pos_used_y;
+	
+	// The X one
+	//fseek(fastaX, posX, SEEK_SET);
+	pos_used_x = posX;
+	char cX = fastaX[pos_used_x++];
+	while(cX != '\n'){ cX = fastaX[pos_used_x++]; }
+	uint64_t gpX = iniX - LacX;
+	uint64_t currX = 0, tab_counter;
+	while(currX < gpX){
+		cX = fastaX[pos_used_x++];
+		if(cX != '\n') ++currX;
+	}
+
+	// the other Y
+	//fseek(fastaY, posY, SEEK_SET);
+	pos_used_y = posY;
+	char cY = fastaY[pos_used_y++];
+	while(cY != '\n'){ cY = fastaY[pos_used_y++]; }
+	uint64_t gpY = iniY - LacY;
+	uint64_t currY = 0;
+	while(currY < gpY){
+		cY = fastaY[pos_used_y++];
+		if(cY != '\n') ++currY;
+	}
+
+
+	
+
+	// Reached the region to show
+	currX = 0;
+	currY = 0;
+	cX = fastaX[pos_used_x++];
+	cY = fastaY[pos_used_y++];
+	//fprintf(fO, "\t");
+	tab_counter = 0;
+	while(currX < lX && currY < lY){
+		
+		if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; }
+		cX = fastaX[pos_used_x++];
+		while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++];
+
+
+		if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; }
+		cY = fastaY[pos_used_y++];
+		while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++];
+
+		while(currX > currY){
+			if(cY == 'A' || cY == 'C' || cY == 'G' || cY == 'T' || cY == 'N'){ copyY[atcopyY++] = cY; ++currY; }
+			cY = fastaY[pos_used_y++];
+			while(cY != 'A' && cY != 'C' && cY != 'G' && cY != 'T' && cY != 'N') cY = fastaY[pos_used_y++];
+		}
+		while(currX < currY){
+			if(cX == 'A' || cX == 'C' || cX == 'G' || cX == 'T' || cX == 'N'){ copyX[atcopyX++] = cX; ++currX; }
+			cX = fastaX[pos_used_x++];
+			while(cX != 'A' && cX != 'C' && cX != 'G' && cX != 'T' && cX != 'N') cX = fastaX[pos_used_x++];
+		}
+
+		++tab_counter;
+		
+
+
+		if(tab_counter >= TAB_INSERT && currX == currY){ 
+			
+			copyX[TAB_INSERT] = '\0';
+			copyY[TAB_INSERT] = '\0';
+			fprintf(fO, "X:\t%.*s\n\t", TAB_INSERT, copyX);
+			for(i=0; i<TAB_INSERT; i++){
+				if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " ");
+			}
+			fprintf(fO, "\nY:\t%.*s\n\n", TAB_INSERT, copyY);
+			tab_counter = 0;
+			atcopyX = 0; atcopyY = 0;
+		}
+	}
+	if(atcopyX > 0){
+		copyX[atcopyX] = '\0';
+		copyY[atcopyY] = '\0';
+		fprintf(fO, "X:\t%.*s\n\t", (int)atcopyX, copyX);
+		for(i=0; i<atcopyX; i++){
+			if(copyX[i] == copyY[i]) fprintf(fO, "|"); else fprintf(fO, " ");
+		}
+		fprintf(fO, "\nY:\t%.*s\n\n", (int)atcopyY, copyY);
+		atcopyX = 0; atcopyY = 0;
+	} 
+	fprintf(fO, "\n");
+}
+
+
+void get_seq_from_to(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){
+	fseek(fasta, pos, SEEK_SET);
+	char c = fgetc(fasta);
+	while(c != '\n'){ c = fgetc(fasta); }
+	uint64_t gp = ini - Lac;
+	uint64_t curr = 0, tab_counter;
+	while(curr < gp){
+		c = fgetc(fasta);
+		if(c != '\n') ++curr;
+	}
+	// Reached the region to show
+	curr = 0;
+	c = fgetc(fasta);
+	fprintf(fO, "\t");
+	tab_counter = 0;
+	while(curr < l){
+		if(c != '\n') fprintf(fO, "%c", c);
+		c = fgetc(fasta);
+		if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; }
+		if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;}
+	}
+	fprintf(fO, "\n");
+}
+
+
+void get_seq_from_to_rev(FILE * fasta, FILE * output, uint64_t ini, uint64_t fin, uint64_t pos, uint64_t Lac, uint64_t seqNum, uint64_t l, FILE * fO){
+	fseek(fasta, pos, SEEK_SET);
+	char c = fgetc(fasta);
+	while(c != '\n'){ c = fgetc(fasta); }
+	uint64_t gp = ini - Lac;
+	uint64_t curr = 0, tab_counter;
+	while(curr < gp){
+		c = fgetc(fasta);
+		if(c != '\n') ++curr;
+	}
+	// Reached the region to show
+	curr = 0;
+	c = fgetc(fasta);
+	fprintf(fO, "\t");
+	tab_counter = 0;
+	while(curr < l){
+		if(c != '\n') fprintf(fO, "%c", c);
+		c = fgetc(fasta);
+		if(c != '\n' || feof(fasta)){ ++curr; ++tab_counter; }
+		if(tab_counter == TAB_INSERT){ fprintf(fO, "\n\t"); tab_counter = 0;}
+	}
+	fprintf(fO, "\n");
+}
+
+
+
+int main(int ac, char** av) {
+	FILE* fFrags;
+	uint64_t n1, n2;
+	struct FragFile frag;
+
+	if (ac != 9)
+		terror("USE: ./frags2text fragsFILE.frags fastaX fastaY fastaYrev indexX indexY indexYrev fragsFILE.txt");
+
+	// prepared for multiple files
+	if ((fFrags = fopen(av[1], "rb")) == NULL)
+		terror("Opening Frags binary file");
+
+	// Open fastas
+
+	FILE * fX = NULL, * fY = NULL, * fYrev = NULL, * fO = NULL;
+	fX = fopen(av[2], "rt");
+	if(fX == NULL) terror("Could not open fasta X file");
+	fY = fopen(av[3], "rt");
+	if(fY == NULL) terror("Could not open fasta Y file");
+	fYrev = fopen(av[4], "rt");
+	if(fYrev == NULL) terror("Could not open fasta Y-rev file");
+
+
+	// Get file lengths
+    fseek(fX, 0, SEEK_END);
+    uint64_t aprox_lenX = ftell(fX);
+    rewind(fX);
+	char * strfastaX = (char *) malloc(aprox_lenX*sizeof(char));
+	fseek(fY, 0, SEEK_END);
+    uint64_t aprox_lenY = ftell(fY);
+    rewind(fY);
+	char * strfastaY = (char *) malloc(aprox_lenY*sizeof(char));
+	fseek(fYrev, 0, SEEK_END);
+    uint64_t aprox_lenYrev = ftell(fYrev);
+    rewind(fYrev);
+	char * strfastaYrev = (char *) malloc(aprox_lenYrev*sizeof(char));
+
+	if(strfastaX == NULL || strfastaY == NULL || strfastaYrev == NULL) terror("Could not allocate string sequences");
+
+	if(aprox_lenX != fread(strfastaX, sizeof(char), aprox_lenX, fX)) terror("Read wrong number of chars at X sequence");
+	if(aprox_lenY != fread(strfastaY, sizeof(char), aprox_lenY, fY)) terror("Read wrong number of chars at Y sequence");
+	if(aprox_lenYrev != fread(strfastaYrev, sizeof(char), aprox_lenYrev, fYrev)) terror("Read wrong number of chars at Y reversed sequence");
+	
+
+	struct rIndex2 * RI_X, * RI_Y, * RI_Yrev;
+
+	
+	uint64_t nReads_X, nReads_Y;
+	RI_X = loadReadsIndex(av[5], &nReads_X);
+	RI_Y = loadReadsIndex(av[6], &nReads_Y);
+	RI_Yrev = loadReadsIndex(av[7], &nReads_Y);
+
+	fO = fopen(av[8], "wt");
+	if(fO == NULL) terror("Could not open output alignments file");
+
+
+	readSequenceLength(&n1, fFrags);
+	readSequenceLength(&n2, fFrags);
+	
+
+	readFragment(&frag, fFrags);
+
+	// RI_X 	is the forward index for fasta file X
+	// RI_Y 	is the forward index for fasta file Y
+	// RI_Yrev 	is the reverse index for fasta file Y
+
+	
+
+	while (!feof(fFrags)) {
+		
+		//RI[id].pos is position in file of >
+		//RI[id].Lac is the sum of the reads length prior
+
+		if(frag.strand == 'f'){
+			write_headers(fX, fY, fO, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, &frag);
+		}else{
+			write_headers(fX, fYrev, fO, RI_X[frag.seqX].pos, RI_Yrev[nReads_Y - frag.seqY - 1].pos, &frag);
+		}
+		
+
+		//get_seq_from_to(fX, fO, frag.xStart, frag.xEnd, RI_X[frag.seqX].pos, RI_X[frag.seqX].Lac, frag.seqX, frag.length, fO);
+
+
+
+		if(frag.strand == 'f'){
+			get_both_seqs(strfastaX, strfastaY, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Y[frag.seqY].pos, RI_X[frag.seqX].Lac, RI_Y[frag.seqY].Lac, frag.length, frag.length, fO);
+			//get_seq_from_to(fY, fO, frag.yStart, frag.yEnd, RI_Y[frag.seqY].pos, RI_Y[frag.seqY].Lac, frag.seqY, frag.length, fO);
+		}else{
+			uint64_t seqYnew;
+			seqYnew = nReads_Y - frag.seqY - 1;
+			get_both_seqs(strfastaX, strfastaYrev, frag.xStart, frag.xEnd, frag.yStart, frag.yEnd, RI_X[frag.seqX].pos, RI_Yrev[seqYnew].pos, RI_X[frag.seqX].Lac, RI_Yrev[seqYnew].Lac, frag.length, frag.length, fO);
+			//get_seq_from_to_rev(fYrev, fO, frag.yStart, frag.yEnd, RI_Yrev[seqYnew].pos, RI_Yrev[seqYnew].Lac, seqYnew, frag.length, fO);
+		}
+
+		readFragment(&frag, fFrags);
+	}
+
+	fclose(fFrags);
+	fclose(fX);
+	fclose(fY);
+	fclose(fO);
+
+	free(RI_X);
+	free(RI_Y);
+	free(RI_Yrev);
+
+	free(strfastaX);
+	free(strfastaY);
+	free(strfastaYrev);
+
+	return 0;
+}