diff gecko/src/indexmaker.c @ 1:35af401890c0 draft

Uploaded
author bitlab
date Thu, 13 Dec 2018 07:59:25 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gecko/src/indexmaker.c	Thu Dec 13 07:59:25 2018 -0500
@@ -0,0 +1,133 @@
+
+/********************* 
+File		indexmaker.c
+Author		Bitlab
+Description	Computes a series of statistics and stores them in natural order and sorted order
+	
+USAGE		<sequences.fasta>	The fasta sequences
+			<output.index>		The output name for the two indexes
+ * ------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <inttypes.h>
+#include "structs.h"
+#include "comparisonFunctions.h"
+#include "commonFunctions.h"
+
+
+void copyRead(struct rIndex2 *To, struct rIndex2 *From);
+void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y);
+
+int main(int ac, char **av) {
+
+	FILE   *f, *fOut;
+	struct rIndex2 R;//, *RR;
+	//uint64_t *positions;
+	uint64_t  c;
+	uint64_t tLen=0, tLmask=0, tN=0, nReads=0;
+	uint64_t maxRlen=0, maxRlenMasked=0, maxNonACGT=0;
+	uint64_t N=0, i;
+	//char tmp[MAXLID];
+
+	if (ac!=3) terror("USE: indexmaker <sequences.fasta> <output.index>");
+
+	
+	if ((f=fopen(av[1],"rt"))==NULL) terror("Could not open sequences fasta file");
+
+	if ((fOut=fopen(av[2],"wb"))==NULL) terror("Could not open output file");
+
+    R.Lac=0;
+    R.rLen=0;
+    c=fgetc(f);
+    while (!feof(f)) {
+       	while(c!='>') c=(char)fgetc(f); // only first time
+		R.pos=ftell(f) - 1;
+		i=0;
+        c=getc(f);
+        if(i==0 && c== ' ') c=getc(f); //Handle sequences with a ' ' after the '>'
+        while(i< MAXLID && c!='\n' && c!=' ') {
+         	R.id[i++] = c;
+          	c=getc(f);
+        }
+        R.id[i]=0x00;
+
+        while(c!='\n') c=getc(f);
+		
+		i=0;
+		R.rLen=0;
+		R.rLmasked=0;
+		R.nonACGT=0;
+       
+		while(c!='>' && !feof(f)) {
+			if (c=='N' || c=='n') R.nonACGT++; 
+			else if (c>96 && c<123) R.rLmasked++;
+		   	c=toupper(c);
+		  	if (c>64 && c<91) R.rLen++;
+		  	if (c>47 && c<58) R.rLen++; //Handling color space reads
+			
+			c=(char)fgetc(f);
+		}
+		R.rNumber = N;
+		if (R.rLen > maxRlen) maxRlen=R.rLen;
+		if (R.rLmasked > maxRlenMasked) maxRlenMasked = R.rLmasked;
+		if (R.nonACGT  > maxNonACGT) maxNonACGT = R.nonACGT;
+		
+		fwrite(&R,sizeof(struct rIndex2),1,fOut); 
+
+		N++;
+
+		R.Lac +=R.rLen;
+		tLen  +=R.rLen;
+		tN    +=R.nonACGT;
+		tLmask+=R.rLmasked;
+
+		nReads++;
+
+	}
+
+	fclose(f);
+	fclose(fOut);
+
+	fprintf(stdout,"....done\n");
+	return 0;
+}
+
+void copyRead(struct rIndex2 *To, struct rIndex2 *From){
+
+    strcpy(To->id, From->id);
+    To->rNumber = From->rNumber;
+    To->rLen    = From->rLen;
+    To->rLmasked= From->rLmasked;
+    To->nonACGT = From->nonACGT;
+    To->pos     = From->pos;      //Start position of read
+    To->Lac     = From->Lac;      // accumulated length
+}
+
+
+void QsortRead(struct rIndex2 *array,uint64_t x, uint64_t y) {
+
+	struct rIndex2 pivote, aux;
+	uint64_t x1, y1;
+
+	copyRead(&pivote, &array[(x+y)/2]);
+	x1 = x;
+	y1 = y;
+
+ 	do{
+		while (strcmp(pivote.id, array[x1].id)>0) x1++;
+		while (strcmp(pivote.id, array[y1].id)<0) y1--;
+    	if (x1 < y1) { 
+			copyRead(&aux,&array[x1]);
+			copyRead(&array[x1], &array[y1]);
+			copyRead(&array[y1], &aux);
+			x1++;
+			y1--;
+    	}else if (x1 == y1) x1++;
+   } while (x1 <=y1);
+
+  if (x<y1) QsortRead(array,x,y1);
+  if (x1<y) QsortRead(array,x1,y);
+ }