view gecko/src/w2hd.c @ 2:ac1ecf12863a draft default tip

Uploaded
author bitlab
date Thu, 13 Dec 2018 08:57:14 -0500
parents 35af401890c0
children
line wrap: on
line source

/* creates a hash table in disk from a set of ordered words
   Syntax: w2hd  wordsSort.In  prefixNameOUT

    wordsSort is a bin file with Word-Pos-Seq
    prefixNameOUT.h2dW  : index of words-Pos-Ocurrences
    prefixNameOUT.h2dP  : positions(Pos+seq)

    Feb.2011: add a new parameter: PrefixSize

    PrefixSize: defines the word-prefix size to be used to identify when two
		words are the "same"

                              ortrelles@uma.es / Dic.2011
    ---------------------------------------------------------*/

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include "structs.h"
#include "commonFunctions.h"
#include "dictionaryFunctions.h"

int main(int ac, char** av){

	char fname[1024];
	uint64_t nWords=0;
	FILE* fw, *fOut1, *fOut2;

	wentry we;
	hashentry he;
	location loc;

	if(ac!=3)terror("USE: w2hd  wordsSort.In  prefixNameOUT\n");

	if ((fw = fopen(av[1],"rb"))==NULL) terror("opening IN file");

	sprintf(fname,"%s.d2hW",av[2]);
	if ((fOut1 = fopen(fname,"wb"))==NULL) terror("opening prefix.d2hW file");
	sprintf(fname,"%s.d2hP",av[2]);
	if ((fOut2 = fopen(fname,"wb"))==NULL) terror("opening prefix.d2hP file");

	if(fread(&we,sizeof(wentry),1,fw)!=1){
		terror("empty words file");
	}
	memcpy(&he.w.b[0],&we.w.b[0],8);
	he.pos=0;
	he.num=0;
   
	while(!feof(fw)){
		  loc.pos=we.pos;
		  loc.seq=we.seq;
		  if (wordcmp(&he.w.b[0],&we.w.b[0],32)!=0) {
			 fwrite(&he,sizeof(hashentry),1,fOut1);
			 memcpy(&he.w.b[0],&we.w.b[0],8);
			 he.pos=ftell(fOut2);
			 he.num=0;
		  }

		  fwrite(&loc,sizeof(location),1,fOut2);
		  he.num++;
		  nWords++;

		  if(fread(&we,sizeof(wentry),1,fw)!=sizeof(wentry)){
			if(ferror(fw))terror("error reading words file");
		  }
	}

	fwrite(&he,sizeof(hashentry),1,fOut1);

	fprintf(stdout,"\nw2hd: %s tot words=%" PRIu64 "\n",av[1],nWords);

	fclose(fOut1);
	fclose(fOut2);
	fclose(fw);

	return 0;
}