Mercurial > repos > alvarofaure > bitlab
comparison chromeister/src/CHROMEISTER.c @ 0:7fdf47a0bae8 draft
Uploaded
author | alvarofaure |
---|---|
date | Wed, 12 Dec 2018 07:18:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7fdf47a0bae8 |
---|---|
1 /********* | |
2 | |
3 File CHROMEISTER.c | |
4 Author EPW <estebanpw@uma.es> | |
5 Description Computes hits and generates a dotplot | |
6 | |
7 USAGE Usage is described by calling ./CHROMEISTER --help | |
8 | |
9 | |
10 | |
11 **********/ | |
12 | |
13 | |
14 #include <stdio.h> | |
15 #include <stdlib.h> | |
16 #include <math.h> | |
17 #include <string.h> | |
18 #include <ctype.h> | |
19 #include "structs.h" | |
20 #include "alignmentFunctions.h" | |
21 #include "commonFunctions.h" | |
22 #define STARTING_SEQS 1000 | |
23 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes | |
24 #define RANGE 2 | |
25 | |
26 uint64_t custom_kmer = 32; // Defined as external in structs.h | |
27 uint64_t diffuse_z = 4; // Defined as external in structs.h | |
28 | |
29 uint64_t get_seq_len(FILE * f); | |
30 | |
31 | |
32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * custom_kmer, uint64_t * dimension, uint64_t * diffuse_z); | |
33 | |
34 int main(int argc, char ** av){ | |
35 | |
36 | |
37 /* | |
38 //Store positions of kmers | |
39 uint64_t n_pools_used = 0; | |
40 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
41 //if(mp == NULL) terror("Could not allocate vectors for memory pools"); | |
42 Mempool_l mp[MAX_MEM_POOLS]; | |
43 init_mem_pool_llpos(&mp[n_pools_used]); | |
44 //llpos * aux; | |
45 | |
46 uint64_t n_pools_used_AVL = 0; | |
47 Mempool_AVL mp_AVL[MAX_MEM_POOLS]; | |
48 init_mem_pool_AVL(&mp_AVL[n_pools_used_AVL]); | |
49 */ | |
50 Tuple_hits * thit; | |
51 | |
52 /* | |
53 AVLTree * root = NULL; | |
54 root = insert_AVLTree(root, 10, mp_AVL, &n_pools_used_AVL, 0, mp, &n_pools_used); | |
55 | |
56 llpos * some = find_AVLTree(root, 25); | |
57 while(some != NULL){ | |
58 printf("#%"PRIu64", ", some->pos); some = some->next; | |
59 } | |
60 */ | |
61 | |
62 uint64_t i, j; | |
63 | |
64 //query to read kmers from, database to find seeds | |
65 FILE * query = NULL, * database = NULL, * out_database = NULL; | |
66 uint64_t dimension = 1000; // Default 1000 * 1000 | |
67 | |
68 | |
69 init_args(argc, av, &query, &database, &out_database, &custom_kmer, &dimension, &diffuse_z); | |
70 | |
71 | |
72 | |
73 unsigned char char_converter[91]; | |
74 char_converter[(unsigned char)'A'] = 0; | |
75 char_converter[(unsigned char)'C'] = 1; | |
76 char_converter[(unsigned char)'G'] = 2; | |
77 char_converter[(unsigned char)'T'] = 3; | |
78 | |
79 | |
80 // Variables to account for positions | |
81 // Print info | |
82 fprintf(stdout, "[INFO] Loading database\n"); | |
83 // Variables to read kmers | |
84 char c = 'N'; //Char to read character | |
85 // Current length of array and variables for the buffer | |
86 uint64_t idx = 0, r = 0; | |
87 | |
88 //Vector to read in batches | |
89 char * temp_seq_buffer = NULL; | |
90 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) { | |
91 terror("Could not allocate memory for read buffer"); | |
92 } | |
93 | |
94 //Dimensional matrix | |
95 uint64_t ** representation = (uint64_t **) calloc(dimension+1, sizeof(uint64_t *)); | |
96 if(representation == NULL) terror("Could not allocate representation"); | |
97 for(i=0; i<dimension+1; i++){ | |
98 representation[i] = (uint64_t *) calloc(dimension+1, sizeof(uint64_t)); | |
99 if(representation[i] == NULL) terror("Could not allocate second loop representation"); | |
100 } | |
101 | |
102 /* | |
103 fseek(database, 0, SEEK_END); | |
104 uint64_t aprox_len_query = ftell(database); | |
105 uint64_t aprox_len_db = aprox_len_query; | |
106 rewind(database); | |
107 */ | |
108 uint64_t aprox_len_query = get_seq_len(database); | |
109 uint64_t aprox_len_db = aprox_len_query; | |
110 | |
111 uint64_t a_hundreth = (aprox_len_query/100); | |
112 | |
113 unsigned char curr_kmer[custom_kmer], reverse_kmer[custom_kmer]; | |
114 curr_kmer[0] = reverse_kmer[0] = '\0'; | |
115 uint64_t word_size = 0, word_size_rev = 0; | |
116 | |
117 //To hold all information related to database | |
118 uint64_t current_len = 0; | |
119 | |
120 //To force reading from the buffer | |
121 idx = READBUF + 1; | |
122 | |
123 //unsigned char aux_kmer[custom_kmer+1]; | |
124 | |
125 //Vector to store query seq | |
126 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char)); | |
127 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector"); | |
128 | |
129 /* | |
130 Container * ct = (Container *) calloc(1, sizeof(Container)); | |
131 if(ct == NULL) terror("Could not allocate container"); | |
132 */ | |
133 | |
134 | |
135 | |
136 Index * ctidx = (Index *) calloc(1, sizeof(Index)); | |
137 if(ctidx == NULL) terror("Could not allocate container"); | |
138 | |
139 | |
140 //begin = clock(); | |
141 | |
142 | |
143 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
144 while((!feof(database) || (feof(database) && idx < r))){ | |
145 | |
146 if(c == '>'){ | |
147 | |
148 | |
149 | |
150 while(c != '\n') c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); //Skip ID | |
151 | |
152 | |
153 while(c != '>' && (!feof(database) || (feof(database) && idx < r))){ //Until next id | |
154 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
155 c = toupper(c); | |
156 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
157 curr_kmer[word_size] = (unsigned char) c; | |
158 if(word_size < custom_kmer) ++word_size; | |
159 ++current_len; | |
160 if(current_len % a_hundreth == 0){ | |
161 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); | |
162 //printf("%"PRIu64"%%..wasted: (%e) (%e)", 1+100*pos_in_query/aprox_len_query, (double)(wasted_cycles_forward)/CLOCKS_PER_SEC, (double)(wasted_cycles_reverse)/CLOCKS_PER_SEC); | |
163 fflush(stdout); | |
164 } | |
165 | |
166 | |
167 | |
168 }else{ //It can be anything (including N, Y, X ...) | |
169 | |
170 if(c != '\n' && c != '>'){ | |
171 word_size = 0; | |
172 // data_database.sequences[pos_in_database++] = (unsigned char) 'N'; //Convert to N | |
173 ++current_len; | |
174 | |
175 } | |
176 } | |
177 //if(current_len % 1000000 == 0) printf(" curr len %" PRIu64"\n", current_len); | |
178 if(word_size == custom_kmer){ | |
179 //write to hash table | |
180 | |
181 | |
182 thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] | |
183 [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] | |
184 [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] | |
185 [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; | |
186 | |
187 /* | |
188 typedef struct tuple_hits{ | |
189 int repetition; | |
190 int hit_count; | |
191 uint64_t key; | |
192 uint64_t pos; | |
193 } Tuple_hits; | |
194 */ | |
195 | |
196 if(thit->repetition == FALSE){ | |
197 // Then we can insert | |
198 thit->hit_count = 0; | |
199 thit->key = collisioned_hash(&curr_kmer[0], custom_kmer); | |
200 thit->pos = current_len; | |
201 }else{ | |
202 // Otherwise we break it | |
203 thit->repetition = TRUE; | |
204 } | |
205 | |
206 //thit->root = insert_AVLTree(thit->root, hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); | |
207 //thit->root = insert_AVLTree(thit->root, collisioned_hash(&curr_kmer[0], custom_kmer), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); | |
208 | |
209 | |
210 | |
211 // Non overlapping | |
212 word_size = 0; | |
213 | |
214 | |
215 // Overlapping | |
216 //memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); | |
217 //--word_size; | |
218 } | |
219 } | |
220 word_size = 0; | |
221 | |
222 }else{ | |
223 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
224 } | |
225 | |
226 } | |
227 | |
228 | |
229 //end = clock(); | |
230 | |
231 // data_database.total_len = pos_in_database; | |
232 | |
233 //fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64". Hash table building took %e seconds\n", data_database.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
234 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64".\n", current_len); | |
235 //close database | |
236 fclose(database); | |
237 | |
238 | |
239 | |
240 //begin = clock(); | |
241 | |
242 | |
243 | |
244 | |
245 | |
246 double pixel_size_db = (double) dimension / (double) current_len; | |
247 double ratio_db = (double) current_len / dimension; | |
248 | |
249 | |
250 // Get file length | |
251 | |
252 //fseek(query, 0, SEEK_END); | |
253 //aprox_len_query = ftell(query); | |
254 //rewind(query); | |
255 aprox_len_query = get_seq_len(query); | |
256 | |
257 //uint64_t reallocs_hash_holder_table = 1; | |
258 //uint64_t n_items_hash_holder_table = aprox_len_query / 5; | |
259 | |
260 //Hash_holder * hash_holder_table = (Hash_holder *) calloc(n_items_hash_holder_table, sizeof(Hash_holder)); | |
261 //if(hash_holder_table == NULL) terror("Could not allocate hash holding table"); | |
262 | |
263 a_hundreth = (aprox_len_query/100); | |
264 double pixel_size_query = (double) dimension / (double) aprox_len_query; | |
265 double ratio_query = (double) aprox_len_query / dimension; | |
266 | |
267 | |
268 double i_r_fix = MAX(1.0, custom_kmer * pixel_size_query); | |
269 double j_r_fix = MAX(1.0, custom_kmer * pixel_size_db); | |
270 | |
271 | |
272 | |
273 fprintf(stdout, "[INFO] Ratios: Q [%e] D [%e]. Lenghts: Q [%"PRIu64"] D [%"PRIu64"]\n", ratio_query, ratio_db, aprox_len_query, current_len); | |
274 fprintf(stdout, "[INFO] Pixel size: Q [%e] D [%e].\n", pixel_size_query, pixel_size_db); | |
275 | |
276 | |
277 fprintf(stdout, "[INFO] Computing absolute hit numbers.\n"); | |
278 | |
279 | |
280 current_len = 0; | |
281 | |
282 //llpos * the_original_hit; | |
283 | |
284 //To force reading from the buffer | |
285 idx = READBUF + 1; | |
286 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
287 //uint64_t c_hash_holder = 0; | |
288 | |
289 while((!feof(query) || (feof(query) && idx < r))){ | |
290 | |
291 if(c == '>'){ | |
292 word_size = 0; | |
293 word_size_rev = custom_kmer-1; | |
294 | |
295 | |
296 | |
297 | |
298 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID | |
299 | |
300 | |
301 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id | |
302 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
303 c = toupper(c); | |
304 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
305 | |
306 ++current_len; | |
307 if(current_len % a_hundreth == 0){ | |
308 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); | |
309 fflush(stdout); | |
310 } | |
311 curr_kmer[word_size] = (unsigned char) c; | |
312 ++word_size; | |
313 | |
314 switch(c){ | |
315 case ('A'): reverse_kmer[word_size_rev] = (unsigned)'T'; | |
316 break; | |
317 case ('C'): reverse_kmer[word_size_rev] = (unsigned)'G'; | |
318 break; | |
319 case ('G'): reverse_kmer[word_size_rev] = (unsigned)'C'; | |
320 break; | |
321 case ('T'): reverse_kmer[word_size_rev] = (unsigned)'A'; | |
322 break; | |
323 } | |
324 if(word_size_rev != 0) --word_size_rev; | |
325 | |
326 | |
327 | |
328 | |
329 if(word_size == custom_kmer){ | |
330 | |
331 | |
332 //hash_forward = hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K); | |
333 //hash_reverse = hashOfWord(&reverse_kmer[0], custom_kmer, FIXED_K); | |
334 uint64_t hash_forward, hash_reverse; | |
335 hash_forward = collisioned_hash(&curr_kmer[0], custom_kmer); | |
336 hash_reverse = collisioned_hash(&reverse_kmer[0], custom_kmer); | |
337 | |
338 | |
339 thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] | |
340 [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] | |
341 [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] | |
342 [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; | |
343 | |
344 //AVLTree * search = find_AVLTree(thit->root, hash_forward); | |
345 | |
346 if(thit->repetition == FALSE && hash_forward == thit->key){ | |
347 // Attention ::::: you were not removing the ones with count==1 earlier | |
348 thit->pos_in_y = current_len; | |
349 thit->hit_count++; | |
350 } | |
351 | |
352 thit = &ctidx->table[char_converter[reverse_kmer[0]]][char_converter[reverse_kmer[1]]][char_converter[reverse_kmer[2]]] | |
353 [char_converter[reverse_kmer[3]]][char_converter[reverse_kmer[4]]][char_converter[reverse_kmer[5]]] | |
354 [char_converter[reverse_kmer[6]]][char_converter[reverse_kmer[7]]][char_converter[reverse_kmer[8]]] | |
355 [char_converter[reverse_kmer[9]]][char_converter[reverse_kmer[10]]][char_converter[reverse_kmer[11]]]; | |
356 | |
357 if(thit->repetition == FALSE && hash_reverse == thit->key){ | |
358 // Attention ::::: you were not removing the ones with count==1 earlier | |
359 thit->pos_in_y = current_len; | |
360 thit->hit_count++; | |
361 } | |
362 | |
363 /* | |
364 if(search != NULL && search->count == 1){ //If count is two, then it is a rep | |
365 thit->hit_count += search->count; | |
366 | |
367 hash_holder_table[c_hash_holder].pos = current_len; | |
368 hash_holder_table[c_hash_holder].node = search; | |
369 hash_holder_table[c_hash_holder].th = thit; | |
370 ++c_hash_holder; | |
371 if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ | |
372 ++reallocs_hash_holder_table; | |
373 hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); | |
374 if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); | |
375 } | |
376 } | |
377 */ | |
378 | |
379 | |
380 | |
381 | |
382 | |
383 //search = find_AVLTree(thit->root, hash_reverse); | |
384 /* | |
385 if(search != NULL && search->count == 1){ //If count is two, then it is a rep | |
386 | |
387 thit->hit_count += search->count; | |
388 hash_holder_table[c_hash_holder].pos = current_len; | |
389 hash_holder_table[c_hash_holder].node = search; | |
390 hash_holder_table[c_hash_holder].th = thit; | |
391 ++c_hash_holder; | |
392 if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ | |
393 ++reallocs_hash_holder_table; | |
394 hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); | |
395 if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); | |
396 } | |
397 } | |
398 */ | |
399 | |
400 // Overlapping | |
401 | |
402 memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); | |
403 memmove(&reverse_kmer[1], &reverse_kmer[0], custom_kmer-1); | |
404 --word_size; | |
405 | |
406 // Non overlapping | |
407 //word_size = 0; | |
408 //word_size_rev = custom_kmer-1; | |
409 } | |
410 }else{ | |
411 if(c != '\n' && c != '>'){ | |
412 word_size = 0; | |
413 word_size_rev = custom_kmer-1; | |
414 ++current_len; | |
415 } | |
416 } | |
417 } | |
418 }else{ | |
419 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
420 } | |
421 | |
422 } | |
423 | |
424 /// Out | |
425 | |
426 fprintf(stdout, "Scanning hits table.\n"); | |
427 | |
428 a_hundreth = MAX(1, TOTAL_ENTRIES/100); | |
429 uint64_t t_computed = 0; | |
430 uint64_t w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11; | |
431 for(w0=0;w0<4;w0++){ | |
432 for(w1=0;w1<4;w1++){ | |
433 for(w2=0;w2<4;w2++){ | |
434 for(w3=0;w3<4;w3++){ | |
435 for(w4=0;w4<4;w4++){ | |
436 for(w5=0;w5<4;w5++){ | |
437 for(w6=0;w6<4;w6++){ | |
438 for(w7=0;w7<4;w7++){ | |
439 for(w8=0;w8<4;w8++){ | |
440 for(w9=0;w9<4;w9++){ | |
441 for(w10=0;w10<4;w10++){ | |
442 for(w11=0;w11<4;w11++){ | |
443 | |
444 if(t_computed % a_hundreth == 0){ | |
445 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*t_computed/TOTAL_ENTRIES); | |
446 fflush(stdout); | |
447 } | |
448 ++t_computed; | |
449 Tuple_hits * taux = &ctidx->table[w0][w1][w2][w3][w4][w5][w6][w7][w8][w9][w10][w11]; | |
450 if(taux->hit_count == 1){ | |
451 // We plot it | |
452 // Convert scale to representation | |
453 uint64_t redir_db = (uint64_t) (taux->pos / (ratio_db)); | |
454 uint64_t redir_query = (uint64_t) (taux->pos_in_y / (ratio_query)); | |
455 double i_r = i_r_fix; double j_r = j_r_fix; | |
456 while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ | |
457 if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ | |
458 representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; | |
459 }else{ | |
460 representation[redir_query][redir_db]++; | |
461 break; | |
462 } | |
463 i_r -= MIN(1.0, pixel_size_query); | |
464 j_r -= MIN(1.0, pixel_size_db); | |
465 } | |
466 } | |
467 } | |
468 } | |
469 } | |
470 } | |
471 } | |
472 } | |
473 } | |
474 } | |
475 } | |
476 } | |
477 } | |
478 } | |
479 | |
480 | |
481 //double average_hit = ((double) total_hits / (double) table_size); | |
482 //average_hit = 2.2; | |
483 | |
484 /* | |
485 //fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" Avg = %e.\n", total_hits, table_size, average_hit); | |
486 fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" E' = %e.\n", total_hits, table_size, Eprime); | |
487 | |
488 | |
489 | |
490 | |
491 | |
492 a_hundreth = MAX(1,c_hash_holder/100); | |
493 | |
494 for(current_len = 0; current_len < c_hash_holder; current_len++){ | |
495 | |
496 if(current_len % a_hundreth == 0){ | |
497 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/c_hash_holder); | |
498 fflush(stdout); | |
499 } | |
500 | |
501 aux = hash_holder_table[current_len].node->next; | |
502 | |
503 //if(hash_holder_table[current_len].th->hit_count < (uint64_t) average_hit){ | |
504 if(hash_holder_table[current_len].th->hit_count < (uint64_t) Eprime){ | |
505 while(aux != NULL){ | |
506 // Convert scale to representation | |
507 uint64_t redir_db = (uint64_t) (aux->pos / (ratio_db)); | |
508 uint64_t redir_query = (uint64_t) (hash_holder_table[current_len].pos / (ratio_query)); | |
509 double i_r = i_r_fix; double j_r = j_r_fix; | |
510 while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ | |
511 if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ | |
512 representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; | |
513 }else{ | |
514 representation[redir_query][redir_db]++; | |
515 break; | |
516 } | |
517 i_r -= MIN(1.0, pixel_size_query); | |
518 j_r -= MIN(1.0, pixel_size_db); | |
519 } | |
520 aux = aux->next; | |
521 } | |
522 } | |
523 } | |
524 */ | |
525 | |
526 | |
527 //end = clock(); | |
528 | |
529 | |
530 | |
531 | |
532 //fprintf(stdout, "\n[INFO] Query length %"PRIu64". Hits completed. Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
533 fprintf(stdout, "\n[INFO] Query length %"PRIu64".\n", current_len); | |
534 | |
535 //begin = clock(); | |
536 | |
537 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads)); | |
538 | |
539 fprintf(stdout, "[INFO] Writing matrix.\n"); | |
540 | |
541 | |
542 uint64_t unique_diffuse = 0; | |
543 fprintf(out_database, "%"PRIu64"\n", aprox_len_query); | |
544 fprintf(out_database, "%"PRIu64"\n", aprox_len_db); | |
545 // And replace 2's with 1's | |
546 | |
547 for(i=0; i<dimension+1; i++){ | |
548 for(j=0; j<dimension; j++){ | |
549 fprintf(out_database, "%"PRIu64" ", representation[i][j]); | |
550 unique_diffuse += representation[i][j]; | |
551 } | |
552 fprintf(out_database, "%"PRIu64"\n", representation[i][dimension]); | |
553 unique_diffuse += representation[i][dimension]; | |
554 } | |
555 | |
556 fprintf(stdout, "[INFO] Found %"PRIu64" unique hits for z = %"PRIu64".\n", unique_diffuse, diffuse_z); | |
557 | |
558 | |
559 | |
560 //free(ct->table); | |
561 //free(hash_holder_table); | |
562 /* | |
563 for(i=0;i<=n_pools_used_AVL;i++){ | |
564 free(mp_AVL[i].base); | |
565 } | |
566 | |
567 for(i=0;i<=n_pools_used;i++){ | |
568 free(mp[i].base); | |
569 } | |
570 */ | |
571 for(i=0;i<dimension;i++){ | |
572 free(representation[i]); | |
573 } | |
574 free(representation); | |
575 if(out_database != NULL) fclose(out_database); | |
576 | |
577 return 0; | |
578 } | |
579 | |
580 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * custom_kmer, uint64_t * dimension, uint64_t * diffuse_z){ | |
581 | |
582 int pNum = 0; | |
583 while(pNum < argc){ | |
584 if(strcmp(av[pNum], "--help") == 0){ | |
585 fprintf(stdout, "USAGE:\n"); | |
586 fprintf(stdout, " CHROMEISTER -query [query] -db [database] -out [outfile]\n"); | |
587 fprintf(stdout, "OPTIONAL:\n"); | |
588 fprintf(stdout, " -kmer [Integer: k>1 (default 32)]\n"); | |
589 fprintf(stdout, " -diffuse [Integer: z>0 (default 4)]\n"); | |
590 fprintf(stdout, " -dimension Size of the output [Integer: d>0 (default 1000)]\n"); | |
591 fprintf(stdout, " -out [File path]\n"); | |
592 fprintf(stdout, " --help Shows help for program usage\n"); | |
593 fprintf(stdout, "\n"); | |
594 fprintf(stdout, "PLEASE NOTICE: The reverse complementary is calculated for the QUERY.\n"); | |
595 exit(1); | |
596 } | |
597 if(strcmp(av[pNum], "-query") == 0){ | |
598 *query = fopen64(av[pNum+1], "rt"); | |
599 if(query==NULL) terror("Could not open query file"); | |
600 } | |
601 if(strcmp(av[pNum], "-db") == 0){ | |
602 *database = fopen64(av[pNum+1], "rt"); | |
603 if(database==NULL) terror("Could not open database file"); | |
604 } | |
605 if(strcmp(av[pNum], "-out") == 0){ | |
606 *out_database = fopen(av[pNum+1], "wt"); | |
607 if(out_database==NULL) terror("Could not open output database file"); | |
608 } | |
609 if(strcmp(av[pNum], "-kmer") == 0){ | |
610 *custom_kmer = (uint64_t) atoi(av[pNum+1]); | |
611 if(*custom_kmer < BYTES_IN_MER) terror("K-mer size must be larger than 4"); | |
612 if(*custom_kmer % BYTES_IN_MER != 0) terror("K-mer size must be a multiple of 4"); | |
613 | |
614 } | |
615 if(strcmp(av[pNum], "-diffuse") == 0){ | |
616 *diffuse_z = (uint64_t) atoi(av[pNum+1]); | |
617 if(*diffuse_z == 0 || *diffuse_z > 32) terror("Z must satisfy 0<z<=32"); | |
618 | |
619 } | |
620 | |
621 if(strcmp(av[pNum], "-dimension") == 0){ | |
622 *dimension = (uint64_t) atoi(av[pNum+1]); | |
623 if(*dimension < 1) terror("Dimension must be a positive integer"); | |
624 } | |
625 | |
626 pNum++; | |
627 } | |
628 | |
629 if(*query==NULL || *database==NULL || *out_database==NULL) terror("A query, database and output file is required"); | |
630 } | |
631 | |
632 uint64_t get_seq_len(FILE * f) { | |
633 char c = '\0'; | |
634 uint64_t l = 0; | |
635 | |
636 while(!feof(f)){ | |
637 c = getc(f); | |
638 | |
639 if(c == '>'){ | |
640 while(c != '\n') c = getc(f); | |
641 } | |
642 c = toupper(c); | |
643 if(c >= 'A' && c <= 'Z'){ | |
644 ++l; | |
645 } | |
646 } | |
647 | |
648 | |
649 rewind(f); | |
650 return l; | |
651 } |