comparison IMSAME/src/IMSAME.c @ 0:762009a91895 draft

Uploaded
author bitlab
date Sat, 15 Dec 2018 18:04:10 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:762009a91895
1 /*********
2
3 File IMSAME.c
4 Author EPW <estebanpw@uma.es>
5 Description Computes an incremental alignment on reads versus reads using n threads
6
7 USAGE Usage is described by calling ./IMSAME --help
8
9
10
11 **********/
12
13
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <math.h>
17 #include <string.h>
18 #include <ctype.h>
19 #include <pthread.h>
20 #include "structs.h"
21 #include "alignmentFunctions.h"
22 #include "commonFunctions.h"
23
24 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
25 #define MIN(x, y) (((x) <= (y)) ? (x) : (y))
26 #define STARTING_SEQS 1000
27 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes
28
29
30 uint64_t custom_kmer = 12; // Defined as external in structs.h
31
32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts);
33
34 int VERBOSE_ACTIVE = 0;
35
36 int main(int argc, char ** av){
37
38
39
40
41
42 clock_t begin, end;
43
44 int error; //To tell if threads could not be launched
45 uint64_t i,j;
46
47 //query to read kmers from, database to find seeds
48 FILE * query = NULL, * database = NULL, * out_database = NULL;
49 long double minevalue = 1/powl(10, 10); //Default 1 * 10^-10
50
51 long double mincoverage = 0.5, minidentity = 0.5, window = 0.15; //Default
52 int igap = -5, egap = -2;
53 unsigned char full_comp = FALSE;
54 unsigned char hits_only = FALSE;
55
56 uint64_t n_threads = 4;
57 uint64_t n_parts = 3; // Default is 3
58
59 init_args(argc, av, &query, &database, &out_database, &n_threads, &minevalue, &mincoverage, &igap, &egap, &minidentity, &window, &full_comp, &custom_kmer, &hits_only, &n_parts);
60
61 //uint64_t reads_per_thread;
62 uint64_t sum_accepted_reads = 0;
63
64 begin = clock();
65
66 fprintf(stdout, "[INFO] Init. quick table\n");
67
68 pthread_t * threads = (pthread_t *) malloc(n_threads * sizeof(pthread_t));
69 if(threads == NULL) terror("Could not create threads");
70
71 pthread_t * loading_threads = (pthread_t *) malloc(FIXED_LOADING_THREADS * sizeof(pthread_t));
72 if(loading_threads == NULL) terror("Could not create loading threads");
73
74
75 HashTableArgs * hta = (HashTableArgs *) malloc(n_threads*sizeof(HashTableArgs));
76 if(hta == NULL) terror("Could not allocate arguments for hash table");
77
78 pthread_mutex_t lock; //The mutex to lock the queue
79 if (pthread_mutex_init(&lock, NULL) != 0) terror("Could not init mutex");
80
81 // To be used if only computing hits
82 uint64_t ** hits_table = NULL;
83
84 unsigned char ** my_x = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));
85 unsigned char ** my_y = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));
86
87 unsigned char ** marker_taggs = NULL; // To be used with full comparison
88
89 struct positioned_cell ** mc = (struct positioned_cell **) malloc(n_threads * sizeof(struct positioned_cell *));
90 struct cell *** table = (struct cell ***) malloc(n_threads * sizeof(struct cell **));
91 if(table == NULL) terror("Could not allocate NW table");
92 char ** reconstruct_X = (char **) malloc(n_threads * sizeof(char *));
93 char ** reconstruct_Y = (char **) malloc(n_threads * sizeof(char *));
94 if(reconstruct_Y == NULL || reconstruct_X == NULL) terror("Could not allocate output alignment sequences");
95 char ** writing_buffer_alignment = (char **) malloc(n_threads * sizeof(char*));
96 for(i=0;i<n_threads;i++){
97
98 table[i] = (struct cell **) malloc(MAX_READ_SIZE * sizeof(struct cell *));
99 for(j=0;j<MAX_READ_SIZE;j++){
100 table[i][j] = (struct cell *) malloc((1+MAX_WINDOW_SIZE)*sizeof(struct cell));
101 if(table[i][j] == NULL) terror("Could not allocate memory for second loop of table");
102 // Delete this
103 /*
104 uint64_t r;
105 for(r=0;r<MAX_WINDOW_SIZE+1;r++){
106 table[i][j][r].score = INT64_MIN;
107 table[i][j][r].xfrom = 10000000000;
108 table[i][j][r].yfrom = 10000000000;
109 }
110 */
111 }
112 mc[i] = (struct positioned_cell *) malloc(MAX_READ_SIZE * sizeof(struct positioned_cell));
113 my_x[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char));
114 my_y[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char));
115 reconstruct_X[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char));
116 reconstruct_Y[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char));
117 writing_buffer_alignment[i] = (char *) malloc(6*MAX_READ_SIZE*sizeof(char)); //6 times because of 2 times the length of the max of the read, and that happens 3 times (seqX,align,Y)
118 if(table[i] == NULL || mc[i] == NULL || my_x[i] == NULL || my_y[i] == NULL || reconstruct_X[i] == NULL || reconstruct_Y[i] == NULL || writing_buffer_alignment[i] == NULL) terror("Could not allocate buffer for alignment output");
119 }
120
121
122
123 end = clock();
124 fprintf(stdout, "[INFO] Initialization took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
125
126 //Variables to account for positions
127 //Print info
128 fprintf(stdout, "[INFO] Loading database.\n");
129 //Variables to read kmers
130 char c = 'N'; //Char to read character
131 //Current length of array and variables for the buffer
132 uint64_t idx = 0, r = 0;
133
134 //Vector to read in batches
135 char * temp_seq_buffer = NULL;
136 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) {
137 terror("Could not allocate memory for read buffer");
138 }
139 //Vector to store database seq
140 unsigned char ** seq_vector_database = (unsigned char **) malloc(FIXED_LOADING_THREADS*sizeof(unsigned char *));
141 if(seq_vector_database == NULL) terror("Could not allocate memory for database vector");
142 uint64_t ** database_positions = (uint64_t **) malloc(FIXED_LOADING_THREADS*sizeof(uint64_t));
143 if(database_positions == NULL) terror("Could not allocate database sequences positions");
144
145 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l));
146 //if(mp == NULL) terror("Could not allocate vectors for memory pools");
147 //Mempool_l mp[FIXED_LOADING_THREADS][MAX_MEM_POOLS];
148
149
150 Mempool_l ** mp = (Mempool_l **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_l *));
151 if(mp == NULL) terror("Could not allocate memory pools");
152 for(i=0; i<FIXED_LOADING_THREADS; i++){
153 mp[i] = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l));
154 if(mp[i] == NULL) terror("Could not allocate individual memory pools");
155 }
156
157 Mempool_AVL ** mp_AVL = (Mempool_AVL **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_AVL *));
158 if(mp_AVL == NULL) terror("Could not allocate memory AVL pools");
159 for(i=0; i<FIXED_LOADING_THREADS; i++){
160 mp_AVL[i] = (Mempool_AVL *) malloc(MAX_MEM_POOLS*sizeof(Mempool_AVL));
161 if(mp_AVL[i] == NULL) terror("Could not allocate individual memory AVL pools");
162 }
163
164
165 AVLContainer * ct_A = (AVLContainer *) calloc(1, sizeof(AVLContainer));
166 if(ct_A == NULL) terror("Could not allocate container A");
167 AVLContainer * ct_B = (AVLContainer *) calloc(1, sizeof(AVLContainer));
168 if(ct_B == NULL) terror("Could not allocate container B");
169 AVLContainer * ct_C = (AVLContainer *) calloc(1, sizeof(AVLContainer));
170 if(ct_C == NULL) terror("Could not allocate container C");
171 AVLContainer * ct_D = (AVLContainer *) calloc(1, sizeof(AVLContainer));
172 if(ct_D == NULL) terror("Could not allocate container D");
173
174
175 SeqInfo data_database[FIXED_LOADING_THREADS];
176 uint64_t full_db_n_seqs = 0;
177
178
179 unsigned char curr_kmer[custom_kmer];
180 unsigned char aux_kmer[custom_kmer+1];
181 curr_kmer[0] = '\0';
182 uint64_t word_size = 0;
183
184
185 SeqInfo data_query;
186
187 //To force reading from the buffer
188 idx = READBUF + 1;
189
190 //Vector to store query seq
191 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char));
192 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector");
193 uint64_t n_realloc_query = 1, pos_in_query = 0, n_seqs_query_realloc = 1;
194 uint64_t * query_positions = (uint64_t *) malloc(INITSEQS*sizeof(uint64_t));
195 if(query_positions == NULL) terror("Could not allocate query sequences positions");
196
197
198
199 // Read number of sequences and load into RAM
200 begin = clock();
201 fseek(database, 0L, SEEK_END);
202 uint64_t db_temp_size = ftell(database);
203 char * load_buffer = (char *) malloc(db_temp_size * sizeof(char));
204 if(load_buffer == NULL) terror("Could not allocate intermediate buffer for threads sequence array");
205 fseek(database, 0L, SEEK_SET);
206
207 if(db_temp_size != fread(load_buffer, sizeof(char), db_temp_size, database)) terror("Could not read full sequence");
208
209 LoadingDBArgs args_DB_load[FIXED_LOADING_THREADS];
210
211
212 args_DB_load[0].data_database = &data_database[0];
213 args_DB_load[1].data_database = &data_database[1];
214 args_DB_load[2].data_database = &data_database[2];
215 args_DB_load[3].data_database = &data_database[3];
216
217 args_DB_load[0].ct = ct_A;
218 args_DB_load[1].ct = ct_B;
219 args_DB_load[2].ct = ct_C;
220 args_DB_load[3].ct = ct_D;
221 for(i=0; i<FIXED_LOADING_THREADS; i++){
222 args_DB_load[i].read_to = 0;
223 args_DB_load[i].read_from = 0;
224 }
225
226 //uint64_t a_fourth = db_temp_size / 4;
227
228
229 //get_num_seqs_and_length(load_buffer, &full_db_n_seqs, &db_temp_size, args_DB_load);
230
231 end = clock();
232 fprintf(stdout, "[INFO] Loading into RAM took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
233 begin = clock();
234
235 /*
236 char * temp_seq_buffer;
237 SeqInfo * data_database;
238 uint64_t t_len;
239 uint64_t word_size;
240 uint64_t read_from;
241 uint64_t read_to;
242 char thread_id;
243 Mempool_l * mp;
244 uint64_t n_pools_used;
245 */
246
247 // Launch threads to process database
248
249 args_DB_load[0].thread_id = 'A';
250 args_DB_load[1].thread_id = 'B';
251 args_DB_load[2].thread_id = 'C';
252 args_DB_load[3].thread_id = 'D';
253
254
255 for(i=0; i<FIXED_LOADING_THREADS; i++){
256
257 //seq_vector_database[i] = (unsigned char *) malloc((args_DB_load[i].read_to - args_DB_load[i].read_from)*sizeof(unsigned char));
258 seq_vector_database[i] = (unsigned char *) malloc((READBUF)*sizeof(unsigned char));
259 //database_positions[i] = (uint64_t *) malloc((1+data_database[i].n_seqs)*sizeof(uint64_t));
260 database_positions[i] = (uint64_t *) malloc((INITSEQS)*sizeof(uint64_t));
261 if(seq_vector_database[i] == NULL || database_positions[i] == NULL) terror("Could not allocate memory for individual database vectors");
262 data_database[i].sequences = seq_vector_database[i];
263
264
265 //To hold all information related to database
266 args_DB_load[i].n_pools_used = 0;
267 args_DB_load[i].n_pools_used_AVL = 0;
268
269 init_mem_pool_llpos(&mp[i][args_DB_load[i].n_pools_used]);
270 init_mem_pool_AVL(&mp_AVL[i][args_DB_load[i].n_pools_used_AVL]);
271
272 data_database[i].start_pos = database_positions[i];
273
274 args_DB_load[i].n_allocs = 1;
275 args_DB_load[i].read_from = i * (db_temp_size / 4);
276 args_DB_load[i].read_to = (i+1) * (db_temp_size / 4);
277 args_DB_load[i].temp_seq_buffer = load_buffer;
278 args_DB_load[i].t_len = db_temp_size;
279 args_DB_load[i].word_size = custom_kmer;
280 args_DB_load[i].mp = mp[i];
281 args_DB_load[i].mp_AVL = mp_AVL[i];
282
283 if( 0 != (error = pthread_create(&loading_threads[i], NULL, load_input, (void *) (&args_DB_load[i])) )){
284 fprintf(stdout, "[@loading] Thread %"PRIu64" returned %d:", i, error); terror("Could not launch");
285 }
286 }
287
288 //Wait for threads to finish
289 for(i=0;i<FIXED_LOADING_THREADS;i++){
290 pthread_join(loading_threads[i], NULL);
291 }
292
293 // Deallocate memory not needed anymore
294 free(load_buffer);
295 free(loading_threads);
296
297
298
299 //fprintf(stdout, "[INFO] WARNING!!!!!!!!! USING NON OVERLAPPING MERS, WHICH IS NOT INCLUDED AS OPTION!!!! DISABLE\n");
300
301
302
303
304
305 //data_database.total_len = pos_in_database;
306 //printf("tables have %"PRIu64" %"PRIu64", %"PRIu64" %"PRIu64"\n", data_database[0].total_len, data_database[1].total_len, data_database[2].total_len, data_database[3].total_len);
307 uint64_t full_db_len = data_database[0].total_len + data_database[1].total_len + data_database[2].total_len + data_database[3].total_len;
308 full_db_n_seqs = args_DB_load[0].contained_reads + args_DB_load[1].contained_reads + args_DB_load[2].contained_reads + args_DB_load[3].contained_reads;
309
310 end = clock();
311 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64" (%"PRIu64" sequences). Hash table building took %e seconds\n", full_db_len, full_db_n_seqs, (double)(end-begin)/CLOCKS_PER_SEC);
312 //close database
313 fclose(database);
314
315 // Allocate marker tags to avoid repeting computation
316
317 marker_taggs = (unsigned char **) malloc(n_threads * sizeof(unsigned char *));
318 if(marker_taggs == NULL) terror("Could not allocate marker taggs");
319 for(i=0;i<n_threads;i++){
320 marker_taggs[i] = (unsigned char *) malloc(full_db_n_seqs);
321 if(marker_taggs[i] == NULL) terror("Could not allocate second loop of marker taggs");
322 }
323 if(hits_only == TRUE){
324 hits_table = (uint64_t **) calloc(n_threads, sizeof(uint64_t *));
325 if(hits_table == NULL) terror("Could not allocate hits table");
326 uint64_t z;
327 for(z=0;z<n_threads;z++){
328 hits_table[z] = (uint64_t *) calloc(full_db_n_seqs, sizeof(uint64_t));
329 if(hits_table[z] == NULL) terror("Could not allocate sub table of hits");
330 }
331 }
332
333
334 begin = clock();
335
336
337
338 //To force reading from the buffer
339 idx = READBUF + 1;
340
341
342 data_query.sequences = seq_vector_query;
343 data_query.start_pos = query_positions;
344 data_query.total_len = 0;
345 data_query.n_seqs = 0;
346
347
348 //Print info
349 fprintf(stdout, "[INFO] Loading query.\n");
350
351
352 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
353 while((!feof(query) || (feof(query) && idx < r))){
354
355 if(c == '>'){
356 data_query.start_pos[data_query.n_seqs++] = pos_in_query;
357 word_size = 0;
358
359 if(pos_in_query == READBUF*n_realloc_query){
360 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
361 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
362 }
363
364 if(data_query.n_seqs == INITSEQS*n_seqs_query_realloc){
365 n_seqs_query_realloc++; data_query.start_pos = (uint64_t *) realloc(data_query.start_pos, INITSEQS*n_seqs_query_realloc*sizeof(uint64_t));
366 }
367
368
369 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID
370
371
372 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id
373 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
374 c = toupper(c);
375 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){
376 data_query.sequences[pos_in_query++] = (unsigned char) c;
377 curr_kmer[word_size++] = (unsigned char) c;
378
379 if(word_size == custom_kmer){
380 memcpy(aux_kmer, curr_kmer, custom_kmer);
381 aux_kmer[custom_kmer] = '\0';
382
383 memcpy(aux_kmer, &curr_kmer[1], custom_kmer-1);
384 memcpy(curr_kmer, aux_kmer, custom_kmer-1);
385 word_size--;
386 }
387
388 if(pos_in_query == READBUF*n_realloc_query){
389 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
390 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
391 }
392 }else{
393 if(c != '\n' && c != '\r' && c != '>'){
394 word_size = 0;
395 data_query.sequences[pos_in_query++] = (unsigned char) 'N'; //Convert to N
396 if(pos_in_query == READBUF*n_realloc_query){
397 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
398 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
399 }
400 }
401 }
402 }
403 }else{
404 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
405 }
406
407 }
408
409
410
411 end = clock();
412
413
414 data_query.total_len = pos_in_query;
415
416 fprintf(stdout, "[INFO] Query loaded and of length %"PRIu64". Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC);
417
418 begin = clock();
419
420 Head queue_head;
421 Queue * first_task = generate_queue(&queue_head, data_query.n_seqs, n_threads, n_parts);
422
423 /*
424 Queue * traverse = queue_head.head;
425 while(traverse != NULL){
426 printf("current_piece: %"PRIu64"-%"PRIu64"\n", traverse->r1, traverse->r2);
427 traverse = traverse->next;
428 }
429 getchar();
430 */
431
432
433
434 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads));
435
436 fprintf(stdout, "[INFO] Computing alignments.\n");
437
438
439
440 /*
441 uint64_t z;
442 for(z=0; z<POOL_SIZE; z++){
443 aux = mp[0].base + z;
444 fprintf(stdout, "%p\n", aux);
445 fflush(stdout);
446 }
447 */
448
449 // Make the full db
450 uint64_t contained_reads[FIXED_LOADING_THREADS] = {0,0,0,0};
451 uint64_t base_coordinates[FIXED_LOADING_THREADS] = {0,0,0,0};
452 //contained_reads[0] = 0;
453 //base_coordinates[0] = 0;
454 for(i=1;i<FIXED_LOADING_THREADS;i++){
455 contained_reads[i] = args_DB_load[i-1].contained_reads + contained_reads[i-1];
456 base_coordinates[i] = args_DB_load[i-1].base_coordinates + base_coordinates[i-1];
457 }
458
459 /*
460 for(i = 0; i < 4 ; i++){
461 printf("total len: %"PRIu64"\n", data_database[i].total_len);
462
463 }
464 getchar();
465 */
466 /*
467 for(i = 0; i < 4 ; i++){
468
469 printf("c:%"PRIu64" - b:%"PRIu64"\n", contained_reads[i], base_coordinates[i]);
470 }
471 getchar();
472 */
473
474
475
476
477 SeqInfo final_db;
478 final_db.sequences = (unsigned char *) malloc(full_db_len * sizeof(unsigned char));
479 if(final_db.sequences == NULL) terror ("Could not allocate final database sequences");
480 memcpy(&final_db.sequences[0], &data_database[0].sequences[0], data_database[0].total_len);
481 memcpy(&final_db.sequences[data_database[0].total_len], &data_database[1].sequences[0], data_database[1].total_len);
482 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len], &data_database[2].sequences[0], data_database[2].total_len);
483 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len+data_database[2].total_len], &data_database[3].sequences[0], data_database[3].total_len);
484 final_db.start_pos = (uint64_t *) malloc(full_db_n_seqs * sizeof(uint64_t));
485 if(final_db.start_pos == NULL) terror("Could not allocate final db starting positions");
486 // Copy them with offset
487 i=0;
488 while(i<data_database[0].n_seqs){ final_db.start_pos[i] = data_database[0].start_pos[i]; ++i; }
489 j=0;
490 //printf("switch\n");
491 while(j<data_database[1].n_seqs){ final_db.start_pos[i] = data_database[1].start_pos[j] + data_database[0].total_len; ++i; ++j; }
492 j=0;
493 //printf("switch\n");
494 while(j<data_database[2].n_seqs){ final_db.start_pos[i] = data_database[2].start_pos[j] + data_database[1].total_len + data_database[0].total_len; ++i; ++j; }
495 j=0;
496 //printf("switch\n");
497 while(j<data_database[3].n_seqs){ final_db.start_pos[i] = data_database[3].start_pos[j] + data_database[2].total_len + data_database[1].total_len + data_database[0].total_len; ++i; ++j; }
498
499 final_db.total_len = full_db_len;
500 final_db.n_seqs = full_db_n_seqs;
501
502 for(i=0;i<FIXED_LOADING_THREADS;i++){
503 free(data_database[i].sequences);
504 free(data_database[i].start_pos);
505 }
506
507
508 // Debug
509 /*
510 uint64_t max = 0;
511 for(i=0; i<full_db_n_seqs-2; i++){
512 //printf("%"PRIu64" - %"PRIu64" res in \n", final_db.start_pos[i], final_db.start_pos[i+1]);
513 //printf("%"PRIu64"\n", final_db.start_pos[i+2] - final_db.start_pos[i+1]);
514 if(final_db.start_pos[i+2] - final_db.start_pos[i+1] > max) max = final_db.start_pos[i+2] - final_db.start_pos[i+1];
515 //getchar();
516 }
517 printf("%"PRIu64"\n", max);
518 */
519
520
521
522
523
524 for(i=0;i<n_threads;i++){
525 hta[i].id = i;
526 hta[i].database = &final_db;
527 hta[i].query = &data_query;
528
529 //hta[i].from = i * reads_per_thread;
530 //hta[i].to = (i + 1) * reads_per_thread;
531 hta[i].container_a = ct_A;
532 hta[i].container_b = ct_B;
533 hta[i].container_c = ct_C;
534 hta[i].container_d = ct_D;
535 hta[i].contained_reads = contained_reads;
536 hta[i].base_coordinates = base_coordinates;
537 hta[i].accepted_query_reads = 0;
538 hta[i].min_e_value = minevalue;
539 hta[i].min_coverage = mincoverage;
540 hta[i].min_identity = minidentity;
541 hta[i].out = out_database;
542 hta[i].igap = igap;
543 hta[i].egap = egap;
544 hta[i].window = window;
545 hta[i].mc = mc[i];
546 hta[i].table = table[i];
547 hta[i].reconstruct_X = reconstruct_X[i];
548 hta[i].reconstruct_Y = reconstruct_Y[i];
549 hta[i].writing_buffer_alignment = writing_buffer_alignment[i];
550 hta[i].my_x = my_x[i];
551 hta[i].my_y = my_y[i];
552 hta[i].queue_head = &queue_head;
553 hta[i].lock = &lock;
554 hta[i].full_comp = full_comp;
555 hta[i].markers = marker_taggs[i];
556 if(hits_only) hta[i].hits = hits_table[i]; else hta[i].hits = NULL;
557
558
559 //if(i==n_threads-1) hta[i].to = data_query.n_seqs;
560
561 if( 0 != (error = pthread_create(&threads[i], NULL, computeAlignmentsByThread, (void *) (&hta[i])) )){
562 fprintf(stdout, "Thread %"PRIu64" returned %d:", i, error); terror("Could not launch");
563 }
564 }
565
566 //Wait for threads to finish
567 for(i=0;i<n_threads;i++){
568 pthread_join(threads[i], NULL);
569 }
570
571
572 for(i=0;i<n_threads;i++){
573 sum_accepted_reads += hta[i].accepted_query_reads;
574 }
575 // Accumulate hits just in case
576 if(hits_only == TRUE){
577 for(i=0;i<full_db_n_seqs;i++){
578 for(j=1;j<n_threads;j++){
579 hta[0].hits[i] += hta[j].hits[i];
580 }
581 if(out_database != NULL){
582
583 if(i == final_db.n_seqs - 1){
584 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], full_db_len - final_db.start_pos[i]);
585 }else{
586 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], final_db.start_pos[i+1] - final_db.start_pos[i]);
587 }
588 }
589 }
590 }
591
592
593 end = clock();
594 fprintf(stdout, "[INFO] Alignments computed in %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
595 fprintf(stdout, "[INFO] %"PRIu64" reads (%"PRIu64") from the query were found in the database (%"PRIu64") at a minimum e-value of %Le and minimum coverage of %d%%.\n", sum_accepted_reads, data_query.n_seqs, final_db.n_seqs, (long double)minevalue, (int) (100*mincoverage));
596 fprintf(stdout, "[INFO] The Jaccard-index is: %Le\n", (long double)sum_accepted_reads/((final_db.n_seqs+data_query.n_seqs)-sum_accepted_reads));
597 fprintf(stdout, "[INFO] Deallocating heap memory.\n");
598
599 fclose(query);
600
601 if(out_database != NULL) fclose(out_database);
602 free(temp_seq_buffer);
603
604
605 if(hits_only == TRUE){
606 // Write hits here
607 for(i=0;i<n_threads;i++){
608 free(hits_table[i]);
609 }
610 free(hits_table);
611 }
612
613 free(final_db.sequences);
614 free(final_db.start_pos);
615 free(data_query.sequences);
616 free(data_query.start_pos);
617 free(ct_A->root);
618 free(ct_B->root);
619 free(ct_C->root);
620 free(ct_D->root);
621 //free(ct);
622 free(threads);
623 free(hta);
624
625
626
627 for(i=0;i<n_threads;i++){
628
629 for(j=0;j<MAX_READ_SIZE;j++){
630 free(table[i][j]);
631 }
632
633 free(table[i]);
634
635 free(mc[i]);
636 free(reconstruct_X[i]);
637 free(reconstruct_Y[i]);
638 free(my_x[i]);
639 free(my_y[i]);
640 free(writing_buffer_alignment[i]);
641
642 free(marker_taggs[i]);
643
644 }
645 free(marker_taggs);
646
647
648 free(table);
649 free(mc);
650 free(reconstruct_X);
651 free(reconstruct_Y);
652 free(my_y);
653 free(my_x);
654 free(writing_buffer_alignment);
655
656 pthread_mutex_destroy(&lock);
657
658
659
660 for(i=0; i<FIXED_LOADING_THREADS; i++){
661 for(j=0;j<=args_DB_load[i].n_pools_used;j++){
662 free(mp[i][j].base);
663 }
664 free(mp[i]);
665 }
666 for(i=0; i<FIXED_LOADING_THREADS; i++){
667 for(j=0;j<=args_DB_load[i].n_pools_used_AVL;j++){
668 free(mp_AVL[i][j].base);
669 }
670 free(mp_AVL[i]);
671 }
672 free(mp);
673 free(mp_AVL);
674 free(seq_vector_database);
675 free(database_positions);
676
677 //Deallocate queue (its allocated as an array)
678 free(first_task);
679
680 //free(mp);
681
682 return 0;
683 }
684
685 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts){
686
687 int pNum = 0;
688 while(pNum < argc){
689 if(strcmp(av[pNum], "--verbose") == 0) VERBOSE_ACTIVE = 1;
690 if(strcmp(av[pNum], "--help") == 0){
691 fprintf(stdout, "USAGE:\n");
692 fprintf(stdout, " IMSAME -query [query] -db [database]\n");
693 fprintf(stdout, "OPTIONAL:\n");
694 fprintf(stdout, " -n_threads [Integer: 0<n_threads] (default 4)\n");
695 fprintf(stdout, " -evalue [Double: 0<=pval<1] (default: 1 * 10^-10)\n");
696 fprintf(stdout, " -coverage [Double: 0<coverage<=1 (default: 0.5)\n");
697 fprintf(stdout, " -identity [Double: 0<identity<=1 (default: 0.5)\n");
698 fprintf(stdout, " -igap [Integer: (default: 5)\n");
699 fprintf(stdout, " -egap [Integer: (default: 2)\n");
700 fprintf(stdout, " -window [Double: 0<window<=0.5 (default: 0.15)\n");
701 fprintf(stdout, " -kmer [Integer: k>1 (default 12)]\n");
702 fprintf(stdout, " -n_parts [Integer: n>0 (default 3)]\n");
703 fprintf(stdout, " -out [File path]\n");
704 fprintf(stdout, " --full Does not stop at first match and reports all equalities\n");
705 fprintf(stdout, " --verbose Turns verbose on\n");
706 fprintf(stdout, " --help Shows help for program usage\n");
707 fprintf(stdout, " --hits Compute only the non-overlapping hits\n");
708 exit(1);
709 }
710 if(strcmp(av[pNum], "--full") == 0) *full_comp = TRUE;
711 if(strcmp(av[pNum], "--hits") == 0) *hits_only = TRUE;
712 if(strcmp(av[pNum], "-n_parts") == 0) *n_parts = (uint64_t) atoi(av[pNum+1]);
713 if(strcmp(av[pNum], "-query") == 0){
714 *query = fopen64(av[pNum+1], "rt");
715 if(query==NULL) terror("Could not open query file");
716 }
717 if(strcmp(av[pNum], "-db") == 0){
718 *database = fopen64(av[pNum+1], "rt");
719 if(database==NULL) terror("Could not open database file");
720 }
721 if(strcmp(av[pNum], "-out") == 0){
722 *out_database = fopen64(av[pNum+1], "wt");
723 if(out_database==NULL) terror("Could not open output database file");
724 }
725 if(strcmp(av[pNum], "-evalue") == 0){
726 *minevalue = (long double) atof(av[pNum+1]);
727 if(*minevalue < 0) terror("Min-e-value must be larger than zero");
728 }
729 if(strcmp(av[pNum], "-window") == 0){
730 *window = (long double) atof(av[pNum+1]);
731 if(*window <= 0 || *window > 0.5) terror("Window percentage size must lie between 0<window<=0.5");
732 }
733 if(strcmp(av[pNum], "-coverage") == 0){
734 *mincoverage = (long double) atof(av[pNum+1]);
735 if(*mincoverage <= 0) terror("Min-coverage must be larger than zero");
736 }
737 if(strcmp(av[pNum], "-identity") == 0){
738 *minidentity = (long double) atof(av[pNum+1]);
739 if(*minidentity <= 0) terror("Min-identity must be larger than zero");
740 }
741 if(strcmp(av[pNum], "-igap") == 0){
742 *igap = - (atoi(av[pNum+1]));
743 }
744 if(strcmp(av[pNum], "-egap") == 0){
745 *egap = - (atoi(av[pNum+1]));
746 }
747 if(strcmp(av[pNum], "-n_threads") == 0){
748 *n_threads = (uint64_t) atoi(av[pNum+1]);
749 if(*n_threads < 0) terror("Number of threads must be larger than zero");
750 }
751 if(strcmp(av[pNum], "-kmer") == 0){
752 *custom_kmer = (uint64_t) atoi(av[pNum+1]);
753 if(*custom_kmer < 2) terror("K-mer size must be larger than 1");
754 }
755 pNum++;
756 }
757
758 if(*query==NULL || *database==NULL) terror("A query and database is required");
759 }
760