Mercurial > repos > bitlab > imsame
comparison IMSAME/src/IMSAME.c @ 0:762009a91895 draft
Uploaded
author | bitlab |
---|---|
date | Sat, 15 Dec 2018 18:04:10 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:762009a91895 |
---|---|
1 /********* | |
2 | |
3 File IMSAME.c | |
4 Author EPW <estebanpw@uma.es> | |
5 Description Computes an incremental alignment on reads versus reads using n threads | |
6 | |
7 USAGE Usage is described by calling ./IMSAME --help | |
8 | |
9 | |
10 | |
11 **********/ | |
12 | |
13 | |
14 #include <stdio.h> | |
15 #include <stdlib.h> | |
16 #include <math.h> | |
17 #include <string.h> | |
18 #include <ctype.h> | |
19 #include <pthread.h> | |
20 #include "structs.h" | |
21 #include "alignmentFunctions.h" | |
22 #include "commonFunctions.h" | |
23 | |
24 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) | |
25 #define MIN(x, y) (((x) <= (y)) ? (x) : (y)) | |
26 #define STARTING_SEQS 1000 | |
27 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes | |
28 | |
29 | |
30 uint64_t custom_kmer = 12; // Defined as external in structs.h | |
31 | |
32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts); | |
33 | |
34 int VERBOSE_ACTIVE = 0; | |
35 | |
36 int main(int argc, char ** av){ | |
37 | |
38 | |
39 | |
40 | |
41 | |
42 clock_t begin, end; | |
43 | |
44 int error; //To tell if threads could not be launched | |
45 uint64_t i,j; | |
46 | |
47 //query to read kmers from, database to find seeds | |
48 FILE * query = NULL, * database = NULL, * out_database = NULL; | |
49 long double minevalue = 1/powl(10, 10); //Default 1 * 10^-10 | |
50 | |
51 long double mincoverage = 0.5, minidentity = 0.5, window = 0.15; //Default | |
52 int igap = -5, egap = -2; | |
53 unsigned char full_comp = FALSE; | |
54 unsigned char hits_only = FALSE; | |
55 | |
56 uint64_t n_threads = 4; | |
57 uint64_t n_parts = 3; // Default is 3 | |
58 | |
59 init_args(argc, av, &query, &database, &out_database, &n_threads, &minevalue, &mincoverage, &igap, &egap, &minidentity, &window, &full_comp, &custom_kmer, &hits_only, &n_parts); | |
60 | |
61 //uint64_t reads_per_thread; | |
62 uint64_t sum_accepted_reads = 0; | |
63 | |
64 begin = clock(); | |
65 | |
66 fprintf(stdout, "[INFO] Init. quick table\n"); | |
67 | |
68 pthread_t * threads = (pthread_t *) malloc(n_threads * sizeof(pthread_t)); | |
69 if(threads == NULL) terror("Could not create threads"); | |
70 | |
71 pthread_t * loading_threads = (pthread_t *) malloc(FIXED_LOADING_THREADS * sizeof(pthread_t)); | |
72 if(loading_threads == NULL) terror("Could not create loading threads"); | |
73 | |
74 | |
75 HashTableArgs * hta = (HashTableArgs *) malloc(n_threads*sizeof(HashTableArgs)); | |
76 if(hta == NULL) terror("Could not allocate arguments for hash table"); | |
77 | |
78 pthread_mutex_t lock; //The mutex to lock the queue | |
79 if (pthread_mutex_init(&lock, NULL) != 0) terror("Could not init mutex"); | |
80 | |
81 // To be used if only computing hits | |
82 uint64_t ** hits_table = NULL; | |
83 | |
84 unsigned char ** my_x = (unsigned char **) malloc(n_threads * sizeof(unsigned char*)); | |
85 unsigned char ** my_y = (unsigned char **) malloc(n_threads * sizeof(unsigned char*)); | |
86 | |
87 unsigned char ** marker_taggs = NULL; // To be used with full comparison | |
88 | |
89 struct positioned_cell ** mc = (struct positioned_cell **) malloc(n_threads * sizeof(struct positioned_cell *)); | |
90 struct cell *** table = (struct cell ***) malloc(n_threads * sizeof(struct cell **)); | |
91 if(table == NULL) terror("Could not allocate NW table"); | |
92 char ** reconstruct_X = (char **) malloc(n_threads * sizeof(char *)); | |
93 char ** reconstruct_Y = (char **) malloc(n_threads * sizeof(char *)); | |
94 if(reconstruct_Y == NULL || reconstruct_X == NULL) terror("Could not allocate output alignment sequences"); | |
95 char ** writing_buffer_alignment = (char **) malloc(n_threads * sizeof(char*)); | |
96 for(i=0;i<n_threads;i++){ | |
97 | |
98 table[i] = (struct cell **) malloc(MAX_READ_SIZE * sizeof(struct cell *)); | |
99 for(j=0;j<MAX_READ_SIZE;j++){ | |
100 table[i][j] = (struct cell *) malloc((1+MAX_WINDOW_SIZE)*sizeof(struct cell)); | |
101 if(table[i][j] == NULL) terror("Could not allocate memory for second loop of table"); | |
102 // Delete this | |
103 /* | |
104 uint64_t r; | |
105 for(r=0;r<MAX_WINDOW_SIZE+1;r++){ | |
106 table[i][j][r].score = INT64_MIN; | |
107 table[i][j][r].xfrom = 10000000000; | |
108 table[i][j][r].yfrom = 10000000000; | |
109 } | |
110 */ | |
111 } | |
112 mc[i] = (struct positioned_cell *) malloc(MAX_READ_SIZE * sizeof(struct positioned_cell)); | |
113 my_x[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char)); | |
114 my_y[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char)); | |
115 reconstruct_X[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char)); | |
116 reconstruct_Y[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char)); | |
117 writing_buffer_alignment[i] = (char *) malloc(6*MAX_READ_SIZE*sizeof(char)); //6 times because of 2 times the length of the max of the read, and that happens 3 times (seqX,align,Y) | |
118 if(table[i] == NULL || mc[i] == NULL || my_x[i] == NULL || my_y[i] == NULL || reconstruct_X[i] == NULL || reconstruct_Y[i] == NULL || writing_buffer_alignment[i] == NULL) terror("Could not allocate buffer for alignment output"); | |
119 } | |
120 | |
121 | |
122 | |
123 end = clock(); | |
124 fprintf(stdout, "[INFO] Initialization took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
125 | |
126 //Variables to account for positions | |
127 //Print info | |
128 fprintf(stdout, "[INFO] Loading database.\n"); | |
129 //Variables to read kmers | |
130 char c = 'N'; //Char to read character | |
131 //Current length of array and variables for the buffer | |
132 uint64_t idx = 0, r = 0; | |
133 | |
134 //Vector to read in batches | |
135 char * temp_seq_buffer = NULL; | |
136 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) { | |
137 terror("Could not allocate memory for read buffer"); | |
138 } | |
139 //Vector to store database seq | |
140 unsigned char ** seq_vector_database = (unsigned char **) malloc(FIXED_LOADING_THREADS*sizeof(unsigned char *)); | |
141 if(seq_vector_database == NULL) terror("Could not allocate memory for database vector"); | |
142 uint64_t ** database_positions = (uint64_t **) malloc(FIXED_LOADING_THREADS*sizeof(uint64_t)); | |
143 if(database_positions == NULL) terror("Could not allocate database sequences positions"); | |
144 | |
145 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
146 //if(mp == NULL) terror("Could not allocate vectors for memory pools"); | |
147 //Mempool_l mp[FIXED_LOADING_THREADS][MAX_MEM_POOLS]; | |
148 | |
149 | |
150 Mempool_l ** mp = (Mempool_l **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_l *)); | |
151 if(mp == NULL) terror("Could not allocate memory pools"); | |
152 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
153 mp[i] = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
154 if(mp[i] == NULL) terror("Could not allocate individual memory pools"); | |
155 } | |
156 | |
157 Mempool_AVL ** mp_AVL = (Mempool_AVL **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_AVL *)); | |
158 if(mp_AVL == NULL) terror("Could not allocate memory AVL pools"); | |
159 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
160 mp_AVL[i] = (Mempool_AVL *) malloc(MAX_MEM_POOLS*sizeof(Mempool_AVL)); | |
161 if(mp_AVL[i] == NULL) terror("Could not allocate individual memory AVL pools"); | |
162 } | |
163 | |
164 | |
165 AVLContainer * ct_A = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
166 if(ct_A == NULL) terror("Could not allocate container A"); | |
167 AVLContainer * ct_B = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
168 if(ct_B == NULL) terror("Could not allocate container B"); | |
169 AVLContainer * ct_C = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
170 if(ct_C == NULL) terror("Could not allocate container C"); | |
171 AVLContainer * ct_D = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
172 if(ct_D == NULL) terror("Could not allocate container D"); | |
173 | |
174 | |
175 SeqInfo data_database[FIXED_LOADING_THREADS]; | |
176 uint64_t full_db_n_seqs = 0; | |
177 | |
178 | |
179 unsigned char curr_kmer[custom_kmer]; | |
180 unsigned char aux_kmer[custom_kmer+1]; | |
181 curr_kmer[0] = '\0'; | |
182 uint64_t word_size = 0; | |
183 | |
184 | |
185 SeqInfo data_query; | |
186 | |
187 //To force reading from the buffer | |
188 idx = READBUF + 1; | |
189 | |
190 //Vector to store query seq | |
191 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char)); | |
192 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector"); | |
193 uint64_t n_realloc_query = 1, pos_in_query = 0, n_seqs_query_realloc = 1; | |
194 uint64_t * query_positions = (uint64_t *) malloc(INITSEQS*sizeof(uint64_t)); | |
195 if(query_positions == NULL) terror("Could not allocate query sequences positions"); | |
196 | |
197 | |
198 | |
199 // Read number of sequences and load into RAM | |
200 begin = clock(); | |
201 fseek(database, 0L, SEEK_END); | |
202 uint64_t db_temp_size = ftell(database); | |
203 char * load_buffer = (char *) malloc(db_temp_size * sizeof(char)); | |
204 if(load_buffer == NULL) terror("Could not allocate intermediate buffer for threads sequence array"); | |
205 fseek(database, 0L, SEEK_SET); | |
206 | |
207 if(db_temp_size != fread(load_buffer, sizeof(char), db_temp_size, database)) terror("Could not read full sequence"); | |
208 | |
209 LoadingDBArgs args_DB_load[FIXED_LOADING_THREADS]; | |
210 | |
211 | |
212 args_DB_load[0].data_database = &data_database[0]; | |
213 args_DB_load[1].data_database = &data_database[1]; | |
214 args_DB_load[2].data_database = &data_database[2]; | |
215 args_DB_load[3].data_database = &data_database[3]; | |
216 | |
217 args_DB_load[0].ct = ct_A; | |
218 args_DB_load[1].ct = ct_B; | |
219 args_DB_load[2].ct = ct_C; | |
220 args_DB_load[3].ct = ct_D; | |
221 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
222 args_DB_load[i].read_to = 0; | |
223 args_DB_load[i].read_from = 0; | |
224 } | |
225 | |
226 //uint64_t a_fourth = db_temp_size / 4; | |
227 | |
228 | |
229 //get_num_seqs_and_length(load_buffer, &full_db_n_seqs, &db_temp_size, args_DB_load); | |
230 | |
231 end = clock(); | |
232 fprintf(stdout, "[INFO] Loading into RAM took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
233 begin = clock(); | |
234 | |
235 /* | |
236 char * temp_seq_buffer; | |
237 SeqInfo * data_database; | |
238 uint64_t t_len; | |
239 uint64_t word_size; | |
240 uint64_t read_from; | |
241 uint64_t read_to; | |
242 char thread_id; | |
243 Mempool_l * mp; | |
244 uint64_t n_pools_used; | |
245 */ | |
246 | |
247 // Launch threads to process database | |
248 | |
249 args_DB_load[0].thread_id = 'A'; | |
250 args_DB_load[1].thread_id = 'B'; | |
251 args_DB_load[2].thread_id = 'C'; | |
252 args_DB_load[3].thread_id = 'D'; | |
253 | |
254 | |
255 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
256 | |
257 //seq_vector_database[i] = (unsigned char *) malloc((args_DB_load[i].read_to - args_DB_load[i].read_from)*sizeof(unsigned char)); | |
258 seq_vector_database[i] = (unsigned char *) malloc((READBUF)*sizeof(unsigned char)); | |
259 //database_positions[i] = (uint64_t *) malloc((1+data_database[i].n_seqs)*sizeof(uint64_t)); | |
260 database_positions[i] = (uint64_t *) malloc((INITSEQS)*sizeof(uint64_t)); | |
261 if(seq_vector_database[i] == NULL || database_positions[i] == NULL) terror("Could not allocate memory for individual database vectors"); | |
262 data_database[i].sequences = seq_vector_database[i]; | |
263 | |
264 | |
265 //To hold all information related to database | |
266 args_DB_load[i].n_pools_used = 0; | |
267 args_DB_load[i].n_pools_used_AVL = 0; | |
268 | |
269 init_mem_pool_llpos(&mp[i][args_DB_load[i].n_pools_used]); | |
270 init_mem_pool_AVL(&mp_AVL[i][args_DB_load[i].n_pools_used_AVL]); | |
271 | |
272 data_database[i].start_pos = database_positions[i]; | |
273 | |
274 args_DB_load[i].n_allocs = 1; | |
275 args_DB_load[i].read_from = i * (db_temp_size / 4); | |
276 args_DB_load[i].read_to = (i+1) * (db_temp_size / 4); | |
277 args_DB_load[i].temp_seq_buffer = load_buffer; | |
278 args_DB_load[i].t_len = db_temp_size; | |
279 args_DB_load[i].word_size = custom_kmer; | |
280 args_DB_load[i].mp = mp[i]; | |
281 args_DB_load[i].mp_AVL = mp_AVL[i]; | |
282 | |
283 if( 0 != (error = pthread_create(&loading_threads[i], NULL, load_input, (void *) (&args_DB_load[i])) )){ | |
284 fprintf(stdout, "[@loading] Thread %"PRIu64" returned %d:", i, error); terror("Could not launch"); | |
285 } | |
286 } | |
287 | |
288 //Wait for threads to finish | |
289 for(i=0;i<FIXED_LOADING_THREADS;i++){ | |
290 pthread_join(loading_threads[i], NULL); | |
291 } | |
292 | |
293 // Deallocate memory not needed anymore | |
294 free(load_buffer); | |
295 free(loading_threads); | |
296 | |
297 | |
298 | |
299 //fprintf(stdout, "[INFO] WARNING!!!!!!!!! USING NON OVERLAPPING MERS, WHICH IS NOT INCLUDED AS OPTION!!!! DISABLE\n"); | |
300 | |
301 | |
302 | |
303 | |
304 | |
305 //data_database.total_len = pos_in_database; | |
306 //printf("tables have %"PRIu64" %"PRIu64", %"PRIu64" %"PRIu64"\n", data_database[0].total_len, data_database[1].total_len, data_database[2].total_len, data_database[3].total_len); | |
307 uint64_t full_db_len = data_database[0].total_len + data_database[1].total_len + data_database[2].total_len + data_database[3].total_len; | |
308 full_db_n_seqs = args_DB_load[0].contained_reads + args_DB_load[1].contained_reads + args_DB_load[2].contained_reads + args_DB_load[3].contained_reads; | |
309 | |
310 end = clock(); | |
311 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64" (%"PRIu64" sequences). Hash table building took %e seconds\n", full_db_len, full_db_n_seqs, (double)(end-begin)/CLOCKS_PER_SEC); | |
312 //close database | |
313 fclose(database); | |
314 | |
315 // Allocate marker tags to avoid repeting computation | |
316 | |
317 marker_taggs = (unsigned char **) malloc(n_threads * sizeof(unsigned char *)); | |
318 if(marker_taggs == NULL) terror("Could not allocate marker taggs"); | |
319 for(i=0;i<n_threads;i++){ | |
320 marker_taggs[i] = (unsigned char *) malloc(full_db_n_seqs); | |
321 if(marker_taggs[i] == NULL) terror("Could not allocate second loop of marker taggs"); | |
322 } | |
323 if(hits_only == TRUE){ | |
324 hits_table = (uint64_t **) calloc(n_threads, sizeof(uint64_t *)); | |
325 if(hits_table == NULL) terror("Could not allocate hits table"); | |
326 uint64_t z; | |
327 for(z=0;z<n_threads;z++){ | |
328 hits_table[z] = (uint64_t *) calloc(full_db_n_seqs, sizeof(uint64_t)); | |
329 if(hits_table[z] == NULL) terror("Could not allocate sub table of hits"); | |
330 } | |
331 } | |
332 | |
333 | |
334 begin = clock(); | |
335 | |
336 | |
337 | |
338 //To force reading from the buffer | |
339 idx = READBUF + 1; | |
340 | |
341 | |
342 data_query.sequences = seq_vector_query; | |
343 data_query.start_pos = query_positions; | |
344 data_query.total_len = 0; | |
345 data_query.n_seqs = 0; | |
346 | |
347 | |
348 //Print info | |
349 fprintf(stdout, "[INFO] Loading query.\n"); | |
350 | |
351 | |
352 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
353 while((!feof(query) || (feof(query) && idx < r))){ | |
354 | |
355 if(c == '>'){ | |
356 data_query.start_pos[data_query.n_seqs++] = pos_in_query; | |
357 word_size = 0; | |
358 | |
359 if(pos_in_query == READBUF*n_realloc_query){ | |
360 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
361 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
362 } | |
363 | |
364 if(data_query.n_seqs == INITSEQS*n_seqs_query_realloc){ | |
365 n_seqs_query_realloc++; data_query.start_pos = (uint64_t *) realloc(data_query.start_pos, INITSEQS*n_seqs_query_realloc*sizeof(uint64_t)); | |
366 } | |
367 | |
368 | |
369 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID | |
370 | |
371 | |
372 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id | |
373 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
374 c = toupper(c); | |
375 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
376 data_query.sequences[pos_in_query++] = (unsigned char) c; | |
377 curr_kmer[word_size++] = (unsigned char) c; | |
378 | |
379 if(word_size == custom_kmer){ | |
380 memcpy(aux_kmer, curr_kmer, custom_kmer); | |
381 aux_kmer[custom_kmer] = '\0'; | |
382 | |
383 memcpy(aux_kmer, &curr_kmer[1], custom_kmer-1); | |
384 memcpy(curr_kmer, aux_kmer, custom_kmer-1); | |
385 word_size--; | |
386 } | |
387 | |
388 if(pos_in_query == READBUF*n_realloc_query){ | |
389 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
390 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
391 } | |
392 }else{ | |
393 if(c != '\n' && c != '\r' && c != '>'){ | |
394 word_size = 0; | |
395 data_query.sequences[pos_in_query++] = (unsigned char) 'N'; //Convert to N | |
396 if(pos_in_query == READBUF*n_realloc_query){ | |
397 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
398 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
399 } | |
400 } | |
401 } | |
402 } | |
403 }else{ | |
404 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
405 } | |
406 | |
407 } | |
408 | |
409 | |
410 | |
411 end = clock(); | |
412 | |
413 | |
414 data_query.total_len = pos_in_query; | |
415 | |
416 fprintf(stdout, "[INFO] Query loaded and of length %"PRIu64". Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
417 | |
418 begin = clock(); | |
419 | |
420 Head queue_head; | |
421 Queue * first_task = generate_queue(&queue_head, data_query.n_seqs, n_threads, n_parts); | |
422 | |
423 /* | |
424 Queue * traverse = queue_head.head; | |
425 while(traverse != NULL){ | |
426 printf("current_piece: %"PRIu64"-%"PRIu64"\n", traverse->r1, traverse->r2); | |
427 traverse = traverse->next; | |
428 } | |
429 getchar(); | |
430 */ | |
431 | |
432 | |
433 | |
434 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads)); | |
435 | |
436 fprintf(stdout, "[INFO] Computing alignments.\n"); | |
437 | |
438 | |
439 | |
440 /* | |
441 uint64_t z; | |
442 for(z=0; z<POOL_SIZE; z++){ | |
443 aux = mp[0].base + z; | |
444 fprintf(stdout, "%p\n", aux); | |
445 fflush(stdout); | |
446 } | |
447 */ | |
448 | |
449 // Make the full db | |
450 uint64_t contained_reads[FIXED_LOADING_THREADS] = {0,0,0,0}; | |
451 uint64_t base_coordinates[FIXED_LOADING_THREADS] = {0,0,0,0}; | |
452 //contained_reads[0] = 0; | |
453 //base_coordinates[0] = 0; | |
454 for(i=1;i<FIXED_LOADING_THREADS;i++){ | |
455 contained_reads[i] = args_DB_load[i-1].contained_reads + contained_reads[i-1]; | |
456 base_coordinates[i] = args_DB_load[i-1].base_coordinates + base_coordinates[i-1]; | |
457 } | |
458 | |
459 /* | |
460 for(i = 0; i < 4 ; i++){ | |
461 printf("total len: %"PRIu64"\n", data_database[i].total_len); | |
462 | |
463 } | |
464 getchar(); | |
465 */ | |
466 /* | |
467 for(i = 0; i < 4 ; i++){ | |
468 | |
469 printf("c:%"PRIu64" - b:%"PRIu64"\n", contained_reads[i], base_coordinates[i]); | |
470 } | |
471 getchar(); | |
472 */ | |
473 | |
474 | |
475 | |
476 | |
477 SeqInfo final_db; | |
478 final_db.sequences = (unsigned char *) malloc(full_db_len * sizeof(unsigned char)); | |
479 if(final_db.sequences == NULL) terror ("Could not allocate final database sequences"); | |
480 memcpy(&final_db.sequences[0], &data_database[0].sequences[0], data_database[0].total_len); | |
481 memcpy(&final_db.sequences[data_database[0].total_len], &data_database[1].sequences[0], data_database[1].total_len); | |
482 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len], &data_database[2].sequences[0], data_database[2].total_len); | |
483 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len+data_database[2].total_len], &data_database[3].sequences[0], data_database[3].total_len); | |
484 final_db.start_pos = (uint64_t *) malloc(full_db_n_seqs * sizeof(uint64_t)); | |
485 if(final_db.start_pos == NULL) terror("Could not allocate final db starting positions"); | |
486 // Copy them with offset | |
487 i=0; | |
488 while(i<data_database[0].n_seqs){ final_db.start_pos[i] = data_database[0].start_pos[i]; ++i; } | |
489 j=0; | |
490 //printf("switch\n"); | |
491 while(j<data_database[1].n_seqs){ final_db.start_pos[i] = data_database[1].start_pos[j] + data_database[0].total_len; ++i; ++j; } | |
492 j=0; | |
493 //printf("switch\n"); | |
494 while(j<data_database[2].n_seqs){ final_db.start_pos[i] = data_database[2].start_pos[j] + data_database[1].total_len + data_database[0].total_len; ++i; ++j; } | |
495 j=0; | |
496 //printf("switch\n"); | |
497 while(j<data_database[3].n_seqs){ final_db.start_pos[i] = data_database[3].start_pos[j] + data_database[2].total_len + data_database[1].total_len + data_database[0].total_len; ++i; ++j; } | |
498 | |
499 final_db.total_len = full_db_len; | |
500 final_db.n_seqs = full_db_n_seqs; | |
501 | |
502 for(i=0;i<FIXED_LOADING_THREADS;i++){ | |
503 free(data_database[i].sequences); | |
504 free(data_database[i].start_pos); | |
505 } | |
506 | |
507 | |
508 // Debug | |
509 /* | |
510 uint64_t max = 0; | |
511 for(i=0; i<full_db_n_seqs-2; i++){ | |
512 //printf("%"PRIu64" - %"PRIu64" res in \n", final_db.start_pos[i], final_db.start_pos[i+1]); | |
513 //printf("%"PRIu64"\n", final_db.start_pos[i+2] - final_db.start_pos[i+1]); | |
514 if(final_db.start_pos[i+2] - final_db.start_pos[i+1] > max) max = final_db.start_pos[i+2] - final_db.start_pos[i+1]; | |
515 //getchar(); | |
516 } | |
517 printf("%"PRIu64"\n", max); | |
518 */ | |
519 | |
520 | |
521 | |
522 | |
523 | |
524 for(i=0;i<n_threads;i++){ | |
525 hta[i].id = i; | |
526 hta[i].database = &final_db; | |
527 hta[i].query = &data_query; | |
528 | |
529 //hta[i].from = i * reads_per_thread; | |
530 //hta[i].to = (i + 1) * reads_per_thread; | |
531 hta[i].container_a = ct_A; | |
532 hta[i].container_b = ct_B; | |
533 hta[i].container_c = ct_C; | |
534 hta[i].container_d = ct_D; | |
535 hta[i].contained_reads = contained_reads; | |
536 hta[i].base_coordinates = base_coordinates; | |
537 hta[i].accepted_query_reads = 0; | |
538 hta[i].min_e_value = minevalue; | |
539 hta[i].min_coverage = mincoverage; | |
540 hta[i].min_identity = minidentity; | |
541 hta[i].out = out_database; | |
542 hta[i].igap = igap; | |
543 hta[i].egap = egap; | |
544 hta[i].window = window; | |
545 hta[i].mc = mc[i]; | |
546 hta[i].table = table[i]; | |
547 hta[i].reconstruct_X = reconstruct_X[i]; | |
548 hta[i].reconstruct_Y = reconstruct_Y[i]; | |
549 hta[i].writing_buffer_alignment = writing_buffer_alignment[i]; | |
550 hta[i].my_x = my_x[i]; | |
551 hta[i].my_y = my_y[i]; | |
552 hta[i].queue_head = &queue_head; | |
553 hta[i].lock = &lock; | |
554 hta[i].full_comp = full_comp; | |
555 hta[i].markers = marker_taggs[i]; | |
556 if(hits_only) hta[i].hits = hits_table[i]; else hta[i].hits = NULL; | |
557 | |
558 | |
559 //if(i==n_threads-1) hta[i].to = data_query.n_seqs; | |
560 | |
561 if( 0 != (error = pthread_create(&threads[i], NULL, computeAlignmentsByThread, (void *) (&hta[i])) )){ | |
562 fprintf(stdout, "Thread %"PRIu64" returned %d:", i, error); terror("Could not launch"); | |
563 } | |
564 } | |
565 | |
566 //Wait for threads to finish | |
567 for(i=0;i<n_threads;i++){ | |
568 pthread_join(threads[i], NULL); | |
569 } | |
570 | |
571 | |
572 for(i=0;i<n_threads;i++){ | |
573 sum_accepted_reads += hta[i].accepted_query_reads; | |
574 } | |
575 // Accumulate hits just in case | |
576 if(hits_only == TRUE){ | |
577 for(i=0;i<full_db_n_seqs;i++){ | |
578 for(j=1;j<n_threads;j++){ | |
579 hta[0].hits[i] += hta[j].hits[i]; | |
580 } | |
581 if(out_database != NULL){ | |
582 | |
583 if(i == final_db.n_seqs - 1){ | |
584 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], full_db_len - final_db.start_pos[i]); | |
585 }else{ | |
586 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], final_db.start_pos[i+1] - final_db.start_pos[i]); | |
587 } | |
588 } | |
589 } | |
590 } | |
591 | |
592 | |
593 end = clock(); | |
594 fprintf(stdout, "[INFO] Alignments computed in %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
595 fprintf(stdout, "[INFO] %"PRIu64" reads (%"PRIu64") from the query were found in the database (%"PRIu64") at a minimum e-value of %Le and minimum coverage of %d%%.\n", sum_accepted_reads, data_query.n_seqs, final_db.n_seqs, (long double)minevalue, (int) (100*mincoverage)); | |
596 fprintf(stdout, "[INFO] The Jaccard-index is: %Le\n", (long double)sum_accepted_reads/((final_db.n_seqs+data_query.n_seqs)-sum_accepted_reads)); | |
597 fprintf(stdout, "[INFO] Deallocating heap memory.\n"); | |
598 | |
599 fclose(query); | |
600 | |
601 if(out_database != NULL) fclose(out_database); | |
602 free(temp_seq_buffer); | |
603 | |
604 | |
605 if(hits_only == TRUE){ | |
606 // Write hits here | |
607 for(i=0;i<n_threads;i++){ | |
608 free(hits_table[i]); | |
609 } | |
610 free(hits_table); | |
611 } | |
612 | |
613 free(final_db.sequences); | |
614 free(final_db.start_pos); | |
615 free(data_query.sequences); | |
616 free(data_query.start_pos); | |
617 free(ct_A->root); | |
618 free(ct_B->root); | |
619 free(ct_C->root); | |
620 free(ct_D->root); | |
621 //free(ct); | |
622 free(threads); | |
623 free(hta); | |
624 | |
625 | |
626 | |
627 for(i=0;i<n_threads;i++){ | |
628 | |
629 for(j=0;j<MAX_READ_SIZE;j++){ | |
630 free(table[i][j]); | |
631 } | |
632 | |
633 free(table[i]); | |
634 | |
635 free(mc[i]); | |
636 free(reconstruct_X[i]); | |
637 free(reconstruct_Y[i]); | |
638 free(my_x[i]); | |
639 free(my_y[i]); | |
640 free(writing_buffer_alignment[i]); | |
641 | |
642 free(marker_taggs[i]); | |
643 | |
644 } | |
645 free(marker_taggs); | |
646 | |
647 | |
648 free(table); | |
649 free(mc); | |
650 free(reconstruct_X); | |
651 free(reconstruct_Y); | |
652 free(my_y); | |
653 free(my_x); | |
654 free(writing_buffer_alignment); | |
655 | |
656 pthread_mutex_destroy(&lock); | |
657 | |
658 | |
659 | |
660 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
661 for(j=0;j<=args_DB_load[i].n_pools_used;j++){ | |
662 free(mp[i][j].base); | |
663 } | |
664 free(mp[i]); | |
665 } | |
666 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
667 for(j=0;j<=args_DB_load[i].n_pools_used_AVL;j++){ | |
668 free(mp_AVL[i][j].base); | |
669 } | |
670 free(mp_AVL[i]); | |
671 } | |
672 free(mp); | |
673 free(mp_AVL); | |
674 free(seq_vector_database); | |
675 free(database_positions); | |
676 | |
677 //Deallocate queue (its allocated as an array) | |
678 free(first_task); | |
679 | |
680 //free(mp); | |
681 | |
682 return 0; | |
683 } | |
684 | |
685 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts){ | |
686 | |
687 int pNum = 0; | |
688 while(pNum < argc){ | |
689 if(strcmp(av[pNum], "--verbose") == 0) VERBOSE_ACTIVE = 1; | |
690 if(strcmp(av[pNum], "--help") == 0){ | |
691 fprintf(stdout, "USAGE:\n"); | |
692 fprintf(stdout, " IMSAME -query [query] -db [database]\n"); | |
693 fprintf(stdout, "OPTIONAL:\n"); | |
694 fprintf(stdout, " -n_threads [Integer: 0<n_threads] (default 4)\n"); | |
695 fprintf(stdout, " -evalue [Double: 0<=pval<1] (default: 1 * 10^-10)\n"); | |
696 fprintf(stdout, " -coverage [Double: 0<coverage<=1 (default: 0.5)\n"); | |
697 fprintf(stdout, " -identity [Double: 0<identity<=1 (default: 0.5)\n"); | |
698 fprintf(stdout, " -igap [Integer: (default: 5)\n"); | |
699 fprintf(stdout, " -egap [Integer: (default: 2)\n"); | |
700 fprintf(stdout, " -window [Double: 0<window<=0.5 (default: 0.15)\n"); | |
701 fprintf(stdout, " -kmer [Integer: k>1 (default 12)]\n"); | |
702 fprintf(stdout, " -n_parts [Integer: n>0 (default 3)]\n"); | |
703 fprintf(stdout, " -out [File path]\n"); | |
704 fprintf(stdout, " --full Does not stop at first match and reports all equalities\n"); | |
705 fprintf(stdout, " --verbose Turns verbose on\n"); | |
706 fprintf(stdout, " --help Shows help for program usage\n"); | |
707 fprintf(stdout, " --hits Compute only the non-overlapping hits\n"); | |
708 exit(1); | |
709 } | |
710 if(strcmp(av[pNum], "--full") == 0) *full_comp = TRUE; | |
711 if(strcmp(av[pNum], "--hits") == 0) *hits_only = TRUE; | |
712 if(strcmp(av[pNum], "-n_parts") == 0) *n_parts = (uint64_t) atoi(av[pNum+1]); | |
713 if(strcmp(av[pNum], "-query") == 0){ | |
714 *query = fopen64(av[pNum+1], "rt"); | |
715 if(query==NULL) terror("Could not open query file"); | |
716 } | |
717 if(strcmp(av[pNum], "-db") == 0){ | |
718 *database = fopen64(av[pNum+1], "rt"); | |
719 if(database==NULL) terror("Could not open database file"); | |
720 } | |
721 if(strcmp(av[pNum], "-out") == 0){ | |
722 *out_database = fopen64(av[pNum+1], "wt"); | |
723 if(out_database==NULL) terror("Could not open output database file"); | |
724 } | |
725 if(strcmp(av[pNum], "-evalue") == 0){ | |
726 *minevalue = (long double) atof(av[pNum+1]); | |
727 if(*minevalue < 0) terror("Min-e-value must be larger than zero"); | |
728 } | |
729 if(strcmp(av[pNum], "-window") == 0){ | |
730 *window = (long double) atof(av[pNum+1]); | |
731 if(*window <= 0 || *window > 0.5) terror("Window percentage size must lie between 0<window<=0.5"); | |
732 } | |
733 if(strcmp(av[pNum], "-coverage") == 0){ | |
734 *mincoverage = (long double) atof(av[pNum+1]); | |
735 if(*mincoverage <= 0) terror("Min-coverage must be larger than zero"); | |
736 } | |
737 if(strcmp(av[pNum], "-identity") == 0){ | |
738 *minidentity = (long double) atof(av[pNum+1]); | |
739 if(*minidentity <= 0) terror("Min-identity must be larger than zero"); | |
740 } | |
741 if(strcmp(av[pNum], "-igap") == 0){ | |
742 *igap = - (atoi(av[pNum+1])); | |
743 } | |
744 if(strcmp(av[pNum], "-egap") == 0){ | |
745 *egap = - (atoi(av[pNum+1])); | |
746 } | |
747 if(strcmp(av[pNum], "-n_threads") == 0){ | |
748 *n_threads = (uint64_t) atoi(av[pNum+1]); | |
749 if(*n_threads < 0) terror("Number of threads must be larger than zero"); | |
750 } | |
751 if(strcmp(av[pNum], "-kmer") == 0){ | |
752 *custom_kmer = (uint64_t) atoi(av[pNum+1]); | |
753 if(*custom_kmer < 2) terror("K-mer size must be larger than 1"); | |
754 } | |
755 pNum++; | |
756 } | |
757 | |
758 if(*query==NULL || *database==NULL) terror("A query and database is required"); | |
759 } | |
760 |