0
|
1 /*********
|
|
2
|
|
3 File IMSAME.c
|
|
4 Author EPW <estebanpw@uma.es>
|
|
5 Description Computes an incremental alignment on reads versus reads using n threads
|
|
6
|
|
7 USAGE Usage is described by calling ./IMSAME --help
|
|
8
|
|
9
|
|
10
|
|
11 **********/
|
|
12
|
|
13
|
|
14 #include <stdio.h>
|
|
15 #include <stdlib.h>
|
|
16 #include <math.h>
|
|
17 #include <string.h>
|
|
18 #include <ctype.h>
|
|
19 #include <pthread.h>
|
|
20 #include "structs.h"
|
|
21 #include "alignmentFunctions.h"
|
|
22 #include "commonFunctions.h"
|
|
23
|
|
24 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
|
25 #define MIN(x, y) (((x) <= (y)) ? (x) : (y))
|
|
26 #define STARTING_SEQS 1000
|
|
27 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes
|
|
28
|
|
29
|
|
30 uint64_t custom_kmer = 12; // Defined as external in structs.h
|
|
31
|
|
32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts);
|
|
33
|
|
34 int VERBOSE_ACTIVE = 0;
|
|
35
|
|
36 int main(int argc, char ** av){
|
|
37
|
|
38
|
|
39
|
|
40
|
|
41
|
|
42 clock_t begin, end;
|
|
43
|
|
44 int error; //To tell if threads could not be launched
|
|
45 uint64_t i,j;
|
|
46
|
|
47 //query to read kmers from, database to find seeds
|
|
48 FILE * query = NULL, * database = NULL, * out_database = NULL;
|
|
49 long double minevalue = 1/powl(10, 10); //Default 1 * 10^-10
|
|
50
|
|
51 long double mincoverage = 0.5, minidentity = 0.5, window = 0.15; //Default
|
|
52 int igap = -5, egap = -2;
|
|
53 unsigned char full_comp = FALSE;
|
|
54 unsigned char hits_only = FALSE;
|
|
55
|
|
56 uint64_t n_threads = 4;
|
|
57 uint64_t n_parts = 3; // Default is 3
|
|
58
|
|
59 init_args(argc, av, &query, &database, &out_database, &n_threads, &minevalue, &mincoverage, &igap, &egap, &minidentity, &window, &full_comp, &custom_kmer, &hits_only, &n_parts);
|
|
60
|
|
61 //uint64_t reads_per_thread;
|
|
62 uint64_t sum_accepted_reads = 0;
|
|
63
|
|
64 begin = clock();
|
|
65
|
|
66 fprintf(stdout, "[INFO] Init. quick table\n");
|
|
67
|
|
68 pthread_t * threads = (pthread_t *) malloc(n_threads * sizeof(pthread_t));
|
|
69 if(threads == NULL) terror("Could not create threads");
|
|
70
|
|
71 pthread_t * loading_threads = (pthread_t *) malloc(FIXED_LOADING_THREADS * sizeof(pthread_t));
|
|
72 if(loading_threads == NULL) terror("Could not create loading threads");
|
|
73
|
|
74
|
|
75 HashTableArgs * hta = (HashTableArgs *) malloc(n_threads*sizeof(HashTableArgs));
|
|
76 if(hta == NULL) terror("Could not allocate arguments for hash table");
|
|
77
|
|
78 pthread_mutex_t lock; //The mutex to lock the queue
|
|
79 if (pthread_mutex_init(&lock, NULL) != 0) terror("Could not init mutex");
|
|
80
|
|
81 // To be used if only computing hits
|
|
82 uint64_t ** hits_table = NULL;
|
|
83
|
|
84 unsigned char ** my_x = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));
|
|
85 unsigned char ** my_y = (unsigned char **) malloc(n_threads * sizeof(unsigned char*));
|
|
86
|
|
87 unsigned char ** marker_taggs = NULL; // To be used with full comparison
|
|
88
|
|
89 struct positioned_cell ** mc = (struct positioned_cell **) malloc(n_threads * sizeof(struct positioned_cell *));
|
|
90 struct cell *** table = (struct cell ***) malloc(n_threads * sizeof(struct cell **));
|
|
91 if(table == NULL) terror("Could not allocate NW table");
|
|
92 char ** reconstruct_X = (char **) malloc(n_threads * sizeof(char *));
|
|
93 char ** reconstruct_Y = (char **) malloc(n_threads * sizeof(char *));
|
|
94 if(reconstruct_Y == NULL || reconstruct_X == NULL) terror("Could not allocate output alignment sequences");
|
|
95 char ** writing_buffer_alignment = (char **) malloc(n_threads * sizeof(char*));
|
|
96 for(i=0;i<n_threads;i++){
|
|
97
|
|
98 table[i] = (struct cell **) malloc(MAX_READ_SIZE * sizeof(struct cell *));
|
|
99 for(j=0;j<MAX_READ_SIZE;j++){
|
|
100 table[i][j] = (struct cell *) malloc((1+MAX_WINDOW_SIZE)*sizeof(struct cell));
|
|
101 if(table[i][j] == NULL) terror("Could not allocate memory for second loop of table");
|
|
102 // Delete this
|
|
103 /*
|
|
104 uint64_t r;
|
|
105 for(r=0;r<MAX_WINDOW_SIZE+1;r++){
|
|
106 table[i][j][r].score = INT64_MIN;
|
|
107 table[i][j][r].xfrom = 10000000000;
|
|
108 table[i][j][r].yfrom = 10000000000;
|
|
109 }
|
|
110 */
|
|
111 }
|
|
112 mc[i] = (struct positioned_cell *) malloc(MAX_READ_SIZE * sizeof(struct positioned_cell));
|
|
113 my_x[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char));
|
|
114 my_y[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char));
|
|
115 reconstruct_X[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char));
|
|
116 reconstruct_Y[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char));
|
|
117 writing_buffer_alignment[i] = (char *) malloc(6*MAX_READ_SIZE*sizeof(char)); //6 times because of 2 times the length of the max of the read, and that happens 3 times (seqX,align,Y)
|
|
118 if(table[i] == NULL || mc[i] == NULL || my_x[i] == NULL || my_y[i] == NULL || reconstruct_X[i] == NULL || reconstruct_Y[i] == NULL || writing_buffer_alignment[i] == NULL) terror("Could not allocate buffer for alignment output");
|
|
119 }
|
|
120
|
|
121
|
|
122
|
|
123 end = clock();
|
|
124 fprintf(stdout, "[INFO] Initialization took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
|
|
125
|
|
126 //Variables to account for positions
|
|
127 //Print info
|
|
128 fprintf(stdout, "[INFO] Loading database.\n");
|
|
129 //Variables to read kmers
|
|
130 char c = 'N'; //Char to read character
|
|
131 //Current length of array and variables for the buffer
|
|
132 uint64_t idx = 0, r = 0;
|
|
133
|
|
134 //Vector to read in batches
|
|
135 char * temp_seq_buffer = NULL;
|
|
136 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) {
|
|
137 terror("Could not allocate memory for read buffer");
|
|
138 }
|
|
139 //Vector to store database seq
|
|
140 unsigned char ** seq_vector_database = (unsigned char **) malloc(FIXED_LOADING_THREADS*sizeof(unsigned char *));
|
|
141 if(seq_vector_database == NULL) terror("Could not allocate memory for database vector");
|
|
142 uint64_t ** database_positions = (uint64_t **) malloc(FIXED_LOADING_THREADS*sizeof(uint64_t));
|
|
143 if(database_positions == NULL) terror("Could not allocate database sequences positions");
|
|
144
|
|
145 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l));
|
|
146 //if(mp == NULL) terror("Could not allocate vectors for memory pools");
|
|
147 //Mempool_l mp[FIXED_LOADING_THREADS][MAX_MEM_POOLS];
|
|
148
|
|
149
|
|
150 Mempool_l ** mp = (Mempool_l **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_l *));
|
|
151 if(mp == NULL) terror("Could not allocate memory pools");
|
|
152 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
153 mp[i] = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l));
|
|
154 if(mp[i] == NULL) terror("Could not allocate individual memory pools");
|
|
155 }
|
|
156
|
|
157 Mempool_AVL ** mp_AVL = (Mempool_AVL **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_AVL *));
|
|
158 if(mp_AVL == NULL) terror("Could not allocate memory AVL pools");
|
|
159 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
160 mp_AVL[i] = (Mempool_AVL *) malloc(MAX_MEM_POOLS*sizeof(Mempool_AVL));
|
|
161 if(mp_AVL[i] == NULL) terror("Could not allocate individual memory AVL pools");
|
|
162 }
|
|
163
|
|
164
|
|
165 AVLContainer * ct_A = (AVLContainer *) calloc(1, sizeof(AVLContainer));
|
|
166 if(ct_A == NULL) terror("Could not allocate container A");
|
|
167 AVLContainer * ct_B = (AVLContainer *) calloc(1, sizeof(AVLContainer));
|
|
168 if(ct_B == NULL) terror("Could not allocate container B");
|
|
169 AVLContainer * ct_C = (AVLContainer *) calloc(1, sizeof(AVLContainer));
|
|
170 if(ct_C == NULL) terror("Could not allocate container C");
|
|
171 AVLContainer * ct_D = (AVLContainer *) calloc(1, sizeof(AVLContainer));
|
|
172 if(ct_D == NULL) terror("Could not allocate container D");
|
|
173
|
|
174
|
|
175 SeqInfo data_database[FIXED_LOADING_THREADS];
|
|
176 uint64_t full_db_n_seqs = 0;
|
|
177
|
|
178
|
|
179 unsigned char curr_kmer[custom_kmer];
|
|
180 unsigned char aux_kmer[custom_kmer+1];
|
|
181 curr_kmer[0] = '\0';
|
|
182 uint64_t word_size = 0;
|
|
183
|
|
184
|
|
185 SeqInfo data_query;
|
|
186
|
|
187 //To force reading from the buffer
|
|
188 idx = READBUF + 1;
|
|
189
|
|
190 //Vector to store query seq
|
|
191 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char));
|
|
192 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector");
|
|
193 uint64_t n_realloc_query = 1, pos_in_query = 0, n_seqs_query_realloc = 1;
|
|
194 uint64_t * query_positions = (uint64_t *) malloc(INITSEQS*sizeof(uint64_t));
|
|
195 if(query_positions == NULL) terror("Could not allocate query sequences positions");
|
|
196
|
|
197
|
|
198
|
|
199 // Read number of sequences and load into RAM
|
|
200 begin = clock();
|
|
201 fseek(database, 0L, SEEK_END);
|
|
202 uint64_t db_temp_size = ftell(database);
|
|
203 char * load_buffer = (char *) malloc(db_temp_size * sizeof(char));
|
|
204 if(load_buffer == NULL) terror("Could not allocate intermediate buffer for threads sequence array");
|
|
205 fseek(database, 0L, SEEK_SET);
|
|
206
|
|
207 if(db_temp_size != fread(load_buffer, sizeof(char), db_temp_size, database)) terror("Could not read full sequence");
|
|
208
|
|
209 LoadingDBArgs args_DB_load[FIXED_LOADING_THREADS];
|
|
210
|
|
211
|
|
212 args_DB_load[0].data_database = &data_database[0];
|
|
213 args_DB_load[1].data_database = &data_database[1];
|
|
214 args_DB_load[2].data_database = &data_database[2];
|
|
215 args_DB_load[3].data_database = &data_database[3];
|
|
216
|
|
217 args_DB_load[0].ct = ct_A;
|
|
218 args_DB_load[1].ct = ct_B;
|
|
219 args_DB_load[2].ct = ct_C;
|
|
220 args_DB_load[3].ct = ct_D;
|
|
221 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
222 args_DB_load[i].read_to = 0;
|
|
223 args_DB_load[i].read_from = 0;
|
|
224 }
|
|
225
|
|
226 //uint64_t a_fourth = db_temp_size / 4;
|
|
227
|
|
228
|
|
229 //get_num_seqs_and_length(load_buffer, &full_db_n_seqs, &db_temp_size, args_DB_load);
|
|
230
|
|
231 end = clock();
|
|
232 fprintf(stdout, "[INFO] Loading into RAM took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
|
|
233 begin = clock();
|
|
234
|
|
235 /*
|
|
236 char * temp_seq_buffer;
|
|
237 SeqInfo * data_database;
|
|
238 uint64_t t_len;
|
|
239 uint64_t word_size;
|
|
240 uint64_t read_from;
|
|
241 uint64_t read_to;
|
|
242 char thread_id;
|
|
243 Mempool_l * mp;
|
|
244 uint64_t n_pools_used;
|
|
245 */
|
|
246
|
|
247 // Launch threads to process database
|
|
248
|
|
249 args_DB_load[0].thread_id = 'A';
|
|
250 args_DB_load[1].thread_id = 'B';
|
|
251 args_DB_load[2].thread_id = 'C';
|
|
252 args_DB_load[3].thread_id = 'D';
|
|
253
|
|
254
|
|
255 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
256
|
|
257 //seq_vector_database[i] = (unsigned char *) malloc((args_DB_load[i].read_to - args_DB_load[i].read_from)*sizeof(unsigned char));
|
|
258 seq_vector_database[i] = (unsigned char *) malloc((READBUF)*sizeof(unsigned char));
|
|
259 //database_positions[i] = (uint64_t *) malloc((1+data_database[i].n_seqs)*sizeof(uint64_t));
|
|
260 database_positions[i] = (uint64_t *) malloc((INITSEQS)*sizeof(uint64_t));
|
|
261 if(seq_vector_database[i] == NULL || database_positions[i] == NULL) terror("Could not allocate memory for individual database vectors");
|
|
262 data_database[i].sequences = seq_vector_database[i];
|
|
263
|
|
264
|
|
265 //To hold all information related to database
|
|
266 args_DB_load[i].n_pools_used = 0;
|
|
267 args_DB_load[i].n_pools_used_AVL = 0;
|
|
268
|
|
269 init_mem_pool_llpos(&mp[i][args_DB_load[i].n_pools_used]);
|
|
270 init_mem_pool_AVL(&mp_AVL[i][args_DB_load[i].n_pools_used_AVL]);
|
|
271
|
|
272 data_database[i].start_pos = database_positions[i];
|
|
273
|
|
274 args_DB_load[i].n_allocs = 1;
|
|
275 args_DB_load[i].read_from = i * (db_temp_size / 4);
|
|
276 args_DB_load[i].read_to = (i+1) * (db_temp_size / 4);
|
|
277 args_DB_load[i].temp_seq_buffer = load_buffer;
|
|
278 args_DB_load[i].t_len = db_temp_size;
|
|
279 args_DB_load[i].word_size = custom_kmer;
|
|
280 args_DB_load[i].mp = mp[i];
|
|
281 args_DB_load[i].mp_AVL = mp_AVL[i];
|
|
282
|
|
283 if( 0 != (error = pthread_create(&loading_threads[i], NULL, load_input, (void *) (&args_DB_load[i])) )){
|
|
284 fprintf(stdout, "[@loading] Thread %"PRIu64" returned %d:", i, error); terror("Could not launch");
|
|
285 }
|
|
286 }
|
|
287
|
|
288 //Wait for threads to finish
|
|
289 for(i=0;i<FIXED_LOADING_THREADS;i++){
|
|
290 pthread_join(loading_threads[i], NULL);
|
|
291 }
|
|
292
|
|
293 // Deallocate memory not needed anymore
|
|
294 free(load_buffer);
|
|
295 free(loading_threads);
|
|
296
|
|
297
|
|
298
|
|
299 //fprintf(stdout, "[INFO] WARNING!!!!!!!!! USING NON OVERLAPPING MERS, WHICH IS NOT INCLUDED AS OPTION!!!! DISABLE\n");
|
|
300
|
|
301
|
|
302
|
|
303
|
|
304
|
|
305 //data_database.total_len = pos_in_database;
|
|
306 //printf("tables have %"PRIu64" %"PRIu64", %"PRIu64" %"PRIu64"\n", data_database[0].total_len, data_database[1].total_len, data_database[2].total_len, data_database[3].total_len);
|
|
307 uint64_t full_db_len = data_database[0].total_len + data_database[1].total_len + data_database[2].total_len + data_database[3].total_len;
|
|
308 full_db_n_seqs = args_DB_load[0].contained_reads + args_DB_load[1].contained_reads + args_DB_load[2].contained_reads + args_DB_load[3].contained_reads;
|
|
309
|
|
310 end = clock();
|
|
311 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64" (%"PRIu64" sequences). Hash table building took %e seconds\n", full_db_len, full_db_n_seqs, (double)(end-begin)/CLOCKS_PER_SEC);
|
|
312 //close database
|
|
313 fclose(database);
|
|
314
|
|
315 // Allocate marker tags to avoid repeting computation
|
|
316
|
|
317 marker_taggs = (unsigned char **) malloc(n_threads * sizeof(unsigned char *));
|
|
318 if(marker_taggs == NULL) terror("Could not allocate marker taggs");
|
|
319 for(i=0;i<n_threads;i++){
|
|
320 marker_taggs[i] = (unsigned char *) malloc(full_db_n_seqs);
|
|
321 if(marker_taggs[i] == NULL) terror("Could not allocate second loop of marker taggs");
|
|
322 }
|
|
323 if(hits_only == TRUE){
|
|
324 hits_table = (uint64_t **) calloc(n_threads, sizeof(uint64_t *));
|
|
325 if(hits_table == NULL) terror("Could not allocate hits table");
|
|
326 uint64_t z;
|
|
327 for(z=0;z<n_threads;z++){
|
|
328 hits_table[z] = (uint64_t *) calloc(full_db_n_seqs, sizeof(uint64_t));
|
|
329 if(hits_table[z] == NULL) terror("Could not allocate sub table of hits");
|
|
330 }
|
|
331 }
|
|
332
|
|
333
|
|
334 begin = clock();
|
|
335
|
|
336
|
|
337
|
|
338 //To force reading from the buffer
|
|
339 idx = READBUF + 1;
|
|
340
|
|
341
|
|
342 data_query.sequences = seq_vector_query;
|
|
343 data_query.start_pos = query_positions;
|
|
344 data_query.total_len = 0;
|
|
345 data_query.n_seqs = 0;
|
|
346
|
|
347
|
|
348 //Print info
|
|
349 fprintf(stdout, "[INFO] Loading query.\n");
|
|
350
|
|
351
|
|
352 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
|
|
353 while((!feof(query) || (feof(query) && idx < r))){
|
|
354
|
|
355 if(c == '>'){
|
|
356 data_query.start_pos[data_query.n_seqs++] = pos_in_query;
|
|
357 word_size = 0;
|
|
358
|
|
359 if(pos_in_query == READBUF*n_realloc_query){
|
|
360 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
|
|
361 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
|
|
362 }
|
|
363
|
|
364 if(data_query.n_seqs == INITSEQS*n_seqs_query_realloc){
|
|
365 n_seqs_query_realloc++; data_query.start_pos = (uint64_t *) realloc(data_query.start_pos, INITSEQS*n_seqs_query_realloc*sizeof(uint64_t));
|
|
366 }
|
|
367
|
|
368
|
|
369 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID
|
|
370
|
|
371
|
|
372 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id
|
|
373 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
|
|
374 c = toupper(c);
|
|
375 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){
|
|
376 data_query.sequences[pos_in_query++] = (unsigned char) c;
|
|
377 curr_kmer[word_size++] = (unsigned char) c;
|
|
378
|
|
379 if(word_size == custom_kmer){
|
|
380 memcpy(aux_kmer, curr_kmer, custom_kmer);
|
|
381 aux_kmer[custom_kmer] = '\0';
|
|
382
|
|
383 memcpy(aux_kmer, &curr_kmer[1], custom_kmer-1);
|
|
384 memcpy(curr_kmer, aux_kmer, custom_kmer-1);
|
|
385 word_size--;
|
|
386 }
|
|
387
|
|
388 if(pos_in_query == READBUF*n_realloc_query){
|
|
389 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
|
|
390 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
|
|
391 }
|
|
392 }else{
|
|
393 if(c != '\n' && c != '\r' && c != '>'){
|
|
394 word_size = 0;
|
|
395 data_query.sequences[pos_in_query++] = (unsigned char) 'N'; //Convert to N
|
|
396 if(pos_in_query == READBUF*n_realloc_query){
|
|
397 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char));
|
|
398 if(data_query.sequences == NULL) terror("Could not reallocate temporary query");
|
|
399 }
|
|
400 }
|
|
401 }
|
|
402 }
|
|
403 }else{
|
|
404 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query);
|
|
405 }
|
|
406
|
|
407 }
|
|
408
|
|
409
|
|
410
|
|
411 end = clock();
|
|
412
|
|
413
|
|
414 data_query.total_len = pos_in_query;
|
|
415
|
|
416 fprintf(stdout, "[INFO] Query loaded and of length %"PRIu64". Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC);
|
|
417
|
|
418 begin = clock();
|
|
419
|
|
420 Head queue_head;
|
|
421 Queue * first_task = generate_queue(&queue_head, data_query.n_seqs, n_threads, n_parts);
|
|
422
|
|
423 /*
|
|
424 Queue * traverse = queue_head.head;
|
|
425 while(traverse != NULL){
|
|
426 printf("current_piece: %"PRIu64"-%"PRIu64"\n", traverse->r1, traverse->r2);
|
|
427 traverse = traverse->next;
|
|
428 }
|
|
429 getchar();
|
|
430 */
|
|
431
|
|
432
|
|
433
|
|
434 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads));
|
|
435
|
|
436 fprintf(stdout, "[INFO] Computing alignments.\n");
|
|
437
|
|
438
|
|
439
|
|
440 /*
|
|
441 uint64_t z;
|
|
442 for(z=0; z<POOL_SIZE; z++){
|
|
443 aux = mp[0].base + z;
|
|
444 fprintf(stdout, "%p\n", aux);
|
|
445 fflush(stdout);
|
|
446 }
|
|
447 */
|
|
448
|
|
449 // Make the full db
|
|
450 uint64_t contained_reads[FIXED_LOADING_THREADS] = {0,0,0,0};
|
|
451 uint64_t base_coordinates[FIXED_LOADING_THREADS] = {0,0,0,0};
|
|
452 //contained_reads[0] = 0;
|
|
453 //base_coordinates[0] = 0;
|
|
454 for(i=1;i<FIXED_LOADING_THREADS;i++){
|
|
455 contained_reads[i] = args_DB_load[i-1].contained_reads + contained_reads[i-1];
|
|
456 base_coordinates[i] = args_DB_load[i-1].base_coordinates + base_coordinates[i-1];
|
|
457 }
|
|
458
|
|
459 /*
|
|
460 for(i = 0; i < 4 ; i++){
|
|
461 printf("total len: %"PRIu64"\n", data_database[i].total_len);
|
|
462
|
|
463 }
|
|
464 getchar();
|
|
465 */
|
|
466 /*
|
|
467 for(i = 0; i < 4 ; i++){
|
|
468
|
|
469 printf("c:%"PRIu64" - b:%"PRIu64"\n", contained_reads[i], base_coordinates[i]);
|
|
470 }
|
|
471 getchar();
|
|
472 */
|
|
473
|
|
474
|
|
475
|
|
476
|
|
477 SeqInfo final_db;
|
|
478 final_db.sequences = (unsigned char *) malloc(full_db_len * sizeof(unsigned char));
|
|
479 if(final_db.sequences == NULL) terror ("Could not allocate final database sequences");
|
|
480 memcpy(&final_db.sequences[0], &data_database[0].sequences[0], data_database[0].total_len);
|
|
481 memcpy(&final_db.sequences[data_database[0].total_len], &data_database[1].sequences[0], data_database[1].total_len);
|
|
482 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len], &data_database[2].sequences[0], data_database[2].total_len);
|
|
483 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len+data_database[2].total_len], &data_database[3].sequences[0], data_database[3].total_len);
|
|
484 final_db.start_pos = (uint64_t *) malloc(full_db_n_seqs * sizeof(uint64_t));
|
|
485 if(final_db.start_pos == NULL) terror("Could not allocate final db starting positions");
|
|
486 // Copy them with offset
|
|
487 i=0;
|
|
488 while(i<data_database[0].n_seqs){ final_db.start_pos[i] = data_database[0].start_pos[i]; ++i; }
|
|
489 j=0;
|
|
490 //printf("switch\n");
|
|
491 while(j<data_database[1].n_seqs){ final_db.start_pos[i] = data_database[1].start_pos[j] + data_database[0].total_len; ++i; ++j; }
|
|
492 j=0;
|
|
493 //printf("switch\n");
|
|
494 while(j<data_database[2].n_seqs){ final_db.start_pos[i] = data_database[2].start_pos[j] + data_database[1].total_len + data_database[0].total_len; ++i; ++j; }
|
|
495 j=0;
|
|
496 //printf("switch\n");
|
|
497 while(j<data_database[3].n_seqs){ final_db.start_pos[i] = data_database[3].start_pos[j] + data_database[2].total_len + data_database[1].total_len + data_database[0].total_len; ++i; ++j; }
|
|
498
|
|
499 final_db.total_len = full_db_len;
|
|
500 final_db.n_seqs = full_db_n_seqs;
|
|
501
|
|
502 for(i=0;i<FIXED_LOADING_THREADS;i++){
|
|
503 free(data_database[i].sequences);
|
|
504 free(data_database[i].start_pos);
|
|
505 }
|
|
506
|
|
507
|
|
508 // Debug
|
|
509 /*
|
|
510 uint64_t max = 0;
|
|
511 for(i=0; i<full_db_n_seqs-2; i++){
|
|
512 //printf("%"PRIu64" - %"PRIu64" res in \n", final_db.start_pos[i], final_db.start_pos[i+1]);
|
|
513 //printf("%"PRIu64"\n", final_db.start_pos[i+2] - final_db.start_pos[i+1]);
|
|
514 if(final_db.start_pos[i+2] - final_db.start_pos[i+1] > max) max = final_db.start_pos[i+2] - final_db.start_pos[i+1];
|
|
515 //getchar();
|
|
516 }
|
|
517 printf("%"PRIu64"\n", max);
|
|
518 */
|
|
519
|
|
520
|
|
521
|
|
522
|
|
523
|
|
524 for(i=0;i<n_threads;i++){
|
|
525 hta[i].id = i;
|
|
526 hta[i].database = &final_db;
|
|
527 hta[i].query = &data_query;
|
|
528
|
|
529 //hta[i].from = i * reads_per_thread;
|
|
530 //hta[i].to = (i + 1) * reads_per_thread;
|
|
531 hta[i].container_a = ct_A;
|
|
532 hta[i].container_b = ct_B;
|
|
533 hta[i].container_c = ct_C;
|
|
534 hta[i].container_d = ct_D;
|
|
535 hta[i].contained_reads = contained_reads;
|
|
536 hta[i].base_coordinates = base_coordinates;
|
|
537 hta[i].accepted_query_reads = 0;
|
|
538 hta[i].min_e_value = minevalue;
|
|
539 hta[i].min_coverage = mincoverage;
|
|
540 hta[i].min_identity = minidentity;
|
|
541 hta[i].out = out_database;
|
|
542 hta[i].igap = igap;
|
|
543 hta[i].egap = egap;
|
|
544 hta[i].window = window;
|
|
545 hta[i].mc = mc[i];
|
|
546 hta[i].table = table[i];
|
|
547 hta[i].reconstruct_X = reconstruct_X[i];
|
|
548 hta[i].reconstruct_Y = reconstruct_Y[i];
|
|
549 hta[i].writing_buffer_alignment = writing_buffer_alignment[i];
|
|
550 hta[i].my_x = my_x[i];
|
|
551 hta[i].my_y = my_y[i];
|
|
552 hta[i].queue_head = &queue_head;
|
|
553 hta[i].lock = &lock;
|
|
554 hta[i].full_comp = full_comp;
|
|
555 hta[i].markers = marker_taggs[i];
|
|
556 if(hits_only) hta[i].hits = hits_table[i]; else hta[i].hits = NULL;
|
|
557
|
|
558
|
|
559 //if(i==n_threads-1) hta[i].to = data_query.n_seqs;
|
|
560
|
|
561 if( 0 != (error = pthread_create(&threads[i], NULL, computeAlignmentsByThread, (void *) (&hta[i])) )){
|
|
562 fprintf(stdout, "Thread %"PRIu64" returned %d:", i, error); terror("Could not launch");
|
|
563 }
|
|
564 }
|
|
565
|
|
566 //Wait for threads to finish
|
|
567 for(i=0;i<n_threads;i++){
|
|
568 pthread_join(threads[i], NULL);
|
|
569 }
|
|
570
|
|
571
|
|
572 for(i=0;i<n_threads;i++){
|
|
573 sum_accepted_reads += hta[i].accepted_query_reads;
|
|
574 }
|
|
575 // Accumulate hits just in case
|
|
576 if(hits_only == TRUE){
|
|
577 for(i=0;i<full_db_n_seqs;i++){
|
|
578 for(j=1;j<n_threads;j++){
|
|
579 hta[0].hits[i] += hta[j].hits[i];
|
|
580 }
|
|
581 if(out_database != NULL){
|
|
582
|
|
583 if(i == final_db.n_seqs - 1){
|
|
584 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], full_db_len - final_db.start_pos[i]);
|
|
585 }else{
|
|
586 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], final_db.start_pos[i+1] - final_db.start_pos[i]);
|
|
587 }
|
|
588 }
|
|
589 }
|
|
590 }
|
|
591
|
|
592
|
|
593 end = clock();
|
|
594 fprintf(stdout, "[INFO] Alignments computed in %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC);
|
|
595 fprintf(stdout, "[INFO] %"PRIu64" reads (%"PRIu64") from the query were found in the database (%"PRIu64") at a minimum e-value of %Le and minimum coverage of %d%%.\n", sum_accepted_reads, data_query.n_seqs, final_db.n_seqs, (long double)minevalue, (int) (100*mincoverage));
|
|
596 fprintf(stdout, "[INFO] The Jaccard-index is: %Le\n", (long double)sum_accepted_reads/((final_db.n_seqs+data_query.n_seqs)-sum_accepted_reads));
|
|
597 fprintf(stdout, "[INFO] Deallocating heap memory.\n");
|
|
598
|
|
599 fclose(query);
|
|
600
|
|
601 if(out_database != NULL) fclose(out_database);
|
|
602 free(temp_seq_buffer);
|
|
603
|
|
604
|
|
605 if(hits_only == TRUE){
|
|
606 // Write hits here
|
|
607 for(i=0;i<n_threads;i++){
|
|
608 free(hits_table[i]);
|
|
609 }
|
|
610 free(hits_table);
|
|
611 }
|
|
612
|
|
613 free(final_db.sequences);
|
|
614 free(final_db.start_pos);
|
|
615 free(data_query.sequences);
|
|
616 free(data_query.start_pos);
|
|
617 free(ct_A->root);
|
|
618 free(ct_B->root);
|
|
619 free(ct_C->root);
|
|
620 free(ct_D->root);
|
|
621 //free(ct);
|
|
622 free(threads);
|
|
623 free(hta);
|
|
624
|
|
625
|
|
626
|
|
627 for(i=0;i<n_threads;i++){
|
|
628
|
|
629 for(j=0;j<MAX_READ_SIZE;j++){
|
|
630 free(table[i][j]);
|
|
631 }
|
|
632
|
|
633 free(table[i]);
|
|
634
|
|
635 free(mc[i]);
|
|
636 free(reconstruct_X[i]);
|
|
637 free(reconstruct_Y[i]);
|
|
638 free(my_x[i]);
|
|
639 free(my_y[i]);
|
|
640 free(writing_buffer_alignment[i]);
|
|
641
|
|
642 free(marker_taggs[i]);
|
|
643
|
|
644 }
|
|
645 free(marker_taggs);
|
|
646
|
|
647
|
|
648 free(table);
|
|
649 free(mc);
|
|
650 free(reconstruct_X);
|
|
651 free(reconstruct_Y);
|
|
652 free(my_y);
|
|
653 free(my_x);
|
|
654 free(writing_buffer_alignment);
|
|
655
|
|
656 pthread_mutex_destroy(&lock);
|
|
657
|
|
658
|
|
659
|
|
660 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
661 for(j=0;j<=args_DB_load[i].n_pools_used;j++){
|
|
662 free(mp[i][j].base);
|
|
663 }
|
|
664 free(mp[i]);
|
|
665 }
|
|
666 for(i=0; i<FIXED_LOADING_THREADS; i++){
|
|
667 for(j=0;j<=args_DB_load[i].n_pools_used_AVL;j++){
|
|
668 free(mp_AVL[i][j].base);
|
|
669 }
|
|
670 free(mp_AVL[i]);
|
|
671 }
|
|
672 free(mp);
|
|
673 free(mp_AVL);
|
|
674 free(seq_vector_database);
|
|
675 free(database_positions);
|
|
676
|
|
677 //Deallocate queue (its allocated as an array)
|
|
678 free(first_task);
|
|
679
|
|
680 //free(mp);
|
|
681
|
|
682 return 0;
|
|
683 }
|
|
684
|
|
685 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts){
|
|
686
|
|
687 int pNum = 0;
|
|
688 while(pNum < argc){
|
|
689 if(strcmp(av[pNum], "--verbose") == 0) VERBOSE_ACTIVE = 1;
|
|
690 if(strcmp(av[pNum], "--help") == 0){
|
|
691 fprintf(stdout, "USAGE:\n");
|
|
692 fprintf(stdout, " IMSAME -query [query] -db [database]\n");
|
|
693 fprintf(stdout, "OPTIONAL:\n");
|
|
694 fprintf(stdout, " -n_threads [Integer: 0<n_threads] (default 4)\n");
|
|
695 fprintf(stdout, " -evalue [Double: 0<=pval<1] (default: 1 * 10^-10)\n");
|
|
696 fprintf(stdout, " -coverage [Double: 0<coverage<=1 (default: 0.5)\n");
|
|
697 fprintf(stdout, " -identity [Double: 0<identity<=1 (default: 0.5)\n");
|
|
698 fprintf(stdout, " -igap [Integer: (default: 5)\n");
|
|
699 fprintf(stdout, " -egap [Integer: (default: 2)\n");
|
|
700 fprintf(stdout, " -window [Double: 0<window<=0.5 (default: 0.15)\n");
|
|
701 fprintf(stdout, " -kmer [Integer: k>1 (default 12)]\n");
|
|
702 fprintf(stdout, " -n_parts [Integer: n>0 (default 3)]\n");
|
|
703 fprintf(stdout, " -out [File path]\n");
|
|
704 fprintf(stdout, " --full Does not stop at first match and reports all equalities\n");
|
|
705 fprintf(stdout, " --verbose Turns verbose on\n");
|
|
706 fprintf(stdout, " --help Shows help for program usage\n");
|
|
707 fprintf(stdout, " --hits Compute only the non-overlapping hits\n");
|
|
708 exit(1);
|
|
709 }
|
|
710 if(strcmp(av[pNum], "--full") == 0) *full_comp = TRUE;
|
|
711 if(strcmp(av[pNum], "--hits") == 0) *hits_only = TRUE;
|
|
712 if(strcmp(av[pNum], "-n_parts") == 0) *n_parts = (uint64_t) atoi(av[pNum+1]);
|
|
713 if(strcmp(av[pNum], "-query") == 0){
|
|
714 *query = fopen64(av[pNum+1], "rt");
|
|
715 if(query==NULL) terror("Could not open query file");
|
|
716 }
|
|
717 if(strcmp(av[pNum], "-db") == 0){
|
|
718 *database = fopen64(av[pNum+1], "rt");
|
|
719 if(database==NULL) terror("Could not open database file");
|
|
720 }
|
|
721 if(strcmp(av[pNum], "-out") == 0){
|
|
722 *out_database = fopen64(av[pNum+1], "wt");
|
|
723 if(out_database==NULL) terror("Could not open output database file");
|
|
724 }
|
|
725 if(strcmp(av[pNum], "-evalue") == 0){
|
|
726 *minevalue = (long double) atof(av[pNum+1]);
|
|
727 if(*minevalue < 0) terror("Min-e-value must be larger than zero");
|
|
728 }
|
|
729 if(strcmp(av[pNum], "-window") == 0){
|
|
730 *window = (long double) atof(av[pNum+1]);
|
|
731 if(*window <= 0 || *window > 0.5) terror("Window percentage size must lie between 0<window<=0.5");
|
|
732 }
|
|
733 if(strcmp(av[pNum], "-coverage") == 0){
|
|
734 *mincoverage = (long double) atof(av[pNum+1]);
|
|
735 if(*mincoverage <= 0) terror("Min-coverage must be larger than zero");
|
|
736 }
|
|
737 if(strcmp(av[pNum], "-identity") == 0){
|
|
738 *minidentity = (long double) atof(av[pNum+1]);
|
|
739 if(*minidentity <= 0) terror("Min-identity must be larger than zero");
|
|
740 }
|
|
741 if(strcmp(av[pNum], "-igap") == 0){
|
|
742 *igap = - (atoi(av[pNum+1]));
|
|
743 }
|
|
744 if(strcmp(av[pNum], "-egap") == 0){
|
|
745 *egap = - (atoi(av[pNum+1]));
|
|
746 }
|
|
747 if(strcmp(av[pNum], "-n_threads") == 0){
|
|
748 *n_threads = (uint64_t) atoi(av[pNum+1]);
|
|
749 if(*n_threads < 0) terror("Number of threads must be larger than zero");
|
|
750 }
|
|
751 if(strcmp(av[pNum], "-kmer") == 0){
|
|
752 *custom_kmer = (uint64_t) atoi(av[pNum+1]);
|
|
753 if(*custom_kmer < 2) terror("K-mer size must be larger than 1");
|
|
754 }
|
|
755 pNum++;
|
|
756 }
|
|
757
|
|
758 if(*query==NULL || *database==NULL) terror("A query and database is required");
|
|
759 }
|
|
760
|