Mercurial > repos > bitlab > imsame
comparison IMSAME/src/IMSAME.c @ 0:762009a91895 draft
Uploaded
| author | bitlab |
|---|---|
| date | Sat, 15 Dec 2018 18:04:10 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:762009a91895 |
|---|---|
| 1 /********* | |
| 2 | |
| 3 File IMSAME.c | |
| 4 Author EPW <estebanpw@uma.es> | |
| 5 Description Computes an incremental alignment on reads versus reads using n threads | |
| 6 | |
| 7 USAGE Usage is described by calling ./IMSAME --help | |
| 8 | |
| 9 | |
| 10 | |
| 11 **********/ | |
| 12 | |
| 13 | |
| 14 #include <stdio.h> | |
| 15 #include <stdlib.h> | |
| 16 #include <math.h> | |
| 17 #include <string.h> | |
| 18 #include <ctype.h> | |
| 19 #include <pthread.h> | |
| 20 #include "structs.h" | |
| 21 #include "alignmentFunctions.h" | |
| 22 #include "commonFunctions.h" | |
| 23 | |
| 24 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) | |
| 25 #define MIN(x, y) (((x) <= (y)) ? (x) : (y)) | |
| 26 #define STARTING_SEQS 1000 | |
| 27 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes | |
| 28 | |
| 29 | |
| 30 uint64_t custom_kmer = 12; // Defined as external in structs.h | |
| 31 | |
| 32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts); | |
| 33 | |
| 34 int VERBOSE_ACTIVE = 0; | |
| 35 | |
| 36 int main(int argc, char ** av){ | |
| 37 | |
| 38 | |
| 39 | |
| 40 | |
| 41 | |
| 42 clock_t begin, end; | |
| 43 | |
| 44 int error; //To tell if threads could not be launched | |
| 45 uint64_t i,j; | |
| 46 | |
| 47 //query to read kmers from, database to find seeds | |
| 48 FILE * query = NULL, * database = NULL, * out_database = NULL; | |
| 49 long double minevalue = 1/powl(10, 10); //Default 1 * 10^-10 | |
| 50 | |
| 51 long double mincoverage = 0.5, minidentity = 0.5, window = 0.15; //Default | |
| 52 int igap = -5, egap = -2; | |
| 53 unsigned char full_comp = FALSE; | |
| 54 unsigned char hits_only = FALSE; | |
| 55 | |
| 56 uint64_t n_threads = 4; | |
| 57 uint64_t n_parts = 3; // Default is 3 | |
| 58 | |
| 59 init_args(argc, av, &query, &database, &out_database, &n_threads, &minevalue, &mincoverage, &igap, &egap, &minidentity, &window, &full_comp, &custom_kmer, &hits_only, &n_parts); | |
| 60 | |
| 61 //uint64_t reads_per_thread; | |
| 62 uint64_t sum_accepted_reads = 0; | |
| 63 | |
| 64 begin = clock(); | |
| 65 | |
| 66 fprintf(stdout, "[INFO] Init. quick table\n"); | |
| 67 | |
| 68 pthread_t * threads = (pthread_t *) malloc(n_threads * sizeof(pthread_t)); | |
| 69 if(threads == NULL) terror("Could not create threads"); | |
| 70 | |
| 71 pthread_t * loading_threads = (pthread_t *) malloc(FIXED_LOADING_THREADS * sizeof(pthread_t)); | |
| 72 if(loading_threads == NULL) terror("Could not create loading threads"); | |
| 73 | |
| 74 | |
| 75 HashTableArgs * hta = (HashTableArgs *) malloc(n_threads*sizeof(HashTableArgs)); | |
| 76 if(hta == NULL) terror("Could not allocate arguments for hash table"); | |
| 77 | |
| 78 pthread_mutex_t lock; //The mutex to lock the queue | |
| 79 if (pthread_mutex_init(&lock, NULL) != 0) terror("Could not init mutex"); | |
| 80 | |
| 81 // To be used if only computing hits | |
| 82 uint64_t ** hits_table = NULL; | |
| 83 | |
| 84 unsigned char ** my_x = (unsigned char **) malloc(n_threads * sizeof(unsigned char*)); | |
| 85 unsigned char ** my_y = (unsigned char **) malloc(n_threads * sizeof(unsigned char*)); | |
| 86 | |
| 87 unsigned char ** marker_taggs = NULL; // To be used with full comparison | |
| 88 | |
| 89 struct positioned_cell ** mc = (struct positioned_cell **) malloc(n_threads * sizeof(struct positioned_cell *)); | |
| 90 struct cell *** table = (struct cell ***) malloc(n_threads * sizeof(struct cell **)); | |
| 91 if(table == NULL) terror("Could not allocate NW table"); | |
| 92 char ** reconstruct_X = (char **) malloc(n_threads * sizeof(char *)); | |
| 93 char ** reconstruct_Y = (char **) malloc(n_threads * sizeof(char *)); | |
| 94 if(reconstruct_Y == NULL || reconstruct_X == NULL) terror("Could not allocate output alignment sequences"); | |
| 95 char ** writing_buffer_alignment = (char **) malloc(n_threads * sizeof(char*)); | |
| 96 for(i=0;i<n_threads;i++){ | |
| 97 | |
| 98 table[i] = (struct cell **) malloc(MAX_READ_SIZE * sizeof(struct cell *)); | |
| 99 for(j=0;j<MAX_READ_SIZE;j++){ | |
| 100 table[i][j] = (struct cell *) malloc((1+MAX_WINDOW_SIZE)*sizeof(struct cell)); | |
| 101 if(table[i][j] == NULL) terror("Could not allocate memory for second loop of table"); | |
| 102 // Delete this | |
| 103 /* | |
| 104 uint64_t r; | |
| 105 for(r=0;r<MAX_WINDOW_SIZE+1;r++){ | |
| 106 table[i][j][r].score = INT64_MIN; | |
| 107 table[i][j][r].xfrom = 10000000000; | |
| 108 table[i][j][r].yfrom = 10000000000; | |
| 109 } | |
| 110 */ | |
| 111 } | |
| 112 mc[i] = (struct positioned_cell *) malloc(MAX_READ_SIZE * sizeof(struct positioned_cell)); | |
| 113 my_x[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char)); | |
| 114 my_y[i] = (unsigned char *) malloc(2*MAX_READ_SIZE * sizeof(unsigned char)); | |
| 115 reconstruct_X[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char)); | |
| 116 reconstruct_Y[i] = (char *) malloc(2*MAX_READ_SIZE * sizeof(char)); | |
| 117 writing_buffer_alignment[i] = (char *) malloc(6*MAX_READ_SIZE*sizeof(char)); //6 times because of 2 times the length of the max of the read, and that happens 3 times (seqX,align,Y) | |
| 118 if(table[i] == NULL || mc[i] == NULL || my_x[i] == NULL || my_y[i] == NULL || reconstruct_X[i] == NULL || reconstruct_Y[i] == NULL || writing_buffer_alignment[i] == NULL) terror("Could not allocate buffer for alignment output"); | |
| 119 } | |
| 120 | |
| 121 | |
| 122 | |
| 123 end = clock(); | |
| 124 fprintf(stdout, "[INFO] Initialization took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
| 125 | |
| 126 //Variables to account for positions | |
| 127 //Print info | |
| 128 fprintf(stdout, "[INFO] Loading database.\n"); | |
| 129 //Variables to read kmers | |
| 130 char c = 'N'; //Char to read character | |
| 131 //Current length of array and variables for the buffer | |
| 132 uint64_t idx = 0, r = 0; | |
| 133 | |
| 134 //Vector to read in batches | |
| 135 char * temp_seq_buffer = NULL; | |
| 136 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) { | |
| 137 terror("Could not allocate memory for read buffer"); | |
| 138 } | |
| 139 //Vector to store database seq | |
| 140 unsigned char ** seq_vector_database = (unsigned char **) malloc(FIXED_LOADING_THREADS*sizeof(unsigned char *)); | |
| 141 if(seq_vector_database == NULL) terror("Could not allocate memory for database vector"); | |
| 142 uint64_t ** database_positions = (uint64_t **) malloc(FIXED_LOADING_THREADS*sizeof(uint64_t)); | |
| 143 if(database_positions == NULL) terror("Could not allocate database sequences positions"); | |
| 144 | |
| 145 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
| 146 //if(mp == NULL) terror("Could not allocate vectors for memory pools"); | |
| 147 //Mempool_l mp[FIXED_LOADING_THREADS][MAX_MEM_POOLS]; | |
| 148 | |
| 149 | |
| 150 Mempool_l ** mp = (Mempool_l **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_l *)); | |
| 151 if(mp == NULL) terror("Could not allocate memory pools"); | |
| 152 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 153 mp[i] = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
| 154 if(mp[i] == NULL) terror("Could not allocate individual memory pools"); | |
| 155 } | |
| 156 | |
| 157 Mempool_AVL ** mp_AVL = (Mempool_AVL **) malloc(FIXED_LOADING_THREADS*sizeof(Mempool_AVL *)); | |
| 158 if(mp_AVL == NULL) terror("Could not allocate memory AVL pools"); | |
| 159 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 160 mp_AVL[i] = (Mempool_AVL *) malloc(MAX_MEM_POOLS*sizeof(Mempool_AVL)); | |
| 161 if(mp_AVL[i] == NULL) terror("Could not allocate individual memory AVL pools"); | |
| 162 } | |
| 163 | |
| 164 | |
| 165 AVLContainer * ct_A = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
| 166 if(ct_A == NULL) terror("Could not allocate container A"); | |
| 167 AVLContainer * ct_B = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
| 168 if(ct_B == NULL) terror("Could not allocate container B"); | |
| 169 AVLContainer * ct_C = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
| 170 if(ct_C == NULL) terror("Could not allocate container C"); | |
| 171 AVLContainer * ct_D = (AVLContainer *) calloc(1, sizeof(AVLContainer)); | |
| 172 if(ct_D == NULL) terror("Could not allocate container D"); | |
| 173 | |
| 174 | |
| 175 SeqInfo data_database[FIXED_LOADING_THREADS]; | |
| 176 uint64_t full_db_n_seqs = 0; | |
| 177 | |
| 178 | |
| 179 unsigned char curr_kmer[custom_kmer]; | |
| 180 unsigned char aux_kmer[custom_kmer+1]; | |
| 181 curr_kmer[0] = '\0'; | |
| 182 uint64_t word_size = 0; | |
| 183 | |
| 184 | |
| 185 SeqInfo data_query; | |
| 186 | |
| 187 //To force reading from the buffer | |
| 188 idx = READBUF + 1; | |
| 189 | |
| 190 //Vector to store query seq | |
| 191 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char)); | |
| 192 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector"); | |
| 193 uint64_t n_realloc_query = 1, pos_in_query = 0, n_seqs_query_realloc = 1; | |
| 194 uint64_t * query_positions = (uint64_t *) malloc(INITSEQS*sizeof(uint64_t)); | |
| 195 if(query_positions == NULL) terror("Could not allocate query sequences positions"); | |
| 196 | |
| 197 | |
| 198 | |
| 199 // Read number of sequences and load into RAM | |
| 200 begin = clock(); | |
| 201 fseek(database, 0L, SEEK_END); | |
| 202 uint64_t db_temp_size = ftell(database); | |
| 203 char * load_buffer = (char *) malloc(db_temp_size * sizeof(char)); | |
| 204 if(load_buffer == NULL) terror("Could not allocate intermediate buffer for threads sequence array"); | |
| 205 fseek(database, 0L, SEEK_SET); | |
| 206 | |
| 207 if(db_temp_size != fread(load_buffer, sizeof(char), db_temp_size, database)) terror("Could not read full sequence"); | |
| 208 | |
| 209 LoadingDBArgs args_DB_load[FIXED_LOADING_THREADS]; | |
| 210 | |
| 211 | |
| 212 args_DB_load[0].data_database = &data_database[0]; | |
| 213 args_DB_load[1].data_database = &data_database[1]; | |
| 214 args_DB_load[2].data_database = &data_database[2]; | |
| 215 args_DB_load[3].data_database = &data_database[3]; | |
| 216 | |
| 217 args_DB_load[0].ct = ct_A; | |
| 218 args_DB_load[1].ct = ct_B; | |
| 219 args_DB_load[2].ct = ct_C; | |
| 220 args_DB_load[3].ct = ct_D; | |
| 221 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 222 args_DB_load[i].read_to = 0; | |
| 223 args_DB_load[i].read_from = 0; | |
| 224 } | |
| 225 | |
| 226 //uint64_t a_fourth = db_temp_size / 4; | |
| 227 | |
| 228 | |
| 229 //get_num_seqs_and_length(load_buffer, &full_db_n_seqs, &db_temp_size, args_DB_load); | |
| 230 | |
| 231 end = clock(); | |
| 232 fprintf(stdout, "[INFO] Loading into RAM took %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
| 233 begin = clock(); | |
| 234 | |
| 235 /* | |
| 236 char * temp_seq_buffer; | |
| 237 SeqInfo * data_database; | |
| 238 uint64_t t_len; | |
| 239 uint64_t word_size; | |
| 240 uint64_t read_from; | |
| 241 uint64_t read_to; | |
| 242 char thread_id; | |
| 243 Mempool_l * mp; | |
| 244 uint64_t n_pools_used; | |
| 245 */ | |
| 246 | |
| 247 // Launch threads to process database | |
| 248 | |
| 249 args_DB_load[0].thread_id = 'A'; | |
| 250 args_DB_load[1].thread_id = 'B'; | |
| 251 args_DB_load[2].thread_id = 'C'; | |
| 252 args_DB_load[3].thread_id = 'D'; | |
| 253 | |
| 254 | |
| 255 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 256 | |
| 257 //seq_vector_database[i] = (unsigned char *) malloc((args_DB_load[i].read_to - args_DB_load[i].read_from)*sizeof(unsigned char)); | |
| 258 seq_vector_database[i] = (unsigned char *) malloc((READBUF)*sizeof(unsigned char)); | |
| 259 //database_positions[i] = (uint64_t *) malloc((1+data_database[i].n_seqs)*sizeof(uint64_t)); | |
| 260 database_positions[i] = (uint64_t *) malloc((INITSEQS)*sizeof(uint64_t)); | |
| 261 if(seq_vector_database[i] == NULL || database_positions[i] == NULL) terror("Could not allocate memory for individual database vectors"); | |
| 262 data_database[i].sequences = seq_vector_database[i]; | |
| 263 | |
| 264 | |
| 265 //To hold all information related to database | |
| 266 args_DB_load[i].n_pools_used = 0; | |
| 267 args_DB_load[i].n_pools_used_AVL = 0; | |
| 268 | |
| 269 init_mem_pool_llpos(&mp[i][args_DB_load[i].n_pools_used]); | |
| 270 init_mem_pool_AVL(&mp_AVL[i][args_DB_load[i].n_pools_used_AVL]); | |
| 271 | |
| 272 data_database[i].start_pos = database_positions[i]; | |
| 273 | |
| 274 args_DB_load[i].n_allocs = 1; | |
| 275 args_DB_load[i].read_from = i * (db_temp_size / 4); | |
| 276 args_DB_load[i].read_to = (i+1) * (db_temp_size / 4); | |
| 277 args_DB_load[i].temp_seq_buffer = load_buffer; | |
| 278 args_DB_load[i].t_len = db_temp_size; | |
| 279 args_DB_load[i].word_size = custom_kmer; | |
| 280 args_DB_load[i].mp = mp[i]; | |
| 281 args_DB_load[i].mp_AVL = mp_AVL[i]; | |
| 282 | |
| 283 if( 0 != (error = pthread_create(&loading_threads[i], NULL, load_input, (void *) (&args_DB_load[i])) )){ | |
| 284 fprintf(stdout, "[@loading] Thread %"PRIu64" returned %d:", i, error); terror("Could not launch"); | |
| 285 } | |
| 286 } | |
| 287 | |
| 288 //Wait for threads to finish | |
| 289 for(i=0;i<FIXED_LOADING_THREADS;i++){ | |
| 290 pthread_join(loading_threads[i], NULL); | |
| 291 } | |
| 292 | |
| 293 // Deallocate memory not needed anymore | |
| 294 free(load_buffer); | |
| 295 free(loading_threads); | |
| 296 | |
| 297 | |
| 298 | |
| 299 //fprintf(stdout, "[INFO] WARNING!!!!!!!!! USING NON OVERLAPPING MERS, WHICH IS NOT INCLUDED AS OPTION!!!! DISABLE\n"); | |
| 300 | |
| 301 | |
| 302 | |
| 303 | |
| 304 | |
| 305 //data_database.total_len = pos_in_database; | |
| 306 //printf("tables have %"PRIu64" %"PRIu64", %"PRIu64" %"PRIu64"\n", data_database[0].total_len, data_database[1].total_len, data_database[2].total_len, data_database[3].total_len); | |
| 307 uint64_t full_db_len = data_database[0].total_len + data_database[1].total_len + data_database[2].total_len + data_database[3].total_len; | |
| 308 full_db_n_seqs = args_DB_load[0].contained_reads + args_DB_load[1].contained_reads + args_DB_load[2].contained_reads + args_DB_load[3].contained_reads; | |
| 309 | |
| 310 end = clock(); | |
| 311 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64" (%"PRIu64" sequences). Hash table building took %e seconds\n", full_db_len, full_db_n_seqs, (double)(end-begin)/CLOCKS_PER_SEC); | |
| 312 //close database | |
| 313 fclose(database); | |
| 314 | |
| 315 // Allocate marker tags to avoid repeting computation | |
| 316 | |
| 317 marker_taggs = (unsigned char **) malloc(n_threads * sizeof(unsigned char *)); | |
| 318 if(marker_taggs == NULL) terror("Could not allocate marker taggs"); | |
| 319 for(i=0;i<n_threads;i++){ | |
| 320 marker_taggs[i] = (unsigned char *) malloc(full_db_n_seqs); | |
| 321 if(marker_taggs[i] == NULL) terror("Could not allocate second loop of marker taggs"); | |
| 322 } | |
| 323 if(hits_only == TRUE){ | |
| 324 hits_table = (uint64_t **) calloc(n_threads, sizeof(uint64_t *)); | |
| 325 if(hits_table == NULL) terror("Could not allocate hits table"); | |
| 326 uint64_t z; | |
| 327 for(z=0;z<n_threads;z++){ | |
| 328 hits_table[z] = (uint64_t *) calloc(full_db_n_seqs, sizeof(uint64_t)); | |
| 329 if(hits_table[z] == NULL) terror("Could not allocate sub table of hits"); | |
| 330 } | |
| 331 } | |
| 332 | |
| 333 | |
| 334 begin = clock(); | |
| 335 | |
| 336 | |
| 337 | |
| 338 //To force reading from the buffer | |
| 339 idx = READBUF + 1; | |
| 340 | |
| 341 | |
| 342 data_query.sequences = seq_vector_query; | |
| 343 data_query.start_pos = query_positions; | |
| 344 data_query.total_len = 0; | |
| 345 data_query.n_seqs = 0; | |
| 346 | |
| 347 | |
| 348 //Print info | |
| 349 fprintf(stdout, "[INFO] Loading query.\n"); | |
| 350 | |
| 351 | |
| 352 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 353 while((!feof(query) || (feof(query) && idx < r))){ | |
| 354 | |
| 355 if(c == '>'){ | |
| 356 data_query.start_pos[data_query.n_seqs++] = pos_in_query; | |
| 357 word_size = 0; | |
| 358 | |
| 359 if(pos_in_query == READBUF*n_realloc_query){ | |
| 360 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
| 361 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
| 362 } | |
| 363 | |
| 364 if(data_query.n_seqs == INITSEQS*n_seqs_query_realloc){ | |
| 365 n_seqs_query_realloc++; data_query.start_pos = (uint64_t *) realloc(data_query.start_pos, INITSEQS*n_seqs_query_realloc*sizeof(uint64_t)); | |
| 366 } | |
| 367 | |
| 368 | |
| 369 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID | |
| 370 | |
| 371 | |
| 372 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id | |
| 373 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 374 c = toupper(c); | |
| 375 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
| 376 data_query.sequences[pos_in_query++] = (unsigned char) c; | |
| 377 curr_kmer[word_size++] = (unsigned char) c; | |
| 378 | |
| 379 if(word_size == custom_kmer){ | |
| 380 memcpy(aux_kmer, curr_kmer, custom_kmer); | |
| 381 aux_kmer[custom_kmer] = '\0'; | |
| 382 | |
| 383 memcpy(aux_kmer, &curr_kmer[1], custom_kmer-1); | |
| 384 memcpy(curr_kmer, aux_kmer, custom_kmer-1); | |
| 385 word_size--; | |
| 386 } | |
| 387 | |
| 388 if(pos_in_query == READBUF*n_realloc_query){ | |
| 389 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
| 390 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
| 391 } | |
| 392 }else{ | |
| 393 if(c != '\n' && c != '\r' && c != '>'){ | |
| 394 word_size = 0; | |
| 395 data_query.sequences[pos_in_query++] = (unsigned char) 'N'; //Convert to N | |
| 396 if(pos_in_query == READBUF*n_realloc_query){ | |
| 397 n_realloc_query++; data_query.sequences = (unsigned char *) realloc(data_query.sequences, READBUF*n_realloc_query*sizeof(unsigned char)); | |
| 398 if(data_query.sequences == NULL) terror("Could not reallocate temporary query"); | |
| 399 } | |
| 400 } | |
| 401 } | |
| 402 } | |
| 403 }else{ | |
| 404 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 405 } | |
| 406 | |
| 407 } | |
| 408 | |
| 409 | |
| 410 | |
| 411 end = clock(); | |
| 412 | |
| 413 | |
| 414 data_query.total_len = pos_in_query; | |
| 415 | |
| 416 fprintf(stdout, "[INFO] Query loaded and of length %"PRIu64". Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
| 417 | |
| 418 begin = clock(); | |
| 419 | |
| 420 Head queue_head; | |
| 421 Queue * first_task = generate_queue(&queue_head, data_query.n_seqs, n_threads, n_parts); | |
| 422 | |
| 423 /* | |
| 424 Queue * traverse = queue_head.head; | |
| 425 while(traverse != NULL){ | |
| 426 printf("current_piece: %"PRIu64"-%"PRIu64"\n", traverse->r1, traverse->r2); | |
| 427 traverse = traverse->next; | |
| 428 } | |
| 429 getchar(); | |
| 430 */ | |
| 431 | |
| 432 | |
| 433 | |
| 434 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads)); | |
| 435 | |
| 436 fprintf(stdout, "[INFO] Computing alignments.\n"); | |
| 437 | |
| 438 | |
| 439 | |
| 440 /* | |
| 441 uint64_t z; | |
| 442 for(z=0; z<POOL_SIZE; z++){ | |
| 443 aux = mp[0].base + z; | |
| 444 fprintf(stdout, "%p\n", aux); | |
| 445 fflush(stdout); | |
| 446 } | |
| 447 */ | |
| 448 | |
| 449 // Make the full db | |
| 450 uint64_t contained_reads[FIXED_LOADING_THREADS] = {0,0,0,0}; | |
| 451 uint64_t base_coordinates[FIXED_LOADING_THREADS] = {0,0,0,0}; | |
| 452 //contained_reads[0] = 0; | |
| 453 //base_coordinates[0] = 0; | |
| 454 for(i=1;i<FIXED_LOADING_THREADS;i++){ | |
| 455 contained_reads[i] = args_DB_load[i-1].contained_reads + contained_reads[i-1]; | |
| 456 base_coordinates[i] = args_DB_load[i-1].base_coordinates + base_coordinates[i-1]; | |
| 457 } | |
| 458 | |
| 459 /* | |
| 460 for(i = 0; i < 4 ; i++){ | |
| 461 printf("total len: %"PRIu64"\n", data_database[i].total_len); | |
| 462 | |
| 463 } | |
| 464 getchar(); | |
| 465 */ | |
| 466 /* | |
| 467 for(i = 0; i < 4 ; i++){ | |
| 468 | |
| 469 printf("c:%"PRIu64" - b:%"PRIu64"\n", contained_reads[i], base_coordinates[i]); | |
| 470 } | |
| 471 getchar(); | |
| 472 */ | |
| 473 | |
| 474 | |
| 475 | |
| 476 | |
| 477 SeqInfo final_db; | |
| 478 final_db.sequences = (unsigned char *) malloc(full_db_len * sizeof(unsigned char)); | |
| 479 if(final_db.sequences == NULL) terror ("Could not allocate final database sequences"); | |
| 480 memcpy(&final_db.sequences[0], &data_database[0].sequences[0], data_database[0].total_len); | |
| 481 memcpy(&final_db.sequences[data_database[0].total_len], &data_database[1].sequences[0], data_database[1].total_len); | |
| 482 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len], &data_database[2].sequences[0], data_database[2].total_len); | |
| 483 memcpy(&final_db.sequences[data_database[0].total_len+data_database[1].total_len+data_database[2].total_len], &data_database[3].sequences[0], data_database[3].total_len); | |
| 484 final_db.start_pos = (uint64_t *) malloc(full_db_n_seqs * sizeof(uint64_t)); | |
| 485 if(final_db.start_pos == NULL) terror("Could not allocate final db starting positions"); | |
| 486 // Copy them with offset | |
| 487 i=0; | |
| 488 while(i<data_database[0].n_seqs){ final_db.start_pos[i] = data_database[0].start_pos[i]; ++i; } | |
| 489 j=0; | |
| 490 //printf("switch\n"); | |
| 491 while(j<data_database[1].n_seqs){ final_db.start_pos[i] = data_database[1].start_pos[j] + data_database[0].total_len; ++i; ++j; } | |
| 492 j=0; | |
| 493 //printf("switch\n"); | |
| 494 while(j<data_database[2].n_seqs){ final_db.start_pos[i] = data_database[2].start_pos[j] + data_database[1].total_len + data_database[0].total_len; ++i; ++j; } | |
| 495 j=0; | |
| 496 //printf("switch\n"); | |
| 497 while(j<data_database[3].n_seqs){ final_db.start_pos[i] = data_database[3].start_pos[j] + data_database[2].total_len + data_database[1].total_len + data_database[0].total_len; ++i; ++j; } | |
| 498 | |
| 499 final_db.total_len = full_db_len; | |
| 500 final_db.n_seqs = full_db_n_seqs; | |
| 501 | |
| 502 for(i=0;i<FIXED_LOADING_THREADS;i++){ | |
| 503 free(data_database[i].sequences); | |
| 504 free(data_database[i].start_pos); | |
| 505 } | |
| 506 | |
| 507 | |
| 508 // Debug | |
| 509 /* | |
| 510 uint64_t max = 0; | |
| 511 for(i=0; i<full_db_n_seqs-2; i++){ | |
| 512 //printf("%"PRIu64" - %"PRIu64" res in \n", final_db.start_pos[i], final_db.start_pos[i+1]); | |
| 513 //printf("%"PRIu64"\n", final_db.start_pos[i+2] - final_db.start_pos[i+1]); | |
| 514 if(final_db.start_pos[i+2] - final_db.start_pos[i+1] > max) max = final_db.start_pos[i+2] - final_db.start_pos[i+1]; | |
| 515 //getchar(); | |
| 516 } | |
| 517 printf("%"PRIu64"\n", max); | |
| 518 */ | |
| 519 | |
| 520 | |
| 521 | |
| 522 | |
| 523 | |
| 524 for(i=0;i<n_threads;i++){ | |
| 525 hta[i].id = i; | |
| 526 hta[i].database = &final_db; | |
| 527 hta[i].query = &data_query; | |
| 528 | |
| 529 //hta[i].from = i * reads_per_thread; | |
| 530 //hta[i].to = (i + 1) * reads_per_thread; | |
| 531 hta[i].container_a = ct_A; | |
| 532 hta[i].container_b = ct_B; | |
| 533 hta[i].container_c = ct_C; | |
| 534 hta[i].container_d = ct_D; | |
| 535 hta[i].contained_reads = contained_reads; | |
| 536 hta[i].base_coordinates = base_coordinates; | |
| 537 hta[i].accepted_query_reads = 0; | |
| 538 hta[i].min_e_value = minevalue; | |
| 539 hta[i].min_coverage = mincoverage; | |
| 540 hta[i].min_identity = minidentity; | |
| 541 hta[i].out = out_database; | |
| 542 hta[i].igap = igap; | |
| 543 hta[i].egap = egap; | |
| 544 hta[i].window = window; | |
| 545 hta[i].mc = mc[i]; | |
| 546 hta[i].table = table[i]; | |
| 547 hta[i].reconstruct_X = reconstruct_X[i]; | |
| 548 hta[i].reconstruct_Y = reconstruct_Y[i]; | |
| 549 hta[i].writing_buffer_alignment = writing_buffer_alignment[i]; | |
| 550 hta[i].my_x = my_x[i]; | |
| 551 hta[i].my_y = my_y[i]; | |
| 552 hta[i].queue_head = &queue_head; | |
| 553 hta[i].lock = &lock; | |
| 554 hta[i].full_comp = full_comp; | |
| 555 hta[i].markers = marker_taggs[i]; | |
| 556 if(hits_only) hta[i].hits = hits_table[i]; else hta[i].hits = NULL; | |
| 557 | |
| 558 | |
| 559 //if(i==n_threads-1) hta[i].to = data_query.n_seqs; | |
| 560 | |
| 561 if( 0 != (error = pthread_create(&threads[i], NULL, computeAlignmentsByThread, (void *) (&hta[i])) )){ | |
| 562 fprintf(stdout, "Thread %"PRIu64" returned %d:", i, error); terror("Could not launch"); | |
| 563 } | |
| 564 } | |
| 565 | |
| 566 //Wait for threads to finish | |
| 567 for(i=0;i<n_threads;i++){ | |
| 568 pthread_join(threads[i], NULL); | |
| 569 } | |
| 570 | |
| 571 | |
| 572 for(i=0;i<n_threads;i++){ | |
| 573 sum_accepted_reads += hta[i].accepted_query_reads; | |
| 574 } | |
| 575 // Accumulate hits just in case | |
| 576 if(hits_only == TRUE){ | |
| 577 for(i=0;i<full_db_n_seqs;i++){ | |
| 578 for(j=1;j<n_threads;j++){ | |
| 579 hta[0].hits[i] += hta[j].hits[i]; | |
| 580 } | |
| 581 if(out_database != NULL){ | |
| 582 | |
| 583 if(i == final_db.n_seqs - 1){ | |
| 584 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], full_db_len - final_db.start_pos[i]); | |
| 585 }else{ | |
| 586 fprintf(out_database, "%"PRIu64"\t%"PRIu64"\t%"PRIu64"\n", i, hta[0].hits[i], final_db.start_pos[i+1] - final_db.start_pos[i]); | |
| 587 } | |
| 588 } | |
| 589 } | |
| 590 } | |
| 591 | |
| 592 | |
| 593 end = clock(); | |
| 594 fprintf(stdout, "[INFO] Alignments computed in %e seconds.\n", (double)(end-begin)/CLOCKS_PER_SEC); | |
| 595 fprintf(stdout, "[INFO] %"PRIu64" reads (%"PRIu64") from the query were found in the database (%"PRIu64") at a minimum e-value of %Le and minimum coverage of %d%%.\n", sum_accepted_reads, data_query.n_seqs, final_db.n_seqs, (long double)minevalue, (int) (100*mincoverage)); | |
| 596 fprintf(stdout, "[INFO] The Jaccard-index is: %Le\n", (long double)sum_accepted_reads/((final_db.n_seqs+data_query.n_seqs)-sum_accepted_reads)); | |
| 597 fprintf(stdout, "[INFO] Deallocating heap memory.\n"); | |
| 598 | |
| 599 fclose(query); | |
| 600 | |
| 601 if(out_database != NULL) fclose(out_database); | |
| 602 free(temp_seq_buffer); | |
| 603 | |
| 604 | |
| 605 if(hits_only == TRUE){ | |
| 606 // Write hits here | |
| 607 for(i=0;i<n_threads;i++){ | |
| 608 free(hits_table[i]); | |
| 609 } | |
| 610 free(hits_table); | |
| 611 } | |
| 612 | |
| 613 free(final_db.sequences); | |
| 614 free(final_db.start_pos); | |
| 615 free(data_query.sequences); | |
| 616 free(data_query.start_pos); | |
| 617 free(ct_A->root); | |
| 618 free(ct_B->root); | |
| 619 free(ct_C->root); | |
| 620 free(ct_D->root); | |
| 621 //free(ct); | |
| 622 free(threads); | |
| 623 free(hta); | |
| 624 | |
| 625 | |
| 626 | |
| 627 for(i=0;i<n_threads;i++){ | |
| 628 | |
| 629 for(j=0;j<MAX_READ_SIZE;j++){ | |
| 630 free(table[i][j]); | |
| 631 } | |
| 632 | |
| 633 free(table[i]); | |
| 634 | |
| 635 free(mc[i]); | |
| 636 free(reconstruct_X[i]); | |
| 637 free(reconstruct_Y[i]); | |
| 638 free(my_x[i]); | |
| 639 free(my_y[i]); | |
| 640 free(writing_buffer_alignment[i]); | |
| 641 | |
| 642 free(marker_taggs[i]); | |
| 643 | |
| 644 } | |
| 645 free(marker_taggs); | |
| 646 | |
| 647 | |
| 648 free(table); | |
| 649 free(mc); | |
| 650 free(reconstruct_X); | |
| 651 free(reconstruct_Y); | |
| 652 free(my_y); | |
| 653 free(my_x); | |
| 654 free(writing_buffer_alignment); | |
| 655 | |
| 656 pthread_mutex_destroy(&lock); | |
| 657 | |
| 658 | |
| 659 | |
| 660 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 661 for(j=0;j<=args_DB_load[i].n_pools_used;j++){ | |
| 662 free(mp[i][j].base); | |
| 663 } | |
| 664 free(mp[i]); | |
| 665 } | |
| 666 for(i=0; i<FIXED_LOADING_THREADS; i++){ | |
| 667 for(j=0;j<=args_DB_load[i].n_pools_used_AVL;j++){ | |
| 668 free(mp_AVL[i][j].base); | |
| 669 } | |
| 670 free(mp_AVL[i]); | |
| 671 } | |
| 672 free(mp); | |
| 673 free(mp_AVL); | |
| 674 free(seq_vector_database); | |
| 675 free(database_positions); | |
| 676 | |
| 677 //Deallocate queue (its allocated as an array) | |
| 678 free(first_task); | |
| 679 | |
| 680 //free(mp); | |
| 681 | |
| 682 return 0; | |
| 683 } | |
| 684 | |
| 685 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * n_threads, long double * minevalue, long double * mincoverage, int * igap, int * egap, long double * minidentity, long double * window, unsigned char * full_comp, uint64_t * custom_kmer, unsigned char * hits_only, uint64_t * n_parts){ | |
| 686 | |
| 687 int pNum = 0; | |
| 688 while(pNum < argc){ | |
| 689 if(strcmp(av[pNum], "--verbose") == 0) VERBOSE_ACTIVE = 1; | |
| 690 if(strcmp(av[pNum], "--help") == 0){ | |
| 691 fprintf(stdout, "USAGE:\n"); | |
| 692 fprintf(stdout, " IMSAME -query [query] -db [database]\n"); | |
| 693 fprintf(stdout, "OPTIONAL:\n"); | |
| 694 fprintf(stdout, " -n_threads [Integer: 0<n_threads] (default 4)\n"); | |
| 695 fprintf(stdout, " -evalue [Double: 0<=pval<1] (default: 1 * 10^-10)\n"); | |
| 696 fprintf(stdout, " -coverage [Double: 0<coverage<=1 (default: 0.5)\n"); | |
| 697 fprintf(stdout, " -identity [Double: 0<identity<=1 (default: 0.5)\n"); | |
| 698 fprintf(stdout, " -igap [Integer: (default: 5)\n"); | |
| 699 fprintf(stdout, " -egap [Integer: (default: 2)\n"); | |
| 700 fprintf(stdout, " -window [Double: 0<window<=0.5 (default: 0.15)\n"); | |
| 701 fprintf(stdout, " -kmer [Integer: k>1 (default 12)]\n"); | |
| 702 fprintf(stdout, " -n_parts [Integer: n>0 (default 3)]\n"); | |
| 703 fprintf(stdout, " -out [File path]\n"); | |
| 704 fprintf(stdout, " --full Does not stop at first match and reports all equalities\n"); | |
| 705 fprintf(stdout, " --verbose Turns verbose on\n"); | |
| 706 fprintf(stdout, " --help Shows help for program usage\n"); | |
| 707 fprintf(stdout, " --hits Compute only the non-overlapping hits\n"); | |
| 708 exit(1); | |
| 709 } | |
| 710 if(strcmp(av[pNum], "--full") == 0) *full_comp = TRUE; | |
| 711 if(strcmp(av[pNum], "--hits") == 0) *hits_only = TRUE; | |
| 712 if(strcmp(av[pNum], "-n_parts") == 0) *n_parts = (uint64_t) atoi(av[pNum+1]); | |
| 713 if(strcmp(av[pNum], "-query") == 0){ | |
| 714 *query = fopen64(av[pNum+1], "rt"); | |
| 715 if(query==NULL) terror("Could not open query file"); | |
| 716 } | |
| 717 if(strcmp(av[pNum], "-db") == 0){ | |
| 718 *database = fopen64(av[pNum+1], "rt"); | |
| 719 if(database==NULL) terror("Could not open database file"); | |
| 720 } | |
| 721 if(strcmp(av[pNum], "-out") == 0){ | |
| 722 *out_database = fopen64(av[pNum+1], "wt"); | |
| 723 if(out_database==NULL) terror("Could not open output database file"); | |
| 724 } | |
| 725 if(strcmp(av[pNum], "-evalue") == 0){ | |
| 726 *minevalue = (long double) atof(av[pNum+1]); | |
| 727 if(*minevalue < 0) terror("Min-e-value must be larger than zero"); | |
| 728 } | |
| 729 if(strcmp(av[pNum], "-window") == 0){ | |
| 730 *window = (long double) atof(av[pNum+1]); | |
| 731 if(*window <= 0 || *window > 0.5) terror("Window percentage size must lie between 0<window<=0.5"); | |
| 732 } | |
| 733 if(strcmp(av[pNum], "-coverage") == 0){ | |
| 734 *mincoverage = (long double) atof(av[pNum+1]); | |
| 735 if(*mincoverage <= 0) terror("Min-coverage must be larger than zero"); | |
| 736 } | |
| 737 if(strcmp(av[pNum], "-identity") == 0){ | |
| 738 *minidentity = (long double) atof(av[pNum+1]); | |
| 739 if(*minidentity <= 0) terror("Min-identity must be larger than zero"); | |
| 740 } | |
| 741 if(strcmp(av[pNum], "-igap") == 0){ | |
| 742 *igap = - (atoi(av[pNum+1])); | |
| 743 } | |
| 744 if(strcmp(av[pNum], "-egap") == 0){ | |
| 745 *egap = - (atoi(av[pNum+1])); | |
| 746 } | |
| 747 if(strcmp(av[pNum], "-n_threads") == 0){ | |
| 748 *n_threads = (uint64_t) atoi(av[pNum+1]); | |
| 749 if(*n_threads < 0) terror("Number of threads must be larger than zero"); | |
| 750 } | |
| 751 if(strcmp(av[pNum], "-kmer") == 0){ | |
| 752 *custom_kmer = (uint64_t) atoi(av[pNum+1]); | |
| 753 if(*custom_kmer < 2) terror("K-mer size must be larger than 1"); | |
| 754 } | |
| 755 pNum++; | |
| 756 } | |
| 757 | |
| 758 if(*query==NULL || *database==NULL) terror("A query and database is required"); | |
| 759 } | |
| 760 |
