Mercurial > repos > alvarofaure > bitlab
comparison chromeister/src/CHROMEISTER.c @ 1:3d1fbde7e0cc draft default tip
Deleted selected files
| author | alvarofaure | 
|---|---|
| date | Thu, 13 Dec 2018 03:41:58 -0500 | 
| parents | 7fdf47a0bae8 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:7fdf47a0bae8 | 1:3d1fbde7e0cc | 
|---|---|
| 1 /********* | |
| 2 | |
| 3 File CHROMEISTER.c | |
| 4 Author EPW <estebanpw@uma.es> | |
| 5 Description Computes hits and generates a dotplot | |
| 6 | |
| 7 USAGE Usage is described by calling ./CHROMEISTER --help | |
| 8 | |
| 9 | |
| 10 | |
| 11 **********/ | |
| 12 | |
| 13 | |
| 14 #include <stdio.h> | |
| 15 #include <stdlib.h> | |
| 16 #include <math.h> | |
| 17 #include <string.h> | |
| 18 #include <ctype.h> | |
| 19 #include "structs.h" | |
| 20 #include "alignmentFunctions.h" | |
| 21 #include "commonFunctions.h" | |
| 22 #define STARTING_SEQS 1000 | |
| 23 #define PIECE_OF_DB_REALLOC 3200000 //half a gigabyte if divided by 8 bytes | |
| 24 #define RANGE 2 | |
| 25 | |
| 26 uint64_t custom_kmer = 32; // Defined as external in structs.h | |
| 27 uint64_t diffuse_z = 4; // Defined as external in structs.h | |
| 28 | |
| 29 uint64_t get_seq_len(FILE * f); | |
| 30 | |
| 31 | |
| 32 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * custom_kmer, uint64_t * dimension, uint64_t * diffuse_z); | |
| 33 | |
| 34 int main(int argc, char ** av){ | |
| 35 | |
| 36 | |
| 37 /* | |
| 38 //Store positions of kmers | |
| 39 uint64_t n_pools_used = 0; | |
| 40 //Mempool_l * mp = (Mempool_l *) malloc(MAX_MEM_POOLS*sizeof(Mempool_l)); | |
| 41 //if(mp == NULL) terror("Could not allocate vectors for memory pools"); | |
| 42 Mempool_l mp[MAX_MEM_POOLS]; | |
| 43 init_mem_pool_llpos(&mp[n_pools_used]); | |
| 44 //llpos * aux; | |
| 45 | |
| 46 uint64_t n_pools_used_AVL = 0; | |
| 47 Mempool_AVL mp_AVL[MAX_MEM_POOLS]; | |
| 48 init_mem_pool_AVL(&mp_AVL[n_pools_used_AVL]); | |
| 49 */ | |
| 50 Tuple_hits * thit; | |
| 51 | |
| 52 /* | |
| 53 AVLTree * root = NULL; | |
| 54 root = insert_AVLTree(root, 10, mp_AVL, &n_pools_used_AVL, 0, mp, &n_pools_used); | |
| 55 | |
| 56 llpos * some = find_AVLTree(root, 25); | |
| 57 while(some != NULL){ | |
| 58 printf("#%"PRIu64", ", some->pos); some = some->next; | |
| 59 } | |
| 60 */ | |
| 61 | |
| 62 uint64_t i, j; | |
| 63 | |
| 64 //query to read kmers from, database to find seeds | |
| 65 FILE * query = NULL, * database = NULL, * out_database = NULL; | |
| 66 uint64_t dimension = 1000; // Default 1000 * 1000 | |
| 67 | |
| 68 | |
| 69 init_args(argc, av, &query, &database, &out_database, &custom_kmer, &dimension, &diffuse_z); | |
| 70 | |
| 71 | |
| 72 | |
| 73 unsigned char char_converter[91]; | |
| 74 char_converter[(unsigned char)'A'] = 0; | |
| 75 char_converter[(unsigned char)'C'] = 1; | |
| 76 char_converter[(unsigned char)'G'] = 2; | |
| 77 char_converter[(unsigned char)'T'] = 3; | |
| 78 | |
| 79 | |
| 80 // Variables to account for positions | |
| 81 // Print info | |
| 82 fprintf(stdout, "[INFO] Loading database\n"); | |
| 83 // Variables to read kmers | |
| 84 char c = 'N'; //Char to read character | |
| 85 // Current length of array and variables for the buffer | |
| 86 uint64_t idx = 0, r = 0; | |
| 87 | |
| 88 //Vector to read in batches | |
| 89 char * temp_seq_buffer = NULL; | |
| 90 if ((temp_seq_buffer = calloc(READBUF, sizeof(char))) == NULL) { | |
| 91 terror("Could not allocate memory for read buffer"); | |
| 92 } | |
| 93 | |
| 94 //Dimensional matrix | |
| 95 uint64_t ** representation = (uint64_t **) calloc(dimension+1, sizeof(uint64_t *)); | |
| 96 if(representation == NULL) terror("Could not allocate representation"); | |
| 97 for(i=0; i<dimension+1; i++){ | |
| 98 representation[i] = (uint64_t *) calloc(dimension+1, sizeof(uint64_t)); | |
| 99 if(representation[i] == NULL) terror("Could not allocate second loop representation"); | |
| 100 } | |
| 101 | |
| 102 /* | |
| 103 fseek(database, 0, SEEK_END); | |
| 104 uint64_t aprox_len_query = ftell(database); | |
| 105 uint64_t aprox_len_db = aprox_len_query; | |
| 106 rewind(database); | |
| 107 */ | |
| 108 uint64_t aprox_len_query = get_seq_len(database); | |
| 109 uint64_t aprox_len_db = aprox_len_query; | |
| 110 | |
| 111 uint64_t a_hundreth = (aprox_len_query/100); | |
| 112 | |
| 113 unsigned char curr_kmer[custom_kmer], reverse_kmer[custom_kmer]; | |
| 114 curr_kmer[0] = reverse_kmer[0] = '\0'; | |
| 115 uint64_t word_size = 0, word_size_rev = 0; | |
| 116 | |
| 117 //To hold all information related to database | |
| 118 uint64_t current_len = 0; | |
| 119 | |
| 120 //To force reading from the buffer | |
| 121 idx = READBUF + 1; | |
| 122 | |
| 123 //unsigned char aux_kmer[custom_kmer+1]; | |
| 124 | |
| 125 //Vector to store query seq | |
| 126 unsigned char * seq_vector_query = (unsigned char *) malloc(READBUF*sizeof(unsigned char)); | |
| 127 if(seq_vector_query == NULL) terror("Could not allocate memory for query vector"); | |
| 128 | |
| 129 /* | |
| 130 Container * ct = (Container *) calloc(1, sizeof(Container)); | |
| 131 if(ct == NULL) terror("Could not allocate container"); | |
| 132 */ | |
| 133 | |
| 134 | |
| 135 | |
| 136 Index * ctidx = (Index *) calloc(1, sizeof(Index)); | |
| 137 if(ctidx == NULL) terror("Could not allocate container"); | |
| 138 | |
| 139 | |
| 140 //begin = clock(); | |
| 141 | |
| 142 | |
| 143 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
| 144 while((!feof(database) || (feof(database) && idx < r))){ | |
| 145 | |
| 146 if(c == '>'){ | |
| 147 | |
| 148 | |
| 149 | |
| 150 while(c != '\n') c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); //Skip ID | |
| 151 | |
| 152 | |
| 153 while(c != '>' && (!feof(database) || (feof(database) && idx < r))){ //Until next id | |
| 154 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
| 155 c = toupper(c); | |
| 156 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
| 157 curr_kmer[word_size] = (unsigned char) c; | |
| 158 if(word_size < custom_kmer) ++word_size; | |
| 159 ++current_len; | |
| 160 if(current_len % a_hundreth == 0){ | |
| 161 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); | |
| 162 //printf("%"PRIu64"%%..wasted: (%e) (%e)", 1+100*pos_in_query/aprox_len_query, (double)(wasted_cycles_forward)/CLOCKS_PER_SEC, (double)(wasted_cycles_reverse)/CLOCKS_PER_SEC); | |
| 163 fflush(stdout); | |
| 164 } | |
| 165 | |
| 166 | |
| 167 | |
| 168 }else{ //It can be anything (including N, Y, X ...) | |
| 169 | |
| 170 if(c != '\n' && c != '>'){ | |
| 171 word_size = 0; | |
| 172 // data_database.sequences[pos_in_database++] = (unsigned char) 'N'; //Convert to N | |
| 173 ++current_len; | |
| 174 | |
| 175 } | |
| 176 } | |
| 177 //if(current_len % 1000000 == 0) printf(" curr len %" PRIu64"\n", current_len); | |
| 178 if(word_size == custom_kmer){ | |
| 179 //write to hash table | |
| 180 | |
| 181 | |
| 182 thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] | |
| 183 [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] | |
| 184 [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] | |
| 185 [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; | |
| 186 | |
| 187 /* | |
| 188 typedef struct tuple_hits{ | |
| 189 int repetition; | |
| 190 int hit_count; | |
| 191 uint64_t key; | |
| 192 uint64_t pos; | |
| 193 } Tuple_hits; | |
| 194 */ | |
| 195 | |
| 196 if(thit->repetition == FALSE){ | |
| 197 // Then we can insert | |
| 198 thit->hit_count = 0; | |
| 199 thit->key = collisioned_hash(&curr_kmer[0], custom_kmer); | |
| 200 thit->pos = current_len; | |
| 201 }else{ | |
| 202 // Otherwise we break it | |
| 203 thit->repetition = TRUE; | |
| 204 } | |
| 205 | |
| 206 //thit->root = insert_AVLTree(thit->root, hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); | |
| 207 //thit->root = insert_AVLTree(thit->root, collisioned_hash(&curr_kmer[0], custom_kmer), mp_AVL, &n_pools_used_AVL, current_len, mp, &n_pools_used); | |
| 208 | |
| 209 | |
| 210 | |
| 211 // Non overlapping | |
| 212 word_size = 0; | |
| 213 | |
| 214 | |
| 215 // Overlapping | |
| 216 //memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); | |
| 217 //--word_size; | |
| 218 } | |
| 219 } | |
| 220 word_size = 0; | |
| 221 | |
| 222 }else{ | |
| 223 c = buffered_fgetc(temp_seq_buffer, &idx, &r, database); | |
| 224 } | |
| 225 | |
| 226 } | |
| 227 | |
| 228 | |
| 229 //end = clock(); | |
| 230 | |
| 231 // data_database.total_len = pos_in_database; | |
| 232 | |
| 233 //fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64". Hash table building took %e seconds\n", data_database.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
| 234 fprintf(stdout, "[INFO] Database loaded and of length %"PRIu64".\n", current_len); | |
| 235 //close database | |
| 236 fclose(database); | |
| 237 | |
| 238 | |
| 239 | |
| 240 //begin = clock(); | |
| 241 | |
| 242 | |
| 243 | |
| 244 | |
| 245 | |
| 246 double pixel_size_db = (double) dimension / (double) current_len; | |
| 247 double ratio_db = (double) current_len / dimension; | |
| 248 | |
| 249 | |
| 250 // Get file length | |
| 251 | |
| 252 //fseek(query, 0, SEEK_END); | |
| 253 //aprox_len_query = ftell(query); | |
| 254 //rewind(query); | |
| 255 aprox_len_query = get_seq_len(query); | |
| 256 | |
| 257 //uint64_t reallocs_hash_holder_table = 1; | |
| 258 //uint64_t n_items_hash_holder_table = aprox_len_query / 5; | |
| 259 | |
| 260 //Hash_holder * hash_holder_table = (Hash_holder *) calloc(n_items_hash_holder_table, sizeof(Hash_holder)); | |
| 261 //if(hash_holder_table == NULL) terror("Could not allocate hash holding table"); | |
| 262 | |
| 263 a_hundreth = (aprox_len_query/100); | |
| 264 double pixel_size_query = (double) dimension / (double) aprox_len_query; | |
| 265 double ratio_query = (double) aprox_len_query / dimension; | |
| 266 | |
| 267 | |
| 268 double i_r_fix = MAX(1.0, custom_kmer * pixel_size_query); | |
| 269 double j_r_fix = MAX(1.0, custom_kmer * pixel_size_db); | |
| 270 | |
| 271 | |
| 272 | |
| 273 fprintf(stdout, "[INFO] Ratios: Q [%e] D [%e]. Lenghts: Q [%"PRIu64"] D [%"PRIu64"]\n", ratio_query, ratio_db, aprox_len_query, current_len); | |
| 274 fprintf(stdout, "[INFO] Pixel size: Q [%e] D [%e].\n", pixel_size_query, pixel_size_db); | |
| 275 | |
| 276 | |
| 277 fprintf(stdout, "[INFO] Computing absolute hit numbers.\n"); | |
| 278 | |
| 279 | |
| 280 current_len = 0; | |
| 281 | |
| 282 //llpos * the_original_hit; | |
| 283 | |
| 284 //To force reading from the buffer | |
| 285 idx = READBUF + 1; | |
| 286 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 287 //uint64_t c_hash_holder = 0; | |
| 288 | |
| 289 while((!feof(query) || (feof(query) && idx < r))){ | |
| 290 | |
| 291 if(c == '>'){ | |
| 292 word_size = 0; | |
| 293 word_size_rev = custom_kmer-1; | |
| 294 | |
| 295 | |
| 296 | |
| 297 | |
| 298 while(c != '\n'){ c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); } //Skip ID | |
| 299 | |
| 300 | |
| 301 while(c != '>' && (!feof(query) || (feof(query) && idx < r))){ //Until next id | |
| 302 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 303 c = toupper(c); | |
| 304 if(c == 'A' || c == 'C' || c == 'G' || c == 'T'){ | |
| 305 | |
| 306 ++current_len; | |
| 307 if(current_len % a_hundreth == 0){ | |
| 308 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/aprox_len_query); | |
| 309 fflush(stdout); | |
| 310 } | |
| 311 curr_kmer[word_size] = (unsigned char) c; | |
| 312 ++word_size; | |
| 313 | |
| 314 switch(c){ | |
| 315 case ('A'): reverse_kmer[word_size_rev] = (unsigned)'T'; | |
| 316 break; | |
| 317 case ('C'): reverse_kmer[word_size_rev] = (unsigned)'G'; | |
| 318 break; | |
| 319 case ('G'): reverse_kmer[word_size_rev] = (unsigned)'C'; | |
| 320 break; | |
| 321 case ('T'): reverse_kmer[word_size_rev] = (unsigned)'A'; | |
| 322 break; | |
| 323 } | |
| 324 if(word_size_rev != 0) --word_size_rev; | |
| 325 | |
| 326 | |
| 327 | |
| 328 | |
| 329 if(word_size == custom_kmer){ | |
| 330 | |
| 331 | |
| 332 //hash_forward = hashOfWord(&curr_kmer[0], custom_kmer, FIXED_K); | |
| 333 //hash_reverse = hashOfWord(&reverse_kmer[0], custom_kmer, FIXED_K); | |
| 334 uint64_t hash_forward, hash_reverse; | |
| 335 hash_forward = collisioned_hash(&curr_kmer[0], custom_kmer); | |
| 336 hash_reverse = collisioned_hash(&reverse_kmer[0], custom_kmer); | |
| 337 | |
| 338 | |
| 339 thit = &ctidx->table[char_converter[curr_kmer[0]]][char_converter[curr_kmer[1]]][char_converter[curr_kmer[2]]] | |
| 340 [char_converter[curr_kmer[3]]][char_converter[curr_kmer[4]]][char_converter[curr_kmer[5]]] | |
| 341 [char_converter[curr_kmer[6]]][char_converter[curr_kmer[7]]][char_converter[curr_kmer[8]]] | |
| 342 [char_converter[curr_kmer[9]]][char_converter[curr_kmer[10]]][char_converter[curr_kmer[11]]]; | |
| 343 | |
| 344 //AVLTree * search = find_AVLTree(thit->root, hash_forward); | |
| 345 | |
| 346 if(thit->repetition == FALSE && hash_forward == thit->key){ | |
| 347 // Attention ::::: you were not removing the ones with count==1 earlier | |
| 348 thit->pos_in_y = current_len; | |
| 349 thit->hit_count++; | |
| 350 } | |
| 351 | |
| 352 thit = &ctidx->table[char_converter[reverse_kmer[0]]][char_converter[reverse_kmer[1]]][char_converter[reverse_kmer[2]]] | |
| 353 [char_converter[reverse_kmer[3]]][char_converter[reverse_kmer[4]]][char_converter[reverse_kmer[5]]] | |
| 354 [char_converter[reverse_kmer[6]]][char_converter[reverse_kmer[7]]][char_converter[reverse_kmer[8]]] | |
| 355 [char_converter[reverse_kmer[9]]][char_converter[reverse_kmer[10]]][char_converter[reverse_kmer[11]]]; | |
| 356 | |
| 357 if(thit->repetition == FALSE && hash_reverse == thit->key){ | |
| 358 // Attention ::::: you were not removing the ones with count==1 earlier | |
| 359 thit->pos_in_y = current_len; | |
| 360 thit->hit_count++; | |
| 361 } | |
| 362 | |
| 363 /* | |
| 364 if(search != NULL && search->count == 1){ //If count is two, then it is a rep | |
| 365 thit->hit_count += search->count; | |
| 366 | |
| 367 hash_holder_table[c_hash_holder].pos = current_len; | |
| 368 hash_holder_table[c_hash_holder].node = search; | |
| 369 hash_holder_table[c_hash_holder].th = thit; | |
| 370 ++c_hash_holder; | |
| 371 if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ | |
| 372 ++reallocs_hash_holder_table; | |
| 373 hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); | |
| 374 if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); | |
| 375 } | |
| 376 } | |
| 377 */ | |
| 378 | |
| 379 | |
| 380 | |
| 381 | |
| 382 | |
| 383 //search = find_AVLTree(thit->root, hash_reverse); | |
| 384 /* | |
| 385 if(search != NULL && search->count == 1){ //If count is two, then it is a rep | |
| 386 | |
| 387 thit->hit_count += search->count; | |
| 388 hash_holder_table[c_hash_holder].pos = current_len; | |
| 389 hash_holder_table[c_hash_holder].node = search; | |
| 390 hash_holder_table[c_hash_holder].th = thit; | |
| 391 ++c_hash_holder; | |
| 392 if(c_hash_holder == n_items_hash_holder_table*reallocs_hash_holder_table){ | |
| 393 ++reallocs_hash_holder_table; | |
| 394 hash_holder_table = (Hash_holder *) realloc(hash_holder_table, n_items_hash_holder_table*reallocs_hash_holder_table*sizeof(Hash_holder)); | |
| 395 if(hash_holder_table == NULL) terror("Could not realloc hash holder table"); | |
| 396 } | |
| 397 } | |
| 398 */ | |
| 399 | |
| 400 // Overlapping | |
| 401 | |
| 402 memmove(&curr_kmer[0], &curr_kmer[1], custom_kmer-1); | |
| 403 memmove(&reverse_kmer[1], &reverse_kmer[0], custom_kmer-1); | |
| 404 --word_size; | |
| 405 | |
| 406 // Non overlapping | |
| 407 //word_size = 0; | |
| 408 //word_size_rev = custom_kmer-1; | |
| 409 } | |
| 410 }else{ | |
| 411 if(c != '\n' && c != '>'){ | |
| 412 word_size = 0; | |
| 413 word_size_rev = custom_kmer-1; | |
| 414 ++current_len; | |
| 415 } | |
| 416 } | |
| 417 } | |
| 418 }else{ | |
| 419 c = buffered_fgetc(temp_seq_buffer, &idx, &r, query); | |
| 420 } | |
| 421 | |
| 422 } | |
| 423 | |
| 424 /// Out | |
| 425 | |
| 426 fprintf(stdout, "Scanning hits table.\n"); | |
| 427 | |
| 428 a_hundreth = MAX(1, TOTAL_ENTRIES/100); | |
| 429 uint64_t t_computed = 0; | |
| 430 uint64_t w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11; | |
| 431 for(w0=0;w0<4;w0++){ | |
| 432 for(w1=0;w1<4;w1++){ | |
| 433 for(w2=0;w2<4;w2++){ | |
| 434 for(w3=0;w3<4;w3++){ | |
| 435 for(w4=0;w4<4;w4++){ | |
| 436 for(w5=0;w5<4;w5++){ | |
| 437 for(w6=0;w6<4;w6++){ | |
| 438 for(w7=0;w7<4;w7++){ | |
| 439 for(w8=0;w8<4;w8++){ | |
| 440 for(w9=0;w9<4;w9++){ | |
| 441 for(w10=0;w10<4;w10++){ | |
| 442 for(w11=0;w11<4;w11++){ | |
| 443 | |
| 444 if(t_computed % a_hundreth == 0){ | |
| 445 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*t_computed/TOTAL_ENTRIES); | |
| 446 fflush(stdout); | |
| 447 } | |
| 448 ++t_computed; | |
| 449 Tuple_hits * taux = &ctidx->table[w0][w1][w2][w3][w4][w5][w6][w7][w8][w9][w10][w11]; | |
| 450 if(taux->hit_count == 1){ | |
| 451 // We plot it | |
| 452 // Convert scale to representation | |
| 453 uint64_t redir_db = (uint64_t) (taux->pos / (ratio_db)); | |
| 454 uint64_t redir_query = (uint64_t) (taux->pos_in_y / (ratio_query)); | |
| 455 double i_r = i_r_fix; double j_r = j_r_fix; | |
| 456 while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ | |
| 457 if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ | |
| 458 representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; | |
| 459 }else{ | |
| 460 representation[redir_query][redir_db]++; | |
| 461 break; | |
| 462 } | |
| 463 i_r -= MIN(1.0, pixel_size_query); | |
| 464 j_r -= MIN(1.0, pixel_size_db); | |
| 465 } | |
| 466 } | |
| 467 } | |
| 468 } | |
| 469 } | |
| 470 } | |
| 471 } | |
| 472 } | |
| 473 } | |
| 474 } | |
| 475 } | |
| 476 } | |
| 477 } | |
| 478 } | |
| 479 | |
| 480 | |
| 481 //double average_hit = ((double) total_hits / (double) table_size); | |
| 482 //average_hit = 2.2; | |
| 483 | |
| 484 /* | |
| 485 //fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" Avg = %e.\n", total_hits, table_size, average_hit); | |
| 486 fprintf(stdout, "[INFO] Total hit count is %"PRIu64" on a size of %"PRIu64" E' = %e.\n", total_hits, table_size, Eprime); | |
| 487 | |
| 488 | |
| 489 | |
| 490 | |
| 491 | |
| 492 a_hundreth = MAX(1,c_hash_holder/100); | |
| 493 | |
| 494 for(current_len = 0; current_len < c_hash_holder; current_len++){ | |
| 495 | |
| 496 if(current_len % a_hundreth == 0){ | |
| 497 fprintf(stdout, "\r%"PRIu64"%%...", 1+100*current_len/c_hash_holder); | |
| 498 fflush(stdout); | |
| 499 } | |
| 500 | |
| 501 aux = hash_holder_table[current_len].node->next; | |
| 502 | |
| 503 //if(hash_holder_table[current_len].th->hit_count < (uint64_t) average_hit){ | |
| 504 if(hash_holder_table[current_len].th->hit_count < (uint64_t) Eprime){ | |
| 505 while(aux != NULL){ | |
| 506 // Convert scale to representation | |
| 507 uint64_t redir_db = (uint64_t) (aux->pos / (ratio_db)); | |
| 508 uint64_t redir_query = (uint64_t) (hash_holder_table[current_len].pos / (ratio_query)); | |
| 509 double i_r = i_r_fix; double j_r = j_r_fix; | |
| 510 while((uint64_t) i_r >= 1 && (uint64_t) j_r >= 1){ | |
| 511 if((int64_t) redir_query - (int64_t) i_r > 0 && (int64_t) redir_db - (int64_t) j_r > 0){ | |
| 512 representation[(int64_t) redir_query - (int64_t) i_r][(int64_t) redir_db - (int64_t) j_r]++; | |
| 513 }else{ | |
| 514 representation[redir_query][redir_db]++; | |
| 515 break; | |
| 516 } | |
| 517 i_r -= MIN(1.0, pixel_size_query); | |
| 518 j_r -= MIN(1.0, pixel_size_db); | |
| 519 } | |
| 520 aux = aux->next; | |
| 521 } | |
| 522 } | |
| 523 } | |
| 524 */ | |
| 525 | |
| 526 | |
| 527 //end = clock(); | |
| 528 | |
| 529 | |
| 530 | |
| 531 | |
| 532 //fprintf(stdout, "\n[INFO] Query length %"PRIu64". Hits completed. Took %e seconds\n", data_query.total_len, (double)(end-begin)/CLOCKS_PER_SEC); | |
| 533 fprintf(stdout, "\n[INFO] Query length %"PRIu64".\n", current_len); | |
| 534 | |
| 535 //begin = clock(); | |
| 536 | |
| 537 //reads_per_thread = (uint64_t) (floorl((long double) data_query.n_seqs / (long double) n_threads)); | |
| 538 | |
| 539 fprintf(stdout, "[INFO] Writing matrix.\n"); | |
| 540 | |
| 541 | |
| 542 uint64_t unique_diffuse = 0; | |
| 543 fprintf(out_database, "%"PRIu64"\n", aprox_len_query); | |
| 544 fprintf(out_database, "%"PRIu64"\n", aprox_len_db); | |
| 545 // And replace 2's with 1's | |
| 546 | |
| 547 for(i=0; i<dimension+1; i++){ | |
| 548 for(j=0; j<dimension; j++){ | |
| 549 fprintf(out_database, "%"PRIu64" ", representation[i][j]); | |
| 550 unique_diffuse += representation[i][j]; | |
| 551 } | |
| 552 fprintf(out_database, "%"PRIu64"\n", representation[i][dimension]); | |
| 553 unique_diffuse += representation[i][dimension]; | |
| 554 } | |
| 555 | |
| 556 fprintf(stdout, "[INFO] Found %"PRIu64" unique hits for z = %"PRIu64".\n", unique_diffuse, diffuse_z); | |
| 557 | |
| 558 | |
| 559 | |
| 560 //free(ct->table); | |
| 561 //free(hash_holder_table); | |
| 562 /* | |
| 563 for(i=0;i<=n_pools_used_AVL;i++){ | |
| 564 free(mp_AVL[i].base); | |
| 565 } | |
| 566 | |
| 567 for(i=0;i<=n_pools_used;i++){ | |
| 568 free(mp[i].base); | |
| 569 } | |
| 570 */ | |
| 571 for(i=0;i<dimension;i++){ | |
| 572 free(representation[i]); | |
| 573 } | |
| 574 free(representation); | |
| 575 if(out_database != NULL) fclose(out_database); | |
| 576 | |
| 577 return 0; | |
| 578 } | |
| 579 | |
| 580 void init_args(int argc, char ** av, FILE ** query, FILE ** database, FILE ** out_database, uint64_t * custom_kmer, uint64_t * dimension, uint64_t * diffuse_z){ | |
| 581 | |
| 582 int pNum = 0; | |
| 583 while(pNum < argc){ | |
| 584 if(strcmp(av[pNum], "--help") == 0){ | |
| 585 fprintf(stdout, "USAGE:\n"); | |
| 586 fprintf(stdout, " CHROMEISTER -query [query] -db [database] -out [outfile]\n"); | |
| 587 fprintf(stdout, "OPTIONAL:\n"); | |
| 588 fprintf(stdout, " -kmer [Integer: k>1 (default 32)]\n"); | |
| 589 fprintf(stdout, " -diffuse [Integer: z>0 (default 4)]\n"); | |
| 590 fprintf(stdout, " -dimension Size of the output [Integer: d>0 (default 1000)]\n"); | |
| 591 fprintf(stdout, " -out [File path]\n"); | |
| 592 fprintf(stdout, " --help Shows help for program usage\n"); | |
| 593 fprintf(stdout, "\n"); | |
| 594 fprintf(stdout, "PLEASE NOTICE: The reverse complementary is calculated for the QUERY.\n"); | |
| 595 exit(1); | |
| 596 } | |
| 597 if(strcmp(av[pNum], "-query") == 0){ | |
| 598 *query = fopen64(av[pNum+1], "rt"); | |
| 599 if(query==NULL) terror("Could not open query file"); | |
| 600 } | |
| 601 if(strcmp(av[pNum], "-db") == 0){ | |
| 602 *database = fopen64(av[pNum+1], "rt"); | |
| 603 if(database==NULL) terror("Could not open database file"); | |
| 604 } | |
| 605 if(strcmp(av[pNum], "-out") == 0){ | |
| 606 *out_database = fopen(av[pNum+1], "wt"); | |
| 607 if(out_database==NULL) terror("Could not open output database file"); | |
| 608 } | |
| 609 if(strcmp(av[pNum], "-kmer") == 0){ | |
| 610 *custom_kmer = (uint64_t) atoi(av[pNum+1]); | |
| 611 if(*custom_kmer < BYTES_IN_MER) terror("K-mer size must be larger than 4"); | |
| 612 if(*custom_kmer % BYTES_IN_MER != 0) terror("K-mer size must be a multiple of 4"); | |
| 613 | |
| 614 } | |
| 615 if(strcmp(av[pNum], "-diffuse") == 0){ | |
| 616 *diffuse_z = (uint64_t) atoi(av[pNum+1]); | |
| 617 if(*diffuse_z == 0 || *diffuse_z > 32) terror("Z must satisfy 0<z<=32"); | |
| 618 | |
| 619 } | |
| 620 | |
| 621 if(strcmp(av[pNum], "-dimension") == 0){ | |
| 622 *dimension = (uint64_t) atoi(av[pNum+1]); | |
| 623 if(*dimension < 1) terror("Dimension must be a positive integer"); | |
| 624 } | |
| 625 | |
| 626 pNum++; | |
| 627 } | |
| 628 | |
| 629 if(*query==NULL || *database==NULL || *out_database==NULL) terror("A query, database and output file is required"); | |
| 630 } | |
| 631 | |
| 632 uint64_t get_seq_len(FILE * f) { | |
| 633 char c = '\0'; | |
| 634 uint64_t l = 0; | |
| 635 | |
| 636 while(!feof(f)){ | |
| 637 c = getc(f); | |
| 638 | |
| 639 if(c == '>'){ | |
| 640 while(c != '\n') c = getc(f); | |
| 641 } | |
| 642 c = toupper(c); | |
| 643 if(c >= 'A' && c <= 'Z'){ | |
| 644 ++l; | |
| 645 } | |
| 646 } | |
| 647 | |
| 648 | |
| 649 rewind(f); | |
| 650 return l; | |
| 651 } | 
