Mercurial > repos > cstrittmatter > mitokmer
comparison kmer_read_m3.cpp @ 0:1472b4f4fbfe draft
planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author | cstrittmatter |
---|---|
date | Fri, 12 Apr 2019 14:58:18 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1472b4f4fbfe |
---|---|
1 // newkmer_5.cpp load k-mers for all species | |
2 // test reads by SmithWaterman | |
3 // MKM 1 APR 2016 | |
4 //-std=c++0x -D__NO_INLINE__ ... at end: -lz | |
5 //#include "stdafx.h" | |
6 #include <fstream> | |
7 #include <stdlib.h> | |
8 #include <stdio.h> | |
9 #include <math.h> | |
10 #include <iostream> | |
11 #include <sstream> | |
12 #include <iomanip> | |
13 #include <ostream> | |
14 #include <string> | |
15 #include <vector> | |
16 #include <unordered_set> | |
17 #include <time.h> | |
18 //#include <sys/time.h> | |
19 #include <cstddef> | |
20 #include <cstring> | |
21 #include <algorithm> | |
22 #include <set> | |
23 #include <zlib.h> | |
24 #include <dirent.h> | |
25 using namespace std; | |
26 | |
27 const int minalign = 0; //120; | |
28 unordered_set<int> myset = { 0}; | |
29 | |
30 typedef int ptype; | |
31 typedef unsigned long long ui64; | |
32 typedef unsigned long long ktype; | |
33 typedef unsigned int vtype; | |
34 typedef unsigned short int otype; | |
35 typedef unsigned long long itype; | |
36 | |
37 const int KSIZE = 30; //kmer size | |
38 const int MAXREP = 2048; | |
39 const int REPSHIFT = 11; //(log of maxrep) | |
40 const int SAVENUM = 12; //save this many reads | |
41 const itype MAXHASH = (1 << 30); //pow2 size of hashtable | |
42 const int MAXREPROBE = 16; | |
43 const int GAPO = 11; | |
44 const int GAPX = 1; | |
45 const int MATCH = 5; | |
46 const int MISMATCH = -4; | |
47 const bool INIGAPPEN = false; | |
48 const int MAXALIGN = 1024; | |
49 const int beam = 8; | |
50 | |
51 vector<string> accession; | |
52 vector<int> targno; | |
53 int mcount, savepos, tct, num_orgs=0, save_target=0; | |
54 vector<int> gcount, ucount; | |
55 ofstream outread; | |
56 set<ktype> kmer_seen; | |
57 string fdir = ""; //directory for fasta.gz files | |
58 | |
59 const char bases[4] = { 'A', 'C', 'G', 'T' }; | |
60 const ui64 one = 1; | |
61 const ui64 two = 2; | |
62 const ui64 three = 3; | |
63 const ui64 mask = (one << (KSIZE * 2)) - 1; | |
64 const ui64 hiC = one << (KSIZE - 1) * 2; | |
65 const ui64 hiG = two << (KSIZE - 1) * 2; | |
66 const ui64 hiT = three << (KSIZE - 1) * 2; | |
67 const ui64 hi1 = one << 62; | |
68 const ui64 hi2 = two << 62; | |
69 const ui64 hi3 = three << 62; | |
70 const ui64 himask = hi1 - one; | |
71 | |
72 const unsigned BUFLEN = 0x4000; | |
73 | |
74 void error(const char* const msg) | |
75 { | |
76 cerr << msg << "\n"; | |
77 exit(255); | |
78 } | |
79 | |
80 string int2str(ktype key) | |
81 { | |
82 string sequence = ""; | |
83 for (int i = 0; i < KSIZE; ++i) | |
84 { | |
85 sequence = bases[key & 3] + sequence; | |
86 key >>= 2; | |
87 } | |
88 return sequence; | |
89 } | |
90 | |
91 class Tree1 | |
92 { | |
93 public: | |
94 | |
95 vector<vector<int>> children; | |
96 vector<int> parent; | |
97 int root; | |
98 | |
99 Tree1(int nt) | |
100 { | |
101 vector<int> blank; | |
102 root = 1; | |
103 for (int i = 0; i < nt; i++) | |
104 { | |
105 parent.push_back(root); | |
106 children.push_back(blank); | |
107 } | |
108 } | |
109 | |
110 ~Tree1() | |
111 { | |
112 } | |
113 | |
114 void add_edge(int x, int y) | |
115 { | |
116 parent[y] = x; | |
117 children[x].push_back(y); | |
118 } | |
119 | |
120 int msca(int x, int y) | |
121 { | |
122 //return most specific common ancestor of nodes x and y | |
123 set<int> ancestors; | |
124 int z; | |
125 | |
126 ancestors.insert(root); | |
127 z = x; | |
128 //cout << x << " " << y << " " << z << endl; | |
129 while (z != root) | |
130 { | |
131 ancestors.insert(z); | |
132 z = get_parent(z); | |
133 //cout << x << " " << y << " " << z << endl; | |
134 } | |
135 z = y; | |
136 if (ancestors.find(y) != ancestors.end()) | |
137 return x; //x is descendent of y | |
138 while (ancestors.find(z) == ancestors.end()) | |
139 { | |
140 z = get_parent(z); | |
141 if (z==x) //y is descendent of x | |
142 return y; | |
143 //cout << x << " " << y << " " << z << endl; | |
144 } | |
145 return z; //common ancesctor | |
146 } | |
147 | |
148 int get_parent(int x) | |
149 { | |
150 if (x != root && x > 0) | |
151 return parent[x]; | |
152 else | |
153 return root; | |
154 } | |
155 | |
156 }; | |
157 | |
158 Tree1 *taxonomy; | |
159 | |
160 class Hashtable | |
161 { | |
162 public: | |
163 | |
164 itype size; | |
165 | |
166 struct Cell | |
167 { | |
168 ktype key; | |
169 vtype value; | |
170 int org; | |
171 ptype position; | |
172 bool fstrand; | |
173 } *pHash; | |
174 | |
175 Hashtable() | |
176 { | |
177 size = 0; | |
178 pHash = new Cell[MAXHASH]; | |
179 HashClear(); | |
180 //for (int i = 1; i<MAXREPROBE; i++) | |
181 // reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2 | |
182 } | |
183 | |
184 ~Hashtable() | |
185 { | |
186 delete[] pHash; | |
187 size = 0; | |
188 } | |
189 | |
190 // from code.google.com/p/smhasher/wiki/MurmurHash3 | |
191 itype integerHash(ui64 k) | |
192 { | |
193 k ^= k >> 33; | |
194 k *= 0xff51afd7ed558ccd; | |
195 k ^= k >> 33; | |
196 k *= 0xc4ceb9fe1a85ec53; | |
197 k ^= k >> 33; | |
198 return (itype) k; | |
199 } | |
200 | |
201 void HashClear() | |
202 { | |
203 memset(pHash, 0, MAXHASH * sizeof(Cell)); | |
204 } | |
205 | |
206 int getHash(ktype key, otype &org, ptype &position, bool &fstrand) | |
207 { | |
208 //0 target=bad | |
209 //returns target if probe present in table | |
210 //sets position and fstrand | |
211 itype index, reprobe, i, hash; | |
212 | |
213 position = 0; | |
214 hash = integerHash(key); | |
215 reprobe = 0; | |
216 i = 0; | |
217 do | |
218 { | |
219 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size | |
220 reprobe += ++i; | |
221 if (pHash[index].value == 0) | |
222 { //empty cell | |
223 return 0; | |
224 } | |
225 else if (pHash[index].key == key) | |
226 { | |
227 position = pHash[index].position; | |
228 fstrand = pHash[index].fstrand; | |
229 org = pHash[index].org; | |
230 return (pHash[index].value); | |
231 } | |
232 } while (reprobe < MAXHASH && i < MAXREPROBE); | |
233 | |
234 return 0; | |
235 } | |
236 | |
237 void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand) | |
238 { //add kmer to table | |
239 itype index, reprobe, i, hash; | |
240 bool notdone=true; | |
241 | |
242 hash = integerHash(key); | |
243 reprobe = 0; | |
244 i = 0; | |
245 do | |
246 { | |
247 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size | |
248 reprobe += ++i; | |
249 | |
250 if (pHash[index].value == 0) | |
251 { //empty cell | |
252 notdone = false; | |
253 pHash[index].key = key; | |
254 pHash[index].value = target; | |
255 pHash[index].org = org; | |
256 pHash[index].position = position; | |
257 pHash[index].fstrand = fstrand; | |
258 if (++size > MAXHASH - 32) | |
259 { | |
260 cout << "out of memory in table " << endl; | |
261 exit(1); | |
262 } | |
263 } | |
264 } while (notdone); | |
265 } | |
266 | |
267 }; //class Hashtable | |
268 Hashtable *ht; | |
269 | |
270 string process_seq(string seq) | |
271 { | |
272 //format nucleotide sequence to all caps ACTG, all else is N | |
273 string seq2; | |
274 string::iterator it1; | |
275 char base; | |
276 | |
277 seq2 = ""; | |
278 for (it1 = seq.begin(); it1 != seq.end(); ++it1) | |
279 { | |
280 base = *it1; | |
281 switch (base) | |
282 { | |
283 case 'a': | |
284 seq2 += "A"; | |
285 break; | |
286 case 'c': | |
287 seq2 += "C"; | |
288 break; | |
289 case 'g': | |
290 seq2 += "G"; | |
291 break; | |
292 case 't': | |
293 seq2 += "T"; | |
294 break; | |
295 case 'A': | |
296 seq2 += "A"; | |
297 break; | |
298 case 'C': | |
299 seq2 += "C"; | |
300 break; | |
301 case 'G': | |
302 seq2 += "G"; | |
303 break; | |
304 case 'T': | |
305 seq2 += "T"; | |
306 break; | |
307 default: | |
308 seq2 += "N"; | |
309 break; | |
310 } | |
311 } | |
312 | |
313 return seq2; | |
314 } | |
315 | |
316 string load_data3(gzFile in) | |
317 { //read gzip fasta file and extract nt sequence | |
318 char buf[BUFLEN]; | |
319 char* offset = buf; | |
320 string line; | |
321 string sequence = ""; | |
322 | |
323 while (true) | |
324 { | |
325 int err, len = sizeof(buf)-(offset-buf); | |
326 if (len == 0) error("Buffer to small for input line lengths"); | |
327 | |
328 len = gzread(in, offset, len); | |
329 | |
330 if (len == 0) break; | |
331 if (len < 0) error(gzerror(in, &err)); | |
332 | |
333 char* cur = buf; | |
334 char* end = offset+len; | |
335 | |
336 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) | |
337 { | |
338 line = string(cur, eol); | |
339 if (line.back() == '\r') | |
340 line.pop_back(); | |
341 if (line.length() > 0) | |
342 { | |
343 if (line.at(0) == '>') | |
344 { | |
345 sequence += "N"; //contig separator | |
346 } | |
347 else | |
348 { | |
349 sequence += process_seq(line); | |
350 } | |
351 } | |
352 //cout << line << endl; | |
353 } | |
354 | |
355 // any trailing data in [eol, end) now is a partial line | |
356 offset = copy(cur, end, buf); | |
357 } | |
358 | |
359 // trailing data without eol | |
360 line = string(buf, offset); | |
361 if (gzclose(in) != Z_OK) error("failed gzclose"); | |
362 | |
363 return sequence; | |
364 } | |
365 | |
366 int *tableM, *tableI, *tableD; | |
367 | |
368 void tableinit() | |
369 { | |
370 int i,j; | |
371 const int NINF = -2000000000; | |
372 | |
373 tableM[0] = 0; | |
374 tableI[0] = NINF; | |
375 tableD[0] = NINF; | |
376 for (i = 1; i<MAXALIGN; i++) //initialize upper row | |
377 { | |
378 j = (0 > i - beam - 1) ? 0 : i - beam - 1; | |
379 tableI[j*MAXALIGN + i] = NINF; | |
380 if (INIGAPPEN) | |
381 { | |
382 tableD[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX; | |
383 tableM[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX; | |
384 } | |
385 else | |
386 { | |
387 tableD[0 + i] = 0; | |
388 tableM[0 + i] = 0; | |
389 } | |
390 } | |
391 for (j = 1; j<MAXALIGN; j++) //init left column | |
392 { | |
393 i = (0 > j - beam - 1) ? 0 : j - beam - 1; | |
394 tableD[j*MAXALIGN + i] = NINF; | |
395 if (INIGAPPEN) | |
396 { | |
397 tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX; | |
398 tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX; | |
399 } | |
400 else | |
401 { | |
402 tableI[j*MAXALIGN + i] = 0; | |
403 tableM[j*MAXALIGN + i] = 0; | |
404 } | |
405 } | |
406 } | |
407 | |
408 int align(string dna1, string dna2) | |
409 { | |
410 //smith waterman alignment with beam | |
411 //returns alignment score | |
412 int b1, b2, len1, len2, tindex; | |
413 int b,bb,c,cc; | |
414 int i, j, maxval, startx, starty; | |
415 int i1, i2; | |
416 | |
417 len1 = dna1.length() + 1; | |
418 len2 = dna2.length() + 1; | |
419 | |
420 j=1; | |
421 do | |
422 { | |
423 i = (j <= beam) ? j : j - beam; | |
424 i2 = (j + beam > len1) ? len1 : j + beam; | |
425 do | |
426 { | |
427 maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]); | |
428 //copy/replace | |
429 if (dna1[i-1] != dna2[j-1]) | |
430 maxval += MISMATCH; | |
431 else maxval += MATCH; | |
432 tableM[j*MAXALIGN + i] = maxval; | |
433 | |
434 b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap | |
435 bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend | |
436 tableI[j*MAXALIGN + i] = (bb<b) ? b : bb; | |
437 | |
438 c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap | |
439 cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend | |
440 tableD[j*MAXALIGN + i] = (cc<c) ? c : cc; | |
441 } while(++i < i2); | |
442 } while(++j < len2); | |
443 | |
444 i = len1 - 1; | |
445 j = len2 - 1; //process last entry | |
446 | |
447 maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]); | |
448 | |
449 return maxval; | |
450 | |
451 } | |
452 | |
453 int process_read(string &sequence, string acc, int start, int stop) | |
454 { | |
455 //returns id of read, 0 = no match, updates gcount, ucount | |
456 //will do align for first minalign of each target | |
457 //will save first SAVENUM reads of each target | |
458 | |
459 char base; | |
460 int it1, it2; | |
461 string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr; | |
462 ktype keyF, keyR, key; | |
463 int i, cpos, minscr, gpos; | |
464 bool fstrand1, fstrand2; | |
465 int readlength = stop - start + 1, readlen2; | |
466 int st2, scr, stlen2; | |
467 ktype target, final_targ=0; | |
468 ptype position; | |
469 otype org; | |
470 int maxct, totct, rtarg=0; | |
471 bool reject = false; | |
472 | |
473 cpos = 0; | |
474 keyF = 0; | |
475 keyR = 0; | |
476 gpos = 0; | |
477 minscr = 5 * sequence.length() / 2; | |
478 for (it1 = start; it1 <= stop; ++it1) | |
479 { | |
480 base = sequence.at(it1); | |
481 switch (base) | |
482 { | |
483 case 'A': | |
484 keyF = (keyF << 2) & mask; | |
485 keyR = (keyR >> 2) | hiT; | |
486 cpos++; | |
487 break; | |
488 case 'C': | |
489 keyF = ((keyF << 2) & mask) | 1; | |
490 keyR = (keyR >> 2) | hiG; | |
491 cpos++; | |
492 break; | |
493 case 'G': | |
494 keyF = ((keyF << 2) & mask) | 2; | |
495 keyR = (keyR >> 2) | hiC; | |
496 cpos++; | |
497 break; | |
498 case 'T': | |
499 keyF = ((keyF << 2) & mask) | 3; | |
500 keyR = keyR >> 2; | |
501 cpos++; | |
502 break; | |
503 case 'a': | |
504 keyF = (keyF << 2) & mask; | |
505 keyR = (keyR >> 2) | hiT; | |
506 cpos++; | |
507 break; | |
508 case 'c': | |
509 keyF = ((keyF << 2) & mask) | 1; | |
510 keyR = (keyR >> 2) | hiG; | |
511 cpos++; | |
512 break; | |
513 case 'g': | |
514 keyF = ((keyF << 2) & mask) | 2; | |
515 keyR = (keyR >> 2) | hiC; | |
516 cpos++; | |
517 break; | |
518 case 't': | |
519 keyF = ((keyF << 2) & mask) | 3; | |
520 keyR = keyR >> 2; | |
521 cpos++; | |
522 break; | |
523 default: | |
524 cpos = 0; //ambiguous character | |
525 keyF = 0; | |
526 keyR = 0; | |
527 break; | |
528 } | |
529 gpos++; | |
530 if (cpos == KSIZE) | |
531 { | |
532 key = keyF < keyR ? keyF : keyR; | |
533 target = ht->getHash(key, org, position, fstrand2); | |
534 //position is ending base, 1-based | |
535 if (target > 0 && gcount[target] < minalign && target != final_targ) | |
536 { //do alignment on initial reads of a species | |
537 fname = fdir + accession[org] + ".fasta.gz"; | |
538 sequence2 = load_data3(gzopen(fname.c_str(), "rb")); | |
539 stlen2 = sequence2.length(); | |
540 fstrand1 = (keyF < keyR); | |
541 readlen2 = readlength; | |
542 if (fstrand1 == fstrand2) | |
543 { | |
544 st2 = position - it1; | |
545 if (st2 < 0) st2 = 0; | |
546 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2; | |
547 dna1 = sequence.substr(start, readlength); | |
548 dna2 = sequence2.substr(st2, readlen2); | |
549 } | |
550 else | |
551 { | |
552 //reverse complement dna1 | |
553 st2 = position - KSIZE + 2 + it1 - readlength; | |
554 if (st2 < 0) st2 = 0; | |
555 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2; | |
556 //dna1 = sequence.substr(start, readlength); | |
557 dna1 = ""; | |
558 for (it2 = stop; it2 >= start; --it2) | |
559 { | |
560 base = sequence.at(it2); | |
561 switch (base) | |
562 { | |
563 case 'A': | |
564 dna1.push_back('T'); | |
565 break; | |
566 case 'C': | |
567 dna1.push_back('G'); | |
568 break; | |
569 case 'G': | |
570 dna1.push_back('C'); | |
571 break; | |
572 case 'T': | |
573 dna1.push_back('A'); | |
574 break; | |
575 default: | |
576 dna1.push_back('N'); | |
577 break; | |
578 } | |
579 } | |
580 dna2 = sequence2.substr(st2, readlen2); | |
581 } | |
582 scr = align(dna1, dna2); | |
583 if (scr < minscr) | |
584 { | |
585 rtarg = target; | |
586 target = 0; //fail | |
587 reject = true; | |
588 //cout << rtarg << ": failed " << scr << endl; | |
589 } | |
590 } | |
591 if (final_targ > 0 && target > 0) | |
592 { | |
593 final_targ = taxonomy->msca(target, final_targ); | |
594 finalkeystr = int2str(key); | |
595 } | |
596 else if (target > 0) | |
597 { | |
598 final_targ = target; | |
599 finalkeystr = int2str(key); | |
600 } | |
601 if (target > 1) | |
602 { | |
603 if (kmer_seen.find(key) == kmer_seen.end()) | |
604 { | |
605 ucount[target]++; | |
606 kmer_seen.insert(key); | |
607 } | |
608 } | |
609 cpos--; | |
610 } | |
611 } //it | |
612 /* | |
613 if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0) | |
614 { | |
615 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl; | |
616 outread << "[" << finalkeystr << "]" << endl; | |
617 } | |
618 if (final_targ > 1 && final_targ == save_target) | |
619 { | |
620 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl; | |
621 } */ | |
622 gcount[final_targ]++; | |
623 tct++; | |
624 | |
625 return final_targ; | |
626 } | |
627 | |
628 | |
629 void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand) | |
630 { //adds kmer from datafile into hashtable | |
631 char base; | |
632 string::iterator it1; | |
633 int cpos, gpos; | |
634 ui64 keyF; | |
635 | |
636 cpos = 0; | |
637 gpos = 0; | |
638 keyF = 0; | |
639 | |
640 for (it1 = sequence.begin(); it1 != sequence.end(); ++it1) | |
641 { | |
642 base = *it1; | |
643 switch (base) | |
644 { | |
645 case 'A': | |
646 keyF = (keyF << 2) & mask; | |
647 cpos++; | |
648 break; | |
649 case 'C': | |
650 keyF = ((keyF << 2) & mask) | 1; | |
651 cpos++; | |
652 break; | |
653 case 'G': | |
654 keyF = ((keyF << 2) & mask) | 2; | |
655 cpos++; | |
656 break; | |
657 case 'T': | |
658 keyF = ((keyF << 2) & mask) | 3; | |
659 cpos++; | |
660 break; | |
661 default: | |
662 cpos = 0; //ambiguous character | |
663 keyF = 0; | |
664 break; | |
665 } | |
666 if (cpos == KSIZE) | |
667 { | |
668 ht->add_kmer(keyF, target, org, position, fstrand); | |
669 cpos--; | |
670 } | |
671 gpos++; | |
672 } //it | |
673 | |
674 } | |
675 | |
676 void process_qual(string acc, string &seq, string &qual) | |
677 { //uses PHRED 33 scores | |
678 //to trim read and pass it to process_read | |
679 const int cutoff_qual = 17; | |
680 const char cutoff_char = 32 + cutoff_qual; | |
681 int start, stop, target, i; | |
682 int window_size = 4; | |
683 int window_val, window_cut = cutoff_qual * window_size; | |
684 string trim_seq; | |
685 | |
686 stop = seq.length()-1; | |
687 start = 0; | |
688 //trim all low qual bases from end | |
689 while (qual.at(start) < cutoff_char && start < stop) | |
690 start++; | |
691 while (qual.at(stop) < cutoff_char && stop > start) | |
692 stop--; | |
693 //trim by window | |
694 if (start < stop - window_size) | |
695 { | |
696 window_val = 0; | |
697 for (i=0; i<window_size; i++) | |
698 window_val += qual.at(start+i) - 32; | |
699 while (window_val < window_cut && start < stop - window_size) | |
700 { | |
701 window_val += qual.at(start+window_size) - qual.at(start); | |
702 start++; | |
703 } | |
704 } | |
705 if (start < stop - window_size) | |
706 { | |
707 window_val = 0; | |
708 for (i=0; i<window_size; i++) | |
709 window_val += qual.at(stop-i) - 32; | |
710 while (window_val < window_cut && start < stop - window_size) | |
711 { | |
712 window_val += qual.at(stop-window_size) - qual.at(stop); | |
713 stop--; | |
714 } | |
715 } | |
716 //trim_seq = seq.substr(start,stop-start+1); | |
717 if (stop - start >= KSIZE) | |
718 { | |
719 target = process_read(seq,acc,start,stop); | |
720 } | |
721 | |
722 } | |
723 | |
724 void process_fqgz(gzFile in) | |
725 { //read gzip fastq file and extract sequence and quality lines | |
726 //passes them to process_qual | |
727 char buf[BUFLEN]; | |
728 char* offset = buf; | |
729 string line, seq, acc; | |
730 int mod4=0; | |
731 | |
732 while (true) | |
733 { | |
734 int err, len = sizeof(buf)-(offset-buf); | |
735 if (len == 0) error("Buffer to small for input line lengths"); | |
736 | |
737 len = gzread(in, offset, len); | |
738 | |
739 if (len == 0) break; | |
740 if (len < 0) error(gzerror(in, &err)); | |
741 | |
742 char* cur = buf; | |
743 char* end = offset+len; | |
744 | |
745 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) | |
746 { | |
747 line = string(cur, eol); | |
748 if (line.back() == '\r') //windows | |
749 line.pop_back(); | |
750 if (line.length() > 0) | |
751 { | |
752 if (mod4 == 1) | |
753 { | |
754 seq = line; | |
755 } | |
756 else if (mod4 == 0) | |
757 { | |
758 acc = line; | |
759 } | |
760 else if (mod4 == 3) | |
761 { | |
762 process_qual(acc, seq, line); | |
763 } | |
764 mod4 = (mod4 + 1) % 4; | |
765 | |
766 } | |
767 //cout << line << endl; | |
768 } | |
769 | |
770 // any trailing data in [eol, end) now is a partial line | |
771 offset = copy(cur, end, buf); | |
772 } | |
773 | |
774 // trailing data without eol | |
775 line = string(buf, offset); | |
776 | |
777 if (gzclose(in) != Z_OK) error("failed gzclose"); | |
778 } | |
779 | |
780 void process_fagz(gzFile in) | |
781 { //read gzip fasta file and extract sequence | |
782 //passes read to process_read | |
783 char buf[BUFLEN]; | |
784 char* offset = buf; | |
785 string line; | |
786 string sequence = "", acc = ""; | |
787 int c1, c2, target; | |
788 | |
789 cout << "true" << endl; | |
790 | |
791 while (true) | |
792 { | |
793 int err, len = sizeof(buf)-(offset-buf); | |
794 if (len == 0) error("Buffer to small for input line lengths"); | |
795 | |
796 len = gzread(in, offset, len); | |
797 | |
798 if (len == 0) break; | |
799 if (len < 0) error(gzerror(in, &err)); | |
800 | |
801 char* cur = buf; | |
802 char* end = offset+len; | |
803 | |
804 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) | |
805 { | |
806 line = string(cur, eol); | |
807 if (line.back() == '\r') | |
808 line.pop_back(); | |
809 if (line.length() > 0) | |
810 { | |
811 if (line.at(0) == '>') | |
812 { | |
813 if (sequence.length() > KSIZE) | |
814 { // process previous sequence | |
815 target = process_read(sequence, acc, 0, sequence.length()-1); | |
816 } | |
817 sequence = ""; | |
818 acc = line.substr(1,line.length()-1); | |
819 } | |
820 else | |
821 { | |
822 sequence += line; | |
823 } | |
824 } | |
825 //cout << line << endl; | |
826 } | |
827 | |
828 // any trailing data in [eol, end) now is a partial line | |
829 offset = copy(cur, end, buf); | |
830 } | |
831 if (sequence.length() > KSIZE) | |
832 { //last one | |
833 target = process_read(sequence, acc, 0, sequence.length()-1); | |
834 } | |
835 // trailing data without eol | |
836 line = string(buf, offset); | |
837 if (gzclose(in) != Z_OK) error("failed gzclose"); | |
838 | |
839 } | |
840 | |
841 int process_gzkmers(gzFile in) | |
842 { | |
843 | |
844 char buf[BUFLEN]; | |
845 char* offset = buf; | |
846 string line; | |
847 int target, count, org, position, tct=0; | |
848 string sequence; | |
849 char strand; | |
850 bool fstrand; | |
851 | |
852 while (true) | |
853 { | |
854 int err, len = sizeof(buf)-(offset-buf); | |
855 if (len == 0) error("Buffer to small for input line lengths"); | |
856 | |
857 len = gzread(in, offset, len); | |
858 | |
859 if (len == 0) break; | |
860 if (len < 0) error(gzerror(in, &err)); | |
861 | |
862 char* cur = buf; | |
863 char* end = offset+len; | |
864 | |
865 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1) | |
866 { | |
867 line = string(cur, eol); | |
868 if (line.back() == '\r') | |
869 line.pop_back(); | |
870 if (line.length() > 0) | |
871 { | |
872 replace(line.begin(), line.end(), ',', ' '); | |
873 istringstream ss(line); | |
874 if (ss >> sequence >> target >> org >> position >> strand >> count) | |
875 { | |
876 fstrand = (strand == 'F'); | |
877 process_kmer(sequence, target, org, position, fstrand); | |
878 tct++; | |
879 } | |
880 } | |
881 //cout << line << endl; | |
882 } | |
883 | |
884 // any trailing data in [eol, end) now is a partial line | |
885 offset = copy(cur, end, buf); | |
886 } | |
887 // trailing data without eol | |
888 line = string(buf, offset); | |
889 if (gzclose(in) != Z_OK) error("failed gzclose"); | |
890 | |
891 return tct; | |
892 | |
893 } | |
894 | |
895 void process_fq(string rname) | |
896 { //read fastq file and extract sequence and quality lines | |
897 //passes them to process_qual | |
898 string line, seq, acc= "", lseq; | |
899 ifstream fin; | |
900 int target; | |
901 int mod4=0; | |
902 | |
903 fin.open(rname); | |
904 if (fin) | |
905 { | |
906 while (getline(fin, line)) | |
907 { | |
908 if (line.back() == '\r') | |
909 line.pop_back(); | |
910 stringstream linestream(line); | |
911 linestream >> lseq; | |
912 if (lseq.length() > 0) | |
913 { | |
914 if (mod4 == 1) | |
915 { | |
916 seq = lseq; | |
917 } | |
918 else if (mod4 == 0) | |
919 { | |
920 acc = lseq; | |
921 } | |
922 else if (mod4 == 3) | |
923 { | |
924 process_qual(acc, seq, lseq); | |
925 } | |
926 mod4 = (mod4 + 1) % 4; | |
927 } | |
928 } | |
929 fin.close(); | |
930 } | |
931 } | |
932 | |
933 void process_fa(string rname) | |
934 { //read unzipped fasta file and extract sequence | |
935 //passes read to process_read | |
936 string sequence = "", line, lseq, acc= ""; | |
937 ifstream fin; | |
938 int target; | |
939 | |
940 fin.open(rname); | |
941 if (fin) | |
942 { | |
943 while (getline(fin, line)) | |
944 { | |
945 if (line.back() == '\r') | |
946 line.pop_back(); | |
947 stringstream linestream(line); | |
948 linestream >> lseq; | |
949 if (lseq[0] == '>') | |
950 { | |
951 if (sequence.length() > KSIZE) | |
952 { | |
953 target = process_read(sequence, acc, 0, sequence.length()-1); | |
954 } | |
955 sequence = ""; | |
956 acc = lseq.substr(1,lseq.length()-1); | |
957 } | |
958 else | |
959 sequence += lseq; | |
960 } | |
961 fin.close(); | |
962 if (sequence.length() > KSIZE) | |
963 { //last one | |
964 target = process_read(sequence, acc, 0, sequence.length()-1); | |
965 } | |
966 } | |
967 else | |
968 { | |
969 cout << "nark " << rname << endl; | |
970 } | |
971 } | |
972 | |
973 int main(int argc, char* argv[]) | |
974 { | |
975 int i,j,num_jobs; | |
976 vtype target; | |
977 ptype position; | |
978 char strand; | |
979 bool fstrand; | |
980 string header, jstr, fname, line, r1name, r2name = "", oname1, oname2; | |
981 int refi, targi, strni, count, org, num_targ; | |
982 ifstream fin; | |
983 string genus, species, acc, sequence, lseq, remainder; | |
984 vector<vector<string>> fnames; | |
985 vector<string> jnames; | |
986 vector<int> jcounts; | |
987 vector<string>::iterator it2; | |
988 string trname, s1, s2, s3,s4; | |
989 string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = ""; | |
990 string iname = "mitochondria_data.txt"; //text file of groups for probe design | |
991 string tname = "mitochondria_tree.txt"; //edges in taxonomy tree | |
992 string pname = "mitochondria_probes.txt.gz"; | |
993 if (argc > 1) | |
994 { | |
995 for (i = 1; i < argc; i++) | |
996 { | |
997 argst = argv[i]; | |
998 if (argst == "-wdir") | |
999 { | |
1000 wdir = argv[i+1]; | |
1001 } | |
1002 if (argst == "-f1") | |
1003 { | |
1004 r1name =argv[i+1]; | |
1005 } | |
1006 if (argst == "-f2") | |
1007 { | |
1008 r2name =argv[i+1]; | |
1009 } | |
1010 } | |
1011 } | |
1012 iname = wdir + iname; | |
1013 tname = wdir + tname; | |
1014 pname = wdir + pname; | |
1015 | |
1016 ht = new Hashtable(); | |
1017 //load strain list | |
1018 | |
1019 | |
1020 cout << "r1 " << r1name << endl; | |
1021 cout << "r2 " << r2name << endl; | |
1022 cout << "wd " << wdir << endl; | |
1023 fin.open(iname); | |
1024 if (fin) | |
1025 { | |
1026 while (getline(fin, line)) | |
1027 { | |
1028 if (line.back() == '\r') | |
1029 line.pop_back(); | |
1030 if (line.length() > 1) | |
1031 { | |
1032 stringstream linestream(line); | |
1033 linestream >> targi >> acc; | |
1034 accession.push_back(acc); | |
1035 targno.push_back(targi); | |
1036 if (targi > num_targ) | |
1037 num_targ = targi; | |
1038 num_orgs++; | |
1039 } | |
1040 } | |
1041 fin.close(); | |
1042 cout << num_orgs << " strains" << endl; | |
1043 num_targ++; | |
1044 } | |
1045 taxonomy = new Tree1(num_targ); | |
1046 | |
1047 //load taxonomy tree | |
1048 fin.open(tname); | |
1049 if (fin) | |
1050 { | |
1051 while (getline(fin, line)) | |
1052 { | |
1053 if (line.back() == '\r') | |
1054 line.pop_back(); | |
1055 stringstream linestream(line); | |
1056 linestream >> i >> j; | |
1057 taxonomy->add_edge(i,j); | |
1058 } | |
1059 } | |
1060 else exit(1); | |
1061 fin.close(); | |
1062 cout << "tree loaded" << endl; | |
1063 | |
1064 //load kmer database | |
1065 tct = process_gzkmers(gzopen(pname.c_str(), "rb")); | |
1066 cout << tct << " kmers loaded" << endl; | |
1067 if (tct < 2) exit(1); | |
1068 | |
1069 //process reads | |
1070 gcount.resize(num_targ,0); | |
1071 ucount.resize(num_targ,0); | |
1072 fill(gcount.begin(), gcount.end(), 0); | |
1073 fill(ucount.begin(), ucount.end(), 0); | |
1074 kmer_seen.clear(); | |
1075 tct = 0; | |
1076 s1 = ".fastq.gz"; | |
1077 s2 = ".fasta"; | |
1078 s3 = ".fastq"; | |
1079 s4 = ".fasta.gz"; | |
1080 int slen = r1name.length(); | |
1081 char sch = r1name.at(slen-1); | |
1082 cout << slen << " : " << sch << endl; | |
1083 if (equal(s1.rbegin(), s1.rend(), r1name.rbegin())) | |
1084 { | |
1085 process_fqgz(gzopen(r1name.c_str(), "rb")); | |
1086 } | |
1087 else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin())) | |
1088 { | |
1089 process_fa(r1name.c_str()); | |
1090 } | |
1091 else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin())) | |
1092 { | |
1093 process_fq(r1name.c_str()); | |
1094 } | |
1095 else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin())) | |
1096 { | |
1097 process_fagz(gzopen(r1name.c_str(), "rb")); | |
1098 } | |
1099 cout << tct << " reads loaded" << endl; | |
1100 if (r2name.length() > 1 && r2name != "none") | |
1101 { //second file? | |
1102 if (equal(s1.rbegin(), s1.rend(), r2name.rbegin())) | |
1103 { | |
1104 process_fqgz(gzopen(r2name.c_str(), "rb")); | |
1105 cout << tct << " reads loaded" << endl; | |
1106 } | |
1107 else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin())) | |
1108 { | |
1109 process_fa(r2name.c_str()); | |
1110 } | |
1111 else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin())) | |
1112 { | |
1113 process_fq(r2name.c_str()); | |
1114 } | |
1115 else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin())) | |
1116 { | |
1117 process_fagz(gzopen(r2name.c_str(), "rb")); | |
1118 } | |
1119 cout << tct << " reads loaded" << endl; | |
1120 } | |
1121 | |
1122 oname2 = wdir + "result.txt"; | |
1123 ofstream out2(oname2); | |
1124 for (i=0; i<num_targ; i++) | |
1125 out2 << i << "," << gcount[i] << "," << ucount[i] << endl; | |
1126 out2.close(); | |
1127 | |
1128 delete taxonomy; | |
1129 delete ht; | |
1130 | |
1131 return 0; | |
1132 } |