comparison kmer_read_m3.cpp @ 0:1472b4f4fbfe draft

planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author cstrittmatter
date Fri, 12 Apr 2019 14:58:18 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1472b4f4fbfe
1 // newkmer_5.cpp load k-mers for all species
2 // test reads by SmithWaterman
3 // MKM 1 APR 2016
4 //-std=c++0x -D__NO_INLINE__ ... at end: -lz
5 //#include "stdafx.h"
6 #include <fstream>
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include <math.h>
10 #include <iostream>
11 #include <sstream>
12 #include <iomanip>
13 #include <ostream>
14 #include <string>
15 #include <vector>
16 #include <unordered_set>
17 #include <time.h>
18 //#include <sys/time.h>
19 #include <cstddef>
20 #include <cstring>
21 #include <algorithm>
22 #include <set>
23 #include <zlib.h>
24 #include <dirent.h>
25 using namespace std;
26
27 const int minalign = 0; //120;
28 unordered_set<int> myset = { 0};
29
30 typedef int ptype;
31 typedef unsigned long long ui64;
32 typedef unsigned long long ktype;
33 typedef unsigned int vtype;
34 typedef unsigned short int otype;
35 typedef unsigned long long itype;
36
37 const int KSIZE = 30; //kmer size
38 const int MAXREP = 2048;
39 const int REPSHIFT = 11; //(log of maxrep)
40 const int SAVENUM = 12; //save this many reads
41 const itype MAXHASH = (1 << 30); //pow2 size of hashtable
42 const int MAXREPROBE = 16;
43 const int GAPO = 11;
44 const int GAPX = 1;
45 const int MATCH = 5;
46 const int MISMATCH = -4;
47 const bool INIGAPPEN = false;
48 const int MAXALIGN = 1024;
49 const int beam = 8;
50
51 vector<string> accession;
52 vector<int> targno;
53 int mcount, savepos, tct, num_orgs=0, save_target=0;
54 vector<int> gcount, ucount;
55 ofstream outread;
56 set<ktype> kmer_seen;
57 string fdir = ""; //directory for fasta.gz files
58
59 const char bases[4] = { 'A', 'C', 'G', 'T' };
60 const ui64 one = 1;
61 const ui64 two = 2;
62 const ui64 three = 3;
63 const ui64 mask = (one << (KSIZE * 2)) - 1;
64 const ui64 hiC = one << (KSIZE - 1) * 2;
65 const ui64 hiG = two << (KSIZE - 1) * 2;
66 const ui64 hiT = three << (KSIZE - 1) * 2;
67 const ui64 hi1 = one << 62;
68 const ui64 hi2 = two << 62;
69 const ui64 hi3 = three << 62;
70 const ui64 himask = hi1 - one;
71
72 const unsigned BUFLEN = 0x4000;
73
74 void error(const char* const msg)
75 {
76 cerr << msg << "\n";
77 exit(255);
78 }
79
80 string int2str(ktype key)
81 {
82 string sequence = "";
83 for (int i = 0; i < KSIZE; ++i)
84 {
85 sequence = bases[key & 3] + sequence;
86 key >>= 2;
87 }
88 return sequence;
89 }
90
91 class Tree1
92 {
93 public:
94
95 vector<vector<int>> children;
96 vector<int> parent;
97 int root;
98
99 Tree1(int nt)
100 {
101 vector<int> blank;
102 root = 1;
103 for (int i = 0; i < nt; i++)
104 {
105 parent.push_back(root);
106 children.push_back(blank);
107 }
108 }
109
110 ~Tree1()
111 {
112 }
113
114 void add_edge(int x, int y)
115 {
116 parent[y] = x;
117 children[x].push_back(y);
118 }
119
120 int msca(int x, int y)
121 {
122 //return most specific common ancestor of nodes x and y
123 set<int> ancestors;
124 int z;
125
126 ancestors.insert(root);
127 z = x;
128 //cout << x << " " << y << " " << z << endl;
129 while (z != root)
130 {
131 ancestors.insert(z);
132 z = get_parent(z);
133 //cout << x << " " << y << " " << z << endl;
134 }
135 z = y;
136 if (ancestors.find(y) != ancestors.end())
137 return x; //x is descendent of y
138 while (ancestors.find(z) == ancestors.end())
139 {
140 z = get_parent(z);
141 if (z==x) //y is descendent of x
142 return y;
143 //cout << x << " " << y << " " << z << endl;
144 }
145 return z; //common ancesctor
146 }
147
148 int get_parent(int x)
149 {
150 if (x != root && x > 0)
151 return parent[x];
152 else
153 return root;
154 }
155
156 };
157
158 Tree1 *taxonomy;
159
160 class Hashtable
161 {
162 public:
163
164 itype size;
165
166 struct Cell
167 {
168 ktype key;
169 vtype value;
170 int org;
171 ptype position;
172 bool fstrand;
173 } *pHash;
174
175 Hashtable()
176 {
177 size = 0;
178 pHash = new Cell[MAXHASH];
179 HashClear();
180 //for (int i = 1; i<MAXREPROBE; i++)
181 // reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2
182 }
183
184 ~Hashtable()
185 {
186 delete[] pHash;
187 size = 0;
188 }
189
190 // from code.google.com/p/smhasher/wiki/MurmurHash3
191 itype integerHash(ui64 k)
192 {
193 k ^= k >> 33;
194 k *= 0xff51afd7ed558ccd;
195 k ^= k >> 33;
196 k *= 0xc4ceb9fe1a85ec53;
197 k ^= k >> 33;
198 return (itype) k;
199 }
200
201 void HashClear()
202 {
203 memset(pHash, 0, MAXHASH * sizeof(Cell));
204 }
205
206 int getHash(ktype key, otype &org, ptype &position, bool &fstrand)
207 {
208 //0 target=bad
209 //returns target if probe present in table
210 //sets position and fstrand
211 itype index, reprobe, i, hash;
212
213 position = 0;
214 hash = integerHash(key);
215 reprobe = 0;
216 i = 0;
217 do
218 {
219 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
220 reprobe += ++i;
221 if (pHash[index].value == 0)
222 { //empty cell
223 return 0;
224 }
225 else if (pHash[index].key == key)
226 {
227 position = pHash[index].position;
228 fstrand = pHash[index].fstrand;
229 org = pHash[index].org;
230 return (pHash[index].value);
231 }
232 } while (reprobe < MAXHASH && i < MAXREPROBE);
233
234 return 0;
235 }
236
237 void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand)
238 { //add kmer to table
239 itype index, reprobe, i, hash;
240 bool notdone=true;
241
242 hash = integerHash(key);
243 reprobe = 0;
244 i = 0;
245 do
246 {
247 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
248 reprobe += ++i;
249
250 if (pHash[index].value == 0)
251 { //empty cell
252 notdone = false;
253 pHash[index].key = key;
254 pHash[index].value = target;
255 pHash[index].org = org;
256 pHash[index].position = position;
257 pHash[index].fstrand = fstrand;
258 if (++size > MAXHASH - 32)
259 {
260 cout << "out of memory in table " << endl;
261 exit(1);
262 }
263 }
264 } while (notdone);
265 }
266
267 }; //class Hashtable
268 Hashtable *ht;
269
270 string process_seq(string seq)
271 {
272 //format nucleotide sequence to all caps ACTG, all else is N
273 string seq2;
274 string::iterator it1;
275 char base;
276
277 seq2 = "";
278 for (it1 = seq.begin(); it1 != seq.end(); ++it1)
279 {
280 base = *it1;
281 switch (base)
282 {
283 case 'a':
284 seq2 += "A";
285 break;
286 case 'c':
287 seq2 += "C";
288 break;
289 case 'g':
290 seq2 += "G";
291 break;
292 case 't':
293 seq2 += "T";
294 break;
295 case 'A':
296 seq2 += "A";
297 break;
298 case 'C':
299 seq2 += "C";
300 break;
301 case 'G':
302 seq2 += "G";
303 break;
304 case 'T':
305 seq2 += "T";
306 break;
307 default:
308 seq2 += "N";
309 break;
310 }
311 }
312
313 return seq2;
314 }
315
316 string load_data3(gzFile in)
317 { //read gzip fasta file and extract nt sequence
318 char buf[BUFLEN];
319 char* offset = buf;
320 string line;
321 string sequence = "";
322
323 while (true)
324 {
325 int err, len = sizeof(buf)-(offset-buf);
326 if (len == 0) error("Buffer to small for input line lengths");
327
328 len = gzread(in, offset, len);
329
330 if (len == 0) break;
331 if (len < 0) error(gzerror(in, &err));
332
333 char* cur = buf;
334 char* end = offset+len;
335
336 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
337 {
338 line = string(cur, eol);
339 if (line.back() == '\r')
340 line.pop_back();
341 if (line.length() > 0)
342 {
343 if (line.at(0) == '>')
344 {
345 sequence += "N"; //contig separator
346 }
347 else
348 {
349 sequence += process_seq(line);
350 }
351 }
352 //cout << line << endl;
353 }
354
355 // any trailing data in [eol, end) now is a partial line
356 offset = copy(cur, end, buf);
357 }
358
359 // trailing data without eol
360 line = string(buf, offset);
361 if (gzclose(in) != Z_OK) error("failed gzclose");
362
363 return sequence;
364 }
365
366 int *tableM, *tableI, *tableD;
367
368 void tableinit()
369 {
370 int i,j;
371 const int NINF = -2000000000;
372
373 tableM[0] = 0;
374 tableI[0] = NINF;
375 tableD[0] = NINF;
376 for (i = 1; i<MAXALIGN; i++) //initialize upper row
377 {
378 j = (0 > i - beam - 1) ? 0 : i - beam - 1;
379 tableI[j*MAXALIGN + i] = NINF;
380 if (INIGAPPEN)
381 {
382 tableD[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX;
383 tableM[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX;
384 }
385 else
386 {
387 tableD[0 + i] = 0;
388 tableM[0 + i] = 0;
389 }
390 }
391 for (j = 1; j<MAXALIGN; j++) //init left column
392 {
393 i = (0 > j - beam - 1) ? 0 : j - beam - 1;
394 tableD[j*MAXALIGN + i] = NINF;
395 if (INIGAPPEN)
396 {
397 tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
398 tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
399 }
400 else
401 {
402 tableI[j*MAXALIGN + i] = 0;
403 tableM[j*MAXALIGN + i] = 0;
404 }
405 }
406 }
407
408 int align(string dna1, string dna2)
409 {
410 //smith waterman alignment with beam
411 //returns alignment score
412 int b1, b2, len1, len2, tindex;
413 int b,bb,c,cc;
414 int i, j, maxval, startx, starty;
415 int i1, i2;
416
417 len1 = dna1.length() + 1;
418 len2 = dna2.length() + 1;
419
420 j=1;
421 do
422 {
423 i = (j <= beam) ? j : j - beam;
424 i2 = (j + beam > len1) ? len1 : j + beam;
425 do
426 {
427 maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]);
428 //copy/replace
429 if (dna1[i-1] != dna2[j-1])
430 maxval += MISMATCH;
431 else maxval += MATCH;
432 tableM[j*MAXALIGN + i] = maxval;
433
434 b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap
435 bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend
436 tableI[j*MAXALIGN + i] = (bb<b) ? b : bb;
437
438 c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap
439 cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend
440 tableD[j*MAXALIGN + i] = (cc<c) ? c : cc;
441 } while(++i < i2);
442 } while(++j < len2);
443
444 i = len1 - 1;
445 j = len2 - 1; //process last entry
446
447 maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]);
448
449 return maxval;
450
451 }
452
453 int process_read(string &sequence, string acc, int start, int stop)
454 {
455 //returns id of read, 0 = no match, updates gcount, ucount
456 //will do align for first minalign of each target
457 //will save first SAVENUM reads of each target
458
459 char base;
460 int it1, it2;
461 string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr;
462 ktype keyF, keyR, key;
463 int i, cpos, minscr, gpos;
464 bool fstrand1, fstrand2;
465 int readlength = stop - start + 1, readlen2;
466 int st2, scr, stlen2;
467 ktype target, final_targ=0;
468 ptype position;
469 otype org;
470 int maxct, totct, rtarg=0;
471 bool reject = false;
472
473 cpos = 0;
474 keyF = 0;
475 keyR = 0;
476 gpos = 0;
477 minscr = 5 * sequence.length() / 2;
478 for (it1 = start; it1 <= stop; ++it1)
479 {
480 base = sequence.at(it1);
481 switch (base)
482 {
483 case 'A':
484 keyF = (keyF << 2) & mask;
485 keyR = (keyR >> 2) | hiT;
486 cpos++;
487 break;
488 case 'C':
489 keyF = ((keyF << 2) & mask) | 1;
490 keyR = (keyR >> 2) | hiG;
491 cpos++;
492 break;
493 case 'G':
494 keyF = ((keyF << 2) & mask) | 2;
495 keyR = (keyR >> 2) | hiC;
496 cpos++;
497 break;
498 case 'T':
499 keyF = ((keyF << 2) & mask) | 3;
500 keyR = keyR >> 2;
501 cpos++;
502 break;
503 case 'a':
504 keyF = (keyF << 2) & mask;
505 keyR = (keyR >> 2) | hiT;
506 cpos++;
507 break;
508 case 'c':
509 keyF = ((keyF << 2) & mask) | 1;
510 keyR = (keyR >> 2) | hiG;
511 cpos++;
512 break;
513 case 'g':
514 keyF = ((keyF << 2) & mask) | 2;
515 keyR = (keyR >> 2) | hiC;
516 cpos++;
517 break;
518 case 't':
519 keyF = ((keyF << 2) & mask) | 3;
520 keyR = keyR >> 2;
521 cpos++;
522 break;
523 default:
524 cpos = 0; //ambiguous character
525 keyF = 0;
526 keyR = 0;
527 break;
528 }
529 gpos++;
530 if (cpos == KSIZE)
531 {
532 key = keyF < keyR ? keyF : keyR;
533 target = ht->getHash(key, org, position, fstrand2);
534 //position is ending base, 1-based
535 if (target > 0 && gcount[target] < minalign && target != final_targ)
536 { //do alignment on initial reads of a species
537 fname = fdir + accession[org] + ".fasta.gz";
538 sequence2 = load_data3(gzopen(fname.c_str(), "rb"));
539 stlen2 = sequence2.length();
540 fstrand1 = (keyF < keyR);
541 readlen2 = readlength;
542 if (fstrand1 == fstrand2)
543 {
544 st2 = position - it1;
545 if (st2 < 0) st2 = 0;
546 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
547 dna1 = sequence.substr(start, readlength);
548 dna2 = sequence2.substr(st2, readlen2);
549 }
550 else
551 {
552 //reverse complement dna1
553 st2 = position - KSIZE + 2 + it1 - readlength;
554 if (st2 < 0) st2 = 0;
555 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
556 //dna1 = sequence.substr(start, readlength);
557 dna1 = "";
558 for (it2 = stop; it2 >= start; --it2)
559 {
560 base = sequence.at(it2);
561 switch (base)
562 {
563 case 'A':
564 dna1.push_back('T');
565 break;
566 case 'C':
567 dna1.push_back('G');
568 break;
569 case 'G':
570 dna1.push_back('C');
571 break;
572 case 'T':
573 dna1.push_back('A');
574 break;
575 default:
576 dna1.push_back('N');
577 break;
578 }
579 }
580 dna2 = sequence2.substr(st2, readlen2);
581 }
582 scr = align(dna1, dna2);
583 if (scr < minscr)
584 {
585 rtarg = target;
586 target = 0; //fail
587 reject = true;
588 //cout << rtarg << ": failed " << scr << endl;
589 }
590 }
591 if (final_targ > 0 && target > 0)
592 {
593 final_targ = taxonomy->msca(target, final_targ);
594 finalkeystr = int2str(key);
595 }
596 else if (target > 0)
597 {
598 final_targ = target;
599 finalkeystr = int2str(key);
600 }
601 if (target > 1)
602 {
603 if (kmer_seen.find(key) == kmer_seen.end())
604 {
605 ucount[target]++;
606 kmer_seen.insert(key);
607 }
608 }
609 cpos--;
610 }
611 } //it
612 /*
613 if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0)
614 {
615 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
616 outread << "[" << finalkeystr << "]" << endl;
617 }
618 if (final_targ > 1 && final_targ == save_target)
619 {
620 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
621 } */
622 gcount[final_targ]++;
623 tct++;
624
625 return final_targ;
626 }
627
628
629 void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand)
630 { //adds kmer from datafile into hashtable
631 char base;
632 string::iterator it1;
633 int cpos, gpos;
634 ui64 keyF;
635
636 cpos = 0;
637 gpos = 0;
638 keyF = 0;
639
640 for (it1 = sequence.begin(); it1 != sequence.end(); ++it1)
641 {
642 base = *it1;
643 switch (base)
644 {
645 case 'A':
646 keyF = (keyF << 2) & mask;
647 cpos++;
648 break;
649 case 'C':
650 keyF = ((keyF << 2) & mask) | 1;
651 cpos++;
652 break;
653 case 'G':
654 keyF = ((keyF << 2) & mask) | 2;
655 cpos++;
656 break;
657 case 'T':
658 keyF = ((keyF << 2) & mask) | 3;
659 cpos++;
660 break;
661 default:
662 cpos = 0; //ambiguous character
663 keyF = 0;
664 break;
665 }
666 if (cpos == KSIZE)
667 {
668 ht->add_kmer(keyF, target, org, position, fstrand);
669 cpos--;
670 }
671 gpos++;
672 } //it
673
674 }
675
676 void process_qual(string acc, string &seq, string &qual)
677 { //uses PHRED 33 scores
678 //to trim read and pass it to process_read
679 const int cutoff_qual = 17;
680 const char cutoff_char = 32 + cutoff_qual;
681 int start, stop, target, i;
682 int window_size = 4;
683 int window_val, window_cut = cutoff_qual * window_size;
684 string trim_seq;
685
686 stop = seq.length()-1;
687 start = 0;
688 //trim all low qual bases from end
689 while (qual.at(start) < cutoff_char && start < stop)
690 start++;
691 while (qual.at(stop) < cutoff_char && stop > start)
692 stop--;
693 //trim by window
694 if (start < stop - window_size)
695 {
696 window_val = 0;
697 for (i=0; i<window_size; i++)
698 window_val += qual.at(start+i) - 32;
699 while (window_val < window_cut && start < stop - window_size)
700 {
701 window_val += qual.at(start+window_size) - qual.at(start);
702 start++;
703 }
704 }
705 if (start < stop - window_size)
706 {
707 window_val = 0;
708 for (i=0; i<window_size; i++)
709 window_val += qual.at(stop-i) - 32;
710 while (window_val < window_cut && start < stop - window_size)
711 {
712 window_val += qual.at(stop-window_size) - qual.at(stop);
713 stop--;
714 }
715 }
716 //trim_seq = seq.substr(start,stop-start+1);
717 if (stop - start >= KSIZE)
718 {
719 target = process_read(seq,acc,start,stop);
720 }
721
722 }
723
724 void process_fqgz(gzFile in)
725 { //read gzip fastq file and extract sequence and quality lines
726 //passes them to process_qual
727 char buf[BUFLEN];
728 char* offset = buf;
729 string line, seq, acc;
730 int mod4=0;
731
732 while (true)
733 {
734 int err, len = sizeof(buf)-(offset-buf);
735 if (len == 0) error("Buffer to small for input line lengths");
736
737 len = gzread(in, offset, len);
738
739 if (len == 0) break;
740 if (len < 0) error(gzerror(in, &err));
741
742 char* cur = buf;
743 char* end = offset+len;
744
745 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
746 {
747 line = string(cur, eol);
748 if (line.back() == '\r') //windows
749 line.pop_back();
750 if (line.length() > 0)
751 {
752 if (mod4 == 1)
753 {
754 seq = line;
755 }
756 else if (mod4 == 0)
757 {
758 acc = line;
759 }
760 else if (mod4 == 3)
761 {
762 process_qual(acc, seq, line);
763 }
764 mod4 = (mod4 + 1) % 4;
765
766 }
767 //cout << line << endl;
768 }
769
770 // any trailing data in [eol, end) now is a partial line
771 offset = copy(cur, end, buf);
772 }
773
774 // trailing data without eol
775 line = string(buf, offset);
776
777 if (gzclose(in) != Z_OK) error("failed gzclose");
778 }
779
780 void process_fagz(gzFile in)
781 { //read gzip fasta file and extract sequence
782 //passes read to process_read
783 char buf[BUFLEN];
784 char* offset = buf;
785 string line;
786 string sequence = "", acc = "";
787 int c1, c2, target;
788
789 cout << "true" << endl;
790
791 while (true)
792 {
793 int err, len = sizeof(buf)-(offset-buf);
794 if (len == 0) error("Buffer to small for input line lengths");
795
796 len = gzread(in, offset, len);
797
798 if (len == 0) break;
799 if (len < 0) error(gzerror(in, &err));
800
801 char* cur = buf;
802 char* end = offset+len;
803
804 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
805 {
806 line = string(cur, eol);
807 if (line.back() == '\r')
808 line.pop_back();
809 if (line.length() > 0)
810 {
811 if (line.at(0) == '>')
812 {
813 if (sequence.length() > KSIZE)
814 { // process previous sequence
815 target = process_read(sequence, acc, 0, sequence.length()-1);
816 }
817 sequence = "";
818 acc = line.substr(1,line.length()-1);
819 }
820 else
821 {
822 sequence += line;
823 }
824 }
825 //cout << line << endl;
826 }
827
828 // any trailing data in [eol, end) now is a partial line
829 offset = copy(cur, end, buf);
830 }
831 if (sequence.length() > KSIZE)
832 { //last one
833 target = process_read(sequence, acc, 0, sequence.length()-1);
834 }
835 // trailing data without eol
836 line = string(buf, offset);
837 if (gzclose(in) != Z_OK) error("failed gzclose");
838
839 }
840
841 int process_gzkmers(gzFile in)
842 {
843
844 char buf[BUFLEN];
845 char* offset = buf;
846 string line;
847 int target, count, org, position, tct=0;
848 string sequence;
849 char strand;
850 bool fstrand;
851
852 while (true)
853 {
854 int err, len = sizeof(buf)-(offset-buf);
855 if (len == 0) error("Buffer to small for input line lengths");
856
857 len = gzread(in, offset, len);
858
859 if (len == 0) break;
860 if (len < 0) error(gzerror(in, &err));
861
862 char* cur = buf;
863 char* end = offset+len;
864
865 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
866 {
867 line = string(cur, eol);
868 if (line.back() == '\r')
869 line.pop_back();
870 if (line.length() > 0)
871 {
872 replace(line.begin(), line.end(), ',', ' ');
873 istringstream ss(line);
874 if (ss >> sequence >> target >> org >> position >> strand >> count)
875 {
876 fstrand = (strand == 'F');
877 process_kmer(sequence, target, org, position, fstrand);
878 tct++;
879 }
880 }
881 //cout << line << endl;
882 }
883
884 // any trailing data in [eol, end) now is a partial line
885 offset = copy(cur, end, buf);
886 }
887 // trailing data without eol
888 line = string(buf, offset);
889 if (gzclose(in) != Z_OK) error("failed gzclose");
890
891 return tct;
892
893 }
894
895 void process_fq(string rname)
896 { //read fastq file and extract sequence and quality lines
897 //passes them to process_qual
898 string line, seq, acc= "", lseq;
899 ifstream fin;
900 int target;
901 int mod4=0;
902
903 fin.open(rname);
904 if (fin)
905 {
906 while (getline(fin, line))
907 {
908 if (line.back() == '\r')
909 line.pop_back();
910 stringstream linestream(line);
911 linestream >> lseq;
912 if (lseq.length() > 0)
913 {
914 if (mod4 == 1)
915 {
916 seq = lseq;
917 }
918 else if (mod4 == 0)
919 {
920 acc = lseq;
921 }
922 else if (mod4 == 3)
923 {
924 process_qual(acc, seq, lseq);
925 }
926 mod4 = (mod4 + 1) % 4;
927 }
928 }
929 fin.close();
930 }
931 }
932
933 void process_fa(string rname)
934 { //read unzipped fasta file and extract sequence
935 //passes read to process_read
936 string sequence = "", line, lseq, acc= "";
937 ifstream fin;
938 int target;
939
940 fin.open(rname);
941 if (fin)
942 {
943 while (getline(fin, line))
944 {
945 if (line.back() == '\r')
946 line.pop_back();
947 stringstream linestream(line);
948 linestream >> lseq;
949 if (lseq[0] == '>')
950 {
951 if (sequence.length() > KSIZE)
952 {
953 target = process_read(sequence, acc, 0, sequence.length()-1);
954 }
955 sequence = "";
956 acc = lseq.substr(1,lseq.length()-1);
957 }
958 else
959 sequence += lseq;
960 }
961 fin.close();
962 if (sequence.length() > KSIZE)
963 { //last one
964 target = process_read(sequence, acc, 0, sequence.length()-1);
965 }
966 }
967 else
968 {
969 cout << "nark " << rname << endl;
970 }
971 }
972
973 int main(int argc, char* argv[])
974 {
975 int i,j,num_jobs;
976 vtype target;
977 ptype position;
978 char strand;
979 bool fstrand;
980 string header, jstr, fname, line, r1name, r2name = "", oname1, oname2;
981 int refi, targi, strni, count, org, num_targ;
982 ifstream fin;
983 string genus, species, acc, sequence, lseq, remainder;
984 vector<vector<string>> fnames;
985 vector<string> jnames;
986 vector<int> jcounts;
987 vector<string>::iterator it2;
988 string trname, s1, s2, s3,s4;
989 string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = "";
990 string iname = "mitochondria_data.txt"; //text file of groups for probe design
991 string tname = "mitochondria_tree.txt"; //edges in taxonomy tree
992 string pname = "mitochondria_probes.txt.gz";
993 if (argc > 1)
994 {
995 for (i = 1; i < argc; i++)
996 {
997 argst = argv[i];
998 if (argst == "-wdir")
999 {
1000 wdir = argv[i+1];
1001 }
1002 if (argst == "-f1")
1003 {
1004 r1name =argv[i+1];
1005 }
1006 if (argst == "-f2")
1007 {
1008 r2name =argv[i+1];
1009 }
1010 }
1011 }
1012 iname = wdir + iname;
1013 tname = wdir + tname;
1014 pname = wdir + pname;
1015
1016 ht = new Hashtable();
1017 //load strain list
1018
1019
1020 cout << "r1 " << r1name << endl;
1021 cout << "r2 " << r2name << endl;
1022 cout << "wd " << wdir << endl;
1023 fin.open(iname);
1024 if (fin)
1025 {
1026 while (getline(fin, line))
1027 {
1028 if (line.back() == '\r')
1029 line.pop_back();
1030 if (line.length() > 1)
1031 {
1032 stringstream linestream(line);
1033 linestream >> targi >> acc;
1034 accession.push_back(acc);
1035 targno.push_back(targi);
1036 if (targi > num_targ)
1037 num_targ = targi;
1038 num_orgs++;
1039 }
1040 }
1041 fin.close();
1042 cout << num_orgs << " strains" << endl;
1043 num_targ++;
1044 }
1045 taxonomy = new Tree1(num_targ);
1046
1047 //load taxonomy tree
1048 fin.open(tname);
1049 if (fin)
1050 {
1051 while (getline(fin, line))
1052 {
1053 if (line.back() == '\r')
1054 line.pop_back();
1055 stringstream linestream(line);
1056 linestream >> i >> j;
1057 taxonomy->add_edge(i,j);
1058 }
1059 }
1060 else exit(1);
1061 fin.close();
1062 cout << "tree loaded" << endl;
1063
1064 //load kmer database
1065 tct = process_gzkmers(gzopen(pname.c_str(), "rb"));
1066 cout << tct << " kmers loaded" << endl;
1067 if (tct < 2) exit(1);
1068
1069 //process reads
1070 gcount.resize(num_targ,0);
1071 ucount.resize(num_targ,0);
1072 fill(gcount.begin(), gcount.end(), 0);
1073 fill(ucount.begin(), ucount.end(), 0);
1074 kmer_seen.clear();
1075 tct = 0;
1076 s1 = ".fastq.gz";
1077 s2 = ".fasta";
1078 s3 = ".fastq";
1079 s4 = ".fasta.gz";
1080 int slen = r1name.length();
1081 char sch = r1name.at(slen-1);
1082 cout << slen << " : " << sch << endl;
1083 if (equal(s1.rbegin(), s1.rend(), r1name.rbegin()))
1084 {
1085 process_fqgz(gzopen(r1name.c_str(), "rb"));
1086 }
1087 else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin()))
1088 {
1089 process_fa(r1name.c_str());
1090 }
1091 else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin()))
1092 {
1093 process_fq(r1name.c_str());
1094 }
1095 else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin()))
1096 {
1097 process_fagz(gzopen(r1name.c_str(), "rb"));
1098 }
1099 cout << tct << " reads loaded" << endl;
1100 if (r2name.length() > 1 && r2name != "none")
1101 { //second file?
1102 if (equal(s1.rbegin(), s1.rend(), r2name.rbegin()))
1103 {
1104 process_fqgz(gzopen(r2name.c_str(), "rb"));
1105 cout << tct << " reads loaded" << endl;
1106 }
1107 else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin()))
1108 {
1109 process_fa(r2name.c_str());
1110 }
1111 else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin()))
1112 {
1113 process_fq(r2name.c_str());
1114 }
1115 else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin()))
1116 {
1117 process_fagz(gzopen(r2name.c_str(), "rb"));
1118 }
1119 cout << tct << " reads loaded" << endl;
1120 }
1121
1122 oname2 = wdir + "result.txt";
1123 ofstream out2(oname2);
1124 for (i=0; i<num_targ; i++)
1125 out2 << i << "," << gcount[i] << "," << ucount[i] << endl;
1126 out2.close();
1127
1128 delete taxonomy;
1129 delete ht;
1130
1131 return 0;
1132 }