Mercurial > repos > cstrittmatter > mitokmer

diff kmer_read_m3.cpp @ 0:1472b4f4fbfe draft
planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author: cstrittmatter
date: Fri, 12 Apr 2019 14:58:18 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmer_read_m3.cpp	Fri Apr 12 14:58:18 2019 -0400
@@ -0,0 +1,1132 @@
+// newkmer_5.cpp load k-mers for all species
+// test reads by SmithWaterman
+// MKM 1 APR 2016
+//-std=c++0x -D__NO_INLINE__   ... at end: -lz
+//#include "stdafx.h"
+#include <fstream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <iostream>
+#include <sstream>
+#include <iomanip>
+#include <ostream>
+#include <string>
+#include <vector>
+#include <unordered_set>
+#include <time.h>
+//#include <sys/time.h>
+#include <cstddef>
+#include <cstring>
+#include <algorithm>
+#include <set>
+#include <zlib.h>
+#include <dirent.h>
+using namespace std;
+
+const int minalign = 0; //120;
+unordered_set<int> myset = { 0};
+
+typedef int ptype;
+typedef unsigned long long ui64;
+typedef unsigned long long ktype;
+typedef unsigned int vtype;
+typedef unsigned short int otype;
+typedef unsigned long long itype;
+
+const int KSIZE = 30; //kmer size
+const int MAXREP = 2048;
+const int REPSHIFT = 11; //(log of maxrep)
+const int SAVENUM = 12; //save this many reads
+const itype MAXHASH = (1 << 30); //pow2 size of hashtable
+const int MAXREPROBE = 16;
+const int GAPO = 11;
+const int GAPX = 1;
+const int MATCH = 5;
+const int MISMATCH = -4;
+const bool INIGAPPEN = false;
+const int MAXALIGN = 1024;
+const int beam = 8;
+
+vector<string> accession;
+vector<int> targno;
+int mcount, savepos, tct, num_orgs=0, save_target=0;
+vector<int> gcount, ucount;
+ofstream outread;
+set<ktype> kmer_seen;
+string fdir = ""; //directory for fasta.gz files
+
+const char bases[4] = { 'A', 'C', 'G', 'T' };
+const ui64 one = 1;
+const ui64 two = 2;
+const ui64 three = 3;
+const ui64 mask = (one << (KSIZE * 2)) - 1;
+const ui64 hiC = one << (KSIZE - 1) * 2;
+const ui64 hiG = two << (KSIZE - 1) * 2;
+const ui64 hiT = three << (KSIZE - 1) * 2;
+const ui64 hi1 = one << 62;
+const ui64 hi2 = two << 62;
+const ui64 hi3 = three << 62;
+const ui64 himask = hi1 - one;
+
+const unsigned BUFLEN = 0x4000;
+
+void error(const char* const msg)
+{
+    cerr << msg << "\n";
+    exit(255);
+}
+
+string int2str(ktype key)
+{
+	string sequence = "";
+	for (int i = 0; i < KSIZE; ++i)
+	{
+		sequence = bases[key & 3] + sequence;
+		key >>= 2;
+	}
+	return sequence;
+}
+
+class Tree1
+{
+public:
+
+	vector<vector<int>> children;
+	vector<int> parent;
+	int root;
+
+	Tree1(int nt)
+	{
+		vector<int> blank;
+		root = 1;
+		for (int i = 0; i < nt; i++)
+		{
+			parent.push_back(root);
+			children.push_back(blank);
+		}
+	}
+
+	~Tree1()
+	{
+	}
+
+	void add_edge(int x, int y)
+	{
+		parent[y] = x;
+		children[x].push_back(y);
+	}
+
+	int msca(int x, int y)
+	{
+		//return most specific common ancestor of nodes x and y
+		set<int> ancestors;
+		int z;
+
+		ancestors.insert(root);
+		z = x;
+		//cout << x << " " << y << " " << z << endl;
+		while (z != root)
+		{
+			ancestors.insert(z);
+			z = get_parent(z);
+		//cout << x << " " << y << " " << z << endl;
+		}
+		z = y;
+		if (ancestors.find(y) != ancestors.end())
+			return x; //x is descendent of y
+		while (ancestors.find(z) == ancestors.end())
+		{
+			z = get_parent(z);
+			if (z==x) //y is descendent of x
+				return y;
+		//cout << x << " " << y << " " << z << endl;
+		}
+		return z; //common ancesctor
+	}
+
+	int get_parent(int x)
+	{
+		if (x != root && x > 0)
+			return parent[x];
+		else
+			return root;
+	}
+
+};
+
+Tree1 *taxonomy;
+
+class Hashtable
+{
+public:
+
+	itype size;
+
+	struct Cell
+	{
+		ktype key;
+		vtype value;
+		int org;
+		ptype position;
+ 		bool fstrand;
+	} *pHash;
+
+	Hashtable()
+	{
+		size = 0;
+		pHash = new Cell[MAXHASH];
+		HashClear();
+		//for (int i = 1; i<MAXREPROBE; i++)
+		//	reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2
+	}
+
+	~Hashtable()
+	{
+		delete[] pHash;
+		size = 0;
+	}
+
+	// from code.google.com/p/smhasher/wiki/MurmurHash3
+	itype integerHash(ui64 k)
+	{
+		k ^= k >> 33;
+		k *= 0xff51afd7ed558ccd;
+		k ^= k >> 33;
+		k *= 0xc4ceb9fe1a85ec53;
+		k ^= k >> 33;
+		return (itype) k;
+	}
+
+	void HashClear()
+	{
+		memset(pHash, 0, MAXHASH * sizeof(Cell));
+	}
+
+	int getHash(ktype key, otype &org, ptype &position, bool &fstrand)
+	{ 
+		//0 target=bad
+		//returns target if probe present in table
+		//sets position and fstrand
+		itype index, reprobe, i, hash;
+
+  		position = 0;
+		hash = integerHash(key);
+		reprobe = 0;
+		i = 0;
+		do
+		{
+			index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
+			reprobe += ++i;
+			if (pHash[index].value == 0)
+			{ //empty cell
+				return 0;
+			}
+			else if (pHash[index].key == key)
+			{
+				position = pHash[index].position;
+				fstrand = pHash[index].fstrand;
+				org = pHash[index].org;
+				return (pHash[index].value);
+			}
+		} while (reprobe < MAXHASH && i < MAXREPROBE);
+
+		return 0;
+	}
+
+	void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand)
+	{ //add kmer to table
+		itype index, reprobe, i, hash;
+		bool notdone=true;
+
+		hash = integerHash(key);
+		reprobe = 0;
+		i = 0;
+		do
+		{
+			index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
+			reprobe += ++i;
+
+			if (pHash[index].value == 0)
+			{ //empty cell
+				notdone = false;
+				pHash[index].key = key;
+				pHash[index].value = target;
+				pHash[index].org = org;
+				pHash[index].position = position;
+				pHash[index].fstrand = fstrand;
+				if (++size > MAXHASH - 32)
+				{
+					cout << "out of memory in table " << endl;
+					exit(1);
+				}
+			}
+		} while (notdone);
+	}
+
+}; //class Hashtable
+Hashtable *ht;
+
+string process_seq(string seq)
+{  
+	//format nucleotide sequence to all caps ACTG, all else is N
+	string seq2;
+	string::iterator it1;
+	char base;
+
+	seq2 = "";
+	for (it1 = seq.begin(); it1 != seq.end(); ++it1)
+	{
+		base = *it1;
+		switch (base)
+		{
+		case 'a':
+			seq2 += "A";
+			break;
+		case 'c':
+			seq2 += "C";
+			break;
+		case 'g':
+			seq2 += "G";
+			break;
+		case 't':
+			seq2 += "T";
+			break;
+		case 'A':
+			seq2 += "A";
+			break;
+		case 'C':
+			seq2 += "C";
+			break;
+		case 'G':
+			seq2 += "G";
+			break;
+		case 'T':
+			seq2 += "T";
+			break;
+		default:
+			seq2 += "N";
+			break;
+		}
+	}
+
+	return seq2;
+}
+
+string load_data3(gzFile in)
+{ //read gzip fasta file and extract nt sequence
+	char buf[BUFLEN];
+	char* offset = buf;
+	string line;
+        string sequence = "";
+
+	while (true) 
+	{
+		int err, len = sizeof(buf)-(offset-buf);
+		if (len == 0) error("Buffer to small for input line lengths");
+
+ 		len = gzread(in, offset, len);
+
+		if (len == 0) break;    
+		if (len <  0) error(gzerror(in, &err));
+
+		char* cur = buf;
+		char* end = offset+len;
+
+		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
+ 		{
+			line = string(cur, eol);
+			if (line.back() == '\r')
+				line.pop_back();
+			if (line.length() > 0)
+			{
+				if (line.at(0) == '>')
+				{
+					sequence += "N"; //contig separator
+				}
+				else
+				{
+					sequence += process_seq(line);
+				}				
+			}
+			//cout << line << endl;
+		}
+
+		// any trailing data in [eol, end) now is a partial line
+		offset = copy(cur, end, buf);
+	}
+
+	// trailing data without eol
+	line = string(buf, offset);
+	if (gzclose(in) != Z_OK) error("failed gzclose");
+
+	return sequence;
+}
+
+int *tableM, *tableI, *tableD;
+
+void tableinit()
+{
+	int i,j;
+	const int NINF = -2000000000;
+
+	tableM[0] = 0;
+	tableI[0] = NINF;
+	tableD[0] = NINF;
+	for (i = 1; i<MAXALIGN; i++) //initialize upper row
+	{
+		j = (0 > i - beam - 1) ? 0 : i - beam - 1;
+		tableI[j*MAXALIGN  + i] = NINF;
+		if (INIGAPPEN)
+		{
+			tableD[j*MAXALIGN  + i] = -GAPO - (i - 1)*GAPX;
+			tableM[j*MAXALIGN  + i] = -GAPO - (i - 1)*GAPX;
+		}
+		else
+		{
+			tableD[0 + i] = 0;
+			tableM[0 + i] = 0;
+		}
+	}
+	for (j = 1; j<MAXALIGN; j++) //init left column
+	{
+		i = (0 > j - beam - 1) ? 0 : j - beam - 1;
+		tableD[j*MAXALIGN + i] = NINF;
+		if (INIGAPPEN)
+		{
+			tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
+			tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
+		}
+		else
+		{
+			tableI[j*MAXALIGN + i] = 0;
+			tableM[j*MAXALIGN + i] = 0;
+		}
+	}
+}
+
+int align(string dna1, string dna2)
+{
+	//smith waterman alignment with beam
+	//returns alignment score
+	int b1, b2, len1, len2, tindex;
+	int b,bb,c,cc;
+	int i, j, maxval, startx, starty;
+	int i1, i2;
+
+	len1 = dna1.length() + 1;
+	len2 = dna2.length() + 1;
+
+	j=1;
+	do
+	{
+		i = (j <= beam) ? j : j - beam;
+		i2 = (j + beam > len1) ? len1 : j + beam;
+		do
+		{
+			maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]);
+			//copy/replace
+                        if (dna1[i-1] != dna2[j-1])
+                            maxval += MISMATCH;
+			else maxval += MATCH;
+			tableM[j*MAXALIGN + i] = maxval;
+
+			b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap
+			bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend
+			tableI[j*MAXALIGN + i] = (bb<b) ? b : bb;
+
+			c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap
+			cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend
+			tableD[j*MAXALIGN + i] = (cc<c) ? c : cc;	
+		} while(++i < i2);
+	} while(++j < len2);
+
+	i = len1 - 1;
+	j = len2 - 1;   //process last entry
+
+	maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]);
+
+	return maxval;
+
+}
+
+int process_read(string &sequence, string acc, int start, int stop)
+{
+	//returns id of read, 0 = no match, updates gcount, ucount
+	//will do align for first minalign of each target
+	//will save first SAVENUM reads of each target
+
+	char base;
+	int it1, it2;
+	string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr;
+	ktype keyF, keyR, key;
+	int i, cpos, minscr, gpos;
+	bool fstrand1, fstrand2;
+	int readlength = stop - start + 1, readlen2;
+	int st2, scr, stlen2;
+	ktype target, final_targ=0;
+	ptype position;
+	otype org;
+	int maxct, totct, rtarg=0;
+	bool reject = false;
+
+	cpos = 0;
+	keyF = 0;
+	keyR = 0;
+	gpos = 0;
+	minscr = 5 * sequence.length() / 2;
+	for (it1 = start; it1 <= stop; ++it1)
+	{
+		base = sequence.at(it1);
+		switch (base)
+		{
+		case 'A':
+			keyF = (keyF << 2) & mask;
+			keyR = (keyR >> 2) | hiT;
+			cpos++;
+			break;
+		case 'C':
+			keyF = ((keyF << 2) & mask) | 1;
+			keyR = (keyR >> 2) | hiG;
+			cpos++;
+			break;
+		case 'G':
+			keyF = ((keyF << 2) & mask) | 2;
+			keyR = (keyR >> 2) | hiC;
+			cpos++;
+			break;
+		case 'T':
+			keyF = ((keyF << 2) & mask) | 3;
+			keyR = keyR >> 2;
+			cpos++;
+			break;
+		case 'a':
+			keyF = (keyF << 2) & mask;
+			keyR = (keyR >> 2) | hiT;
+			cpos++;
+			break;
+		case 'c':
+			keyF = ((keyF << 2) & mask) | 1;
+			keyR = (keyR >> 2) | hiG;
+			cpos++;
+			break;
+		case 'g':
+			keyF = ((keyF << 2) & mask) | 2;
+			keyR = (keyR >> 2) | hiC;
+			cpos++;
+			break;
+		case 't':
+			keyF = ((keyF << 2) & mask) | 3;
+			keyR = keyR >> 2;
+			cpos++;
+			break;
+		default:
+			cpos = 0; //ambiguous character
+			keyF = 0;
+			keyR = 0;
+			break;
+		}
+		gpos++;
+		if (cpos == KSIZE)
+		{
+			key = keyF < keyR ? keyF : keyR;
+			target = ht->getHash(key, org, position, fstrand2);
+			//position is ending base, 1-based
+			if (target > 0 && gcount[target] < minalign && target != final_targ)
+			{ //do alignment on initial reads of a species
+				fname = fdir + accession[org] + ".fasta.gz";
+				sequence2 = load_data3(gzopen(fname.c_str(), "rb"));
+				stlen2 = sequence2.length();
+				fstrand1 = (keyF < keyR);
+				readlen2 = readlength;
+				if (fstrand1 == fstrand2)
+				{
+					st2 = position - it1;
+					if (st2 < 0) st2 = 0;
+					if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
+					dna1 = sequence.substr(start, readlength);
+					dna2 = sequence2.substr(st2, readlen2);
+				}
+				else
+				{
+					//reverse complement dna1
+					st2 = position - KSIZE + 2 + it1 - readlength;
+					if (st2 < 0) st2 = 0;
+					if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
+					//dna1 = sequence.substr(start, readlength);
+					dna1 = "";
+					for (it2 = stop; it2 >= start; --it2)
+					{
+						base = sequence.at(it2);
+						switch (base)
+						{
+						case 'A':
+							dna1.push_back('T');
+							break;
+						case 'C':
+							dna1.push_back('G');
+							break;
+						case 'G':
+							dna1.push_back('C');
+							break;
+						case 'T':
+							dna1.push_back('A');
+							break;
+						default:
+							dna1.push_back('N');
+							break;
+						}
+					}
+					dna2 = sequence2.substr(st2, readlen2);
+				}
+				scr = align(dna1, dna2);
+				if (scr < minscr)
+				{
+					rtarg = target;
+					target = 0; //fail
+					reject = true;
+					//cout << rtarg << ": failed " << scr << endl;
+				}
+			}
+			if (final_targ > 0 && target > 0)
+			{
+				final_targ = taxonomy->msca(target, final_targ);
+				finalkeystr =  int2str(key);
+			}
+			else if (target > 0)
+			{
+				final_targ = target;
+				finalkeystr =  int2str(key);
+			}
+			if (target > 1)
+			{
+				if (kmer_seen.find(key) == kmer_seen.end())
+				{
+					ucount[target]++;
+					kmer_seen.insert(key);
+				}
+			}
+			cpos--;
+		}
+	} //it
+/*
+        if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0)
+	{
+ 		outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
+		outread << "[" << finalkeystr << "]" << endl;
+	}
+        if (final_targ > 1 && final_targ == save_target)
+	{
+ 		outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
+	} */
+	gcount[final_targ]++;
+	tct++; 
+
+	return final_targ;
+}
+
+
+void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand)
+{	//adds kmer from datafile into hashtable
+	char base;
+	string::iterator it1;
+	int cpos, gpos;
+	ui64 keyF;
+
+	cpos = 0;
+	gpos = 0;
+	keyF = 0;
+
+	for (it1 = sequence.begin(); it1 != sequence.end(); ++it1)
+	{
+		base = *it1;
+		switch (base)
+		{
+		case 'A':
+			keyF = (keyF << 2) & mask;
+			cpos++;
+			break;
+		case 'C':
+			keyF = ((keyF << 2) & mask) | 1;
+			cpos++;
+			break;
+		case 'G':
+			keyF = ((keyF << 2) & mask) | 2;
+			cpos++;
+			break;
+		case 'T':
+			keyF = ((keyF << 2) & mask) | 3;
+			cpos++;
+			break;
+		default:
+			cpos = 0; //ambiguous character
+			keyF = 0;
+			break;
+		}
+		if (cpos == KSIZE)
+		{
+			ht->add_kmer(keyF, target, org, position, fstrand);
+			cpos--;
+		}
+		gpos++;
+	} //it
+
+}
+
+void process_qual(string acc, string &seq, string &qual)
+{ 	//uses PHRED 33 scores
+	//to trim read and pass it to process_read
+	const int cutoff_qual = 17;
+	const char cutoff_char = 32 + cutoff_qual;
+	int start, stop, target, i;
+	int window_size = 4;	
+	int window_val, window_cut = cutoff_qual * window_size;
+	string trim_seq;
+
+	stop = seq.length()-1;
+	start = 0;
+	//trim all low qual bases from end
+	while (qual.at(start) < cutoff_char && start < stop)
+		start++;
+	while (qual.at(stop) < cutoff_char && stop > start)
+		stop--;
+	//trim by window
+	if (start < stop - window_size)
+	{
+		window_val = 0;
+		for (i=0; i<window_size; i++)
+			window_val += qual.at(start+i) - 32;
+		while (window_val < window_cut && start < stop - window_size)
+		{
+			window_val += qual.at(start+window_size) - qual.at(start);
+			start++;
+		}
+	}
+	if (start < stop - window_size)
+	{
+		window_val = 0;
+		for (i=0; i<window_size; i++)
+			window_val += qual.at(stop-i) - 32;
+		while (window_val < window_cut && start < stop - window_size)
+		{
+			window_val += qual.at(stop-window_size) - qual.at(stop);
+			stop--;
+		}
+	}
+	//trim_seq = seq.substr(start,stop-start+1);
+	if (stop - start >= KSIZE)
+	{
+		target = process_read(seq,acc,start,stop);
+	}
+
+}
+
+void process_fqgz(gzFile in)
+{ 	//read gzip fastq file and extract sequence and quality lines
+	//passes them to process_qual
+	char buf[BUFLEN];
+	char* offset = buf;
+	string line, seq, acc;
+	int mod4=0;
+
+	while (true) 
+	{
+		int err, len = sizeof(buf)-(offset-buf);
+		if (len == 0) error("Buffer to small for input line lengths");
+
+ 		len = gzread(in, offset, len);
+
+		if (len == 0) break;    
+		if (len <  0) error(gzerror(in, &err));
+
+		char* cur = buf;
+		char* end = offset+len;
+
+		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
+ 		{
+			line = string(cur, eol);
+			if (line.back() == '\r') //windows
+				line.pop_back();
+			if (line.length() > 0)
+			{
+				if (mod4 == 1)
+				{
+					seq = line;
+				}
+				else if (mod4 == 0)
+				{
+					acc = line;
+				}
+				else if (mod4 == 3)
+				{
+					process_qual(acc, seq, line);
+				}
+				mod4 = (mod4 + 1) % 4;	
+				
+			}
+			//cout << line << endl;
+		}
+
+		// any trailing data in [eol, end) now is a partial line
+		offset = copy(cur, end, buf);
+	}
+
+	// trailing data without eol
+	line = string(buf, offset);
+
+	if (gzclose(in) != Z_OK) error("failed gzclose");
+}
+
+void process_fagz(gzFile in)
+{ 	//read gzip fasta file and extract sequence
+	//passes read to process_read
+	char buf[BUFLEN];
+	char* offset = buf;
+	string line;
+        string sequence = "", acc = "";
+        int c1, c2, target;
+
+	cout << "true" << endl;
+
+	while (true) 
+	{
+		int err, len = sizeof(buf)-(offset-buf);
+		if (len == 0) error("Buffer to small for input line lengths");
+
+ 		len = gzread(in, offset, len);
+
+		if (len == 0) break;    
+		if (len <  0) error(gzerror(in, &err));
+
+		char* cur = buf;
+		char* end = offset+len;
+
+		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
+ 		{
+			line = string(cur, eol);
+			if (line.back() == '\r')
+				line.pop_back();
+			if (line.length() > 0)
+			{
+				if (line.at(0) == '>')
+				{
+					if (sequence.length() > KSIZE)
+					{ // process previous sequence
+						target = process_read(sequence, acc, 0, sequence.length()-1);
+					}
+					sequence = "";
+					acc = line.substr(1,line.length()-1);
+				}
+				else
+				{
+					sequence += line;
+				}				
+			}
+			//cout << line << endl;
+		}
+
+		// any trailing data in [eol, end) now is a partial line
+		offset = copy(cur, end, buf);
+	}
+	if (sequence.length() > KSIZE)
+	{ //last one
+		target = process_read(sequence, acc, 0, sequence.length()-1);
+	}
+	// trailing data without eol
+	line = string(buf, offset);
+	if (gzclose(in) != Z_OK) error("failed gzclose");
+
+}
+
+int process_gzkmers(gzFile in)
+{
+
+	char buf[BUFLEN];
+	char* offset = buf;
+	string line;
+	int target, count, org, position, tct=0;
+	string sequence;
+	char strand;
+	bool fstrand;
+
+	while (true) 
+	{
+		int err, len = sizeof(buf)-(offset-buf);
+		if (len == 0) error("Buffer to small for input line lengths");
+
+ 		len = gzread(in, offset, len);
+
+		if (len == 0) break;    
+		if (len <  0) error(gzerror(in, &err));
+
+		char* cur = buf;
+		char* end = offset+len;
+
+		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
+ 		{
+			line = string(cur, eol);
+			if (line.back() == '\r')
+				line.pop_back();
+			if (line.length() > 0)
+			{
+				replace(line.begin(), line.end(), ',', ' ');
+				istringstream ss(line);
+				if (ss >> sequence >> target >> org >> position >> strand >> count)
+				{
+					fstrand = (strand == 'F');
+					process_kmer(sequence, target, org, position, fstrand);
+					tct++;
+				}				
+			}
+			//cout << line << endl;
+		}
+
+		// any trailing data in [eol, end) now is a partial line
+		offset = copy(cur, end, buf);
+	}
+	// trailing data without eol
+	line = string(buf, offset);
+	if (gzclose(in) != Z_OK) error("failed gzclose");
+
+	return tct;
+
+}
+
+void process_fq(string rname)
+{ 	//read fastq file and extract sequence and quality lines
+	//passes them to process_qual
+	string line, seq, acc= "", lseq;
+	ifstream fin;
+	int target;
+	int mod4=0;
+
+	fin.open(rname);
+	if (fin)
+	{
+		while (getline(fin, line))
+		{
+			if (line.back() == '\r')
+				line.pop_back();
+			stringstream linestream(line);
+			linestream >> lseq;
+			if (lseq.length() > 0)
+			{
+				if (mod4 == 1)
+				{
+					seq = lseq;
+				}
+				else if (mod4 == 0)
+				{
+					acc = lseq;
+				}
+				else if (mod4 == 3)
+				{
+					process_qual(acc, seq, lseq);
+				}
+				mod4 = (mod4 + 1) % 4;	
+			}
+		}
+		fin.close();
+	}
+}
+
+void process_fa(string rname)
+{	//read unzipped fasta file and extract sequence
+	//passes read to process_read
+	string sequence = "", line, lseq, acc= "";
+	ifstream fin;
+	int target;
+
+	fin.open(rname);
+	if (fin)
+	{
+		while (getline(fin, line))
+		{
+			if (line.back() == '\r')
+				line.pop_back();
+			stringstream linestream(line);
+			linestream >> lseq;
+			if (lseq[0] == '>')
+			{
+				if (sequence.length() > KSIZE)
+				{
+					target = process_read(sequence, acc, 0, sequence.length()-1);
+				}
+				sequence = "";
+				acc = lseq.substr(1,lseq.length()-1);
+			}
+			else
+				sequence += lseq;
+		}
+		fin.close();
+		if (sequence.length() > KSIZE)
+		{ //last one
+			target = process_read(sequence, acc, 0, sequence.length()-1);
+		}
+	}
+	else
+	{
+		cout << "nark " << rname << endl;
+	}
+}
+
+int main(int argc, char* argv[])
+{
+	int i,j,num_jobs;
+	vtype target;
+	ptype position;
+	char strand;
+	bool fstrand;
+	string header, jstr, fname, line, r1name, r2name = "", oname1, oname2;
+	int refi, targi, strni, count, org, num_targ;
+	ifstream fin;
+	string genus, species, acc, sequence, lseq, remainder;
+	vector<vector<string>> fnames;
+	vector<string> jnames;
+	vector<int> jcounts;
+	vector<string>::iterator it2;
+	string trname, s1, s2, s3,s4;
+	string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = "";
+	string iname = "mitochondria_data.txt";   //text file of groups for probe design
+	string tname = "mitochondria_tree.txt";   //edges in taxonomy tree
+	string pname = "mitochondria_probes.txt.gz";
+ 	if (argc > 1) 
+	{
+		for (i = 1; i < argc; i++)
+		{
+			argst = argv[i];
+    			if (argst == "-wdir")
+			{
+				wdir = argv[i+1];
+			}
+    			if (argst == "-f1")
+			{
+				r1name =argv[i+1];
+			}
+    			if (argst == "-f2")
+			{
+				r2name =argv[i+1];
+			}
+ 		}
+	}
+	iname = wdir + iname; 
+	tname = wdir + tname;  
+	pname = wdir + pname;
+
+	ht = new Hashtable();
+	//load strain list
+
+
+	cout << "r1 " << r1name << endl;
+	cout << "r2 " << r2name << endl;
+	cout << "wd " << wdir << endl;
+	fin.open(iname);
+	if (fin)
+	{
+		while (getline(fin, line))
+		{
+			if (line.back() == '\r')
+				line.pop_back();
+			if (line.length() > 1)
+			{
+			stringstream linestream(line);
+			linestream >> targi >> acc;
+			accession.push_back(acc);
+			targno.push_back(targi);
+			if (targi > num_targ)
+				num_targ = targi;
+			num_orgs++;
+			}
+		}
+		fin.close();
+		cout << num_orgs << " strains" << endl;
+		num_targ++;
+	}
+	taxonomy = new Tree1(num_targ);
+
+	//load taxonomy tree
+	fin.open(tname);
+	if (fin)
+	{
+		while (getline(fin, line))
+		{
+			if (line.back() == '\r')
+				line.pop_back();
+			stringstream linestream(line);
+			linestream >> i >> j;
+			taxonomy->add_edge(i,j);
+		}
+	}
+	else exit(1);
+	fin.close();
+	cout << "tree loaded" << endl;
+
+	//load kmer database
+	tct = process_gzkmers(gzopen(pname.c_str(), "rb"));
+	cout << tct << " kmers loaded" << endl;
+	if (tct < 2) exit(1);
+
+	//process reads
+	gcount.resize(num_targ,0);
+	ucount.resize(num_targ,0);
+	fill(gcount.begin(), gcount.end(), 0);
+	fill(ucount.begin(), ucount.end(), 0);
+        kmer_seen.clear();
+	tct = 0;
+	s1 = ".fastq.gz";
+	s2 = ".fasta";
+	s3 = ".fastq";
+        s4 = ".fasta.gz";
+	int slen = r1name.length();
+	char sch = r1name.at(slen-1);
+	cout << slen << " : " << sch << endl;
+	if (equal(s1.rbegin(), s1.rend(), r1name.rbegin()))
+	{
+		process_fqgz(gzopen(r1name.c_str(), "rb"));
+	}
+	else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin()))
+	{
+		process_fa(r1name.c_str());
+	}
+	else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin()))
+	{
+		process_fq(r1name.c_str());
+	}
+	else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin()))
+	{
+		process_fagz(gzopen(r1name.c_str(), "rb"));
+	}
+	cout << tct << " reads loaded" << endl;
+	if (r2name.length() > 1 && r2name != "none")
+	{ 	//second file?
+	if (equal(s1.rbegin(), s1.rend(), r2name.rbegin()))
+	{
+		process_fqgz(gzopen(r2name.c_str(), "rb"));
+		cout << tct << " reads loaded" << endl;
+	}
+	else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin()))
+	{
+		process_fa(r2name.c_str());
+	}
+	else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin()))
+	{
+		process_fq(r2name.c_str());
+	}
+	else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin()))
+	{
+		process_fagz(gzopen(r2name.c_str(), "rb"));
+	}
+	cout << tct << " reads loaded" << endl;
+	}
+
+	oname2 = wdir + "result.txt";
+	ofstream out2(oname2);
+	for (i=0; i<num_targ; i++)
+		out2 << i << "," << gcount[i] << "," << ucount[i] << endl;
+	out2.close();
+
+	delete taxonomy;
+	delete ht;
+
+return 0;
+}
author	cstrittmatter
date	Fri, 12 Apr 2019 14:58:18 -0400
parents
children