Mercurial > repos > cstrittmatter > mitokmer

// newkmer_5.cpp load k-mers for all species
// test reads by SmithWaterman
// MKM 1 APR 2016
//-std=c++0x -D__NO_INLINE__   ... at end: -lz
//#include "stdafx.h"
#include <fstream>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <iostream>
#include <sstream>
#include <iomanip>
#include <ostream>
#include <string>
#include <vector>
#include <unordered_set>
#include <time.h>
//#include <sys/time.h>
#include <cstddef>
#include <cstring>
#include <algorithm>
#include <set>
#include <zlib.h>
#include <dirent.h>
using namespace std;

const int minalign = 0; //120;
unordered_set<int> myset = { 0};

typedef int ptype;
typedef unsigned long long ui64;
typedef unsigned long long ktype;
typedef unsigned int vtype;
typedef unsigned short int otype;
typedef unsigned long long itype;

const int KSIZE = 30; //kmer size
const int MAXREP = 2048;
const int REPSHIFT = 11; //(log of maxrep)
const int SAVENUM = 12; //save this many reads
const itype MAXHASH = (1 << 30); //pow2 size of hashtable
const int MAXREPROBE = 16;
const int GAPO = 11;
const int GAPX = 1;
const int MATCH = 5;
const int MISMATCH = -4;
const bool INIGAPPEN = false;
const int MAXALIGN = 1024;
const int beam = 8;

vector<string> accession;
vector<int> targno;
int mcount, savepos, tct, num_orgs=0, save_target=0;
vector<int> gcount, ucount;
ofstream outread;
set<ktype> kmer_seen;
string fdir = ""; //directory for fasta.gz files

const char bases[4] = { 'A', 'C', 'G', 'T' };
const ui64 one = 1;
const ui64 two = 2;
const ui64 three = 3;
const ui64 mask = (one << (KSIZE * 2)) - 1;
const ui64 hiC = one << (KSIZE - 1) * 2;
const ui64 hiG = two << (KSIZE - 1) * 2;
const ui64 hiT = three << (KSIZE - 1) * 2;
const ui64 hi1 = one << 62;
const ui64 hi2 = two << 62;
const ui64 hi3 = three << 62;
const ui64 himask = hi1 - one;

const unsigned BUFLEN = 0x4000;

void error(const char* const msg)
{
    cerr << msg << "\n";
    exit(255);
}

string int2str(ktype key)
{
	string sequence = "";
	for (int i = 0; i < KSIZE; ++i)
	{
		sequence = bases[key & 3] + sequence;
		key >>= 2;
	}
	return sequence;
}

class Tree1
{
public:

	vector<vector<int>> children;
	vector<int> parent;
	int root;

	Tree1(int nt)
	{
		vector<int> blank;
		root = 1;
		for (int i = 0; i < nt; i++)
		{
			parent.push_back(root);
			children.push_back(blank);
		}
	}

	~Tree1()
	{
	}

	void add_edge(int x, int y)
	{
		parent[y] = x;
		children[x].push_back(y);
	}

	int msca(int x, int y)
	{
		//return most specific common ancestor of nodes x and y
		set<int> ancestors;
		int z;

		ancestors.insert(root);
		z = x;
		//cout << x << " " << y << " " << z << endl;
		while (z != root)
		{
			ancestors.insert(z);
			z = get_parent(z);
		//cout << x << " " << y << " " << z << endl;
		}
		z = y;
		if (ancestors.find(y) != ancestors.end())
			return x; //x is descendent of y
		while (ancestors.find(z) == ancestors.end())
		{
			z = get_parent(z);
			if (z==x) //y is descendent of x
				return y;
		//cout << x << " " << y << " " << z << endl;
		}
		return z; //common ancesctor
	}

	int get_parent(int x)
	{
		if (x != root && x > 0)
			return parent[x];
		else
			return root;
	}

};

Tree1 *taxonomy;

class Hashtable
{
public:

	itype size;

	struct Cell
	{
		ktype key;
		vtype value;
		int org;
		ptype position;
 		bool fstrand;
	} *pHash;

	Hashtable()
	{
		size = 0;
		pHash = new Cell[MAXHASH];
		HashClear();
		//for (int i = 1; i<MAXREPROBE; i++)
		//	reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2
	}

	~Hashtable()
	{
		delete[] pHash;
		size = 0;
	}

	// from code.google.com/p/smhasher/wiki/MurmurHash3
	itype integerHash(ui64 k)
	{
		k ^= k >> 33;
		k *= 0xff51afd7ed558ccd;
		k ^= k >> 33;
		k *= 0xc4ceb9fe1a85ec53;
		k ^= k >> 33;
		return (itype) k;
	}

	void HashClear()
	{
		memset(pHash, 0, MAXHASH * sizeof(Cell));
	}

	int getHash(ktype key, otype &org, ptype &position, bool &fstrand)
	{
		//0 target=bad
		//returns target if probe present in table
		//sets position and fstrand
		itype index, reprobe, i, hash;

  		position = 0;
		hash = integerHash(key);
		reprobe = 0;
		i = 0;
		do
		{
			index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
			reprobe += ++i;
			if (pHash[index].value == 0)
			{ //empty cell
				return 0;
			}
			else if (pHash[index].key == key)
			{
				position = pHash[index].position;
				fstrand = pHash[index].fstrand;
				org = pHash[index].org;
				return (pHash[index].value);
			}
		} while (reprobe < MAXHASH && i < MAXREPROBE);

		return 0;
	}

	void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand)
	{ //add kmer to table
		itype index, reprobe, i, hash;
		bool notdone=true;

		hash = integerHash(key);
		reprobe = 0;
		i = 0;
		do
		{
			index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
			reprobe += ++i;

			if (pHash[index].value == 0)
			{ //empty cell
				notdone = false;
				pHash[index].key = key;
				pHash[index].value = target;
				pHash[index].org = org;
				pHash[index].position = position;
				pHash[index].fstrand = fstrand;
				if (++size > MAXHASH - 32)
				{
					cout << "out of memory in table " << endl;
					exit(1);
				}
			}
		} while (notdone);
	}

}; //class Hashtable
Hashtable *ht;

string process_seq(string seq)
{
	//format nucleotide sequence to all caps ACTG, all else is N
	string seq2;
	string::iterator it1;
	char base;

	seq2 = "";
	for (it1 = seq.begin(); it1 != seq.end(); ++it1)
	{
		base = *it1;
		switch (base)
		{
		case 'a':
			seq2 += "A";
			break;
		case 'c':
			seq2 += "C";
			break;
		case 'g':
			seq2 += "G";
			break;
		case 't':
			seq2 += "T";
			break;
		case 'A':
			seq2 += "A";
			break;
		case 'C':
			seq2 += "C";
			break;
		case 'G':
			seq2 += "G";
			break;
		case 'T':
			seq2 += "T";
			break;
		default:
			seq2 += "N";
			break;
		}
	}

	return seq2;
}

string load_data3(gzFile in)
{ //read gzip fasta file and extract nt sequence
	char buf[BUFLEN];
	char* offset = buf;
	string line;
        string sequence = "";

	while (true)
	{
		int err, len = sizeof(buf)-(offset-buf);
		if (len == 0) error("Buffer to small for input line lengths");

 		len = gzread(in, offset, len);

		if (len == 0) break;
		if (len <  0) error(gzerror(in, &err));

		char* cur = buf;
		char* end = offset+len;

		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
 		{
			line = string(cur, eol);
			if (line.back() == '\r')
				line.pop_back();
			if (line.length() > 0)
			{
				if (line.at(0) == '>')
				{
					sequence += "N"; //contig separator
				}
				else
				{
					sequence += process_seq(line);
				}
			}
			//cout << line << endl;
		}

		// any trailing data in [eol, end) now is a partial line
		offset = copy(cur, end, buf);
	}

	// trailing data without eol
	line = string(buf, offset);
	if (gzclose(in) != Z_OK) error("failed gzclose");

	return sequence;
}

int *tableM, *tableI, *tableD;

void tableinit()
{
	int i,j;
	const int NINF = -2000000000;

	tableM[0] = 0;
	tableI[0] = NINF;
	tableD[0] = NINF;
	for (i = 1; i<MAXALIGN; i++) //initialize upper row
	{
		j = (0 > i - beam - 1) ? 0 : i - beam - 1;
		tableI[j*MAXALIGN  + i] = NINF;
		if (INIGAPPEN)
		{
			tableD[j*MAXALIGN  + i] = -GAPO - (i - 1)*GAPX;
			tableM[j*MAXALIGN  + i] = -GAPO - (i - 1)*GAPX;
		}
		else
		{
			tableD[0 + i] = 0;
			tableM[0 + i] = 0;
		}
	}
	for (j = 1; j<MAXALIGN; j++) //init left column
	{
		i = (0 > j - beam - 1) ? 0 : j - beam - 1;
		tableD[j*MAXALIGN + i] = NINF;
		if (INIGAPPEN)
		{
			tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
			tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
		}
		else
		{
			tableI[j*MAXALIGN + i] = 0;
			tableM[j*MAXALIGN + i] = 0;
		}
	}
}

int align(string dna1, string dna2)
{
	//smith waterman alignment with beam
	//returns alignment score
	int b1, b2, len1, len2, tindex;
	int b,bb,c,cc;
	int i, j, maxval, startx, starty;
	int i1, i2;

	len1 = dna1.length() + 1;
	len2 = dna2.length() + 1;

	j=1;
	do
	{
		i = (j <= beam) ? j : j - beam;
		i2 = (j + beam > len1) ? len1 : j + beam;
		do
		{
			maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]);
			//copy/replace
                        if (dna1[i-1] != dna2[j-1])
                            maxval += MISMATCH;
			else maxval += MATCH;
			tableM[j*MAXALIGN + i] = maxval;

			b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap
			bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend
			tableI[j*MAXALIGN + i] = (bb<b) ? b : bb;

			c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap
			cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend
			tableD[j*MAXALIGN + i] = (cc<c) ? c : cc;
		} while(++i < i2);
	} while(++j < len2);

	i = len1 - 1;
	j = len2 - 1;   //process last entry

	maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]);

	return maxval;

}

int process_read(string &sequence, string acc, int start, int stop)
{
	//returns id of read, 0 = no match, updates gcount, ucount
	//will do align for first minalign of each target
	//will save first SAVENUM reads of each target

	char base;
	int it1, it2;
	string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr;
	ktype keyF, keyR, key;
	int i, cpos, minscr, gpos;
	bool fstrand1, fstrand2;
	int readlength = stop - start + 1, readlen2;
	int st2, scr, stlen2;
	ktype target, final_targ=0;
	ptype position;
	otype org;
	int maxct, totct, rtarg=0;
	bool reject = false;

	cpos = 0;
	keyF = 0;
	keyR = 0;
	gpos = 0;
	minscr = 5 * sequence.length() / 2;
	for (it1 = start; it1 <= stop; ++it1)
	{
		base = sequence.at(it1);
		switch (base)
		{
		case 'A':
			keyF = (keyF << 2) & mask;
			keyR = (keyR >> 2) | hiT;
			cpos++;
			break;
		case 'C':
			keyF = ((keyF << 2) & mask) | 1;
			keyR = (keyR >> 2) | hiG;
			cpos++;
			break;
		case 'G':
			keyF = ((keyF << 2) & mask) | 2;
			keyR = (keyR >> 2) | hiC;
			cpos++;
			break;
		case 'T':
			keyF = ((keyF << 2) & mask) | 3;
			keyR = keyR >> 2;
			cpos++;
			break;
		case 'a':
			keyF = (keyF << 2) & mask;
			keyR = (keyR >> 2) | hiT;
			cpos++;
			break;
		case 'c':
			keyF = ((keyF << 2) & mask) | 1;
			keyR = (keyR >> 2) | hiG;
			cpos++;
			break;
		case 'g':
			keyF = ((keyF << 2) & mask) | 2;
			keyR = (keyR >> 2) | hiC;
			cpos++;
			break;
		case 't':
			keyF = ((keyF << 2) & mask) | 3;
			keyR = keyR >> 2;
			cpos++;
			break;
		default:
			cpos = 0; //ambiguous character
			keyF = 0;
			keyR = 0;
			break;
		}
		gpos++;
		if (cpos == KSIZE)
		{
			key = keyF < keyR ? keyF : keyR;
			target = ht->getHash(key, org, position, fstrand2);
			//position is ending base, 1-based
			if (target > 0 && gcount[target] < minalign && target != final_targ)
			{ //do alignment on initial reads of a species
				fname = fdir + accession[org] + ".fasta.gz";
				sequence2 = load_data3(gzopen(fname.c_str(), "rb"));
				stlen2 = sequence2.length();
				fstrand1 = (keyF < keyR);
				readlen2 = readlength;
				if (fstrand1 == fstrand2)
				{
					st2 = position - it1;
					if (st2 < 0) st2 = 0;
					if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
					dna1 = sequence.substr(start, readlength);
					dna2 = sequence2.substr(st2, readlen2);
				}
				else
				{
					//reverse complement dna1
					st2 = position - KSIZE + 2 + it1 - readlength;
					if (st2 < 0) st2 = 0;
					if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
					//dna1 = sequence.substr(start, readlength);
					dna1 = "";
					for (it2 = stop; it2 >= start; --it2)
					{
						base = sequence.at(it2);
						switch (base)
						{
						case 'A':
							dna1.push_back('T');
							break;
						case 'C':
							dna1.push_back('G');
							break;
						case 'G':
							dna1.push_back('C');
							break;
						case 'T':
							dna1.push_back('A');
							break;
						default:
							dna1.push_back('N');
							break;
						}
					}
					dna2 = sequence2.substr(st2, readlen2);
				}
				scr = align(dna1, dna2);
				if (scr < minscr)
				{
					rtarg = target;
					target = 0; //fail
					reject = true;
					//cout << rtarg << ": failed " << scr << endl;
				}
			}
			if (final_targ > 0 && target > 0)
			{
				final_targ = taxonomy->msca(target, final_targ);
				finalkeystr =  int2str(key);
			}
			else if (target > 0)
			{
				final_targ = target;
				finalkeystr =  int2str(key);
			}
			if (target > 1)
			{
				if (kmer_seen.find(key) == kmer_seen.end())
				{
					ucount[target]++;
					kmer_seen.insert(key);
				}
			}
			cpos--;
		}
	} //it
/*
        if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0)
	{
 		outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
		outread << "[" << finalkeystr << "]" << endl;
	}
        if (final_targ > 1 && final_targ == save_target)
	{
 		outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
	} */
	gcount[final_targ]++;
	tct++;

	return final_targ;
}


void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand)
{	//adds kmer from datafile into hashtable
	char base;
	string::iterator it1;
	int cpos, gpos;
	ui64 keyF;

	cpos = 0;
	gpos = 0;
	keyF = 0;

	for (it1 = sequence.begin(); it1 != sequence.end(); ++it1)
	{
		base = *it1;
		switch (base)
		{
		case 'A':
			keyF = (keyF << 2) & mask;
			cpos++;
			break;
		case 'C':
			keyF = ((keyF << 2) & mask) | 1;
			cpos++;
			break;
		case 'G':
			keyF = ((keyF << 2) & mask) | 2;
			cpos++;
			break;
		case 'T':
			keyF = ((keyF << 2) & mask) | 3;
			cpos++;
			break;
		default:
			cpos = 0; //ambiguous character
			keyF = 0;
			break;
		}
		if (cpos == KSIZE)
		{
			ht->add_kmer(keyF, target, org, position, fstrand);
			cpos--;
		}
		gpos++;
	} //it

}

void process_qual(string acc, string &seq, string &qual)
{ 	//uses PHRED 33 scores
	//to trim read and pass it to process_read
	const int cutoff_qual = 17;
	const char cutoff_char = 32 + cutoff_qual;
	int start, stop, target, i;
	int window_size = 4;
	int window_val, window_cut = cutoff_qual * window_size;
	string trim_seq;

	stop = seq.length()-1;
	start = 0;
	//trim all low qual bases from end
	while (qual.at(start) < cutoff_char && start < stop)
		start++;
	while (qual.at(stop) < cutoff_char && stop > start)
		stop--;
	//trim by window
	if (start < stop - window_size)
	{
		window_val = 0;
		for (i=0; i<window_size; i++)
			window_val += qual.at(start+i) - 32;
		while (window_val < window_cut && start < stop - window_size)
		{
			window_val += qual.at(start+window_size) - qual.at(start);
			start++;
		}
	}
	if (start < stop - window_size)
	{
		window_val = 0;
		for (i=0; i<window_size; i++)
			window_val += qual.at(stop-i) - 32;
		while (window_val < window_cut && start < stop - window_size)
		{
			window_val += qual.at(stop-window_size) - qual.at(stop);
			stop--;
		}
	}
	//trim_seq = seq.substr(start,stop-start+1);
	if (stop - start >= KSIZE)
	{
		target = process_read(seq,acc,start,stop);
	}

}

void process_fqgz(gzFile in)
{ 	//read gzip fastq file and extract sequence and quality lines
	//passes them to process_qual
	char buf[BUFLEN];
	char* offset = buf;
	string line, seq, acc;
	int mod4=0;

	while (true)
	{
		int err, len = sizeof(buf)-(offset-buf);
		if (len == 0) error("Buffer to small for input line lengths");

 		len = gzread(in, offset, len);

		if (len == 0) break;
		if (len <  0) error(gzerror(in, &err));

		char* cur = buf;
		char* end = offset+len;

		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
 		{
			line = string(cur, eol);
			if (line.back() == '\r') //windows
				line.pop_back();
			if (line.length() > 0)
			{
				if (mod4 == 1)
				{
					seq = line;
				}
				else if (mod4 == 0)
				{
					acc = line;
				}
				else if (mod4 == 3)
				{
					process_qual(acc, seq, line);
				}
				mod4 = (mod4 + 1) % 4;

			}
			//cout << line << endl;
		}

		// any trailing data in [eol, end) now is a partial line
		offset = copy(cur, end, buf);
	}

	// trailing data without eol
	line = string(buf, offset);

	if (gzclose(in) != Z_OK) error("failed gzclose");
}

void process_fagz(gzFile in)
{ 	//read gzip fasta file and extract sequence
	//passes read to process_read
	char buf[BUFLEN];
	char* offset = buf;
	string line;
        string sequence = "", acc = "";
        int c1, c2, target;

	cout << "true" << endl;

	while (true)
	{
		int err, len = sizeof(buf)-(offset-buf);
		if (len == 0) error("Buffer to small for input line lengths");

 		len = gzread(in, offset, len);

		if (len == 0) break;
		if (len <  0) error(gzerror(in, &err));

		char* cur = buf;
		char* end = offset+len;

		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
 		{
			line = string(cur, eol);
			if (line.back() == '\r')
				line.pop_back();
			if (line.length() > 0)
			{
				if (line.at(0) == '>')
				{
					if (sequence.length() > KSIZE)
					{ // process previous sequence
						target = process_read(sequence, acc, 0, sequence.length()-1);
					}
					sequence = "";
					acc = line.substr(1,line.length()-1);
				}
				else
				{
					sequence += line;
				}
			}
			//cout << line << endl;
		}

		// any trailing data in [eol, end) now is a partial line
		offset = copy(cur, end, buf);
	}
	if (sequence.length() > KSIZE)
	{ //last one
		target = process_read(sequence, acc, 0, sequence.length()-1);
	}
	// trailing data without eol
	line = string(buf, offset);
	if (gzclose(in) != Z_OK) error("failed gzclose");

}

int process_gzkmers(gzFile in)
{

	char buf[BUFLEN];
	char* offset = buf;
	string line;
	int target, count, org, position, tct=0;
	string sequence;
	char strand;
	bool fstrand;

	while (true)
	{
		int err, len = sizeof(buf)-(offset-buf);
		if (len == 0) error("Buffer to small for input line lengths");

 		len = gzread(in, offset, len);

		if (len == 0) break;
		if (len <  0) error(gzerror(in, &err));

		char* cur = buf;
		char* end = offset+len;

		for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
 		{
			line = string(cur, eol);
			if (line.back() == '\r')
				line.pop_back();
			if (line.length() > 0)
			{
				replace(line.begin(), line.end(), ',', ' ');
				istringstream ss(line);
				if (ss >> sequence >> target >> org >> position >> strand >> count)
				{
					fstrand = (strand == 'F');
					process_kmer(sequence, target, org, position, fstrand);
					tct++;
				}
			}
			//cout << line << endl;
		}

		// any trailing data in [eol, end) now is a partial line
		offset = copy(cur, end, buf);
	}
	// trailing data without eol
	line = string(buf, offset);
	if (gzclose(in) != Z_OK) error("failed gzclose");

	return tct;

}

void process_fq(string rname)
{ 	//read fastq file and extract sequence and quality lines
	//passes them to process_qual
	string line, seq, acc= "", lseq;
	ifstream fin;
	int target;
	int mod4=0;

	fin.open(rname);
	if (fin)
	{
		while (getline(fin, line))
		{
			if (line.back() == '\r')
				line.pop_back();
			stringstream linestream(line);
			linestream >> lseq;
			if (lseq.length() > 0)
			{
				if (mod4 == 1)
				{
					seq = lseq;
				}
				else if (mod4 == 0)
				{
					acc = lseq;
				}
				else if (mod4 == 3)
				{
					process_qual(acc, seq, lseq);
				}
				mod4 = (mod4 + 1) % 4;
			}
		}
		fin.close();
	}
}

void process_fa(string rname)
{	//read unzipped fasta file and extract sequence
	//passes read to process_read
	string sequence = "", line, lseq, acc= "";
	ifstream fin;
	int target;

	fin.open(rname);
	if (fin)
	{
		while (getline(fin, line))
		{
			if (line.back() == '\r')
				line.pop_back();
			stringstream linestream(line);
			linestream >> lseq;
			if (lseq[0] == '>')
			{
				if (sequence.length() > KSIZE)
				{
					target = process_read(sequence, acc, 0, sequence.length()-1);
				}
				sequence = "";
				acc = lseq.substr(1,lseq.length()-1);
			}
			else
				sequence += lseq;
		}
		fin.close();
		if (sequence.length() > KSIZE)
		{ //last one
			target = process_read(sequence, acc, 0, sequence.length()-1);
		}
	}
	else
	{
		cout << "nark " << rname << endl;
	}
}

int main(int argc, char* argv[])
{
	int i,j,num_jobs;
	vtype target;
	ptype position;
	char strand;
	bool fstrand;
	string header, jstr, fname, line, r1name, r2name = "", oname1, oname2;
	int refi, targi, strni, count, org, num_targ;
	ifstream fin;
	string genus, species, acc, sequence, lseq, remainder;
	vector<vector<string>> fnames;
	vector<string> jnames;
	vector<int> jcounts;
	vector<string>::iterator it2;
	string trname, s1, s2, s3,s4;
	string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = "";
	string iname = "mitochondria_data.txt";   //text file of groups for probe design
	string tname = "mitochondria_tree.txt";   //edges in taxonomy tree
	string pname = "mitochondria_probes.txt.gz";
 	if (argc > 1)
	{
		for (i = 1; i < argc; i++)
		{
			argst = argv[i];
    			if (argst == "-wdir")
			{
				wdir = argv[i+1];
			}
    			if (argst == "-f1")
			{
				r1name =argv[i+1];
			}
    			if (argst == "-f2")
			{
				r2name =argv[i+1];
			}
 		}
	}
	iname = wdir + iname;
	tname = wdir + tname;
	pname = wdir + pname;

	ht = new Hashtable();
	//load strain list


	cout << "r1 " << r1name << endl;
	cout << "r2 " << r2name << endl;
	cout << "wd " << wdir << endl;
	fin.open(iname);
	if (fin)
	{
		while (getline(fin, line))
		{
			if (line.back() == '\r')
				line.pop_back();
			if (line.length() > 1)
			{
			stringstream linestream(line);
			linestream >> targi >> acc;
			accession.push_back(acc);
			targno.push_back(targi);
			if (targi > num_targ)
				num_targ = targi;
			num_orgs++;
			}
		}
		fin.close();
		cout << num_orgs << " strains" << endl;
		num_targ++;
	}
	taxonomy = new Tree1(num_targ);

	//load taxonomy tree
	fin.open(tname);
	if (fin)
	{
		while (getline(fin, line))
		{
			if (line.back() == '\r')
				line.pop_back();
			stringstream linestream(line);
			linestream >> i >> j;
			taxonomy->add_edge(i,j);
		}
	}
	else exit(1);
	fin.close();
	cout << "tree loaded" << endl;

	//load kmer database
	tct = process_gzkmers(gzopen(pname.c_str(), "rb"));
	cout << tct << " kmers loaded" << endl;
	if (tct < 2) exit(1);

	//process reads
	gcount.resize(num_targ,0);
	ucount.resize(num_targ,0);
	fill(gcount.begin(), gcount.end(), 0);
	fill(ucount.begin(), ucount.end(), 0);
        kmer_seen.clear();
	tct = 0;
	s1 = ".fastq.gz";
	s2 = ".fasta";
	s3 = ".fastq";
        s4 = ".fasta.gz";
	int slen = r1name.length();
	char sch = r1name.at(slen-1);
	cout << slen << " : " << sch << endl;
	if (equal(s1.rbegin(), s1.rend(), r1name.rbegin()))
	{
		process_fqgz(gzopen(r1name.c_str(), "rb"));
	}
	else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin()))
	{
		process_fa(r1name.c_str());
	}
	else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin()))
	{
		process_fq(r1name.c_str());
	}
	else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin()))
	{
		process_fagz(gzopen(r1name.c_str(), "rb"));
	}
	cout << tct << " reads loaded" << endl;
	if (r2name.length() > 1 && r2name != "none")
	{ 	//second file?
	if (equal(s1.rbegin(), s1.rend(), r2name.rbegin()))
	{
		process_fqgz(gzopen(r2name.c_str(), "rb"));
		cout << tct << " reads loaded" << endl;
	}
	else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin()))
	{
		process_fa(r2name.c_str());
	}
	else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin()))
	{
		process_fq(r2name.c_str());
	}
	else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin()))
	{
		process_fagz(gzopen(r2name.c_str(), "rb"));
	}
	cout << tct << " reads loaded" << endl;
	}

	oname2 = wdir + "result.txt";
	ofstream out2(oname2);
	for (i=0; i<num_targ; i++)
		out2 << i << "," << gcount[i] << "," << ucount[i] << endl;
	out2.close();

	delete taxonomy;
	delete ht;

return 0;
}
author	cstrittmatter
date	Sun, 28 Apr 2019 22:07:50 -0400
parents	1472b4f4fbfe
children