annotate kmer_read_m3.cpp @ 3:c9d98f5bc240 draft

planemo upload commit 003cdb83fd17248ef57959332d58a3c96311332a-dirty
author cstrittmatter
date Sun, 28 Apr 2019 00:42:24 -0400
parents 1472b4f4fbfe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1 // newkmer_5.cpp load k-mers for all species
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
2 // test reads by SmithWaterman
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
3 // MKM 1 APR 2016
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
4 //-std=c++0x -D__NO_INLINE__ ... at end: -lz
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
5 //#include "stdafx.h"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
6 #include <fstream>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
7 #include <stdlib.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
8 #include <stdio.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
9 #include <math.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
10 #include <iostream>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
11 #include <sstream>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
12 #include <iomanip>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
13 #include <ostream>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
14 #include <string>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
15 #include <vector>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
16 #include <unordered_set>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
17 #include <time.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
18 //#include <sys/time.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
19 #include <cstddef>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
20 #include <cstring>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
21 #include <algorithm>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
22 #include <set>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
23 #include <zlib.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
24 #include <dirent.h>
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
25 using namespace std;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
26
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
27 const int minalign = 0; //120;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
28 unordered_set<int> myset = { 0};
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
29
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
30 typedef int ptype;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
31 typedef unsigned long long ui64;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
32 typedef unsigned long long ktype;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
33 typedef unsigned int vtype;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
34 typedef unsigned short int otype;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
35 typedef unsigned long long itype;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
36
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
37 const int KSIZE = 30; //kmer size
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
38 const int MAXREP = 2048;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
39 const int REPSHIFT = 11; //(log of maxrep)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
40 const int SAVENUM = 12; //save this many reads
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
41 const itype MAXHASH = (1 << 30); //pow2 size of hashtable
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
42 const int MAXREPROBE = 16;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
43 const int GAPO = 11;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
44 const int GAPX = 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
45 const int MATCH = 5;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
46 const int MISMATCH = -4;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
47 const bool INIGAPPEN = false;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
48 const int MAXALIGN = 1024;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
49 const int beam = 8;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
50
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
51 vector<string> accession;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
52 vector<int> targno;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
53 int mcount, savepos, tct, num_orgs=0, save_target=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
54 vector<int> gcount, ucount;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
55 ofstream outread;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
56 set<ktype> kmer_seen;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
57 string fdir = ""; //directory for fasta.gz files
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
58
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
59 const char bases[4] = { 'A', 'C', 'G', 'T' };
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
60 const ui64 one = 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
61 const ui64 two = 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
62 const ui64 three = 3;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
63 const ui64 mask = (one << (KSIZE * 2)) - 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
64 const ui64 hiC = one << (KSIZE - 1) * 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
65 const ui64 hiG = two << (KSIZE - 1) * 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
66 const ui64 hiT = three << (KSIZE - 1) * 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
67 const ui64 hi1 = one << 62;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
68 const ui64 hi2 = two << 62;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
69 const ui64 hi3 = three << 62;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
70 const ui64 himask = hi1 - one;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
71
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
72 const unsigned BUFLEN = 0x4000;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
73
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
74 void error(const char* const msg)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
75 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
76 cerr << msg << "\n";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
77 exit(255);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
78 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
79
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
80 string int2str(ktype key)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
81 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
82 string sequence = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
83 for (int i = 0; i < KSIZE; ++i)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
84 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
85 sequence = bases[key & 3] + sequence;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
86 key >>= 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
87 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
88 return sequence;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
89 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
90
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
91 class Tree1
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
92 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
93 public:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
94
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
95 vector<vector<int>> children;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
96 vector<int> parent;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
97 int root;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
98
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
99 Tree1(int nt)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
100 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
101 vector<int> blank;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
102 root = 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
103 for (int i = 0; i < nt; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
104 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
105 parent.push_back(root);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
106 children.push_back(blank);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
107 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
108 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
109
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
110 ~Tree1()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
111 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
112 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
113
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
114 void add_edge(int x, int y)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
115 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
116 parent[y] = x;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
117 children[x].push_back(y);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
118 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
119
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
120 int msca(int x, int y)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
121 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
122 //return most specific common ancestor of nodes x and y
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
123 set<int> ancestors;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
124 int z;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
125
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
126 ancestors.insert(root);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
127 z = x;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
128 //cout << x << " " << y << " " << z << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
129 while (z != root)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
130 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
131 ancestors.insert(z);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
132 z = get_parent(z);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
133 //cout << x << " " << y << " " << z << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
134 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
135 z = y;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
136 if (ancestors.find(y) != ancestors.end())
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
137 return x; //x is descendent of y
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
138 while (ancestors.find(z) == ancestors.end())
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
139 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
140 z = get_parent(z);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
141 if (z==x) //y is descendent of x
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
142 return y;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
143 //cout << x << " " << y << " " << z << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
144 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
145 return z; //common ancesctor
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
146 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
147
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
148 int get_parent(int x)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
149 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
150 if (x != root && x > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
151 return parent[x];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
152 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
153 return root;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
154 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
155
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
156 };
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
157
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
158 Tree1 *taxonomy;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
159
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
160 class Hashtable
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
161 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
162 public:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
163
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
164 itype size;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
165
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
166 struct Cell
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
167 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
168 ktype key;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
169 vtype value;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
170 int org;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
171 ptype position;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
172 bool fstrand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
173 } *pHash;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
174
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
175 Hashtable()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
176 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
177 size = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
178 pHash = new Cell[MAXHASH];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
179 HashClear();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
180 //for (int i = 1; i<MAXREPROBE; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
181 // reprobeVal[i] = reprobeVal[i - 1] + i; //i*i/2+i/2
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
182 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
183
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
184 ~Hashtable()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
185 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
186 delete[] pHash;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
187 size = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
188 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
189
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
190 // from code.google.com/p/smhasher/wiki/MurmurHash3
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
191 itype integerHash(ui64 k)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
192 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
193 k ^= k >> 33;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
194 k *= 0xff51afd7ed558ccd;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
195 k ^= k >> 33;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
196 k *= 0xc4ceb9fe1a85ec53;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
197 k ^= k >> 33;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
198 return (itype) k;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
199 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
200
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
201 void HashClear()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
202 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
203 memset(pHash, 0, MAXHASH * sizeof(Cell));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
204 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
205
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
206 int getHash(ktype key, otype &org, ptype &position, bool &fstrand)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
207 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
208 //0 target=bad
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
209 //returns target if probe present in table
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
210 //sets position and fstrand
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
211 itype index, reprobe, i, hash;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
212
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
213 position = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
214 hash = integerHash(key);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
215 reprobe = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
216 i = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
217 do
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
218 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
219 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
220 reprobe += ++i;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
221 if (pHash[index].value == 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
222 { //empty cell
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
223 return 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
224 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
225 else if (pHash[index].key == key)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
226 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
227 position = pHash[index].position;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
228 fstrand = pHash[index].fstrand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
229 org = pHash[index].org;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
230 return (pHash[index].value);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
231 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
232 } while (reprobe < MAXHASH && i < MAXREPROBE);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
233
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
234 return 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
235 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
236
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
237 void add_kmer(ktype key, vtype target, otype org, ptype position, bool fstrand)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
238 { //add kmer to table
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
239 itype index, reprobe, i, hash;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
240 bool notdone=true;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
241
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
242 hash = integerHash(key);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
243 reprobe = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
244 i = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
245 do
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
246 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
247 index = ((hash + reprobe) & (MAXHASH-1)); //for power of 2 size
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
248 reprobe += ++i;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
249
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
250 if (pHash[index].value == 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
251 { //empty cell
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
252 notdone = false;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
253 pHash[index].key = key;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
254 pHash[index].value = target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
255 pHash[index].org = org;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
256 pHash[index].position = position;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
257 pHash[index].fstrand = fstrand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
258 if (++size > MAXHASH - 32)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
259 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
260 cout << "out of memory in table " << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
261 exit(1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
262 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
263 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
264 } while (notdone);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
265 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
266
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
267 }; //class Hashtable
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
268 Hashtable *ht;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
269
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
270 string process_seq(string seq)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
271 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
272 //format nucleotide sequence to all caps ACTG, all else is N
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
273 string seq2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
274 string::iterator it1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
275 char base;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
276
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
277 seq2 = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
278 for (it1 = seq.begin(); it1 != seq.end(); ++it1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
279 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
280 base = *it1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
281 switch (base)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
282 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
283 case 'a':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
284 seq2 += "A";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
285 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
286 case 'c':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
287 seq2 += "C";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
288 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
289 case 'g':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
290 seq2 += "G";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
291 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
292 case 't':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
293 seq2 += "T";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
294 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
295 case 'A':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
296 seq2 += "A";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
297 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
298 case 'C':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
299 seq2 += "C";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
300 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
301 case 'G':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
302 seq2 += "G";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
303 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
304 case 'T':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
305 seq2 += "T";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
306 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
307 default:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
308 seq2 += "N";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
309 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
310 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
311 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
312
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
313 return seq2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
314 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
315
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
316 string load_data3(gzFile in)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
317 { //read gzip fasta file and extract nt sequence
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
318 char buf[BUFLEN];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
319 char* offset = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
320 string line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
321 string sequence = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
322
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
323 while (true)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
324 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
325 int err, len = sizeof(buf)-(offset-buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
326 if (len == 0) error("Buffer to small for input line lengths");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
327
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
328 len = gzread(in, offset, len);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
329
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
330 if (len == 0) break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
331 if (len < 0) error(gzerror(in, &err));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
332
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
333 char* cur = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
334 char* end = offset+len;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
335
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
336 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
337 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
338 line = string(cur, eol);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
339 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
340 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
341 if (line.length() > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
342 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
343 if (line.at(0) == '>')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
344 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
345 sequence += "N"; //contig separator
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
346 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
347 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
348 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
349 sequence += process_seq(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
350 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
351 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
352 //cout << line << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
353 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
354
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
355 // any trailing data in [eol, end) now is a partial line
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
356 offset = copy(cur, end, buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
357 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
358
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
359 // trailing data without eol
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
360 line = string(buf, offset);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
361 if (gzclose(in) != Z_OK) error("failed gzclose");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
362
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
363 return sequence;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
364 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
365
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
366 int *tableM, *tableI, *tableD;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
367
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
368 void tableinit()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
369 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
370 int i,j;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
371 const int NINF = -2000000000;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
372
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
373 tableM[0] = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
374 tableI[0] = NINF;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
375 tableD[0] = NINF;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
376 for (i = 1; i<MAXALIGN; i++) //initialize upper row
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
377 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
378 j = (0 > i - beam - 1) ? 0 : i - beam - 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
379 tableI[j*MAXALIGN + i] = NINF;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
380 if (INIGAPPEN)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
381 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
382 tableD[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
383 tableM[j*MAXALIGN + i] = -GAPO - (i - 1)*GAPX;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
384 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
385 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
386 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
387 tableD[0 + i] = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
388 tableM[0 + i] = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
389 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
390 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
391 for (j = 1; j<MAXALIGN; j++) //init left column
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
392 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
393 i = (0 > j - beam - 1) ? 0 : j - beam - 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
394 tableD[j*MAXALIGN + i] = NINF;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
395 if (INIGAPPEN)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
396 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
397 tableI[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
398 tableM[j*MAXALIGN + i] = -GAPO - (j - 1)*GAPX;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
399 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
400 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
401 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
402 tableI[j*MAXALIGN + i] = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
403 tableM[j*MAXALIGN + i] = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
404 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
405 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
406 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
407
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
408 int align(string dna1, string dna2)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
409 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
410 //smith waterman alignment with beam
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
411 //returns alignment score
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
412 int b1, b2, len1, len2, tindex;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
413 int b,bb,c,cc;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
414 int i, j, maxval, startx, starty;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
415 int i1, i2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
416
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
417 len1 = dna1.length() + 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
418 len2 = dna2.length() + 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
419
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
420 j=1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
421 do
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
422 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
423 i = (j <= beam) ? j : j - beam;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
424 i2 = (j + beam > len1) ? len1 : j + beam;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
425 do
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
426 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
427 maxval = max(max(tableM[(j-1)*MAXALIGN + i-1], tableI[(j-1)*MAXALIGN + i-1]), tableD[(j-1)*MAXALIGN + i-1]);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
428 //copy/replace
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
429 if (dna1[i-1] != dna2[j-1])
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
430 maxval += MISMATCH;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
431 else maxval += MATCH;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
432 tableM[j*MAXALIGN + i] = maxval;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
433
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
434 b=tableM[(j-1)*MAXALIGN + i] - GAPO; //new gap
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
435 bb = tableI[(j-1)*MAXALIGN + i] -GAPX; //extend
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
436 tableI[j*MAXALIGN + i] = (bb<b) ? b : bb;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
437
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
438 c=tableM[j*MAXALIGN + i-1] - GAPO; //new gap
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
439 cc = tableD[j*MAXALIGN + i-1] -GAPX; //extend
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
440 tableD[j*MAXALIGN + i] = (cc<c) ? c : cc;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
441 } while(++i < i2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
442 } while(++j < len2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
443
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
444 i = len1 - 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
445 j = len2 - 1; //process last entry
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
446
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
447 maxval = max(max(tableM[j*MAXALIGN + i], tableI[j*MAXALIGN + i]), tableD[j*MAXALIGN + i]);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
448
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
449 return maxval;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
450
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
451 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
452
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
453 int process_read(string &sequence, string acc, int start, int stop)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
454 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
455 //returns id of read, 0 = no match, updates gcount, ucount
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
456 //will do align for first minalign of each target
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
457 //will save first SAVENUM reads of each target
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
458
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
459 char base;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
460 int it1, it2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
461 string kmer, sequence2, dna1, dna1r, dna2, fname, finalkeystr;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
462 ktype keyF, keyR, key;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
463 int i, cpos, minscr, gpos;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
464 bool fstrand1, fstrand2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
465 int readlength = stop - start + 1, readlen2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
466 int st2, scr, stlen2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
467 ktype target, final_targ=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
468 ptype position;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
469 otype org;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
470 int maxct, totct, rtarg=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
471 bool reject = false;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
472
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
473 cpos = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
474 keyF = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
475 keyR = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
476 gpos = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
477 minscr = 5 * sequence.length() / 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
478 for (it1 = start; it1 <= stop; ++it1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
479 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
480 base = sequence.at(it1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
481 switch (base)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
482 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
483 case 'A':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
484 keyF = (keyF << 2) & mask;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
485 keyR = (keyR >> 2) | hiT;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
486 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
487 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
488 case 'C':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
489 keyF = ((keyF << 2) & mask) | 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
490 keyR = (keyR >> 2) | hiG;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
491 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
492 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
493 case 'G':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
494 keyF = ((keyF << 2) & mask) | 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
495 keyR = (keyR >> 2) | hiC;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
496 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
497 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
498 case 'T':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
499 keyF = ((keyF << 2) & mask) | 3;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
500 keyR = keyR >> 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
501 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
502 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
503 case 'a':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
504 keyF = (keyF << 2) & mask;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
505 keyR = (keyR >> 2) | hiT;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
506 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
507 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
508 case 'c':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
509 keyF = ((keyF << 2) & mask) | 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
510 keyR = (keyR >> 2) | hiG;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
511 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
512 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
513 case 'g':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
514 keyF = ((keyF << 2) & mask) | 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
515 keyR = (keyR >> 2) | hiC;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
516 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
517 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
518 case 't':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
519 keyF = ((keyF << 2) & mask) | 3;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
520 keyR = keyR >> 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
521 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
522 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
523 default:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
524 cpos = 0; //ambiguous character
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
525 keyF = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
526 keyR = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
527 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
528 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
529 gpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
530 if (cpos == KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
531 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
532 key = keyF < keyR ? keyF : keyR;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
533 target = ht->getHash(key, org, position, fstrand2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
534 //position is ending base, 1-based
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
535 if (target > 0 && gcount[target] < minalign && target != final_targ)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
536 { //do alignment on initial reads of a species
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
537 fname = fdir + accession[org] + ".fasta.gz";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
538 sequence2 = load_data3(gzopen(fname.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
539 stlen2 = sequence2.length();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
540 fstrand1 = (keyF < keyR);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
541 readlen2 = readlength;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
542 if (fstrand1 == fstrand2)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
543 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
544 st2 = position - it1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
545 if (st2 < 0) st2 = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
546 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
547 dna1 = sequence.substr(start, readlength);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
548 dna2 = sequence2.substr(st2, readlen2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
549 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
550 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
551 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
552 //reverse complement dna1
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
553 st2 = position - KSIZE + 2 + it1 - readlength;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
554 if (st2 < 0) st2 = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
555 if (st2 + readlen2 > stlen2) readlen2 = stlen2 - st2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
556 //dna1 = sequence.substr(start, readlength);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
557 dna1 = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
558 for (it2 = stop; it2 >= start; --it2)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
559 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
560 base = sequence.at(it2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
561 switch (base)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
562 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
563 case 'A':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
564 dna1.push_back('T');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
565 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
566 case 'C':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
567 dna1.push_back('G');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
568 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
569 case 'G':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
570 dna1.push_back('C');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
571 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
572 case 'T':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
573 dna1.push_back('A');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
574 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
575 default:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
576 dna1.push_back('N');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
577 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
578 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
579 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
580 dna2 = sequence2.substr(st2, readlen2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
581 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
582 scr = align(dna1, dna2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
583 if (scr < minscr)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
584 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
585 rtarg = target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
586 target = 0; //fail
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
587 reject = true;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
588 //cout << rtarg << ": failed " << scr << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
589 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
590 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
591 if (final_targ > 0 && target > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
592 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
593 final_targ = taxonomy->msca(target, final_targ);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
594 finalkeystr = int2str(key);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
595 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
596 else if (target > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
597 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
598 final_targ = target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
599 finalkeystr = int2str(key);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
600 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
601 if (target > 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
602 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
603 if (kmer_seen.find(key) == kmer_seen.end())
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
604 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
605 ucount[target]++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
606 kmer_seen.insert(key);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
607 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
608 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
609 cpos--;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
610 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
611 } //it
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
612 /*
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
613 if (final_targ > 1 && gcount[final_targ] < SAVENUM && save_target == 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
614 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
615 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
616 outread << "[" << finalkeystr << "]" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
617 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
618 if (final_targ > 1 && final_targ == save_target)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
619 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
620 outread << ">" << final_targ << ":" << acc << endl << sequence.substr(start, stop-start+1) << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
621 } */
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
622 gcount[final_targ]++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
623 tct++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
624
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
625 return final_targ;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
626 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
627
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
628
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
629 void process_kmer(string sequence, vtype target, otype org, ptype position, bool fstrand)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
630 { //adds kmer from datafile into hashtable
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
631 char base;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
632 string::iterator it1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
633 int cpos, gpos;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
634 ui64 keyF;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
635
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
636 cpos = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
637 gpos = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
638 keyF = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
639
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
640 for (it1 = sequence.begin(); it1 != sequence.end(); ++it1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
641 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
642 base = *it1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
643 switch (base)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
644 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
645 case 'A':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
646 keyF = (keyF << 2) & mask;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
647 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
648 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
649 case 'C':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
650 keyF = ((keyF << 2) & mask) | 1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
651 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
652 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
653 case 'G':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
654 keyF = ((keyF << 2) & mask) | 2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
655 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
656 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
657 case 'T':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
658 keyF = ((keyF << 2) & mask) | 3;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
659 cpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
660 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
661 default:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
662 cpos = 0; //ambiguous character
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
663 keyF = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
664 break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
665 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
666 if (cpos == KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
667 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
668 ht->add_kmer(keyF, target, org, position, fstrand);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
669 cpos--;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
670 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
671 gpos++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
672 } //it
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
673
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
674 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
675
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
676 void process_qual(string acc, string &seq, string &qual)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
677 { //uses PHRED 33 scores
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
678 //to trim read and pass it to process_read
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
679 const int cutoff_qual = 17;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
680 const char cutoff_char = 32 + cutoff_qual;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
681 int start, stop, target, i;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
682 int window_size = 4;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
683 int window_val, window_cut = cutoff_qual * window_size;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
684 string trim_seq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
685
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
686 stop = seq.length()-1;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
687 start = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
688 //trim all low qual bases from end
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
689 while (qual.at(start) < cutoff_char && start < stop)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
690 start++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
691 while (qual.at(stop) < cutoff_char && stop > start)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
692 stop--;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
693 //trim by window
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
694 if (start < stop - window_size)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
695 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
696 window_val = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
697 for (i=0; i<window_size; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
698 window_val += qual.at(start+i) - 32;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
699 while (window_val < window_cut && start < stop - window_size)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
700 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
701 window_val += qual.at(start+window_size) - qual.at(start);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
702 start++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
703 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
704 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
705 if (start < stop - window_size)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
706 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
707 window_val = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
708 for (i=0; i<window_size; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
709 window_val += qual.at(stop-i) - 32;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
710 while (window_val < window_cut && start < stop - window_size)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
711 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
712 window_val += qual.at(stop-window_size) - qual.at(stop);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
713 stop--;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
714 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
715 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
716 //trim_seq = seq.substr(start,stop-start+1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
717 if (stop - start >= KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
718 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
719 target = process_read(seq,acc,start,stop);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
720 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
721
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
722 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
723
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
724 void process_fqgz(gzFile in)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
725 { //read gzip fastq file and extract sequence and quality lines
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
726 //passes them to process_qual
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
727 char buf[BUFLEN];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
728 char* offset = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
729 string line, seq, acc;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
730 int mod4=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
731
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
732 while (true)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
733 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
734 int err, len = sizeof(buf)-(offset-buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
735 if (len == 0) error("Buffer to small for input line lengths");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
736
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
737 len = gzread(in, offset, len);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
738
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
739 if (len == 0) break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
740 if (len < 0) error(gzerror(in, &err));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
741
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
742 char* cur = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
743 char* end = offset+len;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
744
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
745 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
746 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
747 line = string(cur, eol);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
748 if (line.back() == '\r') //windows
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
749 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
750 if (line.length() > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
751 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
752 if (mod4 == 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
753 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
754 seq = line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
755 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
756 else if (mod4 == 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
757 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
758 acc = line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
759 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
760 else if (mod4 == 3)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
761 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
762 process_qual(acc, seq, line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
763 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
764 mod4 = (mod4 + 1) % 4;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
765
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
766 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
767 //cout << line << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
768 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
769
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
770 // any trailing data in [eol, end) now is a partial line
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
771 offset = copy(cur, end, buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
772 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
773
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
774 // trailing data without eol
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
775 line = string(buf, offset);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
776
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
777 if (gzclose(in) != Z_OK) error("failed gzclose");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
778 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
779
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
780 void process_fagz(gzFile in)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
781 { //read gzip fasta file and extract sequence
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
782 //passes read to process_read
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
783 char buf[BUFLEN];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
784 char* offset = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
785 string line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
786 string sequence = "", acc = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
787 int c1, c2, target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
788
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
789 cout << "true" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
790
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
791 while (true)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
792 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
793 int err, len = sizeof(buf)-(offset-buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
794 if (len == 0) error("Buffer to small for input line lengths");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
795
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
796 len = gzread(in, offset, len);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
797
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
798 if (len == 0) break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
799 if (len < 0) error(gzerror(in, &err));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
800
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
801 char* cur = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
802 char* end = offset+len;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
803
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
804 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
805 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
806 line = string(cur, eol);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
807 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
808 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
809 if (line.length() > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
810 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
811 if (line.at(0) == '>')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
812 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
813 if (sequence.length() > KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
814 { // process previous sequence
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
815 target = process_read(sequence, acc, 0, sequence.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
816 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
817 sequence = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
818 acc = line.substr(1,line.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
819 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
820 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
821 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
822 sequence += line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
823 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
824 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
825 //cout << line << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
826 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
827
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
828 // any trailing data in [eol, end) now is a partial line
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
829 offset = copy(cur, end, buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
830 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
831 if (sequence.length() > KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
832 { //last one
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
833 target = process_read(sequence, acc, 0, sequence.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
834 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
835 // trailing data without eol
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
836 line = string(buf, offset);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
837 if (gzclose(in) != Z_OK) error("failed gzclose");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
838
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
839 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
840
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
841 int process_gzkmers(gzFile in)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
842 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
843
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
844 char buf[BUFLEN];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
845 char* offset = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
846 string line;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
847 int target, count, org, position, tct=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
848 string sequence;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
849 char strand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
850 bool fstrand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
851
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
852 while (true)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
853 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
854 int err, len = sizeof(buf)-(offset-buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
855 if (len == 0) error("Buffer to small for input line lengths");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
856
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
857 len = gzread(in, offset, len);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
858
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
859 if (len == 0) break;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
860 if (len < 0) error(gzerror(in, &err));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
861
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
862 char* cur = buf;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
863 char* end = offset+len;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
864
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
865 for (char* eol; (cur<end) && (eol = find(cur, end, '\n')) < end; cur = eol + 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
866 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
867 line = string(cur, eol);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
868 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
869 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
870 if (line.length() > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
871 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
872 replace(line.begin(), line.end(), ',', ' ');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
873 istringstream ss(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
874 if (ss >> sequence >> target >> org >> position >> strand >> count)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
875 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
876 fstrand = (strand == 'F');
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
877 process_kmer(sequence, target, org, position, fstrand);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
878 tct++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
879 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
880 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
881 //cout << line << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
882 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
883
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
884 // any trailing data in [eol, end) now is a partial line
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
885 offset = copy(cur, end, buf);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
886 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
887 // trailing data without eol
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
888 line = string(buf, offset);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
889 if (gzclose(in) != Z_OK) error("failed gzclose");
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
890
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
891 return tct;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
892
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
893 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
894
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
895 void process_fq(string rname)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
896 { //read fastq file and extract sequence and quality lines
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
897 //passes them to process_qual
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
898 string line, seq, acc= "", lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
899 ifstream fin;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
900 int target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
901 int mod4=0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
902
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
903 fin.open(rname);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
904 if (fin)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
905 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
906 while (getline(fin, line))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
907 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
908 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
909 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
910 stringstream linestream(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
911 linestream >> lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
912 if (lseq.length() > 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
913 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
914 if (mod4 == 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
915 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
916 seq = lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
917 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
918 else if (mod4 == 0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
919 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
920 acc = lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
921 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
922 else if (mod4 == 3)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
923 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
924 process_qual(acc, seq, lseq);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
925 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
926 mod4 = (mod4 + 1) % 4;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
927 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
928 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
929 fin.close();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
930 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
931 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
932
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
933 void process_fa(string rname)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
934 { //read unzipped fasta file and extract sequence
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
935 //passes read to process_read
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
936 string sequence = "", line, lseq, acc= "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
937 ifstream fin;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
938 int target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
939
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
940 fin.open(rname);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
941 if (fin)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
942 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
943 while (getline(fin, line))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
944 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
945 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
946 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
947 stringstream linestream(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
948 linestream >> lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
949 if (lseq[0] == '>')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
950 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
951 if (sequence.length() > KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
952 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
953 target = process_read(sequence, acc, 0, sequence.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
954 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
955 sequence = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
956 acc = lseq.substr(1,lseq.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
957 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
958 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
959 sequence += lseq;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
960 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
961 fin.close();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
962 if (sequence.length() > KSIZE)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
963 { //last one
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
964 target = process_read(sequence, acc, 0, sequence.length()-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
965 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
966 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
967 else
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
968 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
969 cout << "nark " << rname << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
970 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
971 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
972
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
973 int main(int argc, char* argv[])
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
974 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
975 int i,j,num_jobs;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
976 vtype target;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
977 ptype position;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
978 char strand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
979 bool fstrand;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
980 string header, jstr, fname, line, r1name, r2name = "", oname1, oname2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
981 int refi, targi, strni, count, org, num_targ;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
982 ifstream fin;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
983 string genus, species, acc, sequence, lseq, remainder;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
984 vector<vector<string>> fnames;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
985 vector<string> jnames;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
986 vector<int> jcounts;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
987 vector<string>::iterator it2;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
988 string trname, s1, s2, s3,s4;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
989 string argst, wdir = "", dname = "", jname = "", jfile = "", jdir = "";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
990 string iname = "mitochondria_data.txt"; //text file of groups for probe design
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
991 string tname = "mitochondria_tree.txt"; //edges in taxonomy tree
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
992 string pname = "mitochondria_probes.txt.gz";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
993 if (argc > 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
994 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
995 for (i = 1; i < argc; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
996 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
997 argst = argv[i];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
998 if (argst == "-wdir")
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
999 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1000 wdir = argv[i+1];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1001 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1002 if (argst == "-f1")
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1003 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1004 r1name =argv[i+1];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1005 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1006 if (argst == "-f2")
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1007 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1008 r2name =argv[i+1];
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1009 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1010 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1011 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1012 iname = wdir + iname;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1013 tname = wdir + tname;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1014 pname = wdir + pname;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1015
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1016 ht = new Hashtable();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1017 //load strain list
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1018
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1019
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1020 cout << "r1 " << r1name << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1021 cout << "r2 " << r2name << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1022 cout << "wd " << wdir << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1023 fin.open(iname);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1024 if (fin)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1025 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1026 while (getline(fin, line))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1027 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1028 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1029 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1030 if (line.length() > 1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1031 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1032 stringstream linestream(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1033 linestream >> targi >> acc;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1034 accession.push_back(acc);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1035 targno.push_back(targi);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1036 if (targi > num_targ)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1037 num_targ = targi;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1038 num_orgs++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1039 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1040 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1041 fin.close();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1042 cout << num_orgs << " strains" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1043 num_targ++;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1044 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1045 taxonomy = new Tree1(num_targ);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1046
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1047 //load taxonomy tree
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1048 fin.open(tname);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1049 if (fin)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1050 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1051 while (getline(fin, line))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1052 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1053 if (line.back() == '\r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1054 line.pop_back();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1055 stringstream linestream(line);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1056 linestream >> i >> j;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1057 taxonomy->add_edge(i,j);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1058 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1059 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1060 else exit(1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1061 fin.close();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1062 cout << "tree loaded" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1063
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1064 //load kmer database
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1065 tct = process_gzkmers(gzopen(pname.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1066 cout << tct << " kmers loaded" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1067 if (tct < 2) exit(1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1068
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1069 //process reads
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1070 gcount.resize(num_targ,0);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1071 ucount.resize(num_targ,0);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1072 fill(gcount.begin(), gcount.end(), 0);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1073 fill(ucount.begin(), ucount.end(), 0);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1074 kmer_seen.clear();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1075 tct = 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1076 s1 = ".fastq.gz";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1077 s2 = ".fasta";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1078 s3 = ".fastq";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1079 s4 = ".fasta.gz";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1080 int slen = r1name.length();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1081 char sch = r1name.at(slen-1);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1082 cout << slen << " : " << sch << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1083 if (equal(s1.rbegin(), s1.rend(), r1name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1084 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1085 process_fqgz(gzopen(r1name.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1086 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1087 else if (equal(s2.rbegin(), s2.rend(), r1name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1088 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1089 process_fa(r1name.c_str());
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1090 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1091 else if (equal(s3.rbegin(), s3.rend(), r1name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1092 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1093 process_fq(r1name.c_str());
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1094 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1095 else if (equal(s4.rbegin(), s4.rend(), r1name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1096 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1097 process_fagz(gzopen(r1name.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1098 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1099 cout << tct << " reads loaded" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1100 if (r2name.length() > 1 && r2name != "none")
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1101 { //second file?
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1102 if (equal(s1.rbegin(), s1.rend(), r2name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1103 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1104 process_fqgz(gzopen(r2name.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1105 cout << tct << " reads loaded" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1106 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1107 else if (equal(s2.rbegin(), s2.rend(), r2name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1108 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1109 process_fa(r2name.c_str());
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1110 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1111 else if (equal(s3.rbegin(), s3.rend(), r2name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1112 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1113 process_fq(r2name.c_str());
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1114 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1115 else if (equal(s4.rbegin(), s4.rend(), r2name.rbegin()))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1116 {
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1117 process_fagz(gzopen(r2name.c_str(), "rb"));
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1118 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1119 cout << tct << " reads loaded" << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1120 }
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1121
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1122 oname2 = wdir + "result.txt";
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1123 ofstream out2(oname2);
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1124 for (i=0; i<num_targ; i++)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1125 out2 << i << "," << gcount[i] << "," << ucount[i] << endl;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1126 out2.close();
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1127
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1128 delete taxonomy;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1129 delete ht;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1130
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1131 return 0;
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1132 }