18
|
1 #include <algorithm>
|
|
2 #include <set>
|
|
3 #include <vector>
|
|
4 #include <sstream>
|
|
5 #include "inputFileParser.hpp"
|
|
6
|
|
7 static const unsigned int MAX_SIZE = 10000;
|
|
8
|
|
9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) {
|
|
10 outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.'));
|
|
11 }
|
|
12
|
|
13
|
|
14 void InputFileParser::parse() {
|
|
15 ifstream file;
|
|
16 string line;
|
|
17 file.open(inputFileName.c_str());
|
|
18 if (file.is_open()) {
|
|
19 GenomicInterval genomicInterval;
|
|
20 while (file.good()) {
|
|
21 getline(file, line);
|
|
22 if (line.size() > 0) {
|
|
23 genomicInterval.parseFromLine(line);
|
|
24 addToList(genomicInterval);
|
|
25 }
|
|
26 }
|
|
27 syncFiles();
|
|
28 file.close();
|
|
29 }
|
|
30 else {
|
|
31 cout << "Unable to open file" << inputFileName;
|
|
32 }
|
|
33 merge();
|
|
34 }
|
|
35
|
|
36
|
|
37 void InputFileParser::addToList(GenomicInterval &genomicInterval) {
|
|
38 Interval interval (genomicInterval);
|
|
39 IntervalsType *intervals;
|
|
40 SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome);
|
|
41 if (iter == sortedIntervals.end()) {
|
|
42 intervals = new IntervalsType;
|
|
43 sortedIntervals[genomicInterval.chromosome] = intervals;
|
|
44 }
|
|
45 else {
|
|
46 intervals = iter->second;
|
|
47 }
|
|
48 //cout << "pushing " << interval.start << "-" << interval.end << endl;
|
|
49 intervals->push_back(&interval);
|
|
50 if (intervals->size() >= MAX_SIZE) {
|
|
51 writeTmpFile(genomicInterval.chromosome);
|
|
52 }
|
|
53 }
|
|
54
|
|
55
|
|
56 void InputFileParser::writeTmpFile(string &chromosome) {
|
|
57 SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome);
|
|
58 IntervalsType *intervals = iter->second;
|
|
59
|
|
60 sort(intervals->begin(), intervals->end());
|
|
61 string fileName = getTmpName(chromosome);
|
|
62 ofstream file(fileName.c_str(), ios::out | ios::binary);
|
|
63 for (unsigned i = 0; i < intervals->size(); i++) {
|
|
64 cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl;
|
|
65 (*intervals)[i]->writeBinary(file);
|
|
66 }
|
|
67 file.close();
|
|
68 ++counter[chromosome];
|
|
69
|
|
70 sortedIntervals[chromosome] = NULL;
|
|
71 delete intervals;
|
|
72 }
|
|
73
|
|
74
|
|
75 void InputFileParser::syncFiles() {
|
|
76 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
|
|
77 string chromosome = iter->first;
|
|
78 writeTmpFile(chromosome);
|
|
79 }
|
|
80 }
|
|
81
|
|
82
|
|
83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) {
|
|
84 stringstream s;
|
|
85 s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp";
|
|
86 return s.str();
|
|
87 }
|
|
88
|
|
89
|
|
90 string InputFileParser::getTmpName(const string &chromosome) {
|
|
91 return getTmpName(chromosome, counter[chromosome]);
|
|
92 }
|
|
93
|
|
94
|
|
95 void InputFileParser::merge() {
|
|
96 ofstream outputFile(outputFileName.c_str());
|
|
97 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
|
|
98 merge(iter->first, outputFile);
|
|
99 }
|
|
100 }
|
|
101
|
|
102
|
|
103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) {
|
|
104 ifstream *files = new ifstream[counter[chromosome]];
|
|
105 set<NumberIntervalType *> intervals;
|
|
106 for (unsigned int i = 0; i < counter[chromosome]; i++) {
|
|
107 string fileName = getTmpName(chromosome, i);
|
|
108 files[i].open(fileName.c_str());
|
|
109 }
|
|
110 for (unsigned int i = 0; i < counter[chromosome]; i++) {
|
|
111 if (files[i].good()) {
|
|
112 Interval interval;
|
|
113 interval.parseBinary(files[i]);
|
|
114 NumberIntervalType ni = NumberIntervalType(&interval, i);
|
|
115 intervals.insert(&ni);
|
|
116 }
|
|
117 }
|
|
118 while (! intervals.empty()) {
|
|
119 NumberIntervalType *ni = *intervals.begin();
|
|
120 GenomicInterval gi(chromosome, ni->first->start, ni->first->end);
|
|
121 outputFile << gi;
|
|
122 intervals.erase(intervals.begin());
|
|
123 if (files[ni->second].good()) {
|
|
124 Interval interval;
|
|
125 interval.parseBinary(files[ni->second]);
|
|
126 NumberIntervalType nni = NumberIntervalType(&interval, ni->second);
|
|
127 intervals.insert(&nni);
|
|
128 }
|
|
129 }
|
|
130 for (unsigned int i = 0; i < counter[chromosome]; i++) {
|
|
131 files[i].close();
|
|
132 }
|
|
133 delete[] files;
|
|
134 }
|