annotate SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #include <algorithm>
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2 #include <set>
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 #include <vector>
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 #include <sstream>
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 #include "inputFileParser.hpp"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 static const unsigned int MAX_SIZE = 10000;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.'));
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 void InputFileParser::parse() {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 ifstream file;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 string line;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 file.open(inputFileName.c_str());
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 if (file.is_open()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 GenomicInterval genomicInterval;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 while (file.good()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 getline(file, line);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 if (line.size() > 0) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 genomicInterval.parseFromLine(line);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 addToList(genomicInterval);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 syncFiles();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 file.close();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 else {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 cout << "Unable to open file" << inputFileName;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 merge();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 void InputFileParser::addToList(GenomicInterval &genomicInterval) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 Interval interval (genomicInterval);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 IntervalsType *intervals;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 if (iter == sortedIntervals.end()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 intervals = new IntervalsType;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 sortedIntervals[genomicInterval.chromosome] = intervals;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 else {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 intervals = iter->second;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 //cout << "pushing " << interval.start << "-" << interval.end << endl;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 intervals->push_back(&interval);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 if (intervals->size() >= MAX_SIZE) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 writeTmpFile(genomicInterval.chromosome);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 void InputFileParser::writeTmpFile(string &chromosome) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 IntervalsType *intervals = iter->second;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 sort(intervals->begin(), intervals->end());
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 string fileName = getTmpName(chromosome);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 ofstream file(fileName.c_str(), ios::out | ios::binary);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 for (unsigned i = 0; i < intervals->size(); i++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 (*intervals)[i]->writeBinary(file);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 file.close();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 ++counter[chromosome];
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 sortedIntervals[chromosome] = NULL;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 delete intervals;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 void InputFileParser::syncFiles() {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 string chromosome = iter->first;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 writeTmpFile(chromosome);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 stringstream s;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp";
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 return s.str();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 string InputFileParser::getTmpName(const string &chromosome) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 return getTmpName(chromosome, counter[chromosome]);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 void InputFileParser::merge() {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 ofstream outputFile(outputFileName.c_str());
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 merge(iter->first, outputFile);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 ifstream *files = new ifstream[counter[chromosome]];
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 set<NumberIntervalType *> intervals;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 for (unsigned int i = 0; i < counter[chromosome]; i++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 string fileName = getTmpName(chromosome, i);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 files[i].open(fileName.c_str());
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 for (unsigned int i = 0; i < counter[chromosome]; i++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 if (files[i].good()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 Interval interval;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 interval.parseBinary(files[i]);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 NumberIntervalType ni = NumberIntervalType(&interval, i);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 intervals.insert(&ni);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 while (! intervals.empty()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 NumberIntervalType *ni = *intervals.begin();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 GenomicInterval gi(chromosome, ni->first->start, ni->first->end);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 outputFile << gi;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 intervals.erase(intervals.begin());
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 if (files[ni->second].good()) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 Interval interval;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 interval.parseBinary(files[ni->second]);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 NumberIntervalType nni = NumberIntervalType(&interval, ni->second);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 intervals.insert(&nni);
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 for (unsigned int i = 0; i < counter[chromosome]; i++) {
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 files[i].close();
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 }
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 delete[] files;
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 }