comparison SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #include <algorithm>
2 #include <set>
3 #include <vector>
4 #include <sstream>
5 #include "inputFileParser.hpp"
6
7 static const unsigned int MAX_SIZE = 10000;
8
9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) {
10 outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.'));
11 }
12
13
14 void InputFileParser::parse() {
15 ifstream file;
16 string line;
17 file.open(inputFileName.c_str());
18 if (file.is_open()) {
19 GenomicInterval genomicInterval;
20 while (file.good()) {
21 getline(file, line);
22 if (line.size() > 0) {
23 genomicInterval.parseFromLine(line);
24 addToList(genomicInterval);
25 }
26 }
27 syncFiles();
28 file.close();
29 }
30 else {
31 cout << "Unable to open file" << inputFileName;
32 }
33 merge();
34 }
35
36
37 void InputFileParser::addToList(GenomicInterval &genomicInterval) {
38 Interval interval (genomicInterval);
39 IntervalsType *intervals;
40 SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome);
41 if (iter == sortedIntervals.end()) {
42 intervals = new IntervalsType;
43 sortedIntervals[genomicInterval.chromosome] = intervals;
44 }
45 else {
46 intervals = iter->second;
47 }
48 //cout << "pushing " << interval.start << "-" << interval.end << endl;
49 intervals->push_back(&interval);
50 if (intervals->size() >= MAX_SIZE) {
51 writeTmpFile(genomicInterval.chromosome);
52 }
53 }
54
55
56 void InputFileParser::writeTmpFile(string &chromosome) {
57 SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome);
58 IntervalsType *intervals = iter->second;
59
60 sort(intervals->begin(), intervals->end());
61 string fileName = getTmpName(chromosome);
62 ofstream file(fileName.c_str(), ios::out | ios::binary);
63 for (unsigned i = 0; i < intervals->size(); i++) {
64 cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl;
65 (*intervals)[i]->writeBinary(file);
66 }
67 file.close();
68 ++counter[chromosome];
69
70 sortedIntervals[chromosome] = NULL;
71 delete intervals;
72 }
73
74
75 void InputFileParser::syncFiles() {
76 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
77 string chromosome = iter->first;
78 writeTmpFile(chromosome);
79 }
80 }
81
82
83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) {
84 stringstream s;
85 s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp";
86 return s.str();
87 }
88
89
90 string InputFileParser::getTmpName(const string &chromosome) {
91 return getTmpName(chromosome, counter[chromosome]);
92 }
93
94
95 void InputFileParser::merge() {
96 ofstream outputFile(outputFileName.c_str());
97 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
98 merge(iter->first, outputFile);
99 }
100 }
101
102
103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) {
104 ifstream *files = new ifstream[counter[chromosome]];
105 set<NumberIntervalType *> intervals;
106 for (unsigned int i = 0; i < counter[chromosome]; i++) {
107 string fileName = getTmpName(chromosome, i);
108 files[i].open(fileName.c_str());
109 }
110 for (unsigned int i = 0; i < counter[chromosome]; i++) {
111 if (files[i].good()) {
112 Interval interval;
113 interval.parseBinary(files[i]);
114 NumberIntervalType ni = NumberIntervalType(&interval, i);
115 intervals.insert(&ni);
116 }
117 }
118 while (! intervals.empty()) {
119 NumberIntervalType *ni = *intervals.begin();
120 GenomicInterval gi(chromosome, ni->first->start, ni->first->end);
121 outputFile << gi;
122 intervals.erase(intervals.begin());
123 if (files[ni->second].good()) {
124 Interval interval;
125 interval.parseBinary(files[ni->second]);
126 NumberIntervalType nni = NumberIntervalType(&interval, ni->second);
127 intervals.insert(&nni);
128 }
129 }
130 for (unsigned int i = 0; i < counter[chromosome]; i++) {
131 files[i].close();
132 }
133 delete[] files;
134 }