Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #include <algorithm> | |
2 #include <set> | |
3 #include <vector> | |
4 #include <sstream> | |
5 #include "inputFileParser.hpp" | |
6 | |
7 static const unsigned int MAX_SIZE = 10000; | |
8 | |
9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) { | |
10 outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.')); | |
11 } | |
12 | |
13 | |
14 void InputFileParser::parse() { | |
15 ifstream file; | |
16 string line; | |
17 file.open(inputFileName.c_str()); | |
18 if (file.is_open()) { | |
19 GenomicInterval genomicInterval; | |
20 while (file.good()) { | |
21 getline(file, line); | |
22 if (line.size() > 0) { | |
23 genomicInterval.parseFromLine(line); | |
24 addToList(genomicInterval); | |
25 } | |
26 } | |
27 syncFiles(); | |
28 file.close(); | |
29 } | |
30 else { | |
31 cout << "Unable to open file" << inputFileName; | |
32 } | |
33 merge(); | |
34 } | |
35 | |
36 | |
37 void InputFileParser::addToList(GenomicInterval &genomicInterval) { | |
38 Interval interval (genomicInterval); | |
39 IntervalsType *intervals; | |
40 SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome); | |
41 if (iter == sortedIntervals.end()) { | |
42 intervals = new IntervalsType; | |
43 sortedIntervals[genomicInterval.chromosome] = intervals; | |
44 } | |
45 else { | |
46 intervals = iter->second; | |
47 } | |
48 //cout << "pushing " << interval.start << "-" << interval.end << endl; | |
49 intervals->push_back(&interval); | |
50 if (intervals->size() >= MAX_SIZE) { | |
51 writeTmpFile(genomicInterval.chromosome); | |
52 } | |
53 } | |
54 | |
55 | |
56 void InputFileParser::writeTmpFile(string &chromosome) { | |
57 SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome); | |
58 IntervalsType *intervals = iter->second; | |
59 | |
60 sort(intervals->begin(), intervals->end()); | |
61 string fileName = getTmpName(chromosome); | |
62 ofstream file(fileName.c_str(), ios::out | ios::binary); | |
63 for (unsigned i = 0; i < intervals->size(); i++) { | |
64 cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl; | |
65 (*intervals)[i]->writeBinary(file); | |
66 } | |
67 file.close(); | |
68 ++counter[chromosome]; | |
69 | |
70 sortedIntervals[chromosome] = NULL; | |
71 delete intervals; | |
72 } | |
73 | |
74 | |
75 void InputFileParser::syncFiles() { | |
76 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | |
77 string chromosome = iter->first; | |
78 writeTmpFile(chromosome); | |
79 } | |
80 } | |
81 | |
82 | |
83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) { | |
84 stringstream s; | |
85 s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp"; | |
86 return s.str(); | |
87 } | |
88 | |
89 | |
90 string InputFileParser::getTmpName(const string &chromosome) { | |
91 return getTmpName(chromosome, counter[chromosome]); | |
92 } | |
93 | |
94 | |
95 void InputFileParser::merge() { | |
96 ofstream outputFile(outputFileName.c_str()); | |
97 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | |
98 merge(iter->first, outputFile); | |
99 } | |
100 } | |
101 | |
102 | |
103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) { | |
104 ifstream *files = new ifstream[counter[chromosome]]; | |
105 set<NumberIntervalType *> intervals; | |
106 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
107 string fileName = getTmpName(chromosome, i); | |
108 files[i].open(fileName.c_str()); | |
109 } | |
110 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
111 if (files[i].good()) { | |
112 Interval interval; | |
113 interval.parseBinary(files[i]); | |
114 NumberIntervalType ni = NumberIntervalType(&interval, i); | |
115 intervals.insert(&ni); | |
116 } | |
117 } | |
118 while (! intervals.empty()) { | |
119 NumberIntervalType *ni = *intervals.begin(); | |
120 GenomicInterval gi(chromosome, ni->first->start, ni->first->end); | |
121 outputFile << gi; | |
122 intervals.erase(intervals.begin()); | |
123 if (files[ni->second].good()) { | |
124 Interval interval; | |
125 interval.parseBinary(files[ni->second]); | |
126 NumberIntervalType nni = NumberIntervalType(&interval, ni->second); | |
127 intervals.insert(&nni); | |
128 } | |
129 } | |
130 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
131 files[i].close(); | |
132 } | |
133 delete[] files; | |
134 } |