Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 #include <algorithm> | |
| 2 #include <set> | |
| 3 #include <vector> | |
| 4 #include <sstream> | |
| 5 #include "inputFileParser.hpp" | |
| 6 | |
| 7 static const unsigned int MAX_SIZE = 10000; | |
| 8 | |
| 9 InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) { | |
| 10 outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.')); | |
| 11 } | |
| 12 | |
| 13 | |
| 14 void InputFileParser::parse() { | |
| 15 ifstream file; | |
| 16 string line; | |
| 17 file.open(inputFileName.c_str()); | |
| 18 if (file.is_open()) { | |
| 19 GenomicInterval genomicInterval; | |
| 20 while (file.good()) { | |
| 21 getline(file, line); | |
| 22 if (line.size() > 0) { | |
| 23 genomicInterval.parseFromLine(line); | |
| 24 addToList(genomicInterval); | |
| 25 } | |
| 26 } | |
| 27 syncFiles(); | |
| 28 file.close(); | |
| 29 } | |
| 30 else { | |
| 31 cout << "Unable to open file" << inputFileName; | |
| 32 } | |
| 33 merge(); | |
| 34 } | |
| 35 | |
| 36 | |
| 37 void InputFileParser::addToList(GenomicInterval &genomicInterval) { | |
| 38 Interval interval (genomicInterval); | |
| 39 IntervalsType *intervals; | |
| 40 SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome); | |
| 41 if (iter == sortedIntervals.end()) { | |
| 42 intervals = new IntervalsType; | |
| 43 sortedIntervals[genomicInterval.chromosome] = intervals; | |
| 44 } | |
| 45 else { | |
| 46 intervals = iter->second; | |
| 47 } | |
| 48 //cout << "pushing " << interval.start << "-" << interval.end << endl; | |
| 49 intervals->push_back(&interval); | |
| 50 if (intervals->size() >= MAX_SIZE) { | |
| 51 writeTmpFile(genomicInterval.chromosome); | |
| 52 } | |
| 53 } | |
| 54 | |
| 55 | |
| 56 void InputFileParser::writeTmpFile(string &chromosome) { | |
| 57 SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome); | |
| 58 IntervalsType *intervals = iter->second; | |
| 59 | |
| 60 sort(intervals->begin(), intervals->end()); | |
| 61 string fileName = getTmpName(chromosome); | |
| 62 ofstream file(fileName.c_str(), ios::out | ios::binary); | |
| 63 for (unsigned i = 0; i < intervals->size(); i++) { | |
| 64 cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl; | |
| 65 (*intervals)[i]->writeBinary(file); | |
| 66 } | |
| 67 file.close(); | |
| 68 ++counter[chromosome]; | |
| 69 | |
| 70 sortedIntervals[chromosome] = NULL; | |
| 71 delete intervals; | |
| 72 } | |
| 73 | |
| 74 | |
| 75 void InputFileParser::syncFiles() { | |
| 76 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | |
| 77 string chromosome = iter->first; | |
| 78 writeTmpFile(chromosome); | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 | |
| 83 string InputFileParser::getTmpName(const string &chromosome, unsigned int i) { | |
| 84 stringstream s; | |
| 85 s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp"; | |
| 86 return s.str(); | |
| 87 } | |
| 88 | |
| 89 | |
| 90 string InputFileParser::getTmpName(const string &chromosome) { | |
| 91 return getTmpName(chromosome, counter[chromosome]); | |
| 92 } | |
| 93 | |
| 94 | |
| 95 void InputFileParser::merge() { | |
| 96 ofstream outputFile(outputFileName.c_str()); | |
| 97 for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { | |
| 98 merge(iter->first, outputFile); | |
| 99 } | |
| 100 } | |
| 101 | |
| 102 | |
| 103 void InputFileParser::merge(const string &chromosome, ofstream &outputFile) { | |
| 104 ifstream *files = new ifstream[counter[chromosome]]; | |
| 105 set<NumberIntervalType *> intervals; | |
| 106 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
| 107 string fileName = getTmpName(chromosome, i); | |
| 108 files[i].open(fileName.c_str()); | |
| 109 } | |
| 110 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
| 111 if (files[i].good()) { | |
| 112 Interval interval; | |
| 113 interval.parseBinary(files[i]); | |
| 114 NumberIntervalType ni = NumberIntervalType(&interval, i); | |
| 115 intervals.insert(&ni); | |
| 116 } | |
| 117 } | |
| 118 while (! intervals.empty()) { | |
| 119 NumberIntervalType *ni = *intervals.begin(); | |
| 120 GenomicInterval gi(chromosome, ni->first->start, ni->first->end); | |
| 121 outputFile << gi; | |
| 122 intervals.erase(intervals.begin()); | |
| 123 if (files[ni->second].good()) { | |
| 124 Interval interval; | |
| 125 interval.parseBinary(files[ni->second]); | |
| 126 NumberIntervalType nni = NumberIntervalType(&interval, ni->second); | |
| 127 intervals.insert(&nni); | |
| 128 } | |
| 129 } | |
| 130 for (unsigned int i = 0; i < counter[chromosome]; i++) { | |
| 131 files[i].close(); | |
| 132 } | |
| 133 delete[] files; | |
| 134 } |
