Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/Cpp/inputFileParser.cpp Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,134 @@ +#include <algorithm> +#include <set> +#include <vector> +#include <sstream> +#include "inputFileParser.hpp" + +static const unsigned int MAX_SIZE = 10000; + +InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) { + outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.')); +} + + +void InputFileParser::parse() { + ifstream file; + string line; + file.open(inputFileName.c_str()); + if (file.is_open()) { + GenomicInterval genomicInterval; + while (file.good()) { + getline(file, line); + if (line.size() > 0) { + genomicInterval.parseFromLine(line); + addToList(genomicInterval); + } + } + syncFiles(); + file.close(); + } + else { + cout << "Unable to open file" << inputFileName; + } + merge(); +} + + +void InputFileParser::addToList(GenomicInterval &genomicInterval) { + Interval interval (genomicInterval); + IntervalsType *intervals; + SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome); + if (iter == sortedIntervals.end()) { + intervals = new IntervalsType; + sortedIntervals[genomicInterval.chromosome] = intervals; + } + else { + intervals = iter->second; + } + //cout << "pushing " << interval.start << "-" << interval.end << endl; + intervals->push_back(&interval); + if (intervals->size() >= MAX_SIZE) { + writeTmpFile(genomicInterval.chromosome); + } +} + + +void InputFileParser::writeTmpFile(string &chromosome) { + SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome); + IntervalsType *intervals = iter->second; + + sort(intervals->begin(), intervals->end()); + string fileName = getTmpName(chromosome); + ofstream file(fileName.c_str(), ios::out | ios::binary); + for (unsigned i = 0; i < intervals->size(); i++) { + cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl; + (*intervals)[i]->writeBinary(file); + } + file.close(); + ++counter[chromosome]; + + sortedIntervals[chromosome] = NULL; + delete intervals; +} + + +void InputFileParser::syncFiles() { + for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { + string chromosome = iter->first; + writeTmpFile(chromosome); + } +} + + +string InputFileParser::getTmpName(const string &chromosome, unsigned int i) { + stringstream s; + s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp"; + return s.str(); +} + + +string InputFileParser::getTmpName(const string &chromosome) { + return getTmpName(chromosome, counter[chromosome]); +} + + +void InputFileParser::merge() { + ofstream outputFile(outputFileName.c_str()); + for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { + merge(iter->first, outputFile); + } +} + + +void InputFileParser::merge(const string &chromosome, ofstream &outputFile) { + ifstream *files = new ifstream[counter[chromosome]]; + set<NumberIntervalType *> intervals; + for (unsigned int i = 0; i < counter[chromosome]; i++) { + string fileName = getTmpName(chromosome, i); + files[i].open(fileName.c_str()); + } + for (unsigned int i = 0; i < counter[chromosome]; i++) { + if (files[i].good()) { + Interval interval; + interval.parseBinary(files[i]); + NumberIntervalType ni = NumberIntervalType(&interval, i); + intervals.insert(&ni); + } + } + while (! intervals.empty()) { + NumberIntervalType *ni = *intervals.begin(); + GenomicInterval gi(chromosome, ni->first->start, ni->first->end); + outputFile << gi; + intervals.erase(intervals.begin()); + if (files[ni->second].good()) { + Interval interval; + interval.parseBinary(files[ni->second]); + NumberIntervalType nni = NumberIntervalType(&interval, ni->second); + intervals.insert(&nni); + } + } + for (unsigned int i = 0; i < counter[chromosome]; i++) { + files[i].close(); + } + delete[] files; +}