Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line source
#include <algorithm> #include <set> #include <vector> #include <sstream> #include "inputFileParser.hpp" static const unsigned int MAX_SIZE = 10000; InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) { outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.')); } void InputFileParser::parse() { ifstream file; string line; file.open(inputFileName.c_str()); if (file.is_open()) { GenomicInterval genomicInterval; while (file.good()) { getline(file, line); if (line.size() > 0) { genomicInterval.parseFromLine(line); addToList(genomicInterval); } } syncFiles(); file.close(); } else { cout << "Unable to open file" << inputFileName; } merge(); } void InputFileParser::addToList(GenomicInterval &genomicInterval) { Interval interval (genomicInterval); IntervalsType *intervals; SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome); if (iter == sortedIntervals.end()) { intervals = new IntervalsType; sortedIntervals[genomicInterval.chromosome] = intervals; } else { intervals = iter->second; } //cout << "pushing " << interval.start << "-" << interval.end << endl; intervals->push_back(&interval); if (intervals->size() >= MAX_SIZE) { writeTmpFile(genomicInterval.chromosome); } } void InputFileParser::writeTmpFile(string &chromosome) { SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome); IntervalsType *intervals = iter->second; sort(intervals->begin(), intervals->end()); string fileName = getTmpName(chromosome); ofstream file(fileName.c_str(), ios::out | ios::binary); for (unsigned i = 0; i < intervals->size(); i++) { cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl; (*intervals)[i]->writeBinary(file); } file.close(); ++counter[chromosome]; sortedIntervals[chromosome] = NULL; delete intervals; } void InputFileParser::syncFiles() { for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { string chromosome = iter->first; writeTmpFile(chromosome); } } string InputFileParser::getTmpName(const string &chromosome, unsigned int i) { stringstream s; s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp"; return s.str(); } string InputFileParser::getTmpName(const string &chromosome) { return getTmpName(chromosome, counter[chromosome]); } void InputFileParser::merge() { ofstream outputFile(outputFileName.c_str()); for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) { merge(iter->first, outputFile); } } void InputFileParser::merge(const string &chromosome, ofstream &outputFile) { ifstream *files = new ifstream[counter[chromosome]]; set<NumberIntervalType *> intervals; for (unsigned int i = 0; i < counter[chromosome]; i++) { string fileName = getTmpName(chromosome, i); files[i].open(fileName.c_str()); } for (unsigned int i = 0; i < counter[chromosome]; i++) { if (files[i].good()) { Interval interval; interval.parseBinary(files[i]); NumberIntervalType ni = NumberIntervalType(&interval, i); intervals.insert(&ni); } } while (! intervals.empty()) { NumberIntervalType *ni = *intervals.begin(); GenomicInterval gi(chromosome, ni->first->start, ni->first->end); outputFile << gi; intervals.erase(intervals.begin()); if (files[ni->second].good()) { Interval interval; interval.parseBinary(files[ni->second]); NumberIntervalType nni = NumberIntervalType(&interval, ni->second); intervals.insert(&nni); } } for (unsigned int i = 0; i < counter[chromosome]; i++) { files[i].close(); } delete[] files; }