diff SMART/Java/Python/Cpp/inputFileParser.cpp @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/Cpp/inputFileParser.cpp	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,134 @@
+#include <algorithm>
+#include <set>
+#include <vector>
+#include <sstream>
+#include "inputFileParser.hpp"
+
+static const unsigned int MAX_SIZE = 10000;
+
+InputFileParser::InputFileParser(string inputFileName, string outputFileName): inputFileName(inputFileName), outputFileName(outputFileName) {
+    outputFilePrefix = outputFileName.substr(0, outputFileName.find_last_of('.'));
+}
+
+
+void InputFileParser::parse() {
+    ifstream file;
+    string line;
+    file.open(inputFileName.c_str());
+    if (file.is_open()) {
+        GenomicInterval genomicInterval;
+        while (file.good()) {
+            getline(file, line);
+            if (line.size() > 0) {
+                genomicInterval.parseFromLine(line);
+                addToList(genomicInterval);
+            }
+        }
+        syncFiles();
+        file.close();
+    }
+    else {
+        cout << "Unable to open file" << inputFileName;
+    }
+    merge();
+}
+
+
+void InputFileParser::addToList(GenomicInterval &genomicInterval) {
+    Interval interval (genomicInterval);
+    IntervalsType *intervals;
+    SortedIntervalsTypes::iterator iter = sortedIntervals.find(genomicInterval.chromosome);
+    if (iter == sortedIntervals.end()) {
+        intervals = new IntervalsType;
+        sortedIntervals[genomicInterval.chromosome] = intervals;
+    }
+    else {
+        intervals = iter->second;
+    }
+    //cout << "pushing " << interval.start << "-" << interval.end << endl;
+    intervals->push_back(&interval);
+    if (intervals->size() >= MAX_SIZE) {
+        writeTmpFile(genomicInterval.chromosome);
+    }
+}
+
+
+void InputFileParser::writeTmpFile(string &chromosome) {
+    SortedIntervalsTypes::iterator iter = sortedIntervals.find(chromosome);
+    IntervalsType *intervals = iter->second;
+
+    sort(intervals->begin(), intervals->end());
+    string fileName = getTmpName(chromosome);
+    ofstream file(fileName.c_str(), ios::out | ios::binary);
+    for (unsigned i = 0; i < intervals->size(); i++) {
+        cout << "writing " << (*intervals)[i]->start << "-" << (*intervals)[i]->end << endl;
+        (*intervals)[i]->writeBinary(file);
+    }
+    file.close();
+    ++counter[chromosome];
+    
+    sortedIntervals[chromosome] = NULL;
+    delete intervals;
+}
+
+
+void InputFileParser::syncFiles() {
+    for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
+        string chromosome = iter->first;
+        writeTmpFile(chromosome);
+    }
+}
+
+
+string InputFileParser::getTmpName(const string &chromosome, unsigned int i) {
+    stringstream s;
+    s << outputFilePrefix << outputFilePrefix << "_tmp_" << chromosome << "_" << i << ".tmp";
+    return s.str();
+}
+
+
+string InputFileParser::getTmpName(const string &chromosome) {
+    return getTmpName(chromosome, counter[chromosome]);
+}
+
+
+void InputFileParser::merge() {
+    ofstream outputFile(outputFileName.c_str());
+    for (SortedIntervalsTypes::iterator iter = sortedIntervals.begin(); iter != sortedIntervals.end(); iter++) {
+        merge(iter->first, outputFile);
+    }
+}
+
+
+void InputFileParser::merge(const string &chromosome, ofstream &outputFile) {
+    ifstream *files = new ifstream[counter[chromosome]];
+    set<NumberIntervalType *> intervals;
+    for (unsigned int i = 0; i < counter[chromosome]; i++) {
+        string fileName = getTmpName(chromosome, i);
+        files[i].open(fileName.c_str());
+    }
+    for (unsigned int i = 0; i < counter[chromosome]; i++) {
+        if (files[i].good()) {
+            Interval interval;
+            interval.parseBinary(files[i]);
+            NumberIntervalType ni = NumberIntervalType(&interval, i);
+            intervals.insert(&ni);
+        }
+    }
+    while (! intervals.empty()) {
+        NumberIntervalType *ni = *intervals.begin();
+        GenomicInterval gi(chromosome, ni->first->start, ni->first->end);
+        outputFile << gi;
+        intervals.erase(intervals.begin());
+        if (files[ni->second].good()) {
+            Interval interval;
+            interval.parseBinary(files[ni->second]);
+            NumberIntervalType nni = NumberIntervalType(&interval, ni->second);
+            intervals.insert(&nni);
+        }
+    }
+    for (unsigned int i = 0; i < counter[chromosome]; i++) {
+        files[i].close();
+    }
+    delete[] files;
+}