diff carpet-src-1/tools/CARPET/com_uni.cpp @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/carpet-src-1/tools/CARPET/com_uni.cpp	Tue Jun 07 16:50:41 2011 -0400
@@ -0,0 +1,352 @@
+/*
+* Copyright 2009 Matteo Cesaroni, Lucilla Luzi
+*
+* This program is free software; ; you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License as published by
+* the Free Software Foundation; either version 3 of the License, or (at your
+* option) any later version.
+* 
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+
+*/
+
+#include <iostream>	
+#include <fstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <sstream>
+#include <deque>
+#include <map>
+#include <ctime> 
+#include <cstdlib>
+
+using namespace std;
+
+inline void Tokenize(const string& str,
+			  vector<string>& tokens,
+			  const string& delimiters = " ")
+{
+    // Skip delimiters at beginning.
+    string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+    // Find first "non-delimiter".
+    string::size_type pos     = str.find_first_of(delimiters, lastPos);
+	
+    while (string::npos != pos || string::npos != lastPos)
+    {
+        // Found a token, add it to the vector.
+        tokens.push_back(str.substr(lastPos, pos - lastPos));
+        // Skip delimiters.  Note the "not_of"
+        lastPos = str.find_first_not_of(delimiters, pos);
+        // Find next "non-delimiter"
+        pos = str.find_first_of(delimiters, lastPos);
+    }
+}
+
+
+typedef struct  {
+	int inizioprobe;
+	int fineprobe;
+	string score;
+	string campo1;
+	string campo2;
+	string strand;
+	string index;
+	int file;
+} Probe;
+
+struct Comparatore2 {
+    bool operator()(const Probe& s1, const Probe& s2) const {
+		if (s1.inizioprobe < s2.inizioprobe) {
+			return true;
+		}
+		else if (s1.inizioprobe == s2.inizioprobe) {
+			if (s1.fineprobe < s2.fineprobe) {
+				return true;
+			}
+			else if (s1.fineprobe == s2.fineprobe){
+				return true;
+			}
+			else {
+				return false;
+			}
+
+		}
+		else {
+			return false;
+		}
+
+	}
+};
+
+int main (int argc, char * const argv[]) {
+
+	string concatenate=argv[3];
+	int dist_t=atoi(argv[1]);
+	string choice=argv[2];
+	
+	string name_out= argv[6];
+	ofstream resfile;
+	resfile.open (name_out.c_str());
+	
+	
+
+	if (concatenate=="no" && choice!="union"){
+		
+		//print "flank=$win , type=$type col6=%overlap, concatenate=$concatenate";
+		if (choice=="common"){
+			cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=%overlap, concatenate="<<concatenate;
+		}
+		if (choice=="unique"){
+			cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=score o p-value";
+		}
+		
+		string line;
+		Probe thisprobe;
+		Probe thisanno;
+		int overlap=0;
+		vector<string> arraypro;
+		map<string, vector<Probe> > seq;
+		map<string, vector<Probe> > annotation;
+		map<string, vector<Probe> >::iterator itseq;
+	
+		ifstream seque_file(argv[4]);
+		while (getline(seque_file, line)) {
+			string s4;
+			s4.assign(line, 0, 1);
+			if (line=="" || s4=="#"){
+				continue;
+			}
+			arraypro.clear();
+			Tokenize(line, arraypro, "\t");
+			string chr2 =  (arraypro[0].c_str());
+			thisprobe.inizioprobe=atoi(arraypro[3].c_str());
+			thisprobe.fineprobe=atoi(arraypro[4].c_str());
+			thisprobe.campo1=(arraypro[1].c_str());
+			thisprobe.campo2=(arraypro[2].c_str());
+			thisprobe.score=(arraypro[5].c_str());
+			thisprobe.strand=(arraypro[6].c_str());
+			thisprobe.index=(arraypro[8].c_str());
+			seq[chr2].push_back(thisprobe);
+		}
+	
+	
+		ifstream anno_file(argv[5]);
+		while (getline(anno_file, line)) {
+			string s4;
+			s4.assign(line, 0, 1);
+			if (line=="" || s4=="#"){
+				continue;
+			}
+			arraypro.clear();
+			Tokenize(line, arraypro, "\t");
+			string chr3=  (arraypro[0].c_str());
+			thisanno.inizioprobe=atoi(arraypro[3].c_str());
+			thisanno.fineprobe=atoi(arraypro[4].c_str());
+			thisanno.campo1=(arraypro[1].c_str());
+			thisanno.campo2=(arraypro[2].c_str());
+			thisanno.score=(arraypro[5].c_str());
+			thisanno.strand=(arraypro[6].c_str());
+			thisanno.index=(arraypro[8].c_str());
+			annotation[chr3].push_back(thisanno);
+		}
+  
+
+		for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){
+	
+			vector <Probe> seq_chr = (*itseq).second;
+			vector <Probe> anno_chr = annotation[(*itseq).first];
+			if(anno_chr.size()==0 && choice=="unique"){
+				for (int i=0; i<seq_chr.size();i++){
+					resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
+				}
+				continue;
+			}
+			if(anno_chr.size()==0 && choice=="common"){
+				continue;
+			}
+			sort (seq_chr.begin(),seq_chr.end(),Comparatore2());
+			sort (anno_chr.begin(),anno_chr.end(),Comparatore2());
+			
+			int finefine=0;
+			
+			for (int i=0; i<anno_chr.size();i++){
+				if(anno_chr[i].fineprobe<=finefine){
+					anno_chr[i].fineprobe=finefine;
+				}
+				if(anno_chr[i].fineprobe>finefine){
+					finefine=anno_chr[i].fineprobe;
+				}
+			}
+		
+			for (int i=0; i<seq_chr.size();i++){
+				int start_array=0;
+				int fine_array=anno_chr.size();
+				int pos=1;
+				int trovato=0;
+				
+				while (pos>0){
+					pos=(fine_array-start_array)/2;
+					int position=start_array+pos;
+				
+					if((seq_chr[i].inizioprobe-dist_t)<anno_chr[position].inizioprobe){
+						fine_array=position;
+					}
+					if((seq_chr[i].inizioprobe-dist_t)>anno_chr[position].inizioprobe){
+						start_array=position;
+					}
+					if((seq_chr[i].inizioprobe-dist_t)<=anno_chr[position].fineprobe && (seq_chr[i].fineprobe+dist_t)>=anno_chr[position].inizioprobe){
+						if (choice=="common"){
+							if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){
+								overlap=(seq_chr[i].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
+							}
+							if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){
+								overlap=(anno_chr[position].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
+							}
+							if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){
+								overlap=(anno_chr[position].fineprobe-seq_chr[i].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
+							}
+							if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){
+								overlap=100;
+							}
+							if (overlap<0){
+								overlap=-1;
+							}
+							resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<overlap<<"\t"<<seq_chr[i].strand<<"\t.\t"<<"ValueA:"<<seq_chr[i].score<<"~"<<"ValueB:"<<anno_chr[position].score<<endl;
+						}
+						trovato=1;
+						break;
+					}
+				}
+				if (choice=="unique" && trovato==0){
+					resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
+				}
+			}
+		}
+	}
+	
+	if (concatenate=="yes" || choice == "union"){
+		
+		cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=#overlaping regions, concatenate="<<concatenate;
+		
+		string line;
+		Probe thisprobe;
+		Probe thisanno;
+		vector<string> arraypro;
+		map<string, vector<Probe> > seq;
+		map<string, vector<Probe> > annotation;
+		map<string, vector<Probe> >::iterator itseq;
+		string concatenate=argv[3];
+		int dist_t=atoi(argv[1]);
+		string choice=argv[2];
+	
+		ifstream seque_file(argv[4]);
+		while (getline(seque_file, line)) {
+			string s4;
+			s4.assign(line, 0, 1);
+			if (line=="" || s4=="#"){
+				continue;
+			}
+			arraypro.clear();
+			Tokenize(line, arraypro, "\t");
+			string chr2 =  (arraypro[0].c_str());
+			thisprobe.inizioprobe=atoi(arraypro[3].c_str());
+			thisprobe.fineprobe=atoi(arraypro[4].c_str());
+			thisprobe.campo2=(arraypro[2].c_str());
+			thisprobe.campo1=(arraypro[1].c_str());
+			thisprobe.score=(arraypro[5].c_str());
+			thisprobe.index=(arraypro[8].c_str());
+			thisprobe.strand=(arraypro[6].c_str());
+			thisprobe.file=1;
+			seq[chr2].push_back(thisprobe);
+		}
+	
+		ifstream anno_file(argv[5]);
+		while (getline(anno_file, line)) {
+			string s4;
+			s4.assign(line, 0, 1);
+			if (line=="" || s4=="#"){
+				continue;
+			}
+			arraypro.clear();
+			Tokenize(line, arraypro, "\t");
+			string chr3=  (arraypro[0].c_str());
+			thisanno.inizioprobe=atoi(arraypro[3].c_str());
+			thisanno.fineprobe=atoi(arraypro[4].c_str());
+			thisanno.campo2=(arraypro[2].c_str());
+			thisanno.campo1=(arraypro[1].c_str());
+			thisanno.score=(arraypro[5].c_str());
+			thisanno.index=(arraypro[8].c_str());
+			thisanno.strand=(arraypro[6].c_str());
+			thisanno.file=2;
+			seq[chr3].push_back(thisanno);
+		}
+    
+		int inizio;
+		int fine;
+		string annot;
+		int overlap;
+		int inizio_ann;
+		int fine_ann;
+	
+		for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){
+	
+			vector <Probe> seq_chr = (*itseq).second;
+			sort (seq_chr.begin(),seq_chr.end(),Comparatore2());
+		
+			for (int i=0; i<seq_chr.size();i++){
+				inizio = seq_chr[i].inizioprobe;
+				fine=seq_chr[i].fineprobe;
+				int file_t=0;
+				int file_t2=0;
+				int entrato=1;
+				int z=1;
+				if(seq_chr[i].file==1){
+					file_t=1;
+				}
+				if(seq_chr[i].file==2){
+					file_t2=1;
+				}
+				if(i==(seq_chr.size()-1)){
+					if (choice=="union"){
+						resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
+					}
+				}
+				
+				//cout<<"x"<<(*itseq).first<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].file<<endl;
+				for (int y=i+1; y<seq_chr.size(); y++){
+					if((inizio-dist_t)<=seq_chr[y].fineprobe && (fine+dist_t)>=seq_chr[y].inizioprobe){
+						if(seq_chr[y].file==1){
+							file_t=1;
+						}
+						if(seq_chr[y].file==2){
+							file_t2=1;
+						}
+						if(seq_chr[y].fineprobe>fine){
+							fine=seq_chr[y].fineprobe;
+						}
+						entrato=2;
+						i++;
+						z++;
+					}
+					if(seq_chr[y].inizioprobe>fine || y==seq_chr.size()-1){
+						if (choice == "union" && entrato==1){
+							resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<inizio<<"\t"<<fine<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
+						}
+						if (choice == "union" && entrato==2){
+							resfile<<(*itseq).first<<"\tcommon\tcommon\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t.\t.\tcommon"<<endl;
+						}
+						if (choice == "common" && entrato==2 && file_t == 1 && file_t2 == 1 && concatenate=="yes"){
+							resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
+						}
+						break;
+					}
+				}
+			}
+		}
+	}
+}
+