view carpet-src-1/tools/CARPET/com_uni.cpp @ 0:cdd489d98766

Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author matces
date Tue, 07 Jun 2011 16:50:41 -0400
parents
children
line wrap: on
line source

/*
* Copyright 2009 Matteo Cesaroni, Lucilla Luzi
*
* This program is free software; ; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 3 of the License, or (at your
* option) any later version.
* 
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.

*/

#include <iostream>	
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <sstream>
#include <deque>
#include <map>
#include <ctime> 
#include <cstdlib>

using namespace std;

inline void Tokenize(const string& str,
			  vector<string>& tokens,
			  const string& delimiters = " ")
{
    // Skip delimiters at beginning.
    string::size_type lastPos = str.find_first_not_of(delimiters, 0);
    // Find first "non-delimiter".
    string::size_type pos     = str.find_first_of(delimiters, lastPos);
	
    while (string::npos != pos || string::npos != lastPos)
    {
        // Found a token, add it to the vector.
        tokens.push_back(str.substr(lastPos, pos - lastPos));
        // Skip delimiters.  Note the "not_of"
        lastPos = str.find_first_not_of(delimiters, pos);
        // Find next "non-delimiter"
        pos = str.find_first_of(delimiters, lastPos);
    }
}


typedef struct  {
	int inizioprobe;
	int fineprobe;
	string score;
	string campo1;
	string campo2;
	string strand;
	string index;
	int file;
} Probe;

struct Comparatore2 {
    bool operator()(const Probe& s1, const Probe& s2) const {
		if (s1.inizioprobe < s2.inizioprobe) {
			return true;
		}
		else if (s1.inizioprobe == s2.inizioprobe) {
			if (s1.fineprobe < s2.fineprobe) {
				return true;
			}
			else if (s1.fineprobe == s2.fineprobe){
				return true;
			}
			else {
				return false;
			}

		}
		else {
			return false;
		}

	}
};

int main (int argc, char * const argv[]) {

	string concatenate=argv[3];
	int dist_t=atoi(argv[1]);
	string choice=argv[2];
	
	string name_out= argv[6];
	ofstream resfile;
	resfile.open (name_out.c_str());
	
	

	if (concatenate=="no" && choice!="union"){
		
		//print "flank=$win , type=$type col6=%overlap, concatenate=$concatenate";
		if (choice=="common"){
			cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=%overlap, concatenate="<<concatenate;
		}
		if (choice=="unique"){
			cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=score o p-value";
		}
		
		string line;
		Probe thisprobe;
		Probe thisanno;
		int overlap=0;
		vector<string> arraypro;
		map<string, vector<Probe> > seq;
		map<string, vector<Probe> > annotation;
		map<string, vector<Probe> >::iterator itseq;
	
		ifstream seque_file(argv[4]);
		while (getline(seque_file, line)) {
			string s4;
			s4.assign(line, 0, 1);
			if (line=="" || s4=="#"){
				continue;
			}
			arraypro.clear();
			Tokenize(line, arraypro, "\t");
			string chr2 =  (arraypro[0].c_str());
			thisprobe.inizioprobe=atoi(arraypro[3].c_str());
			thisprobe.fineprobe=atoi(arraypro[4].c_str());
			thisprobe.campo1=(arraypro[1].c_str());
			thisprobe.campo2=(arraypro[2].c_str());
			thisprobe.score=(arraypro[5].c_str());
			thisprobe.strand=(arraypro[6].c_str());
			thisprobe.index=(arraypro[8].c_str());
			seq[chr2].push_back(thisprobe);
		}
	
	
		ifstream anno_file(argv[5]);
		while (getline(anno_file, line)) {
			string s4;
			s4.assign(line, 0, 1);
			if (line=="" || s4=="#"){
				continue;
			}
			arraypro.clear();
			Tokenize(line, arraypro, "\t");
			string chr3=  (arraypro[0].c_str());
			thisanno.inizioprobe=atoi(arraypro[3].c_str());
			thisanno.fineprobe=atoi(arraypro[4].c_str());
			thisanno.campo1=(arraypro[1].c_str());
			thisanno.campo2=(arraypro[2].c_str());
			thisanno.score=(arraypro[5].c_str());
			thisanno.strand=(arraypro[6].c_str());
			thisanno.index=(arraypro[8].c_str());
			annotation[chr3].push_back(thisanno);
		}
  

		for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){
	
			vector <Probe> seq_chr = (*itseq).second;
			vector <Probe> anno_chr = annotation[(*itseq).first];
			if(anno_chr.size()==0 && choice=="unique"){
				for (int i=0; i<seq_chr.size();i++){
					resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
				}
				continue;
			}
			if(anno_chr.size()==0 && choice=="common"){
				continue;
			}
			sort (seq_chr.begin(),seq_chr.end(),Comparatore2());
			sort (anno_chr.begin(),anno_chr.end(),Comparatore2());
			
			int finefine=0;
			
			for (int i=0; i<anno_chr.size();i++){
				if(anno_chr[i].fineprobe<=finefine){
					anno_chr[i].fineprobe=finefine;
				}
				if(anno_chr[i].fineprobe>finefine){
					finefine=anno_chr[i].fineprobe;
				}
			}
		
			for (int i=0; i<seq_chr.size();i++){
				int start_array=0;
				int fine_array=anno_chr.size();
				int pos=1;
				int trovato=0;
				
				while (pos>0){
					pos=(fine_array-start_array)/2;
					int position=start_array+pos;
				
					if((seq_chr[i].inizioprobe-dist_t)<anno_chr[position].inizioprobe){
						fine_array=position;
					}
					if((seq_chr[i].inizioprobe-dist_t)>anno_chr[position].inizioprobe){
						start_array=position;
					}
					if((seq_chr[i].inizioprobe-dist_t)<=anno_chr[position].fineprobe && (seq_chr[i].fineprobe+dist_t)>=anno_chr[position].inizioprobe){
						if (choice=="common"){
							if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){
								overlap=(seq_chr[i].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
							}
							if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){
								overlap=(anno_chr[position].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
							}
							if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){
								overlap=(anno_chr[position].fineprobe-seq_chr[i].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe);
							}
							if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){
								overlap=100;
							}
							if (overlap<0){
								overlap=-1;
							}
							resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<overlap<<"\t"<<seq_chr[i].strand<<"\t.\t"<<"ValueA:"<<seq_chr[i].score<<"~"<<"ValueB:"<<anno_chr[position].score<<endl;
						}
						trovato=1;
						break;
					}
				}
				if (choice=="unique" && trovato==0){
					resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
				}
			}
		}
	}
	
	if (concatenate=="yes" || choice == "union"){
		
		cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=#overlaping regions, concatenate="<<concatenate;
		
		string line;
		Probe thisprobe;
		Probe thisanno;
		vector<string> arraypro;
		map<string, vector<Probe> > seq;
		map<string, vector<Probe> > annotation;
		map<string, vector<Probe> >::iterator itseq;
		string concatenate=argv[3];
		int dist_t=atoi(argv[1]);
		string choice=argv[2];
	
		ifstream seque_file(argv[4]);
		while (getline(seque_file, line)) {
			string s4;
			s4.assign(line, 0, 1);
			if (line=="" || s4=="#"){
				continue;
			}
			arraypro.clear();
			Tokenize(line, arraypro, "\t");
			string chr2 =  (arraypro[0].c_str());
			thisprobe.inizioprobe=atoi(arraypro[3].c_str());
			thisprobe.fineprobe=atoi(arraypro[4].c_str());
			thisprobe.campo2=(arraypro[2].c_str());
			thisprobe.campo1=(arraypro[1].c_str());
			thisprobe.score=(arraypro[5].c_str());
			thisprobe.index=(arraypro[8].c_str());
			thisprobe.strand=(arraypro[6].c_str());
			thisprobe.file=1;
			seq[chr2].push_back(thisprobe);
		}
	
		ifstream anno_file(argv[5]);
		while (getline(anno_file, line)) {
			string s4;
			s4.assign(line, 0, 1);
			if (line=="" || s4=="#"){
				continue;
			}
			arraypro.clear();
			Tokenize(line, arraypro, "\t");
			string chr3=  (arraypro[0].c_str());
			thisanno.inizioprobe=atoi(arraypro[3].c_str());
			thisanno.fineprobe=atoi(arraypro[4].c_str());
			thisanno.campo2=(arraypro[2].c_str());
			thisanno.campo1=(arraypro[1].c_str());
			thisanno.score=(arraypro[5].c_str());
			thisanno.index=(arraypro[8].c_str());
			thisanno.strand=(arraypro[6].c_str());
			thisanno.file=2;
			seq[chr3].push_back(thisanno);
		}
    
		int inizio;
		int fine;
		string annot;
		int overlap;
		int inizio_ann;
		int fine_ann;
	
		for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){
	
			vector <Probe> seq_chr = (*itseq).second;
			sort (seq_chr.begin(),seq_chr.end(),Comparatore2());
		
			for (int i=0; i<seq_chr.size();i++){
				inizio = seq_chr[i].inizioprobe;
				fine=seq_chr[i].fineprobe;
				int file_t=0;
				int file_t2=0;
				int entrato=1;
				int z=1;
				if(seq_chr[i].file==1){
					file_t=1;
				}
				if(seq_chr[i].file==2){
					file_t2=1;
				}
				if(i==(seq_chr.size()-1)){
					if (choice=="union"){
						resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
					}
				}
				
				//cout<<"x"<<(*itseq).first<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].file<<endl;
				for (int y=i+1; y<seq_chr.size(); y++){
					if((inizio-dist_t)<=seq_chr[y].fineprobe && (fine+dist_t)>=seq_chr[y].inizioprobe){
						if(seq_chr[y].file==1){
							file_t=1;
						}
						if(seq_chr[y].file==2){
							file_t2=1;
						}
						if(seq_chr[y].fineprobe>fine){
							fine=seq_chr[y].fineprobe;
						}
						entrato=2;
						i++;
						z++;
					}
					if(seq_chr[y].inizioprobe>fine || y==seq_chr.size()-1){
						if (choice == "union" && entrato==1){
							resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<inizio<<"\t"<<fine<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
						}
						if (choice == "union" && entrato==2){
							resfile<<(*itseq).first<<"\tcommon\tcommon\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t.\t.\tcommon"<<endl;
						}
						if (choice == "common" && entrato==2 && file_t == 1 && file_t2 == 1 && concatenate=="yes"){
							resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl;
						}
						break;
					}
				}
			}
		}
	}
}