Mercurial > repos > matces > carpet_toolsuite
diff carpet-src-1/tools/CARPET/com_uni.cpp @ 0:cdd489d98766
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | matces |
---|---|
date | Tue, 07 Jun 2011 16:50:41 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/carpet-src-1/tools/CARPET/com_uni.cpp Tue Jun 07 16:50:41 2011 -0400 @@ -0,0 +1,352 @@ +/* +* Copyright 2009 Matteo Cesaroni, Lucilla Luzi +* +* This program is free software; ; you can redistribute it and/or modify +* it under the terms of the GNU Lesser General Public License as published by +* the Free Software Foundation; either version 3 of the License, or (at your +* option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. + +*/ + +#include <iostream> +#include <fstream> +#include <string> +#include <vector> +#include <algorithm> +#include <sstream> +#include <deque> +#include <map> +#include <ctime> +#include <cstdlib> + +using namespace std; + +inline void Tokenize(const string& str, + vector<string>& tokens, + const string& delimiters = " ") +{ + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of(delimiters, 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of(delimiters, lastPos); + + while (string::npos != pos || string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiters, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiters, lastPos); + } +} + + +typedef struct { + int inizioprobe; + int fineprobe; + string score; + string campo1; + string campo2; + string strand; + string index; + int file; +} Probe; + +struct Comparatore2 { + bool operator()(const Probe& s1, const Probe& s2) const { + if (s1.inizioprobe < s2.inizioprobe) { + return true; + } + else if (s1.inizioprobe == s2.inizioprobe) { + if (s1.fineprobe < s2.fineprobe) { + return true; + } + else if (s1.fineprobe == s2.fineprobe){ + return true; + } + else { + return false; + } + + } + else { + return false; + } + + } +}; + +int main (int argc, char * const argv[]) { + + string concatenate=argv[3]; + int dist_t=atoi(argv[1]); + string choice=argv[2]; + + string name_out= argv[6]; + ofstream resfile; + resfile.open (name_out.c_str()); + + + + if (concatenate=="no" && choice!="union"){ + + //print "flank=$win , type=$type col6=%overlap, concatenate=$concatenate"; + if (choice=="common"){ + cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=%overlap, concatenate="<<concatenate; + } + if (choice=="unique"){ + cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=score o p-value"; + } + + string line; + Probe thisprobe; + Probe thisanno; + int overlap=0; + vector<string> arraypro; + map<string, vector<Probe> > seq; + map<string, vector<Probe> > annotation; + map<string, vector<Probe> >::iterator itseq; + + ifstream seque_file(argv[4]); + while (getline(seque_file, line)) { + string s4; + s4.assign(line, 0, 1); + if (line=="" || s4=="#"){ + continue; + } + arraypro.clear(); + Tokenize(line, arraypro, "\t"); + string chr2 = (arraypro[0].c_str()); + thisprobe.inizioprobe=atoi(arraypro[3].c_str()); + thisprobe.fineprobe=atoi(arraypro[4].c_str()); + thisprobe.campo1=(arraypro[1].c_str()); + thisprobe.campo2=(arraypro[2].c_str()); + thisprobe.score=(arraypro[5].c_str()); + thisprobe.strand=(arraypro[6].c_str()); + thisprobe.index=(arraypro[8].c_str()); + seq[chr2].push_back(thisprobe); + } + + + ifstream anno_file(argv[5]); + while (getline(anno_file, line)) { + string s4; + s4.assign(line, 0, 1); + if (line=="" || s4=="#"){ + continue; + } + arraypro.clear(); + Tokenize(line, arraypro, "\t"); + string chr3= (arraypro[0].c_str()); + thisanno.inizioprobe=atoi(arraypro[3].c_str()); + thisanno.fineprobe=atoi(arraypro[4].c_str()); + thisanno.campo1=(arraypro[1].c_str()); + thisanno.campo2=(arraypro[2].c_str()); + thisanno.score=(arraypro[5].c_str()); + thisanno.strand=(arraypro[6].c_str()); + thisanno.index=(arraypro[8].c_str()); + annotation[chr3].push_back(thisanno); + } + + + for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){ + + vector <Probe> seq_chr = (*itseq).second; + vector <Probe> anno_chr = annotation[(*itseq).first]; + if(anno_chr.size()==0 && choice=="unique"){ + for (int i=0; i<seq_chr.size();i++){ + resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl; + } + continue; + } + if(anno_chr.size()==0 && choice=="common"){ + continue; + } + sort (seq_chr.begin(),seq_chr.end(),Comparatore2()); + sort (anno_chr.begin(),anno_chr.end(),Comparatore2()); + + int finefine=0; + + for (int i=0; i<anno_chr.size();i++){ + if(anno_chr[i].fineprobe<=finefine){ + anno_chr[i].fineprobe=finefine; + } + if(anno_chr[i].fineprobe>finefine){ + finefine=anno_chr[i].fineprobe; + } + } + + for (int i=0; i<seq_chr.size();i++){ + int start_array=0; + int fine_array=anno_chr.size(); + int pos=1; + int trovato=0; + + while (pos>0){ + pos=(fine_array-start_array)/2; + int position=start_array+pos; + + if((seq_chr[i].inizioprobe-dist_t)<anno_chr[position].inizioprobe){ + fine_array=position; + } + if((seq_chr[i].inizioprobe-dist_t)>anno_chr[position].inizioprobe){ + start_array=position; + } + if((seq_chr[i].inizioprobe-dist_t)<=anno_chr[position].fineprobe && (seq_chr[i].fineprobe+dist_t)>=anno_chr[position].inizioprobe){ + if (choice=="common"){ + if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){ + overlap=(seq_chr[i].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe); + } + if (seq_chr[i].inizioprobe<=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){ + overlap=(anno_chr[position].fineprobe-anno_chr[position].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe); + } + if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe>=anno_chr[position].fineprobe){ + overlap=(anno_chr[position].fineprobe-seq_chr[i].inizioprobe)*100/(seq_chr[i].fineprobe-seq_chr[i].inizioprobe); + } + if (seq_chr[i].inizioprobe>=anno_chr[position].inizioprobe && seq_chr[i].fineprobe<=anno_chr[position].fineprobe){ + overlap=100; + } + if (overlap<0){ + overlap=-1; + } + resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<overlap<<"\t"<<seq_chr[i].strand<<"\t.\t"<<"ValueA:"<<seq_chr[i].score<<"~"<<"ValueB:"<<anno_chr[position].score<<endl; + } + trovato=1; + break; + } + } + if (choice=="unique" && trovato==0){ + resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl; + } + } + } + } + + if (concatenate=="yes" || choice == "union"){ + + cout<<"flank="<<dist_t<<" , type="<<choice<<" , col6=#overlaping regions, concatenate="<<concatenate; + + string line; + Probe thisprobe; + Probe thisanno; + vector<string> arraypro; + map<string, vector<Probe> > seq; + map<string, vector<Probe> > annotation; + map<string, vector<Probe> >::iterator itseq; + string concatenate=argv[3]; + int dist_t=atoi(argv[1]); + string choice=argv[2]; + + ifstream seque_file(argv[4]); + while (getline(seque_file, line)) { + string s4; + s4.assign(line, 0, 1); + if (line=="" || s4=="#"){ + continue; + } + arraypro.clear(); + Tokenize(line, arraypro, "\t"); + string chr2 = (arraypro[0].c_str()); + thisprobe.inizioprobe=atoi(arraypro[3].c_str()); + thisprobe.fineprobe=atoi(arraypro[4].c_str()); + thisprobe.campo2=(arraypro[2].c_str()); + thisprobe.campo1=(arraypro[1].c_str()); + thisprobe.score=(arraypro[5].c_str()); + thisprobe.index=(arraypro[8].c_str()); + thisprobe.strand=(arraypro[6].c_str()); + thisprobe.file=1; + seq[chr2].push_back(thisprobe); + } + + ifstream anno_file(argv[5]); + while (getline(anno_file, line)) { + string s4; + s4.assign(line, 0, 1); + if (line=="" || s4=="#"){ + continue; + } + arraypro.clear(); + Tokenize(line, arraypro, "\t"); + string chr3= (arraypro[0].c_str()); + thisanno.inizioprobe=atoi(arraypro[3].c_str()); + thisanno.fineprobe=atoi(arraypro[4].c_str()); + thisanno.campo2=(arraypro[2].c_str()); + thisanno.campo1=(arraypro[1].c_str()); + thisanno.score=(arraypro[5].c_str()); + thisanno.index=(arraypro[8].c_str()); + thisanno.strand=(arraypro[6].c_str()); + thisanno.file=2; + seq[chr3].push_back(thisanno); + } + + int inizio; + int fine; + string annot; + int overlap; + int inizio_ann; + int fine_ann; + + for ( itseq=seq.begin() ; itseq != seq.end(); itseq++ ){ + + vector <Probe> seq_chr = (*itseq).second; + sort (seq_chr.begin(),seq_chr.end(),Comparatore2()); + + for (int i=0; i<seq_chr.size();i++){ + inizio = seq_chr[i].inizioprobe; + fine=seq_chr[i].fineprobe; + int file_t=0; + int file_t2=0; + int entrato=1; + int z=1; + if(seq_chr[i].file==1){ + file_t=1; + } + if(seq_chr[i].file==2){ + file_t2=1; + } + if(i==(seq_chr.size()-1)){ + if (choice=="union"){ + resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl; + } + } + + //cout<<"x"<<(*itseq).first<<"\t"<<seq_chr[i].inizioprobe<<"\t"<<seq_chr[i].fineprobe<<"\t"<<seq_chr[i].file<<endl; + for (int y=i+1; y<seq_chr.size(); y++){ + if((inizio-dist_t)<=seq_chr[y].fineprobe && (fine+dist_t)>=seq_chr[y].inizioprobe){ + if(seq_chr[y].file==1){ + file_t=1; + } + if(seq_chr[y].file==2){ + file_t2=1; + } + if(seq_chr[y].fineprobe>fine){ + fine=seq_chr[y].fineprobe; + } + entrato=2; + i++; + z++; + } + if(seq_chr[y].inizioprobe>fine || y==seq_chr.size()-1){ + if (choice == "union" && entrato==1){ + resfile<<(*itseq).first<<"\tfile_"<<seq_chr[i].file<<"\tunique\t"<<inizio<<"\t"<<fine<<"\t"<<seq_chr[i].score<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl; + } + if (choice == "union" && entrato==2){ + resfile<<(*itseq).first<<"\tcommon\tcommon\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t.\t.\tcommon"<<endl; + } + if (choice == "common" && entrato==2 && file_t == 1 && file_t2 == 1 && concatenate=="yes"){ + resfile<<(*itseq).first<<"\t"<<seq_chr[i].campo1<<"\t"<<seq_chr[i].campo2<<"\t"<<inizio<<"\t"<<fine<<"\t"<<z<<"\t"<<seq_chr[i].strand<<"\t.\t"<<seq_chr[i].index<<endl; + } + break; + } + } + } + } + } +} +