# HG changeset patch # User calkan # Date 1329838187 18000 # Node ID 7b3dc85dc7fd9a93295a1714721b2a46ef0322bd Uploaded mrfast source tarball diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/CommandLineParser.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/CommandLineParser.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,388 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + +#include +#include +#include +#include +#include +#include "Common.h" +#include "CommandLineParser.h" + +int uniqueMode=1; +int indexingMode; +int searchingMode; +int pairedEndMode; +int pairedEndDiscordantMode; +int transChromosal=0; +int pairedEndProfilingMode; +int seqCompressed; +int outCompressed; +int cropSize = 0; +int progressRep = 0; +int minPairEndedDistance=-1; +int maxPairEndedDistance=-1; +int minPairEndedDiscordantDistance=-1; +int maxPairEndedDiscordantDistance=-1; +int bestMode; +int nosamMode; +char *seqFile1; +char *seqFile2; +char *mappingOutput = "output"; +char *mappingOutputPath = ""; +char *unmappedOutput = "unmapped"; +char fileName[1000][2][FILE_NAME_LENGTH]; +int fileCnt; +int maxOEAOutput=1000; +int maxDiscordantOutput=10000; +unsigned char errThreshold=2; +unsigned char maxHits=0; +unsigned char WINDOW_SIZE = 12; +unsigned int CONTIG_SIZE; +unsigned int CONTIG_MAX_SIZE; + +void printHelp(); + +int parseCommandLine (int argc, char *argv[]) +{ + + int o; + int index; + char *fastaFile = NULL; + char *batchFile = NULL ; + int batchMode = 0; + + static struct option longOptions[] = + { + {"pe", no_argument, &pairedEndMode, 1}, + {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, + {"trans", no_argument, &transChromosal , 1}, + {"profile", no_argument, &pairedEndProfilingMode, 1}, + {"seqcomp", no_argument, &seqCompressed, 1}, + {"outcomp", no_argument, &outCompressed, 1}, + {"progress", no_argument, &progressRep, 1}, + {"best", no_argument, &bestMode, 1}, + {"index", required_argument, 0, 'i'}, + {"search", required_argument, 0, 's'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'v'}, + {"seq", required_argument, 0, 'x'}, + {"seq1", required_argument, 0, 'x'}, + {"seq2", required_argument, 0, 'y'}, + {"ws", required_argument, 0, 'w'}, + {"min", required_argument, 0, 'l'}, + {"max", required_argument, 0, 'm'}, + {"crop", required_argument, 0, 'c'}, + {"maxoea", required_argument, 0, 'a'}, + {"maxdis", required_argument, 0, 'd'}, + {"nosam", no_argument, &nosamMode, 1}, + {0, 0, 0, 0}, + }; + + while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 ) + { + switch (o) + { + case 'a': + maxOEAOutput = atoi(optarg); + break; + case 'd': + maxDiscordantOutput = atoi(optarg); + break; + case 'i': + indexingMode = 1; + fastaFile = optarg; + break; + case 's': + searchingMode = 1; + fastaFile = optarg; + break; + case 'b': + batchMode = 1; + break; + case 'c': + cropSize = atoi(optarg); + break; + case 'w': + WINDOW_SIZE = atoi(optarg); + break; + case 'x': + seqFile1 = optarg; + break; + case 'y': + seqFile2 = optarg; + break; + case 'u': + unmappedOutput = optarg; + break; + case 'o': + mappingOutput = getMem(FILE_NAME_LENGTH); + mappingOutputPath = getMem(FILE_NAME_LENGTH); + stripPath (optarg, &mappingOutputPath, &mappingOutput); + break; + case 'n': + maxHits = atoi(optarg); + break; + case 'e': + errThreshold = atoi(optarg); + break; + case 'l': + minPairEndedDistance = atoi(optarg); + break; + case 'm': + maxPairEndedDistance = atoi(optarg); + break; + case 'h': + printHelp(); + return 0; + break; + case 'v': + fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF); + return 0; + break; + /* case '?': + fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name); + abort(); + break;*/ + } + + } + if (indexingMode + searchingMode != 1) + { + fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); + return 0; + } + + if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11) + { + fprintf(stdout, "ERROR: Window size should be in [12..15]\n"); + return 0; + } + + + if ( indexingMode ) + { + CONTIG_SIZE = 15000000; + CONTIG_MAX_SIZE = 40000000; + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); + return 0; + } + + if (pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant cannot be used in indexing mode. \n"); + return 0; + } + + } + + + if ( searchingMode ) + { + CONTIG_SIZE = 300000000; + CONTIG_MAX_SIZE = 300000000; + + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); + return 0; + } + + if (seqFile1 == NULL && seqFile2 == NULL) + { + fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); + return 0; + } + + + if (!pairedEndMode && seqFile2 != NULL) + { + fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n"); + return 0; + } + + if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance)) + { + fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); + return 0; + } + + if (pairedEndMode && seqFile1 == NULL) + { + fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); + return 0; + } + + if (!pairedEndMode && pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant should be used with --pe"); + return 0; + } + + if (!pairedEndMode && pairedEndProfilingMode) + { + fprintf(stdout, "ERROR: --profile should be used with --pe"); + return 0; + } + } + + int i = 0; + + + if (batchMode) + { + FILE *fp = fileOpen(batchFile, "r"); + + if (fp == NULL) + return 0; + + fileCnt = 0; + + while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp)) + { + for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--) + if ( !isspace(fileName[fileCnt][0][i])) + break; + fileName[fileCnt][0][i+1] = '\0'; + + if (strcmp(fileName[fileCnt][0], "") != 0) + { + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + } + } + else + { + sprintf(fileName[fileCnt][0], "%s", fastaFile); + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + + + if (pairedEndProfilingMode) + { + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + + } + + if (pairedEndDiscordantMode) + { + minPairEndedDiscordantDistance = minPairEndedDistance; + maxPairEndedDiscordantDistance = maxPairEndedDistance; + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + } + + return 1; +} + + +void printHelp() +{ + char *errorType; + if (mrFAST) + { + fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"Usage: mrfast [options]\n\n"); + errorType="edit distance"; + } + else + { + fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n"); + fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n"); + fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n"); + fprintf(stdout,"the output in SAM format.\n\n"); + fprintf(stdout,"Usage: mrsFAST [options]\n\n"); + errorType="hamming distance"; + } + + fprintf(stdout,"General Options:\n"); + fprintf(stdout," -v|--version\t\tCurrent Version.\n"); + fprintf(stdout," -h\t\t\tShows the help file.\n"); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Indexing Options:\n"); + fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n"); + fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n"); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Searching Options:\n"); + fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n"); + fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n"); + fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n"); + fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n"); + fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads. \n"); + fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n"); + fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n"); + fprintf(stdout," --best \t\tOnly the best mapping from all the possible mapping is returned.\n"); + fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n"); + fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n"); + fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType); + fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n"); + fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n"); + + fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n"); + fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n"); +} diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/CommandLineParser.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/CommandLineParser.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,49 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + +#ifndef __COMMAND_LINE_PARSER__ +#define __COMMAND_LINE_PARSER__ + +int parseCommandLine (int argc, char *argv[]); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Common.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Common.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,184 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#include +#include +#include +#include +#include +#include "Common.h" + + +unsigned short SEQ_LENGTH = 0; +long long memUsage = 0; +/**********************************************/ +FILE *fileOpen(char *fileName, char *mode) +{ + FILE *fp; + fp = fopen (fileName, mode); + if (fp == NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return fp; +} +/**********************************************/ +gzFile fileOpenGZ(char *fileName, char *mode) +{ + gzFile gzfp; + gzfp = gzopen (fileName, mode); + if (gzfp == Z_NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return gzfp; +} +/**********************************************/ +double getTime(void) +{ + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec+t.tv_usec/1000000.0; +} + +/**********************************************/ +char reverseCompleteChar(char c) +{ + char ret; + switch (c) + { + case 'A': + ret = 'T'; + break; + case 'T': + ret = 'A'; + break; + case 'C': + ret = 'G'; + break; + case 'G': + ret = 'C'; + break; + default: + ret = 'N'; + break; + } + return ret; +} +/**********************************************/ +void reverseComplete (char *seq, char *rcSeq , int length) +{ + int i; + for (i=0; i=0; i--) + { + if (full[i]=='/') + { + pos = i; + break; + } + + } + + if (pos != -1) + { + sprintf(*fileName, "%s%c", (full+pos+1), '\0'); + full[pos+1]='\0'; + sprintf(*path,"%s%c", full, '\0'); + } + else + { + sprintf(*fileName, "%s%c", full, '\0'); + sprintf(*path,"%c", '\0'); + } +} diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Common.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Common.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,111 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __COMMON__ +#define __COMMON__ + +#include + +#define SEQ_MAX_LENGTH 300 // Seq Max Length +#define CONTIG_OVERLAP 200 // No. of characters overlapped between contings +#define CONTIG_NAME_SIZE 200 // Contig name max size +#define FILE_NAME_LENGTH 400 // Filename Max Length +#define DISCORDANT_CUT_OFF 800 +#define MAX_OPEN_FILE 600 +#define MAX_TRANS_CHROMOSAL_OUTPUT 50 +#define MAX_OEA_OUT 500 + +extern unsigned int CONTIG_SIZE; +extern unsigned int CONTIG_MAX_SIZE; + + +extern unsigned char WINDOW_SIZE ; // WINDOW SIZE for indexing/searching +extern unsigned short SEQ_LENGTH; // Sequence(read) length + +extern char *versionNumber; +extern char *versionNumberF; +extern unsigned char mrFAST; + + +extern int maxOEAOutput; +extern int maxDiscordantOutput; +extern int uniqueMode; +extern int indexingMode; +extern int searchingMode; +extern int pairedEndMode; +extern int pairedEndDiscordantMode; +extern int transChromosal; +extern int pairedEndProfilingMode; +extern int bestMode; +extern int nosamMode; +extern int seqCompressed; +extern int outCompressed; +extern int cropSize; +extern int progressRep; +extern char *seqFile1; +extern char *seqFile2; +extern char *seqUnmapped; +extern char *mappingOutput; +extern char *mappingOutputPath; +extern char *unmappedOutput; +extern unsigned char seqFastq; +extern unsigned char errThreshold; +extern unsigned char maxHits; +extern int minPairEndedDiscordantDistance; +extern int maxPairEndedDiscordantDistance; +extern int minPairEndedDistance; +extern int maxPairEndedDistance; +extern char fileName[1000][2][FILE_NAME_LENGTH]; +extern int fileCnt; +extern long long memUsage; + +FILE * fileOpen(char *fileName, char *mode); +gzFile fileOpenGZ(char *fileName, char *mode); +double getTime(void); +void reverseComplete (char *seq, char *rcSeq , int length); +void * getMem(size_t size); +void freeMem(void * ptr, size_t size); +double getMemUsage(); +void reverse (char *seq, char *rcSeq , int length); +void stripPath(char *full, char **path, char **fileName); +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/HashTable.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/HashTable.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,472 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + +#include +#include +#include +#include +#include "Common.h" +#include "RefGenome.h" +#include "HashTable.h" +/**********************************************/ +FILE *_ih_fp = NULL; +IHashTable *_ih_hashTable = NULL; +int _ih_maxHashTableSize = 0; +char *_ih_refGen = NULL; +char *_ih_refGenName = NULL; +long long _ih_memUsage = 0; +int _ih_refGenOff = 0; +/**********************************************/ + +int hashVal(char *seq) +{ + int i=0; + int val=0, numericVal=0; + + while(i0) + { + tmpSize = cnt; + } + else + { + tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp); + } + + _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) ); + _ih_hashTable[hv].locs[0] = tmpSize; + +/* for (j=1; j<=tmpSize; j++) + tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp); +*/ + + if(tmpSize != 0) + { + tmp = fread(&(_ih_hashTable[hv].locs[index]), sizeof(_ih_hashTable[hv].locs[index]), 1, _ih_fp); + index++; + } + + for (j=2; j<=tmpSize; j++) + { + tmp = fread(&tmpValue, sizeof(_ih_hashTable[hv].locs[index-1]), 1, _ih_fp); + if(abs(tmpValue-_ih_hashTable[hv].locs[index-1]) > errThreshould) + { + _ih_hashTable[hv].locs[index] = tmpValue; + index++; + } + } + _ih_hashTable[hv].locs[0] = index-1; + } + *loadTime = getTime()-startTime; + return 1; +} +/**********************************************/ +unsigned int *getIHashTableCandidates(int hv) +{ + if ( hv != -1 ) + return _ih_hashTable[hv].locs; + else + return NULL; +} +/**********************************************/ +/**********************************************/ +/**********************************************/ +void configHashTable() +{ + if (WINDOW_SIZE <= 15) + { + generateHashTable = &generateIHashTable; + loadHashTable = &loadIHashTable; + finalizeLoadingHashTable = &finalizeLoadingIHashTable; + getCandidates = &getIHashTableCandidates; + } + else + { + + + } +} +/**********************************************/ +int initLoadingHashTable(char *fileName) +{ + int i; + unsigned char bsIndex; + int tmp; + + _ih_fp = fileOpen(fileName, "r"); + + if (_ih_fp == NULL) + return 0; + + tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp); + + if (tmp == 0){ + fprintf(stderr, "Read error while inilializing hash table loading.\n"); + exit(0); + } + + if (bsIndex) + { + fprintf(stdout, "Error: Wrong Type of Index indicated"); + return 0; + } + + tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); + + configHashTable(); + + if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE)) + { + + if (_ih_hashTable != NULL) + { + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + } + + _ih_maxHashTableSize = pow(4, WINDOW_SIZE); + + _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); + for (i=0; i<_ih_maxHashTableSize; i++) + _ih_hashTable[i].locs = NULL; + _ih_refGen = getMem(1); + _ih_refGen[0]='\0'; + _ih_refGenName = getMem(1); + _ih_refGenName[0] = '\0'; + } + + return 1; +} +/**********************************************/ +char *getRefGenome() +{ + return _ih_refGen; + +} +/**********************************************/ +char *getRefGenomeName() +{ + return _ih_refGenName; + +} +/**********************************************/ +int getRefGenomeOffset() +{ + return _ih_refGenOff; + +} +/**********************************************/ +HashTable *getHashTable() +{ + return NULL; +} diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/HashTable.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/HashTable.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,71 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __HASH_TABLE__ +#define __HASH_TABLE__ + +typedef struct HashTable +{ + long long hv; + int *locs; +} HashTable; + +typedef struct +{ + unsigned int *locs; +} IHashTable; + +int hashVal(char *seq); +void configHashTable(); +char *getRefGenome(); +char *getRefGenomeName(); +int getRefGenomeOffset(); +int initLoadingHashTable(char *fileName); +HashTable *getHashTable(); + +void (*generateHashTable)(char *fileName, char *indexName); +int (*loadHashTable)(double *loadTime, int errThreshould); +void (*finalizeLoadingHashTable)(); +unsigned int *(*getCandidates)(int hv); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Makefile Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,18 @@ +CC=gcc +CFLAGS = -c -O3 -Wall +LDFLAGS = -lz -lm +SOURCES = baseFAST.c CommandLineParser.c Common.c HashTable.c MrFAST.c Output.c Reads.c RefGenome.c +OBJECTS = $(SOURCES:.c=.o) +EXECUTABLE = mrfast + + +all: $(SOURCES) $(EXECUTABLE) + rm -rf *.o + +$(EXECUTABLE): $(OBJECTS) + $(CC) $(OBJECTS) -o $@ $(LDFLAGS) + +.c.o: + $(CC) $(CFLAGS) $< -o $@ +clean: + rm -f *.o *~ #* diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/MrFAST.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/MrFAST.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,8148 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan +Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "Common.h" +#include "Reads.h" +#include "HashTable.h" +#include "Output.h" +#include "MrFAST.h" +#include "RefGenome.h" + + +#define min(a,b) ((a)>(b)?(b):(a)) +#define min3(a,b,c) ((a)>(b)?(b>c?c:b):(a>c?c:a)) +#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) + +#define MAX_REF_SIZE 18 + + +float calculateScore(int index, char *seq, char *qual, char *md); +unsigned char mrFAST = 1; +char *versionNumberF="0.4"; + +long long verificationCnt = 0; +long long mappingCnt = 0; +long long mappedSeqCnt = 0; +long long completedSeqCnt = 0; +char *mappingOutput; +/**********************************************/ +char *_msf_refGen = NULL; +int _msf_refGenLength = 0; +int _msf_refGenOffset = 0; +char *_msf_refGenName = NULL; + +int _msf_refGenBeg; +int _msf_refGenEnd; + +IHashTable *_msf_hashTable = NULL; + +int *_msf_samplingLocs; +int *_msf_samplingLocsEnds; +int _msf_samplingLocsSize; + +Read *_msf_seqList; +int _msf_seqListSize; + +Pair *_msf_sort_seqList = NULL; +int *_msf_map_sort_seqList; + +ReadIndexTable *_msf_rIndex = NULL; +int _msf_rIndexSize; +int _msf_rIndexMax; + +SAM _msf_output; + +OPT_FIELDS *_msf_optionalFields; + +char *_msf_op; + +int *_msf_verifiedLocs = NULL; + +char _msf_numbers[200][3]; +char _msf_cigar[5]; + +MappingInfo *_msf_mappingInfo; + +int *_msf_seqHits; +int _msf_openFiles = 0; +int _msf_maxLSize=0; +int _msf_maxRSize=0; + +BestFullMappingInfo *bestHitMappingInfo; + +/*************************/ +int _msf_maxFile=0; +char _msf_fileName[4000][200][2][FILE_NAME_LENGTH]; +int _msf_fileCount[4000]; + +char *_msf_readHasConcordantMapping; //boolean if a read has concordant mapping :D + +int *_msf_oeaMapping; +int *_msf_discordantMapping; + +FILE *bestConcordantFILE; +FILE *bestDiscordantFILE; + +int counter = 0; + +int scoreF[200][200]; +int scoreB[200][200]; + +int score[200][200]; +int direction1[200][200]; +int direction2[200][200]; + +__m128i MASK; + +int lookUpTable[15625][15625]; + +/**************************************************Methods***************************************************/ +int smallEditDistanceF(char *a, int lena, char *b, int lenb) +{ + int matrix[20][20]; + int i = 0; + int j = 0; + + for(i = 0; i <= lena; i++) + { + matrix[0][i] = i; + } + + for(i = 0; i <= lenb; i++) + { + matrix[i][0] = i; + } + + + for(i = 1; i <= lenb; i++) + { + for(j = 1; j <= lena; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (a[j-1] != b[i-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); +} + +int smallEditDistanceB(char *a, int lena, char *b, int lenb) +{ + int matrix[20][20]; + int i = 0; + int j = 0; + + for(i = 0; i <= lena; i++) + { + matrix[0][i] = i; + } + + for(i = 0; i <= lenb; i++) + { + matrix[i][0] = i; + } + + + for(i = 1; i <= lenb; i++) + { + for(j = 1; j <= lena; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (*(a-j+1) != *(b-i+1)),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + + return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); +} + +char fastEditDistance(int per1, int per2) +{ + + int i = 0; + int j = 0; + + char str1[7]; + char str2[7]; + + int val1 = per1; + int val2 = per2; + + int index = 0; + int mod = 0; + + int matrix[7][7]; + + int min = 20; + + while(index < 6) + { + mod = val1%5; + str1[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); + val1 = val1 /5; + index++; + } + + str1[6] = '\0'; + + index = 0; + while(index < 6) + { + mod=val2%5; + str2[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); + val2 = val2 / 5; + index++; + } + str2[6] = '\0'; + + for(i = 0; i < 7; i++) + { + matrix[0][i] = i; + matrix[i][0] = i; + } + + for(i = 1; i < 7; i++) + { + for(j = 1; j < 7; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (str1[i-1] != str2[j-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + + for(i = 0; i < 7; i++) + { + if(matrix[i][6] < min) + min = matrix[i][6]; + } + + for(i = 0; i < 7; i++) + { + if(matrix[6][i] < min) + min = matrix[6][i]; + } + return min; +} + +void initLookUpTable() +{ + int i = 0; + + MASK = _mm_insert_epi16(MASK,1,0); + MASK = _mm_insert_epi16(MASK,1,1); + MASK = _mm_insert_epi16(MASK,1,2); + MASK = _mm_insert_epi16(MASK,1,3); + MASK = _mm_insert_epi16(MASK,1,4); + MASK = _mm_insert_epi16(MASK,0,5); + MASK = _mm_insert_epi16(MASK,0,6); + MASK = _mm_insert_epi16(MASK,0,7); + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreF[0][i] = i; + scoreF[i][0] = i; + } + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreB[0][i] = i; + scoreB[i][0] = i; + } + + +} + +int backwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + + int e = errThreshold; + + char flag = 0; + + int minError = 2*e; + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side1 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); + } + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + tmp = _mm_srli_si128(R0,2); + for(j=0; j <= e;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=e/2;j>=-e/2-1;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i)/2-j-1)),0); + } + + // printf("@%d %d %d %d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1), _mm_extract_epi16(Diag,2), + // _mm_extract_epi16(Diag,3)); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + + // printf("#~%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + // printf("$%d %d %d %d\n", _mm_extract_epi16(Side2,0), _mm_extract_epi16(Side2,1), _mm_extract_epi16(Side2,2), + // _mm_extract_epi16(Side2,3)); + + // printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + //first cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-3)) != *(a-lena), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena-1)), 1); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-2)), 2); + Diag = _mm_insert_epi16(Diag, 2*e, 3); + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R1,2)); + + //second cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena)), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-1)), 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,1)); + + //third cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena+1)), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena)), 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + //forth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+1)), 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + //fifth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+2)), 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + +int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + + int e = errThreshold; + + char flag = 0; + + int minError = 2*e; + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + tmp = _mm_srli_si128(R0,2); + for(j=0; j <= e;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=-e/2+1;j<=e/2;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2-j-1)) != *(a-((i-1)/2+j-1)),0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + + tmpE--; + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k e) + return -1; + return minError; +} + +inline int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0; + int i1; + int i2; + int i4; + int i5; + + int e = 4; + int mismatch = errThreshold; + + int minError = 2*errThreshold; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + __m128i SeqA, SeqB; + __m128i Result; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,minError,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, ( *(a-1)!=*b) )+1; + i2 = min(i0,( a[0] != *(b-1) ) )+1; + + i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); + i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; + i5 = min(i2, (a[0] != *(b-2))+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,minError,0); + Down1 = _mm_insert_epi16(Down1,minError,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-index),0); + SeqB = _mm_insert_epi16(SeqB,*(b-index),0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,minError,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); + SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold + && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && _mm_extract_epi16(R1, 0) > errThreshold + && _mm_extract_epi16(R1, 1) > errThreshold && _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k mismatch) + return -1; + return minError; +} + +int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0; + int i1; + int i2; + int i4; + int i5; + + int e = errThreshold; + + int minError = 2*e; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + __m128i SeqA, SeqB; + __m128i Result; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, ( *(a-1)!=*b) )+1; + i2 = min(i0,( a[0] != *(b-1) ) )+1; + + i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); + i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; + i5 = min(i2, (a[0] != *(b-2))+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-index),0); + SeqB = _mm_insert_epi16(SeqB,*(b-index),0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + index = 4; + i = 5; + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); + SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + //tmp = _mm_sub_epi16(Error, R0); + //i0 = _mm_movemask_epi8(tmp); + + if( _mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e + && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && + _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e ) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + + j=0; + + int tmpE = e; + + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k e) + return -1; + return minError; +} + +inline int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0=0; + int i1=0; + int i2=0; + int i4=0; + int i5=0; + + int mismatch = errThreshold; + int e = 4; + + int minError = 4*mismatch+1; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + register __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + register __m128i SeqA, SeqB; + __m128i Result; + + __m128i tmpSeqA; + __m128i tmpSeqB; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,minError,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, (a[1]!=b[0]))+1; + i2 = min(i0,(a[0]!=b[1]))+1; + + i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); + i4 = min(i1, (a[2]!=b[0])+1)+1; + i5 = min(i2, (a[0]!=b[2])+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,minError,0); + Down1 = _mm_insert_epi16(Down1,minError,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,a[index],0); + SeqB = _mm_insert_epi16(SeqB,b[index],0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,minError,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + if( i%2 == 0) + { + tmpSeqA = _mm_slli_si128(SeqA, 2); + tmpSeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); + SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold + && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && + _mm_extract_epi16(R1, 0) > errThreshold && _mm_extract_epi16(R1, 1) > errThreshold && + _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k mismatch) + return -1; + return minError; +} + + + +int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0=0; + int i1=0; + int i2=0; + int i4=0; + int i5=0; + + int e = errThreshold; + + int minError = 2*e; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + register __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + register __m128i SeqA, SeqB; + __m128i Result; + + __m128i tmpSeqA; + __m128i tmpSeqB; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, (a[1]!=b[0]))+1; + i2 = min(i0,(a[0]!=b[1]))+1; + + i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); + i4 = min(i1, (a[2]!=b[0])+1)+1; + i5 = min(i2, (a[0]!=b[2])+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,a[index],0); + SeqB = _mm_insert_epi16(SeqB,b[index],0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + //Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + tmpSeqA = _mm_slli_si128(SeqA, 2); + tmpSeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); + SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e + && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && + _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k e) + return -1; + return minError; +} + +int forwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int e = errThreshold; + + int minError = 2*e; + + char flag = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Side1 = _mm_xor_si128(Side1, Side1); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side1 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); + } + + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + + tmp = _mm_srli_si128(R0,2); + for(j=0; j < e-1;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + // printf("#%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); + if(flag == 0) + return -1; + + if(i == 2*lenb-(e-1)) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=e/2;j>=-e/2-1;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i)/2-j-1],0); + } + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + //printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + //first cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-3] != a[lena], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena-1], 1); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-2], 2); + Diag = _mm_insert_epi16(Diag, 2*e, 3); + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,2)); + + //second cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-1], 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + + minError = min(minError, _mm_extract_epi16(R0,1)); + + //third cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena+1], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena], 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,1)); + + //forth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+1], 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + //fifth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+2], 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,0)); + + if(minError > e) + return -1; + return minError; + +} + +int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int e = errThreshold; + + int minError = 2*e; + + char flag = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + + tmp = _mm_srli_si128(R0,2); + for(j=0; j < e-1;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=-e/2+1;j<=e/2;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2-j-1] != a[(i-1)/2+j-1],0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k e) + return -1; + return minError; +} + + +int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + + + int i0 = 0; + int i1 = 0; + + + int error; //0: if the two character are equal 1: if not + + int i = 0; //loop index + + int e = 2; //error bound + + int totalError = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; + + __m128i tmp; + __m128i ERROR_REACH; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + ERROR_REACH = _mm_setzero_si128 (); + /* end initialize */ + + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + +// error = ((a[0]) != (b[0])); + + Diag = _mm_set_epi16(0,0,0,0,0,2*e,((a[0]) != (b[0])),2*e); + Side1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); + Side2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); + Down1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); + Down2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + for (i = 3; i < 2*lena; i++) + { + if(i % 2 ==1) + { + + Diag = _mm_xor_si128(Diag, Diag); + error = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0, ((a[(i-1)/2-1]) != (b[(i+1)/2-1])) ,((a[(i+1)/2-1]) != (b[(i-1)/2-1]))); + + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + if(i > 2 * lenb - 2) + { + i1 = _mm_extract_epi16(R1, 1); + totalError = min(totalError, i1); + } + } + + else if(i % 2 == 0) + { + error = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); + error = ((a[i/2-2]) != (b[i/2])); + Diag = _mm_insert_epi16(Diag,error,2); + + // Diag = _mm_set_epi16(0, 0, 0, 0, 0, ((a[i/2-2]) != (b[i/2])) , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + tmp = _mm_sub_epi16(ERROR_REACH, R0); + i0 = _mm_movemask_epi8(tmp); + + if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) + return -1; + if(i == 2 * lenb - 2) { + totalError = _mm_extract_epi16(R0, 2); + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + error = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + +// i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + totalError = min(totalError, i1); + + //fill the second part of the error + i++; + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + error = ((a[i/2]) != (b[lenb-1])); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2]) != (b[lenb-1])) , 2*e ); + + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + +// i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + totalError = min(totalError, i1); + //fill the last the last element of the matrix + i++; + + Diag = _mm_xor_si128(Diag, Diag); + error = ((a[i/2]) != (b[lenb-1])); + Diag = _mm_insert_epi16(Diag,error,0); + +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0 , 0 , ((a[i/2]) != (b[lenb-1])) ); + + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + totalError = min(totalError, i0); + + if(totalError > e) + return -1; + + return totalError; + +} + +int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i0 = 0; + int i1 = 0; + + int error; //0: if the two character are equal 1: if not + + int i = 0; //loop index + + int e = 2; //error bound + + int totalError = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; //diag matrix + + __m128i tmp; + __m128i ERROR_REACH; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + ERROR_REACH = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + error = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + +// printf("%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); + for (i = 3; i < 2*lena; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + error = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + //printf("#%d #%d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1)); + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + if(i > 2 * lenb - 2) { + i1 = _mm_extract_epi16(R1, 1); + totalError = min(totalError, i1); + } +// printf("%d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1)); + } + + else if(i % 2 == 0) + { + error = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + error = ( *(a-(i/2-2)) != *(b-(i/2))); + Diag = _mm_insert_epi16(Diag,error,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + tmp = _mm_sub_epi16(ERROR_REACH, R0); + i0 = _mm_movemask_epi8(tmp); + + if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) + return -1; + + if(i == 2 * lenb - 2) { + totalError = _mm_extract_epi16(R0, 2); + } + } + } + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + error = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + totalError = min(totalError, i1); + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + error = ( *(a-(i/2)) != *(b-(lenb-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + totalError = min(totalError, i1); + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + error = ( *(a-(i/2)) != *(b-(lenb-1)) ); + Diag = _mm_insert_epi16(Diag,error,0); + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + totalError = min(totalError, i0); + + if(totalError > e || totalError == 0) + return -1; + return totalError; +} + +void initBestMapping(int totalReadNumber) +{ + int i = 0; + bestHitMappingInfo = getMem(totalReadNumber * sizeof(BestFullMappingInfo)); + for(i = 0; i < totalReadNumber; i++) { + bestHitMappingInfo[i].loc = -1; + } +} + + +void finalizeBestSingleMapping() +{ + int i = 0; + char *_tmpQual, *_tmpSeq; + char rqual[SEQ_LENGTH + 1]; + rqual[SEQ_LENGTH]='\0'; + + for(i = 0; i < _msf_seqListSize; i++) + { + if(_msf_seqList[i].hits[0] != 0) + { + if (bestHitMappingInfo[i].dir) + { + reverse(_msf_seqList[i].qual, rqual, SEQ_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[i].rseq; + } + else + { + _tmpQual = _msf_seqList[i].qual; + _tmpSeq = _msf_seqList[i].seq; + } + + + _msf_output.QNAME = _msf_seqList[i].name; + _msf_output.FLAG = 16 * bestHitMappingInfo[i].dir; + _msf_output.RNAME = bestHitMappingInfo[i].chr; + + _msf_output.POS = bestHitMappingInfo[i].loc; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[i].cigar ; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + + + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[i].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[i].md; + + output(_msf_output); + } + } + freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); +} +/**********************************************/ +int compare (const void *a, const void *b) +{ + return ((Pair *)a)->hv - ((Pair *)b)->hv; + /*char *s1 = ((Pair *)a)->hv; + char *s2 = ((Pair *)b)->hv; + int i = 0; + + int diff = 0; + int sign = 0; + + for(i = 0; i < SEQ_LENGTH; i++) + { + diff += (s1[i] != s2[i]); + if(s1[i] > s2[i]) + sign++; + else if(s1[i] < s2[i]) + sign--; + } + + return diff*sign;*/ +// return strncmp(s1, s2,SEQ_LENGTH); + +} +/**********************************************/ +void preProcessReads() +{ + int i = 0; + + _msf_sort_seqList = getMem(_msf_seqListSize * sizeof(Pair)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_sort_seqList[i].hv = hashVal(_msf_seqList[i].seq); + + _msf_sort_seqList[i].readNumber = i; + } + + qsort(_msf_sort_seqList, _msf_seqListSize, sizeof(Pair), compare); + + /* + for(i = 0; i < _msf_seqListSize; i++) + { + //printf("%s\n", _msf_sort_seqList[i].hv); + } + */ + + _msf_map_sort_seqList = getMem(_msf_seqListSize * sizeof(int)); + + for(i = 0; i < _msf_seqListSize; i++) + _msf_map_sort_seqList[_msf_seqList[i].readNumber] = i; + +} +/**********************************************/ + +int verifySingleEnd(int index, char* seq, int offset) +{ + int curOff = 0; + int i; + + char *ref; + + int err; + int errCnt =0; + int errCntOff = 0; + int NCntOff = 0; + + ref = _msf_refGen + index - 1; + + verificationCnt++; + + for (i = 0; i < SEQ_LENGTH; i++) + { + err = *ref != *seq; + errCnt += err; + if (errCnt > errThreshold) + { + + return -1; + } + + if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1]) + { + + if (errCntOff == 0 && NCntOff == 0 && offset > curOff) + { + return -1; + } + + errCntOff = 0; + NCntOff = 0; + curOff++; + + if ( i >= _msf_samplingLocs[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + } + + ref++; + seq++; + } + return errCnt; +} + +/*********************************************/ +void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName) +{ + int i; + + if (_msf_optionalFields == NULL) + { + _msf_op = getMem(SEQ_LENGTH); + if (pairedEndMode) + { + _msf_optionalFields = getMem(8*sizeof(OPT_FIELDS)); + } + else + { + _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS)); + } + + for (i=0; i<200;i++) + { + sprintf(_msf_numbers[i],"%d%c",i, '\0'); + } + sprintf(_msf_cigar, "%dM", SEQ_LENGTH); + } + + if (_msf_samplingLocsEnds == NULL) + { + _msf_samplingLocs = samplingLocs; + _msf_samplingLocsSize = samplingLocsSize; + + _msf_samplingLocsEnds = getMem(sizeof(int)*_msf_samplingLocsSize); + for (i=0; i<_msf_samplingLocsSize; i++) + { + _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1; + } + + _msf_seqList = seqList; + _msf_seqListSize = seqListSize; + + preProcessReads(); + + _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_oeaMapping[i] = 0; + } + + _msf_discordantMapping = getMem(_msf_seqListSize * sizeof(int)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_discordantMapping[i] = 0; + } + + } + + if (_msf_refGenName == NULL) + { + _msf_refGenName = getMem(4*SEQ_LENGTH); + } + _msf_refGen = getRefGenome(); + _msf_refGenLength = strlen(_msf_refGen); + + _msf_refGenOffset = getRefGenomeOffset(); + snprintf(_msf_refGenName, 4*SEQ_LENGTH,"%s%c", getRefGenomeName(), '\0'); + _msf_refGenName[strlen(getRefGenomeName())] = '\0'; + + + if (_msf_verifiedLocs != NULL){ + freeMem(_msf_verifiedLocs, sizeof(int) * (_msf_refGenLength+1)); + } + + _msf_verifiedLocs = (int *) getMem(sizeof(int)*(_msf_refGenLength+1)); + + for (i=0; i<=_msf_refGenLength; i++) + _msf_verifiedLocs[i] = _msf_seqListSize*10+1; + + + + if (pairedEndMode && _msf_seqHits == NULL) + { + + _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo)); + + for (i=0; i 0 ? ((rIndex > seqLength) ? seqLength - errThreshold :rIndex - errThreshold) : 1 ); + tempDown = ((rIndex >= seqLength-errThreshold ) ? seqLength+1 :rIndex + errThreshold + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(seq+i-1)); + + upValue = score[i-1][rIndex]+1; + diagValue = score[i-1][rIndex-1]+ !errorString; + sideValue = score[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + score[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - errThreshold) > 0 ? rIndex - errThreshold : 1)) && rIndex <= seqLength ) + score[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > seqLength && (i == seqLength - errThreshold) ) + score[i][rIndex] = sideValue; + else + score[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = score[i][rIndex]; + else if(error > score[i][rIndex]) + error = score[i][rIndex]; + } + rIndex++; + } + + min = score[seqLength][seqLength+errThreshold]; + minIndex = seqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= score[seqLength][seqLength+errThreshold-i] && seqLength+errThreshold-i > 0) + { + min = score[seqLength][seqLength+errThreshold-i]; + minIndex = seqLength+errThreshold-i; + } + } + + error = score[seqLength][minIndex]; + + directionIndex = seqLength; + rIndex = minIndex; + while(directionIndex != 0 || rIndex != 0) + { + + if(rIndex == 0) + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + } + else if(directionIndex == 0) + { + if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + } + else if(directionIndex-rIndex == errThreshold) + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + else if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + + matrix[size] = '\0'; + + char returnString[200]; + + returnString[0] = '\0'; + reverse(matrix, returnString, size); + sprintf(matrix, "%s", returnString); + + return error; +} + +/* + Will apply the Levenshtein Dynamic programming. + in both right and left direction as long as the + threshould error is reached or end of string length + +*/ +int msfHashVal(char *seq) +{ + int i=0; + int val=0, numericVal=0; + + while(i<6) + { + switch (seq[i]) + { + case 'A': + numericVal = 0; + break; + case 'C': + numericVal = 1; + break; + case 'G' : + numericVal = 2; + break; + case 'T': + numericVal = 3; + break; + default: + return -1; + break; + } + val = (val << 2)|numericVal; + i++; + } + return val; +} + + + +int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int e = errThreshold; + int error = 0; + int error1 = 0; + int totalError = 0; + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + + int minIndex1 = 0; + int minIndex2 = 0; + + + int directionIndex = 0; + + int size = 0; + + int startIndex1 = 0; + + rIndex = 1; + + + char matrixR[200]; + char matrixL[200]; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + int jumpIndex = 0; + + if(rSeqLength != 0) + { + error1 = forwardEditDistance2SSE2(ref+segLength+jumpIndex, rSeqLength-jumpIndex, rSeq+jumpIndex, rSeqLength-jumpIndex); + if(error1 == -1) + return -1; + } + + + if(lSeqLength != 0) + { + error = backwardEditDistance2SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error == -1) + { + return -1; + } + } + + matrixL[0] = '\0'; + matrixR[0] = '\0'; + + + ref = _msf_refGen + refIndex - 1; + + if(error1+error > errThreshold) + return -1; + + ref = _msf_refGen + refIndex - 1; + + rIndex = startIndex1+1; + + int i0 = 0; + int i1 = 0; + int i2 = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; // + + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + int mismatch[3] = {0,0,0}; + + if(lSeqLength != 0) + { + char *a; + char *b; + + a = ref-1; + b = lSeq+lSeqLength-1; + + R0 = _mm_insert_epi16(R0,0,0); + + score[0][0] = 0; + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + score[1][0] = 1; + direction1[1][0] = 1; + score[0][1] = 1; + direction1[0][1] = 2; + + mismatch[0] = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[0][2] = i0; + score[1][1] = i1; + score[2][0] = i2; + + direction1[0][2] = 2; + direction1[1][1] = ((mismatch[0] == 0)? 3 : 4); + direction1[2][0] = 1; + + for (i = 3; i < 2*lSeqLength; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + score[i/2][i/2+1] = i0; + score[i/2+1][i/2] = i1; + + direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; + + direction1[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : + (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; + + if(i > 2 * lSeqLength - 2) + { + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + } + } + + else if(i % 2 == 0) + { + mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + mismatch[2] = ( *(a-(i/2-2)) != *(b-(i/2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[2],2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + score[i/2+1][i/2-1] = i2; + + direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; + + direction1[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2] == 0) ? 3 : + (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; + + if( (i/2) % segLength == 0 && i1 == 0) // the segment has been processed no need to process it again + { + return -1; + } + + if(i == 2 * lSeqLength - 2) + { + error = i2; + minIndex1 = i-lSeqLength; + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-(i/2-1)) !=*(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + + direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]) ? 2 : 4; + + direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + + score[i/2-1][i/2+2] = i0; + score[i/2][i/2+1] = i1; + + direction1[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 4; + + direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2]) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + error = min(error, i0); + if(error == 0) + return -1; + if(error == i0) + minIndex1 = i-lSeqLength; + if(mismatch[0] == 0) + direction1[lSeqLength][lSeqLength+errThreshold] = 3; + else + { + if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength][lSeqLength+errThreshold-1] == 1) + direction1[lSeqLength][lSeqLength+errThreshold] = 2; + else if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength-1][lSeqLength+errThreshold] == 1) + direction1[lSeqLength][lSeqLength+errThreshold] = 1; + else + direction1[lSeqLength][lSeqLength+errThreshold] = 4; + } + } + error1 = error; + error = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength <= e) + { + char *a; + char *b; + + int tmp_index = 0; + + a = ref; + b = rSeq; + + for(tmp_index = 0; tmp_index < rSeqLength; tmp_index++) + { + matrixR[tmp_index] = (a[tmp_index]==b[tmp_index]) ? 'M' : a[tmp_index] ; + } + matrixR[tmp_index] = '\0'; + } + else if(rSeqLength != 0 && rSeqLength >= e) + { + char *a; + char *b; + + a = ref; + b = rSeq; + + R0 = _mm_sub_epi8(R0, R0); + R1 = _mm_sub_epi8(R1, R1); + + R0 = _mm_insert_epi16(R0,0,0); + + score[0][0] = 0; + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + score[1][0] = 1; + direction2[1][0] = 1; + score[0][1] = 1; + direction2[0][1] = 2; + + mismatch[0] = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[0][2] = i0; + score[1][1] = i1; + score[2][0] = i2; + + direction2[0][2] = 2; + direction2[1][1] = ((mismatch[0] == 0)? 3 : 4); + direction2[2][0] = 1; + + + for (i = 3; i < 2*rSeqLength; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + score[i/2][i/2+1] = i0; + score[i/2+1][i/2] = i1; + + direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; + + direction2[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : + (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; + + + if(i > 2 * rSeqLength - 2) + { + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + } + } + + else if(i % 2 == 0) + { + mismatch[0] = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + mismatch[2] = ((a[i/2-2]) != (b[i/2])); + Diag = _mm_insert_epi16(Diag,mismatch[2],2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + score[i/2+1][i/2-1] = i2; + + direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; + + direction2[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2]==0) ? 3 : + (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; + + + if(i == 2 * rSeqLength - 2) + { + error = i2; + minIndex2 = i-rSeqLength; + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + mismatch[0] = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + + direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; + + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + + score[i/2-1][i/2+2] = i0; + score[i/2][i/2+1] = i1; + + direction2[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 3; + + direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; + + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + + Down = _mm_sub_epi8(Down, Down); + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_sub_epi8(Side, Side); + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + error = min(error, i0); + if(error == i0) + minIndex2 = i-rSeqLength; + + if(mismatch[0] == 0) + direction2[rSeqLength][rSeqLength+errThreshold] = 3; + else + { + if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength][rSeqLength+errThreshold-1] == 1) + direction2[lSeqLength][lSeqLength+errThreshold] = 2; + else if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength-1][rSeqLength+errThreshold] == 1) + direction2[rSeqLength][rSeqLength+errThreshold] = 1; + else + direction2[rSeqLength][rSeqLength+errThreshold] = 4; + } + + } + + totalError = error1 + error; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + + if(rSeqLength > e) + { + while(directionIndex != 0 || rIndex != 0) + { + + if(direction2[directionIndex][rIndex] == 3) + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + else if(direction2[directionIndex][rIndex] == 4) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else if(direction2[directionIndex][rIndex] == 2) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + size++; + } + matrixR[size] = '\0'; + } + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + while(directionIndex != 0 || rIndex != 0) + { + + if(direction1[directionIndex][rIndex] == 3) + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + else if(direction1[directionIndex][rIndex] == 4) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else if(direction1[directionIndex][rIndex] == 2) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + + size++; + } + + matrixL[size] = '\0'; + + char middle[200]; + middle[0] = '\0'; + + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; +} + +int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = errThreshold; + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(lSeqLength != 0) + { + error3 = backwardEditDistance4SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1 || error3 == 0){ + return -1; + } + } + + if(rSeqLength != 0) + { + error2 = forwardEditDistance4SSE2(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(error2 + error3 > errThreshold) + return -1; + + rIndex = 1; + + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + + while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold]; + minIndex1 = lSeqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; + minIndex1 = lSeqLength+errThreshold-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0) + { + rIndex = 1; + while(rIndex <= rSeqLength+errThreshold-error1) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + + rIndex++; + } + + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; + minIndex2 = rSeqLength + errThreshold-error1; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*(errThreshold-error1); i++) + { + if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; + minIndex2 = rSeqLength+errThreshold-error1-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + + totalError = error + error1; + + if(errThreshold > 4) + printf("ERROR in errorThreshold.\n"); + + + if(totalError != error2 + error3 && totalError > errThreshold) + { + printf("ErrorF=%d, ErrorB=%d Error=%d Error=%d\n", error2,error3,error1,error); + + scanf("%d", &i); + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + + size++; + } + + matrixL[size] = '\0'; + char middle[200]; + middle[0] = '\0'; + + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; + +} + +int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue) +{ + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = min(4, errThreshold); + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(lSeqLength != 0) + { + error3 = backwardEditDistanceSSE2Extention(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1){ + return -1; + } + } + + if(rSeqLength != 0) + { + error2 = forwardEditDistanceSSE2Extention(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(error2 + error3 > errThreshold) + return -1; + + rIndex = 1; + + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + if(lSeqLength > ERROR_BOUND) + { + while(rIndex <= lSeqLength+ERROR_BOUND && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND]; + minIndex1 = lSeqLength + ERROR_BOUND; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*ERROR_BOUND; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i] && lSeqLength+ERROR_BOUND-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i]; + minIndex1 = lSeqLength+ERROR_BOUND-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + } + else + { + int j = 0; + for(i = 1; i <= lSeqLength; i++) + { + for(j = 1; j <= lSeqLength; j++) + { + scoreB[i][j] = min3(scoreB[i-1][j-1]+ (*(ref-j) != *(lSeq+lSeqLength-i) ),scoreB[i][j-1]+1 ,scoreB[i-1][j]+1); + } + } + error = scoreB[lSeqLength][lSeqLength]; + minIndex1 = lSeqLength; + + } + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0 && rSeqLength > ERROR_BOUND) + { + ERROR_BOUND = min(ERROR_BOUND, rSeqLength); + + if(rSeqLength == ERROR_BOUND) + { + for(i=0; i < 2*ERROR_BOUND; i++) + scoreF[0][i] = i; + } + + rIndex = 1; + while(rIndex <= rSeqLength+ERROR_BOUND) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + rIndex++; + } + min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND]; + minIndex2 = rSeqLength + ERROR_BOUND; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*ERROR_BOUND; i++) + { + if(min > scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i] && rSeqLength+ERROR_BOUND-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i]; + minIndex2 = rSeqLength+ERROR_BOUND-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + else + { + int j = 0; + for(i = 1; i <= rSeqLength; i++) + { + for(j = 1; j <= rSeqLength; j++) + { + scoreF[i][j] = min3(scoreF[i-1][j-1]+ (*(ref+j-1) != *(rSeq+i-1) ),scoreF[i][j-1]+1 ,scoreF[i-1][j]+1); + } + } + error = scoreF[rSeqLength][rSeqLength]; + minIndex2 = rSeqLength; + } + + totalError = error + error1; + + if(totalError != error2+error3) + { + for(i = 0; i < lSeqLength; i++) + printf("%c", *(tempref-1-i)); + printf("\n"); + for(i = 0; i < lSeqLength; i++) + printf("%c", *(lSeq+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(tempref+segLength+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(rSeq+i)); + printf("\n"); + + printf("ERROR=%d\n", totalError); + printf("ERROR_SSE=%d\n", error3+error2); + + printf("ERROR_SSE_back=%d E_SSE_forw=%d\n", error3, error2); + printf("ERROR_back=%d E_forw=%d\n", error1, error); + + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixL[size] = '\0'; + + char middle[200]; + middle[0] = '\0'; + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + + return totalError; + +} + + +int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = errThreshold; + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(rSeqLength != 0) + { + if(errThreshold %2 == 1) + error2 = forwardEditDistanceSSE2Odd(ref+segLength, rSeqLength, rSeq, rSeqLength); + else + error2 = forwardEditDistanceSSE2G(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(lSeqLength != 0) + { + if(errThreshold % 2 == 1) + error3 = backwardEditDistanceSSE2Odd(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + else + error3 = backwardEditDistanceSSE2G(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1 || error3 == 0){ + return -1; + } + } + + if(error3 + error2 > errThreshold) + return -1; + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreB[0][i] = i; + scoreB[i][0] = i; + } + + rIndex = 1; + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + + while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold]; + minIndex1 = lSeqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; + minIndex1 = lSeqLength+errThreshold-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0) + { + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreF[0][i] = i; + scoreF[i][0] = i; + } + + + rIndex = 1; + while(rIndex <= rSeqLength+errThreshold-error1) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + rIndex++; + } + + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; + minIndex2 = rSeqLength + errThreshold-error1; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*(errThreshold-error1); i++) + { + if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; + minIndex2 = rSeqLength+errThreshold-error1-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + + totalError = error + error1; + + + if(totalError != error2 + error3 && totalError > errThreshold) + { + for(i = 0; i < lSeqLength; i++) + printf("%c", *(tempref-1-i)); + printf("\n"); + for(i = 0; i < lSeqLength; i++) + printf("%c", *(lSeq+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(tempref+segLength+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(rSeq+i)); + printf("\n"); + + + printf("SSEF=%d SSEB%d\n", error2, error3); + printf("F=%d B=%d\n", error, error1); + scanf("%d", &i); + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixL[size] = '\0'; + char middle[200]; + middle[0] = '\0'; + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; +} + + +int addCigarSize(int cnt){ + if (cnt<10) return 1; + else if (cnt < 100) return 2; + return 3; +} + +/* + Generate Cigar from the back tracking matrix +*/ +void generateCigar(char *matrix, int matrixLength, char *cigar) +{ + int i = 0; + + int counterM=0; + int counterI=0; + int counterD=0; + + int cigarSize = 0; + + cigar[0] = '\0'; + + while(i < matrixLength) + { + if(matrix[i]=='M') + { + counterM++; + if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + } + else if(matrix[i] == 'I') + { + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + counterI++; + i++; + + } + else if (matrix[i] == 'D') + { + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + + counterD++; + i++; + + } + else + { + counterM++; + if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + } + i++; + } + + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI = 0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD = 0; + } + + cigar[cigarSize] = '\0'; +} + +/* + Creates the Cigar output from the mismatching positions format [0-9]+(([ACTGN]|\^[ACTGN]+)[0-9]+)* +*/ +void generateCigarFromMD(char *mismatch, int mismatchLength, char *cigar) +{ + int i = 0; + int j = 0; + + int start = 0; + int cigarSize = 0; + + cigar[0] = '\0'; + + while(i < mismatchLength) + { + if(mismatch[i] >= '0' && mismatch[i] <= '9') + { + start = i; + + while(mismatch[i] >= '0' && mismatch[i] <= '9' && i < mismatchLength) + i++; + + int value = atoi(mismatch+start); + for(j = 0; j < value-1; j++) + { + cigar[cigarSize] = 'M'; + cigarSize++; + } + cigar[cigarSize] = 'M'; + } + else if(mismatch[i] == '^') + { + cigar[cigarSize] = 'I'; + i++; + } + else if(mismatch[i] == '\'') + { + cigar[cigarSize] = 'D'; + i++; + } + else + { + cigar[cigarSize] = 'M'; + cigarSize++; + } + cigarSize++; + i++; + } + cigar[cigarSize] = '\0'; +} + +void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP) +{ + + int i = 0; + + int counterM = 0; + int counterD = 0; + + char delete[100]; + + int snpSize = 0; + + outputSNP[0] = '\0'; + delete[0] = '\0'; + + + while(i < matrixLength) + { + if(matrix[i]=='M') + { + counterM++; + if(counterD != 0) + { + delete[counterD] = '\0'; + counterD=0; + sprintf(outputSNP, "%s^%s", outputSNP,delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + delete[0] = '\0'; + } + } + else if(matrix[i] == 'D') + { + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM=0; + delete[counterD] = matrix[i+1]; + i++; + counterD++; + } + else if(counterD != 0) + { + delete[counterD] = matrix[i+1]; + counterD++; + i++; + } + else + { + delete[counterD] = matrix[i+1]; + counterD++; + i++; + } + } + else if(matrix[i] == 'I') + { + if(counterM != 0) + { + // sprintf(outputSNP, "%s%d\0", outputSNP, counterM); + //counterM++; + } + else if(counterD != 0) + { + delete[counterD] = '\0'; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + counterD = 0; + delete[0] = '\0'; + } + i++; + + } + else + { + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM = 0; + } + if(counterD != 0) + { + delete[counterD] = '\0'; + counterD=0; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + delete[0] = '\0'; + } + sprintf(outputSNP,"%s%c",outputSNP,matrix[i]); + snpSize += 1; + outputSNP[snpSize] = '\0'; + } + i++; + } + + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM = 0; + } + else if(counterD != 0) + { + delete[counterD] = '\0'; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + counterD = 0; + } + + outputSNP[snpSize] = '\0'; +} +/**********************************************/ + +/* + direction = 0 forward + 1 backward + +*/ + +void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) +{ + int j = 0; + int z = 0; + int *locs = (int *) l1; + char *_tmpSeq, *_tmpQual; + char rqual[SEQ_LENGTH+1]; + rqual[SEQ_LENGTH]='\0'; + + int genLoc = 0; + int leftSeqLength = 0; + int rightSeqLength = 0; + int middleSeqLength = 0; + + char matrix[200]; + char editString[200]; + char cigar[MAX_CIGAR_SIZE]; + + short *_tmpHashValue; + + if (direction) + { + reverse(_msf_seqList[readNumber].qual, rqual, SEQ_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[readNumber].rseq; + _tmpHashValue = _msf_seqList[readNumber].rhashValue; + } + else + { + _tmpQual = _msf_seqList[readNumber].qual; + _tmpSeq = _msf_seqList[readNumber].seq; + _tmpHashValue = _msf_seqList[readNumber].hashValue; + } + + int readId = 2*readNumber+direction; + for (z=0; z _msf_refGenEnd || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId + ) + continue; + int err = -1; + + + map_location = 0; + + leftSeqLength = _msf_samplingLocs[o]; + middleSeqLength = WINDOW_SIZE; + a = leftSeqLength + middleSeqLength; + rightSeqLength = SEQ_LENGTH - a; + + if(errThreshold == 2) + err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold == 4) + err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold ==3) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + /*else if(errThreshold == 6) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + */ + else + err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + + if(err != -1) + { + generateSNPSAM(matrix, strlen(matrix), editString); + generateCigar(matrix, strlen(matrix), cigar); + } + + if(err != -1 && !bestMode) + { + + mappingCnt++; + + int j = 0; + int k = 0; + for(k = 0; k < readSegment+1; k++) + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && + genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) + _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; + } + } + _msf_seqList[readNumber].hits[0]++; + + _msf_output.QNAME = _msf_seqList[readNumber].name; + _msf_output.FLAG = 16 * direction; + _msf_output.RNAME = _msf_refGenName; + _msf_output.POS = map_location + _msf_refGenOffset; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = editString; + + output(_msf_output); + + + if (_msf_seqList[readNumber].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[readNumber].hits[0] = 2; + } + + + if ( maxHits!=0 && _msf_seqList[readNumber].hits[0] == maxHits) + { + completedSeqCnt++; + break; + } + + } + else if(err != -1 && bestMode) + { + mappingCnt++; + _msf_seqList[readNumber].hits[0]++; + + if (_msf_seqList[readNumber].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[readNumber].hits[0] = 2; + } + + if(err < bestHitMappingInfo[readNumber].err || bestHitMappingInfo[readNumber].loc == -1) + { + setFullMappingInfo(readNumber, map_location + _msf_refGenOffset, direction, err, 0, editString, _msf_refGenName, cigar ); + } + } + else + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc+j > _msf_refGenBeg && + genLoc+j < _msf_refGenEnd) + _msf_verifiedLocs[genLoc+j] = -readId; + } + } + } +} + + +int mapAllSingleEndSeq() +{ + int i = 0; + int j = 0; + int k = 0; + + + unsigned int *locs = NULL; + + + int prev_hash = 0; + + for(i = 0; i < _msf_seqListSize; i++) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; +// if(j != 0) +// if(strncmp(_msf_seqList[k].seq+_msf_samplingLocs[j], _msf_seqList[k].seq+_msf_samplingLocs[j-1], segSize) == 0) +// continue; +// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) +// continue; + locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapSingleEndSeq(locs+1, locs[0],k ,j, 0); + } + prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); + } + } + i = 0; + + for(i = 0; i < _msf_seqListSize; i++) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + +// if(j != 0) +// if(strncmp(_msf_seqList[k].rseq+_msf_samplingLocs[j], _msf_seqList[k].rseq+_msf_samplingLocs[j-1], segSize) == 0) +// continue; +// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) +// continue; + locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapSingleEndSeq(locs+1, locs[0],k ,j, 1); + } + prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); + } + } + return 1; +} + + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +int compareOut (const void *a, const void *b) +{ + FullMappingInfo *aInfo = (FullMappingInfo *)a; + FullMappingInfo *bInfo = (FullMappingInfo *)b; + return aInfo->loc - bInfo->loc; +} + + + +/**********************************************/ + +/* + direction 0: Forward + 1: Reverse +*/ + +void mapPairEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) +{ + int z = 0; + int *locs = (int *) l1; + char *_tmpSeq; + + char rqual[SEQ_LENGTH+1]; + + char matrix[200]; + char editString[200]; + char cigar[MAX_CIGAR_SIZE]; + + short *_tmpHashValue; + + int leftSeqLength = 0; + int middleSeqLength = 0; + int rightSeqLength =0; + int a = 0; + + rqual[SEQ_LENGTH]='\0'; + + + int r = readNumber; + + char d = (direction==1)?-1:1; + + if (d==-1) + { + _tmpSeq = _msf_seqList[readNumber].rseq; + _tmpHashValue = _msf_seqList[r].rhashValue; + } + else + { + _tmpSeq = _msf_seqList[readNumber].seq; + _tmpHashValue = _msf_seqList[r].hashValue; + } + + int readId = 2*readNumber+direction; + for (z=0; z _msf_refGenEnd || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId) + continue; + + if(errThreshold == 2) + err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold == 4) + err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold ==3) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + /*else if(errThreshold == 6) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue);*/ + else + err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + + + if (err != -1) + { + int j = 0; + int k = 0; + + for(k = 0; k < readSegment+1; k++) + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && + genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) + _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; + } + } + + + generateSNPSAM(matrix, strlen(matrix), editString); + generateCigar(matrix, strlen(matrix), cigar); + + MappingLocations *parent = NULL; + MappingLocations *child = _msf_mappingInfo[r].next; + + genLoc = map_location + _msf_refGenOffset; + int i = 0; + for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++) + { + parent = child; + child = child->next; + } + + if (child==NULL) + { + MappingLocations *tmp = getMem(sizeof(MappingLocations)); + + tmp->next = NULL; + tmp->loc[0]=genLoc * d; + tmp->err[0]=err; + + tmp->cigarSize[0] = strlen(cigar); + sprintf(tmp->cigar[0],"%s", cigar); + + tmp->mdSize[0] = strlen(editString); + sprintf(tmp->md[0],"%s", editString); + + if (parent == NULL) + _msf_mappingInfo[r].next = tmp; + else + parent->next = tmp; + } + else + { + if(strlen(cigar) > SEQ_LENGTH || strlen(editString) > SEQ_LENGTH) + { + printf("ERROR in %d read size(After mapping) exceedes cigar=%d md =%d cigar=%s md =%s\n", r, (int)strlen(cigar), (int)strlen(editString), cigar, editString); + } + + child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d; + child->err[_msf_mappingInfo[r].size % MAP_CHUNKS] = err; + + child->cigarSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(cigar); + sprintf(child->cigar[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",cigar); + + child->mdSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(editString); + sprintf(child->md[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",editString); + } + _msf_mappingInfo[r].size++; + + } + else + { + _msf_verifiedLocs[genLoc] = -readId; + } + + } +} + +/**********************************************/ +void mapPairedEndSeq() +{ + int i = 0; + int j = 0; + int k = 0; + + unsigned int *locs = NULL; + while ( i < _msf_seqListSize ) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapPairEndSeqList(locs+1, locs[0],k ,j, 0); + } + } + i++; + } + i = 0; + + while ( i < _msf_seqListSize ) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapPairEndSeqList(locs+1, locs[0],k ,j, 1); + } + } + + i++; + } + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + MappingLocations *cur; + int tmpOut; + int lmax=0, rmax=0; + + sprintf(fname1, "%s__%s__%s__%d__1.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); + sprintf(fname2, "%s__%s__%s__%d__2.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); + + FILE* out; + FILE* out1 = fileOpen(fname1, "w"); + FILE* out2 = fileOpen(fname2, "w"); + + _msf_openFiles++; + + for (i=0; i<_msf_seqListSize; i++) + { + + if (i%2==0) + { + out = out1; + + if (lmax < _msf_mappingInfo[i].size) + { + lmax = _msf_mappingInfo[i].size; + } + } + else + { + out = out2; + if (rmax < _msf_mappingInfo[i].size) + { + rmax = _msf_mappingInfo[i].size; + } + } + + tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out); + if (_msf_mappingInfo[i].size > 0) + { + cur = _msf_mappingInfo[i].next; + for (j=0; j < _msf_mappingInfo[i].size; j++) + { + if ( j>0 && j%MAP_CHUNKS==0) + { + cur = cur->next; + } + if(cur->cigarSize[j % MAP_CHUNKS] > SEQ_LENGTH || cur->mdSize[j % MAP_CHUNKS] > SEQ_LENGTH) + { + printf("ERROR in %d read size exceeds cigar=%d md =%d cigar=%s md =%s\n", i, cur->cigarSize[j % MAP_CHUNKS], cur->mdSize[j % MAP_CHUNKS], cur->cigar[j % MAP_CHUNKS], cur->md[j % MAP_CHUNKS]); + } + + tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out); + + tmpOut = fwrite(&(cur->err[j % MAP_CHUNKS]), sizeof(int), 1, out); + + tmpOut = fwrite(&(cur->cigarSize[j % MAP_CHUNKS]), sizeof(int), 1, out); + tmpOut = fwrite((cur->cigar[j % MAP_CHUNKS]), sizeof(char), (cur->cigarSize[j % MAP_CHUNKS]), out); + + tmpOut = fwrite(&(cur->mdSize[j % MAP_CHUNKS]), sizeof(int), 1, out); + tmpOut = fwrite((cur->md[j % MAP_CHUNKS]), sizeof(char), (cur->mdSize[j % MAP_CHUNKS]), out); + + } + _msf_mappingInfo[i].size = 0; + //_msf_mappingInfo[i].next = NULL; + } + } + + _msf_maxLSize += lmax; + _msf_maxRSize += rmax; + + fclose(out1); + fclose(out2); + +} + +void outputPairFullMappingInfo(FILE *fp, int readNumber) +{ + + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; + + rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; + + seq1 = _msf_seqList[readNumber*2].seq; + rseq1 = _msf_seqList[readNumber*2].rseq; + qual1 = _msf_seqList[readNumber*2].qual; + + reverse(_msf_seqList[readNumber*2].qual, rqual1, SEQ_LENGTH); + + seq2 = _msf_seqList[readNumber*2+1].seq; + rseq2 = _msf_seqList[readNumber*2+1].rseq; + qual2 = _msf_seqList[readNumber*2+1].qual; + + reverse(_msf_seqList[readNumber*2+1].qual, rqual2, SEQ_LENGTH); + + + if(bestHitMappingInfo[readNumber*2].loc == -1 && bestHitMappingInfo[readNumber*2+1].loc == -1) + return; + else + { + + char *seq; + char *qual; + char d1; + char d2; + int isize; + int proper=0; + // ISIZE CALCULATION + // The distance between outer edges + isize = abs(bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc)+SEQ_LENGTH - 2; + + if (bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc > 0) + { + isize *= -1; + } + d1 = (bestHitMappingInfo[readNumber*2].dir == -1)?1:0; + d2 = (bestHitMappingInfo[readNumber*2+1].dir == -1)?1:0; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + if ( (bestHitMappingInfo[readNumber*2].loc < bestHitMappingInfo[readNumber*2+1].loc && !d1 && d2) || + (bestHitMappingInfo[readNumber*2].loc > bestHitMappingInfo[readNumber*2+1].loc && d1 && !d2) ) + { + proper = 2; + } + else + { + proper = 0; + } + + _msf_output.POS = bestHitMappingInfo[readNumber*2].loc; + _msf_output.MPOS = bestHitMappingInfo[readNumber*2+1].loc; + _msf_output.FLAG = 1+proper+16*d1+32*d2+64; + _msf_output.ISIZE = isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[readNumber*2].name; + _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[readNumber*2].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2].md; + + outputSAM(fp, _msf_output); + output(_msf_output); + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + _msf_output.POS = bestHitMappingInfo[readNumber*2+1].loc; + _msf_output.MPOS = bestHitMappingInfo[readNumber*2].loc; + _msf_output.FLAG = 1+proper+16*d2+32*d1+128; + _msf_output.ISIZE = -isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[readNumber*2].name; + _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[readNumber*2+1].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2+1].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2+1].md; + + outputSAM(fp, _msf_output); + output(_msf_output); + } +} + + +/* +Find the closet one to the c + @return 0: if the x1 is closer to c + 1: if the x2 is closer to c + 2: if both distance are equal + -1: if error +*/ +int findNearest(int x1, int x2, int c) +{ + + if (abs(x1 - c) > abs(x2 - c) ) + return 0; + else if ( abs(x1 - c) < abs(x2 - c) ) + return 1; + else if ( abs(x1 - c) == abs(x2 - c) ) + return 2; + else + return -1; +} + +void initBestConcordantDiscordant(int readNumber) +{ + char bestConcordantFileName[FILE_NAME_LENGTH]; + char bestDiscordantFileName[FILE_NAME_LENGTH]; + + //OPEN THE BEST CONCORDANT FILE + //BEGIN{Farhad Hormozdiari} + sprintf(bestConcordantFileName, "%s%s__BEST.CONCORDANT", mappingOutputPath, mappingOutput); + bestConcordantFILE = fileOpen(bestConcordantFileName, "w"); + //END{Farhad Hormozdiari} + + + //OPEN THE BEST DISCORDANT FILE + //BEGIN{Farhad Hormozdiari} + sprintf(bestDiscordantFileName, "%s%s__BEST.DISCORDANT", mappingOutputPath, mappingOutput); + bestDiscordantFILE = fileOpen(bestDiscordantFileName, "w"); + //END{Farhad Hormozdiari} + + initBestMapping(readNumber); +} + +void finalizeBestConcordantDiscordant() +{ + int i = 0; + + for(i = 0; i<_msf_seqListSize/2; i++) + { + if(_msf_readHasConcordantMapping[i]==1) + outputPairFullMappingInfo(bestConcordantFILE, i); + else + outputPairFullMappingInfo(bestDiscordantFILE, i); + } + + fclose(bestConcordantFILE); + fclose(bestDiscordantFILE); + + freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); +} + +void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar) +{ + bestHitMappingInfo[readNumber].loc = loc; + bestHitMappingInfo[readNumber].dir = dir; + bestHitMappingInfo[readNumber].err = err; + bestHitMappingInfo[readNumber].score = score; + + strncpy(bestHitMappingInfo[readNumber].md, md, strlen(md)); + strncpy(bestHitMappingInfo[readNumber].chr, refName, strlen(refName)); + strncpy(bestHitMappingInfo[readNumber].cigar, cigar, strlen(cigar)); +} + + +void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2) +{ + + bestHitMappingInfo[readNumber*2].loc = mi1.loc; + bestHitMappingInfo[readNumber*2].dir = mi1.dir; + bestHitMappingInfo[readNumber*2].err = mi1.err; + bestHitMappingInfo[readNumber*2].score = mi1.score; + snprintf(bestHitMappingInfo[readNumber*2].chr, MAX_REF_SIZE, "%s", _msf_refGenName); + + strncpy(bestHitMappingInfo[readNumber*2].md, mi1.md, strlen(mi1.md)); + strncpy(bestHitMappingInfo[readNumber*2].cigar, mi1.cigar, strlen(mi1.cigar)); + + /* + sprintf(bestHitMappingInfo[readNumber*2].md, "%s\0", mi1.md); + sprintf(bestHitMappingInfo[readNumber*2].cigar, "%s\0", mi1.cigar); + */ + + + bestHitMappingInfo[readNumber*2+1].loc = mi2.loc; + bestHitMappingInfo[readNumber*2+1].dir = mi2.dir; + bestHitMappingInfo[readNumber*2+1].err = mi2.err; + bestHitMappingInfo[readNumber*2+1].score = mi2.score; + + snprintf(bestHitMappingInfo[readNumber*2+1].chr, MAX_REF_SIZE, "%s", _msf_refGenName); + + /* + sprintf(bestHitMappingInfo[readNumber*2+1].md, "%s\0", mi2.md); + sprintf(bestHitMappingInfo[readNumber*2+1].cigar, "%s\0", mi2.cigar); + */ + + strncpy(bestHitMappingInfo[readNumber*2+1].md, mi2.md, strlen(mi2.md)); + strncpy(bestHitMappingInfo[readNumber*2+1].cigar, mi2.cigar, strlen(mi2.cigar)); + +} + +/**********************************************/ +void outputPairedEnd() +{ + int i = 0; + + char cigar[MAX_CIGAR_SIZE]; + + int tmpOut; + + loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut); + + FILE* in1[_msf_openFiles]; + FILE* in2[_msf_openFiles]; + + char fname1[_msf_openFiles][FILE_NAME_LENGTH]; + char fname2[_msf_openFiles][FILE_NAME_LENGTH]; + + // discordant + FILE *out=NULL, *out1=NULL; + + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + + int meanDistanceMapping = 0; + + char *rqual1; + char *rqual2; + + rqual1 = getMem((SEQ_LENGTH+1)*sizeof(char)); + rqual2 = getMem((SEQ_LENGTH+1)*sizeof(char)); + + if (pairedEndDiscordantMode) + { + sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname4, "%s__%s__oea", mappingOutputPath, mappingOutput); + out = fileOpen(fname3, "a"); + out1 = fileOpen(fname4, "a"); + } + + FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize); + FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize); + + _msf_fileCount[_msf_maxFile] = 0; + for (i=0; i<_msf_openFiles; i++) + { + sprintf(fname1[i], "%s__%s__%s__%d__1.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); + sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][0], "%s", fname1[i]); + + sprintf(fname2[i], "%s__%s__%s__%d__2.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); + sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][1], "%s", fname2[i]); + + in1[i] = fileOpen(fname1[i], "r"); + in2[i] = fileOpen(fname2[i], "r"); + _msf_fileCount[_msf_maxFile]++; + } + _msf_maxFile++; + + int size; + int j, k; + int size1, size2; + + meanDistanceMapping = (pairedEndDiscordantMode==1)? (minPairEndedDiscordantDistance+maxPairEndedDiscordantDistance)/2 + SEQ_LENGTH + : (minPairEndedDistance + maxPairEndedDistance) / 2 + SEQ_LENGTH; + + for (i=0; i<_msf_seqListSize/2; i++) + { + size1 = size2 = 0; + for (j=0; j<_msf_openFiles; j++) + { + tmpOut = fread(&size, sizeof(int), 1, in1[j]); + if ( size > 0 ) + { + for (k=0; k 0 ) + { + for (k=0; k= rl) + { + if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) || + (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) ) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + + if(nosamMode != 0) + { + size1=0; + size2=0; + } + + break; + } + } + k++; + } + } + + _msf_seqHits[i*2] += size1; + _msf_seqHits[i*2+1] += size2; + + + if (_msf_seqHits[i*2+1] * _msf_seqHits[i*2] > DISCORDANT_CUT_OFF && nosamMode != 0) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + size1=0; + size2=0; + } + + + + + int tmp = 0; + int rNo = 0; + int loc = 0; + int err = 0; + float sc = 0; + char l = 0; + + //write the OEA data + if(_msf_seqHits[i*2] == 0 ) + { + for(k = 0;k < size2 && _msf_oeaMapping[i*2+1] < maxOEAOutput ;k++) + { + rNo = i*2+1; + loc = mi2[k].loc*mi2[k].dir; + err = mi2[k].err; + sc = mi2[k].score; + + l = strlen(_msf_refGenName); + + tmp = fwrite(&rNo, sizeof(int), 1, out1); + + tmp = fwrite(&l, sizeof(char), 1, out1); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); + + tmp = fwrite(&loc, sizeof(int), 1, out1); + tmp = fwrite(&err, sizeof(int), 1, out1); + tmp = fwrite(&sc, sizeof(float), 1, out1); + + if(mi2[k].cigarSize > SEQ_LENGTH || mi2[k].cigarSize <= 0) + printf("ERROR CIGAR size=%d %s\n", mi2[k].cigarSize, _msf_seqList[i*2+1].seq); + + tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out1); + tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out1); + + tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out1); + tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out1); + + _msf_oeaMapping[i*2+1]++; + } + } + if(_msf_seqHits[i*2+1] == 0) + { + for(j = 0;j < size1 && _msf_oeaMapping[i*2] < maxOEAOutput;j++) + { + rNo = i*2; + loc = mi1[j].loc*mi1[j].dir; + err = mi1[j].err; + sc = mi1[j].score; + + l = strlen(_msf_refGenName); + + tmp = fwrite(&rNo, sizeof(int), 1, out1); + + tmp = fwrite(&l, sizeof(char), 1, out1); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); + + tmp = fwrite(&loc, sizeof(int), 1, out1); + tmp = fwrite(&err, sizeof(int), 1, out1); + tmp = fwrite(&sc, sizeof(float), 1, out1); + + if(mi1[j].cigarSize > SEQ_LENGTH || mi1[j].cigarSize <= 0 ) + printf("ERROR %d %s\n", mi1[j].cigarSize, _msf_seqList[i*2+1].seq); + + tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out1); + tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out1); + + tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out1); + tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out1); + + _msf_oeaMapping[i*2]++; + } + } + } + + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + + + + + rqual1[SEQ_LENGTH] = '\0'; + rqual2[SEQ_LENGTH] = '\0'; + rqual1[0] = '\0'; + rqual2[0] = '\0'; + + + + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + + + + strncpy(rqual1, _msf_seqList[i*2].qual, SEQ_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + + + strncpy(rqual2, _msf_seqList[i*2+1].qual, SEQ_LENGTH); + + if (pairedEndDiscordantMode) + { + for (k=0; k= minPairEndedDiscordantDistance && + mi2[k].loc-mi1[j].loc <= maxPairEndedDiscordantDistance && + mi1[j].dir > 0 && mi2[k].dir < 0 ) + + || + + (mi1[j].loc-mi2[k].loc >= minPairEndedDiscordantDistance && + mi1[j].loc-mi2[k].loc <= maxPairEndedDiscordantDistance && + mi1[j].dir < 0 && mi2[k].dir > 0) + ) + { + //POSSIBLE CONCORDANT + if(_msf_readHasConcordantMapping[i] == 0) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + _msf_readHasConcordantMapping[i] = 1; + _msf_seqList[i*2].hits[0] = 1; + _msf_seqList[i*2+1].hits[0] = 1; + } + else + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) + { + + if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == + mi1[j].err + mi2[k].err && + findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi2[k].loc - mi1[j].loc), + meanDistanceMapping + ) == 0 ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + } + } + //DISCORDANT TO TEMP FILE FOR POST PROCESSIING + else if(_msf_readHasConcordantMapping[i] == 0 && + _msf_seqHits[i*2] != 0 && + _msf_seqHits[i*2+1] != 0) + { + + int tmp; + int rNo = i; + int loc = mi1[j].loc*mi1[j].dir; + int err = mi1[j].err; + float sc = mi1[j].score; + + char l = strlen(_msf_refGenName); + + if(_msf_discordantMapping[i*2] < maxDiscordantOutput) + { + + tmp = fwrite(&rNo, sizeof(int), 1, out); + + tmp = fwrite(&l, sizeof(char), 1, out); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out); + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(int), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + + tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out); + tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out); + + tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out); + tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out); + + + loc = mi2[k].loc*mi2[k].dir; + err = mi2[k].err; + sc = mi2[k].score; + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(int), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + + tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); + tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); + + tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); + tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); + + + _msf_discordantMapping[i*2]++; + } + //SET THE BEST DISCORDANT + //BEGIN {Farhad Hormozdiari} + if( bestHitMappingInfo[i*2].loc == -1 && + bestHitMappingInfo[i*2+1].loc == -1 && + _msf_readHasConcordantMapping[i] == 0) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + _msf_seqList[i*2].hits[0] = 1; + _msf_seqList[i*2+1].hits[0] = 1; + } + else if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err + && _msf_readHasConcordantMapping[i] == 0) + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && + findNearest( abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi1[j].loc - mi2[k].loc), + meanDistanceMapping + ) == 0 + ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + //END {Farhad Hormozdiari} + } + } + } + } + else + { + for (j=0; j= minPairEndedDistance && + mi2[k].loc-mi1[j].loc <= maxPairEndedDistance && + mi1[j].dir > 0 && mi2[k].dir < 0) + || + (mi1[j].loc-mi2[k].loc >= minPairEndedDistance && + mi1[j].loc-mi2[k].loc <= maxPairEndedDistance && + mi1[j].dir < 0 && mi2[k].dir > 0) + ) + { + char *seq; + char *qual; + char d1; + char d2; + int isize; + int proper=0; + // ISIZE CALCULATION + // The distance between outer edges + isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-2; + if (mi1[j].loc - mi2[k].loc > 0) + { + isize *= -1; + } + + d1 = (mi1[j].dir == -1)?1:0; + d2 = (mi2[k].dir == -1)?1:0; + + //SET THE READ HAS CONCORDANT MAPPING + _msf_readHasConcordantMapping[i] = 1; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + + if ((mi1[j].loc < mi2[k].loc && !d1 && d2) || + (mi1[j].loc > mi2[k].loc && d1 && !d2) ) + { + proper = 2; + } + else + { + proper = 0; + } + + + _msf_output.POS = mi1[j].loc; + _msf_output.MPOS = mi2[k].loc; + _msf_output.FLAG = 1+proper+16*d1+32*d2+64; + _msf_output.ISIZE = isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi1[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi1[j].md; + + if(!bestMode) + output(_msf_output); + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + _msf_output.POS = mi2[k].loc; + _msf_output.MPOS = mi1[j].loc; + _msf_output.FLAG = 1+proper+16*d2+32*d1+128; + _msf_output.ISIZE = -isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi2[k].err;; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi2[k].md; + + if(!bestMode) + output(_msf_output); + //SET THE BEST CONCORDANT + //BEGIN {Farhad Hormozdiari} + if(bestHitMappingInfo[i*2].loc == -1 && bestHitMappingInfo[i*2+1].loc == -1) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + else + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) + { + + if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && + findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi2[k].loc - mi1[j].loc), + meanDistanceMapping + ) == 0 ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + } + //END {Farhad Hormozdiari} + } + } + } + + } + } + + freeMem(rqual1, 0); + freeMem(rqual2, 0); + + if (pairedEndDiscordantMode) + { + fclose(out); + fclose(out1); + } + + for (i=0; i<_msf_openFiles; i++) + { + fclose(in1[i]); + fclose(in2[i]); + + unlink(fname1[i]); + unlink(fname2[i]); + } + + freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize); + freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize); + + _msf_openFiles = 0; +} + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +float str2int(char *str, int index1, int index2) +{ + char tmp[200]; + strncpy(tmp, &str[index1], index2-index1); + tmp[index2-index1] = '\0'; + return atol(tmp); +} + +float calculateScore(int index, char *seq, char *qual,char *md) +{ + int i; + int j; + char *ref; + char *ver; + + ref = _msf_refGen + index-1; + ver = seq; + float score = 1; + + char tmp[200]; + int value = 0; + int end = 0; + int index1 = 0; + int index2 = 0; + + i=0; + while(1) + { + + if(i>=strlen(md)) + break; + + index1 = i; + + while(md[i] >='0' && md[i]<='9') + { + i++; + } + + index2 = i; + + value = str2int(md, index1,index2); + + if(md[i]=='M') + { + for(j=0;j= '0' && cigar[i] <= '9') + { + start = i; + + while(cigar[i] >= '0' && cigar[i] <= '9' && i < cigar_size) + i++; + + int value = matoi(cigar, start, i); + for(j = 0; j < value; j++) + { + if(cigar[i] == 'M') + matrix[size] = 'M'; + else if(cigar[i] == 'D') + matrix[size] ='D'; + else if(cigar[i] == 'I') + matrix[size] = 'I'; + size++; + } + } + i++; + } + matrix[size] = '\0'; +} + + + +void convertMDToMatrix(char *md, int md_size, char * matrix) +{ + int i = 0; + int j = 0; + + int start = 0; + int size = 0; + + matrix[0] = '\0'; + + while(i < md_size) + { + if(md[i] >= '0' && md[i] <= '9') + { + start = i; + + while(md[i] >= '0' && md[i] <= '9' && i < md_size) + i++; + + int value = matoi(md, start, i); + for(j = 0; j < value; j++) + { + matrix[size] = 'M'; + size++; + } + i--; + } + else if(md[i] == '^') + { + matrix[size] = 'D'; + size++; + } + else + { + matrix[size] = md[i]; + size++; + } + //size++; + i++; + } + matrix[size] = '\0'; +} + + +void convertMDCigarToMatrix(char *cigar, int cigar_size, char *md, int md_size, char *matrix) +{ + int i = 0; + int j = 0; + + int size = 0; + + char tmp1[200]; + char tmp2[200]; + convertMDToMatrix(md,md_size, tmp2); + + convertCigarToMatrix(cigar, cigar_size,tmp1); + + + + while(i < strlen(tmp1)) + { + if(tmp1[i]=='M') + { + if(j < strlen(tmp2)) + { + if(tmp2[j]=='M') + { + matrix[size]='M'; + size++; + } + if(tmp2[j]!='M') + { + matrix[size]=tmp2[j]; + size++; + } + } + else + { + matrix[size]='M'; + size++; + } + } + else if(tmp1[i] == 'D') + { + matrix[size]='D'; + size++; + j++; + matrix[size]=tmp2[j]; + size++; + + } + else if(tmp1[i] == 'I') + { + matrix[size]='I'; + size++; + } + + i++; + if(j < strlen(tmp2)) + j++; + } + + if(strlen(tmp1)) + + matrix[size] = '\0'; + +} + +void convertInsertion(char * in_matrix, char * seq, char *out_matrix) +{ + int i = 0; + int j = 0; + int size = 0; + + while( i < strlen(in_matrix)) + { + if(in_matrix[i] == 'M') + { + out_matrix[size] = 'M'; + size++; + j++; + } + else if(in_matrix[i] == 'D') + { + out_matrix[size] = 'D'; + size++; + + i++; + j++; + + out_matrix[size] = seq[j]; + j++; + size++; + } + else if(in_matrix[i] == 'I') + { + out_matrix[size] = 'I'; + size++; + out_matrix[size] = seq[j]; + size++; + j++; + } + else + { + out_matrix[size] = in_matrix[i]; + size++; + j++; + } + i++; + } + out_matrix[size] = '\0'; +} + +/**********************************************/ +void outputPairedEndDiscPP() +{ + char tmp_matrix1[200]; + char tmp_matrix2[200]; + + char matrix1[200]; + char matrix2[200]; + + char cigar1[200]; + char editString1[200]; + + char cigar2[200]; + char editString2[200]; + + char seq1[SEQ_LENGTH+1]; + char qual1[SEQ_LENGTH+1]; + + char seq2[SEQ_LENGTH+1]; + char qual2[SEQ_LENGTH+1]; + + char genName[SEQ_LENGTH]; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char l; + int l_size; + int loc1, loc2; + int err1, err2; + char dir1, dir2; + float sc1, sc2, lsc=0; + int flag = 0; + int rNo,lrNo = -1; + int tmp; + FILE *in, *out; + + sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname2, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput); + + in = fileOpen(fname1, "r"); + out = fileOpen(fname2, "w"); + + if (in != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in); + } + else + { + flag = 0; + } + + seq1[SEQ_LENGTH] = '\0'; + qual1[SEQ_LENGTH] = '\0'; + + seq2[SEQ_LENGTH] = '\0'; + qual2[SEQ_LENGTH] = '\0'; + + while (flag) + { + tmp = fread(&l, sizeof(char), 1, in); + tmp = fread(genName, sizeof(char), l, in); + genName[(int)l]='\0'; + tmp = fread(&loc1, sizeof(int), 1, in); + tmp = fread(&err1, sizeof(int), 1, in); + tmp = fread(&sc1, sizeof(float), 1, in); + + //tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(cigar1, sizeof(char), l_size, in); + cigar1[(int)l_size]='\0'; + //tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); + + //tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(editString1, sizeof(char), l_size, in); + editString1[(int)l_size]='\0'; + //tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); + + tmp = fread(&loc2, sizeof(int), 1, in); + tmp = fread(&err2, sizeof(int), 1, in); + tmp = fread(&sc2, sizeof(float), 1, in); + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(cigar2, sizeof(char), l_size, in); + cigar2[(int)l_size]='\0'; + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(editString2, sizeof(char), l_size, in); + editString2[(int)l_size]='\0'; + + convertMDCigarToMatrix(cigar1, strlen(cigar1), editString1, strlen(editString1), tmp_matrix1); + convertMDCigarToMatrix(cigar2, strlen(cigar2), editString2, strlen(editString2), tmp_matrix2); + + + if(_msf_readHasConcordantMapping[rNo] == 0) + { + + dir1 = dir2 = 'F'; + + strncpy(seq1, _msf_seqList[rNo*2].seq, SEQ_LENGTH); + strncpy(seq2, _msf_seqList[rNo*2+1].seq, SEQ_LENGTH); + + if (loc1 < 0) + { + dir1 = 'R'; + loc1 = -loc1; + + strncpy(seq1, _msf_seqList[rNo*2].rseq, SEQ_LENGTH); + } + + if (loc2 < 0) + { + dir2 = 'R'; + loc2 = -loc2; + + strncpy(seq2, _msf_seqList[rNo*2+1].rseq, SEQ_LENGTH); + } + + convertInsertion(tmp_matrix1, seq1, matrix1); + convertInsertion(tmp_matrix2, seq2, matrix2); + + + if (rNo != lrNo) + { + int j; + for (j=0; j= maxPairEndedDiscordantDistance ) + { + event = 'D'; + } + else + { + event = 'I'; + } + } + else if (loc2 < loc1) + { + if (dir2 == 'R' && dir1 == 'F') + { + event = 'E'; + } + else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance ) + { + event = 'D'; + } + else + { + event = 'I'; + } + } + } + _msf_seqList[rNo*2].hits[0] = 2; + if(event != 'E') + fprintf(out, "%s\t%s\t%d\t%d\t%c\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%e\n", + _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, + loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2); + + } + flag = fread(&rNo, sizeof(int), 1, in); + } + + fclose(in); + fclose(out); + + unlink(fname1); +} + +void finalizeOEAReads(char *fileName) +{ + FILE *fp_out1; + FILE * in; + + char genName[SEQ_LENGTH]; + + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + + char l=0; + int loc1=0; + + int err1; + + char d; + + float sc1=0; + int flag = 0; + int rNo=-1; + int tmp=0; + + int cigarSize = 0; + int mdSize = 0; + + char cigar[SEQ_LENGTH+1]; + char md[SEQ_LENGTH+1]; + + char *seq1, *seq2, *qual1, *qual2; + char *rqual1, *rqual2; + + seq1=NULL; seq2=NULL; qual1=NULL; qual2=NULL; + + rqual1 = getMem(200*sizeof(char)); + rqual2 = getMem(200*sizeof(char)); + + rqual1[0] = '\0'; + rqual2[0] = '\0'; + + sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput); + + fp_out1 = fileOpen(fname1, "w"); + + in = NULL; + if (pairedEndDiscordantMode){ + sprintf(fname2, "%s__%s__oea", mappingOutputPath, mappingOutput); + + in = fileOpen(fname2, "r"); + } + + + if (in != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in); + } + else + { + flag = 0; + } + + while (flag) + { + cigar[0] = '\0'; + md[0] = '\0'; + + tmp = fread(&l, sizeof(char), 1, in); + tmp = fread(genName, sizeof(char), l, in); + + genName[(int)l]='\0'; + + + tmp = fread(&loc1, sizeof(int), 1, in); + tmp = fread(&err1, sizeof(int), 1, in); + tmp = fread(&sc1, sizeof(float), 1, in); + + tmp = fread (&cigarSize, sizeof(int), 1, in); + tmp = fread (cigar, sizeof(char), cigarSize, in); + + cigar[cigarSize] = '\0'; + + tmp = fread (&mdSize, sizeof(int), 1, in); + tmp = fread (md, sizeof(char), mdSize, in); + md[mdSize] = '\0'; + + d = 1; + + if(loc1 < 0) + { + d = -1; + loc1 *= -1; + + seq1 = _msf_seqList[rNo].rseq; + reverse(_msf_seqList[rNo].qual, rqual1, SEQ_LENGTH); + rqual1[SEQ_LENGTH] = '\0'; + } + else + { + seq1 = _msf_seqList[rNo].seq; + qual1 = _msf_seqList[rNo].qual; + } + + if(rNo % 2 == 0) + { + seq2 = _msf_seqList[rNo+1].seq; + qual2 = _msf_seqList[rNo+1].qual; + } + else + { + seq2 = _msf_seqList[rNo-1].seq; + qual2 = _msf_seqList[rNo-1].qual; + } + + if(_msf_seqHits[rNo] != 0 && _msf_seqHits[(rNo%2==0)?rNo+1:rNo-1] == 0) + { + _msf_output.POS = loc1; + _msf_output.MPOS = 0; + _msf_output.FLAG = (rNo % 2 ==0)? 1+4+32*d+128 : 1+8+16*d+64 ; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq1; + _msf_output.QUAL = qual1; + _msf_output.QNAME = _msf_seqList[rNo].name; + _msf_output.RNAME = genName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + + _msf_output.optSize = 4; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err1; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = md; + + + + //for the OEA reads + _msf_optionalFields[2].tag = "NS"; + _msf_optionalFields[2].type = 'Z'; + _msf_optionalFields[2].sVal = seq2; + + + _msf_optionalFields[3].tag = "NQ"; + _msf_optionalFields[3].type = 'Z'; + _msf_optionalFields[3].sVal = qual2; + + outputSAM(fp_out1, _msf_output); + + _msf_seqList[rNo].hits[0] = -1; + _msf_seqList[(rNo%2==0)?rNo+1:rNo-1].hits[0] = -1; + } + flag = fread(&rNo, sizeof(int), 1, in); + } + + freeMem(rqual1, 0); + freeMem(rqual2, 0); + + unlink(fname2); + + fclose(fp_out1); +} + +/* + +void outputOEA(char *fileName1, FILE * fp_out, int readSegment) +{ + int i = 0; + int j = 0; + + char *index; + + int size1 = 0; + + FILE *fp1; + + char geneFileName1[FILE_NAME_LENGTH]; + + char matrix[200]; + char cigar[MAX_CIGAR_SIZE]; + char editString[200]; + + FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); + + if(fileName1 != NULL) + { + + fp1 = fileOpen(fileName1, "r"); + + index = strstr(fileName1, "__"); + strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); + geneFileName1[strstr(index + 2, "__") - index - 2] = '\0'; + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + fread(&size1, sizeof(int), 1, fp1); + + miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) ); + + miL[i].size = size1; + + for(j = 0; j < size1; j++) + { + fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1); + + fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1); + + miL[i].mi[j].dir = 1; + if(miL[i].mi[j].loc < 1) + { + miL[i].mi[j].loc *= -1; + miL[i].mi[j].dir = -1; + } + } + + int tmpSize = (readSegment==0) ? _msf_seqHits[i*2+1] : _msf_seqHits[i*2]; + + if(_msf_seqHits[i*2+readSegment] == 0 && size1 != 0 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput) + { + int d1 = 0; + + char *seq, *qual; + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; + + rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH); + + for(j = 0; j < size1 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput; j++) + { + d1 = (miL[i].mi[j].dir == -1)?1:0; + + if(readSegment == 0) + { + if ( d1 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + } + else + { + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + } + + _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)]++; + + _msf_output.POS = (readSegment==1)?miL[i].mi[j].loc:0; + _msf_output.MPOS = (readSegment==0)?miL[i].mi[j].loc:0; + _msf_output.FLAG = (readSegment==0)? 1+4+32*d1+128 : 1+8+16*d1+64 ; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2+(readSegment==0?1:0)].name; + _msf_output.RNAME = geneFileName1; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = miL[i].mi[j].cigar; + _msf_output.MRNAME = "="; + //_msf_output.NSEQ = (readSegment == 0)?seq1:seq2; + //_msf_output.NQUAL = (readSegment == 0)?qual1:qual2; + + + _msf_output.optSize = 4; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = miL[i].mi[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = miL[i].mi[j].md; + + + + //for the OEA reads + _msf_optionalFields[2].tag = "NS"; + _msf_optionalFields[2].type = 'Z'; + _msf_optionalFields[2].sVal = (readSegment == 0)?seq1:seq2; + + + _msf_optionalFields[3].tag = "NQ"; + _msf_optionalFields[3].type = 'Z'; + _msf_optionalFields[3].sVal = (readSegment == 0)?qual1:qual2; + + outputSAM(fp_out, _msf_output); + + } + } + } + + } + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo)); + } + + freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink)); + + fclose(fp1); +} + +void finalizeOEAReads(char *fileName) +{ + + int i = 0; + int k = 0; + + FILE *fp_out1; + char fname1[200]; + + _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_oeaMapping[i] = 0; + } + + sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput); + + fp_out1 = fileOpen(fname1, "w"); + for(i = 0; i < _msf_maxFile; i++) + { + for(k = 0; k < _msf_fileCount[i]; k++) + { + outputOEA(_msf_fileName[i][k][1], fp_out1, 0); + }// for k + } //for i + + + for(i = 0; i < _msf_maxFile; i++) + { + for(k = 0; k < _msf_fileCount[i]; k++) + { + outputOEA(_msf_fileName[i][k][0], fp_out1, 1); + }// for k + } //for i + + fclose(fp_out1); + +} +*/ + +void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out) +{ + int i = 0; + int j = 0; + int k = 0; + + char *index; + + int size1 = 0; + int size2 = 0; + + FILE *fp1 = NULL; + FILE *fp2 = NULL; + + char geneFileName1[FILE_NAME_LENGTH]; + char geneFileName2[FILE_NAME_LENGTH]; + + FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); + FullMappingInfoLink *miR = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); + + + if(fileName1 != NULL && fileName2 != NULL) + { + + fp1 = fileOpen(fileName1, "r"); + fp2 = fileOpen(fileName2, "r"); + + index = strstr(fileName1, "__"); + strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); + geneFileName1[strstr(index + 2, "__") - index - 2] = '\0'; + + index = strstr(fileName2, "__"); + strncpy(geneFileName2, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); + geneFileName2[strstr(index + 2, "__") - index - 2] = '\0'; + + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + fread(&size1, sizeof(int), 1, fp1); + fread(&size2, sizeof(int), 1, fp2); + + miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) ); + miR[i].mi = getMem(size2 * sizeof(FullMappingInfo) ); + + miL[i].size = size1; + miR[i].size = size2; + + for(j = 0; j < size1; j++) + { + fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1); + + fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1); + + miL[i].mi[j].dir = 1; + if(miL[i].mi[j].loc < 1) + { + miL[i].mi[j].loc *= -1; + miL[i].mi[j].dir = -1; + } + } + for(k = 0; k < size2; k++) + { + fread(&(miR[i].mi[k].loc), sizeof(int), 1, fp2); + + fread (&(miR[i].mi[k].err), sizeof(int), 1, fp2); + + fread (&(miR[i].mi[k].cigarSize), sizeof(int), 1, fp2); + fread ((miR[i].mi[k].cigar), sizeof(char), miR[i].mi[k].cigarSize+1, fp2); + + fread (&(miR[i].mi[k].mdSize), sizeof(int), 1, fp2); + fread ((miR[i].mi[k].md), sizeof(char), miR[i].mi[k].mdSize+1, fp2); + + miR[i].mi[k].dir = 1; + if(miR[i].mi[k].loc < 1) + { + miR[i].mi[k].loc *= -1; + miR[i].mi[k].dir = -1; + } + } + if(_msf_readHasConcordantMapping[i] == 0 && size1 != 0 && size2 != 0 && (size1 * size2 < MAX_TRANS_CHROMOSAL_OUTPUT)) + { + int d1 = 0; + int d2 = 0; + char *seq, *qual; + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; + rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH); + + for(j = 0; j < size1; j++) + { + d1 = (miL[i].mi[j].dir == -1)?1:0; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + + for(k = 0; k < size2; k++) + { + + d2 = (miR[i].mi[k].dir == -1)?1:0; + + _msf_output.POS = miL[i].mi[j].loc; + _msf_output.MPOS = miR[i].mi[k].loc; + _msf_output.FLAG = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = geneFileName1; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = miL[i].mi[j].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = miL[i].mi[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = miL[i].mi[j].md; + + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + outputSAM(fp_out, _msf_output); + + + _msf_output.POS = miR[i].mi[k].loc; + _msf_output.MPOS = miL[i].mi[j].loc; + _msf_output.FLAG = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2+1].name; + _msf_output.RNAME = geneFileName2; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = miR[i].mi[k].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = miR[i].mi[k].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = miR[i].mi[k].md; + + outputSAM(fp_out, _msf_output); + + } + } + } + } + + } + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo)); + freeMem(miR[i].mi, miR[i].size * sizeof(FullMappingInfo)); + } + + freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink)); + freeMem(miR, _msf_seqListSize * sizeof(FullMappingInfoLink)); + + fclose(fp1); + fclose(fp2); +} + +/* + if flag is 1 it will output all the possible trans chromsal mapping + otherwise only tmp file will be delete + +*/ + +void outputAllTransChromosal(int flag) +{ + + int i = 0; + int j = 0; + int k = 0; + int l = 0; + + FILE *fp_out = NULL; + char fname1[200]; + + if(flag) + { + fp_out = fileOpen(fname1, "w"); + + sprintf(fname1, "%s%s_TRANSCHROMOSOMAL", mappingOutputPath, mappingOutput); + + // for(i = 0; i < _msf_maxFile; i++) + // { + i = 0; + for(j = i+1; j < _msf_maxFile; j++) + { + if(i != j) + { + for(k = 0; k < _msf_fileCount[i]; k++) + { + for(l = 0; l < _msf_fileCount[j]; l++) + { + outputTransChromosal(_msf_fileName[i][k][0], _msf_fileName[j][l][1], fp_out); + }// for l + }// for k + }// if + }// for j + // } //for i + } + + for(i = 0; i < _msf_maxFile; i++) + { + for(j = 0; j < _msf_fileCount[i]; j++) + { + unlink(_msf_fileName[i][j][0]); + unlink(_msf_fileName[i][j][1]); + } + } + if(flag) + fclose(fp_out); +} diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/MrFAST.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/MrFAST.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,209 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __MR_FAST__ +#define __MR_FAST__ + +#include "Reads.h" + +#define MAP_CHUNKS 15 +#define MAX_CIGAR_SIZE 100 + + +// Pair is used to pre-processing and making the read index table +typedef struct +{ + int hv; + //char hv[50]; + int readNumber; +} Pair; + +typedef struct +{ + int hv; + unsigned int *seqInfo; +} ReadIndexTable; + + +typedef struct +{ + int loc; + char dir; + int err; + float score; + char md[MAX_CIGAR_SIZE]; + char chr[20]; + char cigar[MAX_CIGAR_SIZE]; + int cigarSize; + int mdSize; +} FullMappingInfo; + +typedef struct +{ + int loc; + char dir; + int err; + float score; + char md[MAX_CIGAR_SIZE]; + char chr[20]; + char cigar[MAX_CIGAR_SIZE]; + int cigarSize; + int mdSize; +} BestFullMappingInfo; + +typedef struct lc +{ + char md[MAP_CHUNKS][MAX_CIGAR_SIZE]; + int mdSize[MAP_CHUNKS]; + + char cigar[MAP_CHUNKS][MAX_CIGAR_SIZE]; + int cigarSize[MAP_CHUNKS]; + + int err[MAP_CHUNKS]; + int loc[MAP_CHUNKS]; + struct lc *next; +} MappingLocations; + +typedef struct inf +{ + int size; + MappingLocations *next; +} MappingInfo; + +typedef struct +{ + FILE * fp; + char name[400]; +} FILE_STRUCT; + +typedef struct +{ + FullMappingInfo *mi; + int size; +} FullMappingInfoLink; + + +typedef struct +{ + char readString[200]; + char ref[200]; + int err; + char matrix[200]; +} extraCaching; + +extern long long verificationCnt; +extern long long mappingCnt; +extern long long mappedSeqCnt; +extern long long completedSeqCnt; + +void initFAST( Read *seqList, + int seqListSize, + int *samplingLocs, + int samplingLocsSize, + char *fileName); + +void initVerifiedLocs(); +void initLookUpTable(); +void initBestMapping(); +void initBestConcordantDiscordant(int readNumber); + +void finalizeFAST(); +void finalizeBestSingleMapping(); +void finalizeBestConcordantDiscordant(); +void finalizeOEAReads(char *); + + +int mapAllSingleEndSeq(); +void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); +void mapPairedEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); + +void mapPairedEndSeq(); + +void outputPairedEnd(); +void outputPairedEndDiscPP(); + + +void outputPairFullMappingInfo(FILE *fp, int readNumber); +void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2); +void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar); + +void outputAllTransChromosal(); +void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out); + +void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP); +void generateCigar(char *matrix, int matrixLength, char *cigar); +void generateCigarFromMD(char *mistmatch, int mismatchLength, char *cigar); + +int msfHashVal(char *seq); + +int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); +int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); + +int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); +int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); + +int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); +int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); + +int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); +int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); + + +/***********************************/ + +int editDistance(int refIndex, char *seq, int seqLength, char *matrix); + +int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Output.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Output.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,231 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + +#include +#include +#include +#include +#include "Common.h" +#include "Output.h" + +FILE *_out_fp; +gzFile _out_gzfp; + +char buffer[300000]; +int bufferSize = 0; + + +void finalizeGZOutput() +{ + gzclose(_out_gzfp); +} + +void finalizeTXOutput() +{ + fclose(_out_fp); +} + + +void gzOutputQ(SAM map) +{ + gzprintf(_out_gzfp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + gzprintf(_out_gzfp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + gzprintf(_out_gzfp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + gzprintf(_out_gzfp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + gzprintf(_out_gzfp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + gzprintf(_out_gzfp, "\n"); +} + +void outputSAM(FILE *fp, SAM map) +{ + fprintf(fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + fprintf(fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + fprintf(fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + fprintf(fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + fprintf(fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + + fprintf(fp, "\n"); + +} + +void outputQ(SAM map) +{ + + fprintf(_out_fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + fprintf(_out_fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + fprintf(_out_fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + fprintf(_out_fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + fprintf(_out_fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + + fprintf(_out_fp, "\n"); +} + +int initOutput ( char *fileName, int compressed) +{ + if (compressed) + { + char newFileName[strlen(fileName)+4]; + sprintf(newFileName, "%s.gz", fileName); + + _out_gzfp = fileOpenGZ(newFileName, "w1f"); + if (_out_gzfp == Z_NULL) + { + return 0; + } + + finalizeOutput = &finalizeGZOutput; + + output = &gzOutputQ; + } + else + { + _out_fp = fileOpen(fileName, "w"); + if (_out_fp == NULL) + { + return 0; + } + + finalizeOutput = &finalizeTXOutput; + output = &outputQ; + } + buffer[0] = '\0'; + return 1; +} + +FILE* getOutputFILE() +{ + if(_out_fp != NULL) + return _out_fp; + else if(_out_gzfp != NULL) + return _out_gzfp; + else + return NULL; +} diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Output.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Output.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,85 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __OUTPUT__ +#define __OUTPUT__ + +#define FORWARD 0 +#define REVERSE 1 + +typedef struct +{ + char *tag; + char type; + char cVal; + int iVal; + float fVal; + char *sVal; +} OPT_FIELDS; + +typedef struct +{ + char *QNAME; + short FLAG; + char *RNAME; + int POS; + unsigned char MAPQ; + char *CIGAR; + char *MRNAME; + int MPOS; + int ISIZE; + char *SEQ; + char *QUAL; + + int optSize; + OPT_FIELDS *optFields; +} SAM; + +int initOutput(char *fileName, int compressed); +void (*finalizeOutput)(); +void (*output)(SAM map); +void outputSAM(FILE *fp, SAM map); + +FILE * getOutputFILE(); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Reads.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Reads.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,773 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#include +#include +#include +#include +#include +#include "Common.h" +#include "Reads.h" + +#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) + +FILE *_r_fp1; +FILE *_r_fp2; +gzFile _r_gzfp1; +gzFile _r_gzfp2; +Read *_r_seq; +int _r_seqCnt; +int *_r_samplingLocs; + +/**********************************************/ +char *(*readFirstSeq)(char *); +char *(*readSecondSeq)(char *); +/**********************************************/ +char *readFirstSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); +} + +/**********************************************/ +char *readSecondSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); +} +/**********************************************/ +char *readFirstSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); +} + +/**********************************************/ +char *readSecondSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); +} +/**********************************************/ +int toCompareRead(const void * elem1, const void * elem2) +{ + return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); +} +/**********************************************/ +int readAllReads(char *fileName1, + char *fileName2, + int compressed, + unsigned char *fastq, + unsigned char pairedEnd, + Read **seqList, + unsigned int *seqListSize) +{ + double startTime=getTime(); + + char seq1[SEQ_MAX_LENGTH]; + char rseq1[SEQ_MAX_LENGTH]; + char name1[SEQ_MAX_LENGTH]; + char qual1[SEQ_MAX_LENGTH]; + char seq2[SEQ_MAX_LENGTH]; + char rseq2[SEQ_MAX_LENGTH]; + char name2[SEQ_MAX_LENGTH]; + char qual2[SEQ_MAX_LENGTH]; + + char dummy[SEQ_MAX_LENGTH]; + char ch; + int err1, err2; + int nCnt; + int discarded = 0; + int seqCnt = 0; + int maxCnt = 0; + int i; + Read *list = NULL; + + int clipped = 0; + + if (!compressed) + { + _r_fp1 = fileOpen( fileName1, "r"); + + if (_r_fp1 == NULL) + { + return 0; + } + + ch = fgetc(_r_fp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_fp2 = fileOpen ( fileName2, "r" ); + if (_r_fp2 == NULL) + { + return 0; + } + } + else + { + _r_fp2 = _r_fp1; + } + + readFirstSeq = &readFirstSeqTXT; + readSecondSeq = &readSecondSeqTXT; + } + else + { + + _r_gzfp1 = fileOpenGZ (fileName1, "r"); + + if (_r_gzfp1 == NULL) + { + return 0; + } + + ch = gzgetc(_r_gzfp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); + if (_r_gzfp2 == NULL) + { + return 0; + } + } + else + { + _r_gzfp2 = _r_gzfp1; + } + + readFirstSeq = &readFirstSeqGZ; + readSecondSeq = &readSecondSeqGZ; + } + + if (ch == '>') + *fastq = 0; + else + *fastq = 1; + + // Counting the number of lines in the file + while (readFirstSeq(dummy)) maxCnt++; + + if (!compressed) + { + rewind(_r_fp1); + } + else + { + gzrewind(_r_gzfp1); + } + + // Calculating the Maximum # of sequences + if (*fastq) + { + maxCnt /= 4; + } + else + { + maxCnt /= 2; + } + + if (pairedEnd && fileName2 != NULL ) + maxCnt *= 2; + + list = getMem(sizeof(Read)*maxCnt); + + while( readFirstSeq(name1) ) + { + err1 = 0; + err2 = 0; + readFirstSeq(seq1); + name1[strlen(name1)-1] = '\0'; + + if ( *fastq ) + { + readFirstSeq(dummy); + readFirstSeq(qual1); + qual1[strlen(qual1)-1] = '\0'; + } + else + { + sprintf(qual1, "*"); + } + + // Cropping + if (cropSize > 0) + { + seq1[cropSize] = '\0'; + if ( *fastq ) + qual1[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i errThreshold) + { + err1 = 1; + } + + // Reading the second seq of pair-ends + if (pairedEnd) + { + readSecondSeq(name2); + readSecondSeq(seq2); + name2[strlen(name2)-1] = '\0'; + + + if ( *fastq ) + { + readSecondSeq(dummy); + readSecondSeq(qual2); + + qual2[strlen(qual2)-1] = '\0'; + } + else + { + sprintf(qual2, "*"); + } + + + // Cropping + if (cropSize > 0) + { + seq2[cropSize] = '\0'; + if ( *fastq ) + qual2[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i errThreshold) + { + err2 = 1; + } + + + if (strlen(seq1) < strlen(seq2)) { + seq2[strlen(seq1)] = '\0'; + if ( *fastq ) + qual2[strlen(seq1)] = '\0'; + if (!clipped) clipped = 2; + } + else if (strlen(seq1) > strlen(seq2)){ + seq1[strlen(seq2)] = '\0'; + if ( *fastq ) + qual1[strlen(seq2)] = '\0'; + if (!clipped) clipped = 1; + } + + if (clipped == 1 || clipped == 2){ + fprintf(stdout, "[PE mode warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped); + clipped = 3; + } + + + + } + + if (!pairedEnd && !err1) + { + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + seqCnt++; + + } + else if (pairedEnd && !err1 && !err2) + { + + // Naming Conventions X/1, X/2 OR X + int tmplen = strlen(name1); + if (strcmp(name1, name2) != 0) + { + tmplen = strlen(name1)-2; + } + + //first seq + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + name1[tmplen]='\0'; + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + + seqCnt++; + + //second seq + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq2, rseq2, _mtmp); + rseq2[_mtmp] = '\0'; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq2[i]; + list[seqCnt].rseq[i] = rseq2[i] ; + list[seqCnt].qual[i] = qual2[i]; + } + + + name2[tmplen]='\0'; + + //MAKE HASH VALUE + code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + j = 0; + tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); + + seqCnt++; + + } + else + { + discarded++; + } + } + + if (seqCnt > 0) + { + SEQ_LENGTH = strlen(list[0].seq); + } + else + { + fprintf(stdout, "ERROR: No reads can be found for mapping\n"); + return 0; + } + + + // Closing Files + if (!compressed) + { + fclose(_r_fp1); + if ( pairedEnd && fileName2 != NULL ) + { + fclose(_r_fp2); + } + } + else + { + gzclose(_r_gzfp1); + if ( pairedEnd && fileName2 != NULL) + { + gzclose(_r_fp2); + } + } + + //qsort(list, seqCnt, sizeof(Read), toCompareRead); + + adjustQual(list, seqCnt); + + *seqList = list; + *seqListSize = seqCnt; + + + _r_seq = list; + _r_seqCnt = seqCnt; + + if ( pairedEnd ) discarded *= 2; + + if (seqCnt>1) + fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + else + fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + + + + return 1; +} +/**********************************************/ +void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) +{ + int i; + int samLocsSize = errThreshold + 1; + int *samLocs = getMem(sizeof(int)*samLocsSize); + + for (i=0; i SEQ_LENGTH) + samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; + } + + // Outputing the sampling locations + + /* + + int j; + for (i=0; i%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0) + { + fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); + } + } + + fclose(fp1); + if (pairedEndMode) + _r_seqCnt *= 2; + + for (i = 0; i < _r_seqCnt; i++) + { + freeMem(_r_seq[i].hits,0); + } + + + freeMem(_r_seq,0); + freeMem(_r_samplingLocs,0); +} + +void adjustQual(Read *list, int seqCnt){ + /* This function will automatically determine the phred_offset and readjust quality values if needed */ +} + + + +/*void finalizeOEAReads(char *fileName) +{ + FILE *fp1=NULL; + FILE *fp2=NULL; + + //printf("OEA%s\n", fileName); + char fileNameOEA1[200]; + char fileNameOEA2[200]; + sprintf(fileNameOEA1, "%s_OEA1", fileName); + sprintf(fileNameOEA2, "%s_OEA2", fileName); + + + if (fileName != NULL) + { + fp1 = fileOpen(fileNameOEA1, "w"); + fp2 = fileOpen(fileNameOEA2, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + printf("%d\n", _r_seqCnt); + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 ) + { + fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + } + + fclose(fp1); + fclose(fp2); + +}*/ diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/Reads.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/Reads.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,65 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + +#ifndef __READ__ +#define __READ__ + +typedef struct +{ + char *name; + char *seq; + char *rseq; + char *qual; + char *hits; + int readNumber; + short *hashValue; + short *rhashValue; + int *hashValSampleSize; +} Read; + +int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pe, Read **seqList, unsigned int *seqListSize); +void loadSamplingLocations(int **samplingLocs, int *samplingLocsSize); +void finalizeReads(char *fileName); +void adjustQual(Read *list, int seqCnt); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/RefGenome.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/RefGenome.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,157 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + +#include +#include +#include +#include +#include "Common.h" +#include "RefGenome.h" + +FILE *_rg_fp; +char *_rg_gen; +char *_rg_name; +int _rg_offset; +int _rg_contGen; + +/**********************************************/ +int initLoadingRefGenome(char *fileName) +{ + char ch; + _rg_fp = fileOpen (fileName, "r"); + if (fscanf(_rg_fp, "%c", &ch)) + { + if (ch == '>') + { + _rg_contGen = 0; + _rg_offset = 0; + _rg_gen = getMem(CONTIG_MAX_SIZE); + _rg_name = getMem(CONTIG_NAME_SIZE); + return 1; + } + } + return 0; +} +/**********************************************/ +void finalizeLoadingRefGenome() +{ + freeMem(_rg_gen, CONTIG_MAX_SIZE); + freeMem(_rg_name, CONTIG_NAME_SIZE); + fclose(_rg_fp); +} +/**********************************************/ +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff) +{ + char ch; + int i; + int returnVal = 0; + int actualSize=0; + int size; + char *tmp; + + // New Conting + if (!_rg_contGen) + { + size = 0; + tmp = fgets(_rg_name, SEQ_MAX_LENGTH, _rg_fp); + } + else + { + size=strlen(_rg_gen); + for( i = 0 ; i < CONTIG_OVERLAP ; i++ ) + { + + _rg_gen[i] = _rg_gen[size-CONTIG_OVERLAP+i]; + if (_rg_gen[i] != 'N') + actualSize++; + } + size = CONTIG_OVERLAP; + } + while( fscanf(_rg_fp, "%c", &ch) > 0 ) + { + if (ch == '>') + { + _rg_contGen = 0; + returnVal = 1; + break; + } + else if (!isspace(ch)) + { + ch = toupper(ch); + _rg_gen[size++] = ch; + if (ch != 'N') + { + actualSize++; + } + if (actualSize == CONTIG_SIZE || size == CONTIG_MAX_SIZE) + { + _rg_contGen = 1; + returnVal=1; + break; + } + } + + } + + _rg_gen[size] = '\0'; + for (i=strlen(_rg_name)-1; i >= 0; i--) + if (!isspace(_rg_name[i])) + break; + _rg_name[i+1] = '\0'; + + *refGenOff = _rg_offset; + *refGenName = _rg_name; + *refGen = _rg_gen; + + if (_rg_contGen == 1) + { + _rg_offset += size-CONTIG_OVERLAP; + } + else + { + _rg_offset = 0; + } + + + return returnVal; +} +/**********************************************/ diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/RefGenome.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/RefGenome.h Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,51 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef _REF_GENOME_ +#define _REF_GENOME_ + +int initLoadingRefGenome(char *fileName); +void finalizeLoadingRefGenome(); +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff); + +#endif diff -r 000000000000 -r 7b3dc85dc7fd mrfast-2.1.0.4/baseFAST.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.4/baseFAST.c Tue Feb 21 10:29:47 2012 -0500 @@ -0,0 +1,376 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + + + +#include +#include +#include +#include +#include "Common.h" +#include "CommandLineParser.h" +#include "Reads.h" +#include "Output.h" +#include "HashTable.h" +#include "MrFAST.h" + +char *versionNumber = "2.1"; // Current Version +unsigned char seqFastq; + +int main(int argc, char *argv[]) +{ + if (!parseCommandLine(argc, argv)) + return 1; + + configHashTable(); + /**************************************************** + * INDEXING + ***************************************************/ + if (indexingMode) + { + int i; + /******************************** + * Regular Mode + ********************************/ + configHashTable(); + for (i = 0; i < fileCnt; i++) + { + generateHashTable(fileName[i][0], fileName[i][1]); + } + } + /**************************************************** + * SEARCHING + ***************************************************/ + else + { + Read *seqList; + unsigned int seqListSize; + int fc; + int samplingLocsSize; + int *samplingLocs; + double totalLoadingTime = 0; + double totalMappingTime = 0; + double startTime; + double loadingTime; + double mappingTime; + double lstartTime; + double ppTime = 0.0; + double tmpTime;; + char *prevGen = getMem(CONTIG_NAME_SIZE); + prevGen[0]='\0'; + char *curGen; + int flag; + double maxMem=0; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + char fname5[FILE_NAME_LENGTH]; + + char outputFileName[FILE_NAME_LENGTH]; + // Loading Sequences & Sampling Locations + startTime = getTime(); + if (!readAllReads(seqFile1, seqFile2, seqCompressed, &seqFastq, pairedEndMode, &seqList, &seqListSize)) + { + return 1; + } + + loadSamplingLocations(&samplingLocs, &samplingLocsSize); + totalLoadingTime += getTime()-startTime; + + if (pairedEndMode) + { + //Switching to Inferred Size + minPairEndedDistance = minPairEndedDistance - SEQ_LENGTH + 2; + maxPairEndedDistance = maxPairEndedDistance - SEQ_LENGTH + 2; + if (pairedEndDiscordantMode) + { + maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance - SEQ_LENGTH + 2; + minPairEndedDiscordantDistance = minPairEndedDiscordantDistance - SEQ_LENGTH + 2; + } + + sprintf(fname1, "__%s__1", mappingOutput); + sprintf(fname2, "__%s__2", mappingOutput); + sprintf(fname3, "__%s__disc", mappingOutput); + sprintf(fname4, "__%s__oea1", mappingOutput); + sprintf(fname5, "__%s__oea2", mappingOutput); + unlink(fname1); + unlink(fname2); + unlink(fname3); + unlink(fname4); + unlink(fname5); + } + + sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput); + // Preparing output + initOutput(outputFileName, outCompressed); + + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads"); + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + + /******************************** + * Regular Mode + ********************************/ + if (!pairedEndMode) + { + initLookUpTable(); + if(bestMode) + initBestMapping(seqListSize); + + for (fc = 0; fc , University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu + calkan AT gmail DOT com + */ + +