Mercurial > repos > calkan > mrfast
changeset 1:d4054b05b015 default tip
Version update to 2.1.0.5
author | calkan |
---|---|
date | Fri, 09 Mar 2012 07:35:51 -0500 |
parents | 7b3dc85dc7fd |
children | |
files | mrfast-2.1.0.4/CommandLineParser.c mrfast-2.1.0.4/CommandLineParser.h mrfast-2.1.0.4/Common.c mrfast-2.1.0.4/Common.h mrfast-2.1.0.4/HashTable.c mrfast-2.1.0.4/HashTable.h mrfast-2.1.0.4/Makefile mrfast-2.1.0.4/MrFAST.c mrfast-2.1.0.4/MrFAST.h mrfast-2.1.0.4/Output.c mrfast-2.1.0.4/Output.h mrfast-2.1.0.4/Reads.c mrfast-2.1.0.4/Reads.h mrfast-2.1.0.4/RefGenome.c mrfast-2.1.0.4/RefGenome.h mrfast-2.1.0.4/baseFAST.c mrfast-2.1.0.4/header mrfast-2.1.0.5/CommandLineParser.c mrfast-2.1.0.5/CommandLineParser.h mrfast-2.1.0.5/Common.c mrfast-2.1.0.5/Common.h mrfast-2.1.0.5/HashTable.c mrfast-2.1.0.5/HashTable.h mrfast-2.1.0.5/Makefile mrfast-2.1.0.5/MrFAST.c mrfast-2.1.0.5/MrFAST.h mrfast-2.1.0.5/Output.c mrfast-2.1.0.5/Output.h mrfast-2.1.0.5/Reads.c mrfast-2.1.0.5/Reads.h mrfast-2.1.0.5/RefGenome.c mrfast-2.1.0.5/RefGenome.h mrfast-2.1.0.5/baseFAST.c |
diffstat | 33 files changed, 11233 insertions(+), 11429 deletions(-) [+] |
line wrap: on
line diff
--- a/mrfast-2.1.0.4/CommandLineParser.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,388 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - - -#include <stdio.h> -#include <stdlib.h> -#include <getopt.h> -#include <string.h> -#include <ctype.h> -#include "Common.h" -#include "CommandLineParser.h" - -int uniqueMode=1; -int indexingMode; -int searchingMode; -int pairedEndMode; -int pairedEndDiscordantMode; -int transChromosal=0; -int pairedEndProfilingMode; -int seqCompressed; -int outCompressed; -int cropSize = 0; -int progressRep = 0; -int minPairEndedDistance=-1; -int maxPairEndedDistance=-1; -int minPairEndedDiscordantDistance=-1; -int maxPairEndedDiscordantDistance=-1; -int bestMode; -int nosamMode; -char *seqFile1; -char *seqFile2; -char *mappingOutput = "output"; -char *mappingOutputPath = ""; -char *unmappedOutput = "unmapped"; -char fileName[1000][2][FILE_NAME_LENGTH]; -int fileCnt; -int maxOEAOutput=1000; -int maxDiscordantOutput=10000; -unsigned char errThreshold=2; -unsigned char maxHits=0; -unsigned char WINDOW_SIZE = 12; -unsigned int CONTIG_SIZE; -unsigned int CONTIG_MAX_SIZE; - -void printHelp(); - -int parseCommandLine (int argc, char *argv[]) -{ - - int o; - int index; - char *fastaFile = NULL; - char *batchFile = NULL ; - int batchMode = 0; - - static struct option longOptions[] = - { - {"pe", no_argument, &pairedEndMode, 1}, - {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, - {"trans", no_argument, &transChromosal , 1}, - {"profile", no_argument, &pairedEndProfilingMode, 1}, - {"seqcomp", no_argument, &seqCompressed, 1}, - {"outcomp", no_argument, &outCompressed, 1}, - {"progress", no_argument, &progressRep, 1}, - {"best", no_argument, &bestMode, 1}, - {"index", required_argument, 0, 'i'}, - {"search", required_argument, 0, 's'}, - {"help", no_argument, 0, 'h'}, - {"version", no_argument, 0, 'v'}, - {"seq", required_argument, 0, 'x'}, - {"seq1", required_argument, 0, 'x'}, - {"seq2", required_argument, 0, 'y'}, - {"ws", required_argument, 0, 'w'}, - {"min", required_argument, 0, 'l'}, - {"max", required_argument, 0, 'm'}, - {"crop", required_argument, 0, 'c'}, - {"maxoea", required_argument, 0, 'a'}, - {"maxdis", required_argument, 0, 'd'}, - {"nosam", no_argument, &nosamMode, 1}, - {0, 0, 0, 0}, - }; - - while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 ) - { - switch (o) - { - case 'a': - maxOEAOutput = atoi(optarg); - break; - case 'd': - maxDiscordantOutput = atoi(optarg); - break; - case 'i': - indexingMode = 1; - fastaFile = optarg; - break; - case 's': - searchingMode = 1; - fastaFile = optarg; - break; - case 'b': - batchMode = 1; - break; - case 'c': - cropSize = atoi(optarg); - break; - case 'w': - WINDOW_SIZE = atoi(optarg); - break; - case 'x': - seqFile1 = optarg; - break; - case 'y': - seqFile2 = optarg; - break; - case 'u': - unmappedOutput = optarg; - break; - case 'o': - mappingOutput = getMem(FILE_NAME_LENGTH); - mappingOutputPath = getMem(FILE_NAME_LENGTH); - stripPath (optarg, &mappingOutputPath, &mappingOutput); - break; - case 'n': - maxHits = atoi(optarg); - break; - case 'e': - errThreshold = atoi(optarg); - break; - case 'l': - minPairEndedDistance = atoi(optarg); - break; - case 'm': - maxPairEndedDistance = atoi(optarg); - break; - case 'h': - printHelp(); - return 0; - break; - case 'v': - fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF); - return 0; - break; - /* case '?': - fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name); - abort(); - break;*/ - } - - } - if (indexingMode + searchingMode != 1) - { - fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); - return 0; - } - - if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11) - { - fprintf(stdout, "ERROR: Window size should be in [12..15]\n"); - return 0; - } - - - if ( indexingMode ) - { - CONTIG_SIZE = 15000000; - CONTIG_MAX_SIZE = 40000000; - - if (batchMode) - { - batchFile = fastaFile; - fastaFile = NULL; - } - - if (batchFile == NULL && fastaFile == NULL) - { - fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); - return 0; - } - - if (pairedEndDiscordantMode) - { - fprintf(stdout, "ERROR: --discordant cannot be used in indexing mode. \n"); - return 0; - } - - } - - - if ( searchingMode ) - { - CONTIG_SIZE = 300000000; - CONTIG_MAX_SIZE = 300000000; - - - if (batchMode) - { - batchFile = fastaFile; - fastaFile = NULL; - } - - if (batchFile == NULL && fastaFile == NULL) - { - fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); - return 0; - } - - if (seqFile1 == NULL && seqFile2 == NULL) - { - fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); - return 0; - } - - - if (!pairedEndMode && seqFile2 != NULL) - { - fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n"); - return 0; - } - - if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance)) - { - fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); - return 0; - } - - if (pairedEndMode && seqFile1 == NULL) - { - fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); - return 0; - } - - if (!pairedEndMode && pairedEndDiscordantMode) - { - fprintf(stdout, "ERROR: --discordant should be used with --pe"); - return 0; - } - - if (!pairedEndMode && pairedEndProfilingMode) - { - fprintf(stdout, "ERROR: --profile should be used with --pe"); - return 0; - } - } - - int i = 0; - - - if (batchMode) - { - FILE *fp = fileOpen(batchFile, "r"); - - if (fp == NULL) - return 0; - - fileCnt = 0; - - while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp)) - { - for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--) - if ( !isspace(fileName[fileCnt][0][i])) - break; - fileName[fileCnt][0][i+1] = '\0'; - - if (strcmp(fileName[fileCnt][0], "") != 0) - { - sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); - fileCnt++; - } - } - } - else - { - sprintf(fileName[fileCnt][0], "%s", fastaFile); - sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); - fileCnt++; - } - - - if (pairedEndProfilingMode) - { - - minPairEndedDistance = 0; - maxPairEndedDistance = 300000000; - - } - - if (pairedEndDiscordantMode) - { - minPairEndedDiscordantDistance = minPairEndedDistance; - maxPairEndedDiscordantDistance = maxPairEndedDistance; - - minPairEndedDistance = 0; - maxPairEndedDistance = 300000000; - } - - return 1; -} - - -void printHelp() -{ - char *errorType; - if (mrFAST) - { - fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n"); - fprintf(stdout,"Usage: mrfast [options]\n\n"); - errorType="edit distance"; - } - else - { - fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n"); - fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n"); - fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n"); - fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n"); - fprintf(stdout,"the output in SAM format.\n\n"); - fprintf(stdout,"Usage: mrsFAST [options]\n\n"); - errorType="hamming distance"; - } - - fprintf(stdout,"General Options:\n"); - fprintf(stdout," -v|--version\t\tCurrent Version.\n"); - fprintf(stdout," -h\t\t\tShows the help file.\n"); - fprintf(stdout,"\n\n"); - - fprintf(stdout,"Indexing Options:\n"); - fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n"); - fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n"); - fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n"); - fprintf(stdout,"\n\n"); - - fprintf(stdout,"Searching Options:\n"); - fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n"); - fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n"); - fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n"); - fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n"); - fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n"); - fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads. \n"); - fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n"); - fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n"); - fprintf(stdout," --best \t\tOnly the best mapping from all the possible mapping is returned.\n"); - fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n"); - fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n"); - fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType); - fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n"); - fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n"); - - fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n"); - fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n"); -}
--- a/mrfast-2.1.0.4/CommandLineParser.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - - - -#ifndef __COMMAND_LINE_PARSER__ -#define __COMMAND_LINE_PARSER__ - -int parseCommandLine (int argc, char *argv[]); - -#endif
--- a/mrfast-2.1.0.4/Common.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,184 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#include <stdio.h> -#include <stdlib.h> -#include <sys/time.h> -#include <zlib.h> -#include <string.h> -#include "Common.h" - - -unsigned short SEQ_LENGTH = 0; -long long memUsage = 0; -/**********************************************/ -FILE *fileOpen(char *fileName, char *mode) -{ - FILE *fp; - fp = fopen (fileName, mode); - if (fp == NULL) - { - fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); - fflush(stdout); - exit(0); - } - return fp; -} -/**********************************************/ -gzFile fileOpenGZ(char *fileName, char *mode) -{ - gzFile gzfp; - gzfp = gzopen (fileName, mode); - if (gzfp == Z_NULL) - { - fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); - fflush(stdout); - exit(0); - } - return gzfp; -} -/**********************************************/ -double getTime(void) -{ - struct timeval t; - gettimeofday(&t, NULL); - return t.tv_sec+t.tv_usec/1000000.0; -} - -/**********************************************/ -char reverseCompleteChar(char c) -{ - char ret; - switch (c) - { - case 'A': - ret = 'T'; - break; - case 'T': - ret = 'A'; - break; - case 'C': - ret = 'G'; - break; - case 'G': - ret = 'C'; - break; - default: - ret = 'N'; - break; - } - return ret; -} -/**********************************************/ -void reverseComplete (char *seq, char *rcSeq , int length) -{ - int i; - for (i=0; i<length; i++) - { - rcSeq[i]=reverseCompleteChar (seq[length-1-i]) ; - } -} -/**********************************************/ -void * getMem(size_t size) -{ - void *ret; - ret = malloc(size); - if (ret == NULL){ - fprintf(stderr, "Cannot allocate memory. Currently addressed memory = %0.2f MB, requested memory = %0.2f MB.\nCheck the available main memory, and if you have user limits (ulimit -v).\n", getMemUsage(), (double)size); - exit(0); - } - memUsage+=size; - return ret; -} -/**********************************************/ -void freeMem(void *ptr, size_t size) -{ - memUsage-=size; - free(ptr); -} -/**********************************************/ -double getMemUsage() -{ - return memUsage/1048576.0; -} -/**********************************************/ -void reverse (char *seq, char *rcSeq , int length) -{ - int i; - int l = length; - if(l != strlen(seq)) - l = strlen(seq); - for (i=0; i<l; i++) - { - rcSeq[i]=seq[l-1-i] ; - } - rcSeq[l] = '\0'; -} -/**********************************************/ -void stripPath(char *full, char **path, char **fileName) -{ - int i; - int pos = -1; - - for (i=strlen(full)-1; i>=0; i--) - { - if (full[i]=='/') - { - pos = i; - break; - } - - } - - if (pos != -1) - { - sprintf(*fileName, "%s%c", (full+pos+1), '\0'); - full[pos+1]='\0'; - sprintf(*path,"%s%c", full, '\0'); - } - else - { - sprintf(*fileName, "%s%c", full, '\0'); - sprintf(*path,"%c", '\0'); - } -}
--- a/mrfast-2.1.0.4/Common.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#ifndef __COMMON__ -#define __COMMON__ - -#include <zlib.h> - -#define SEQ_MAX_LENGTH 300 // Seq Max Length -#define CONTIG_OVERLAP 200 // No. of characters overlapped between contings -#define CONTIG_NAME_SIZE 200 // Contig name max size -#define FILE_NAME_LENGTH 400 // Filename Max Length -#define DISCORDANT_CUT_OFF 800 -#define MAX_OPEN_FILE 600 -#define MAX_TRANS_CHROMOSAL_OUTPUT 50 -#define MAX_OEA_OUT 500 - -extern unsigned int CONTIG_SIZE; -extern unsigned int CONTIG_MAX_SIZE; - - -extern unsigned char WINDOW_SIZE ; // WINDOW SIZE for indexing/searching -extern unsigned short SEQ_LENGTH; // Sequence(read) length - -extern char *versionNumber; -extern char *versionNumberF; -extern unsigned char mrFAST; - - -extern int maxOEAOutput; -extern int maxDiscordantOutput; -extern int uniqueMode; -extern int indexingMode; -extern int searchingMode; -extern int pairedEndMode; -extern int pairedEndDiscordantMode; -extern int transChromosal; -extern int pairedEndProfilingMode; -extern int bestMode; -extern int nosamMode; -extern int seqCompressed; -extern int outCompressed; -extern int cropSize; -extern int progressRep; -extern char *seqFile1; -extern char *seqFile2; -extern char *seqUnmapped; -extern char *mappingOutput; -extern char *mappingOutputPath; -extern char *unmappedOutput; -extern unsigned char seqFastq; -extern unsigned char errThreshold; -extern unsigned char maxHits; -extern int minPairEndedDiscordantDistance; -extern int maxPairEndedDiscordantDistance; -extern int minPairEndedDistance; -extern int maxPairEndedDistance; -extern char fileName[1000][2][FILE_NAME_LENGTH]; -extern int fileCnt; -extern long long memUsage; - -FILE * fileOpen(char *fileName, char *mode); -gzFile fileOpenGZ(char *fileName, char *mode); -double getTime(void); -void reverseComplete (char *seq, char *rcSeq , int length); -void * getMem(size_t size); -void freeMem(void * ptr, size_t size); -double getMemUsage(); -void reverse (char *seq, char *rcSeq , int length); -void stripPath(char *full, char **path, char **fileName); -#endif
--- a/mrfast-2.1.0.4/HashTable.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,472 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "Common.h" -#include "RefGenome.h" -#include "HashTable.h" -/**********************************************/ -FILE *_ih_fp = NULL; -IHashTable *_ih_hashTable = NULL; -int _ih_maxHashTableSize = 0; -char *_ih_refGen = NULL; -char *_ih_refGenName = NULL; -long long _ih_memUsage = 0; -int _ih_refGenOff = 0; -/**********************************************/ - -int hashVal(char *seq) -{ - int i=0; - int val=0, numericVal=0; - - while(i<WINDOW_SIZE) - { - switch (seq[i]) - { - case 'A': - numericVal = 0; - break; - case 'C': - numericVal = 1; - break; - case 'G' : - numericVal = 2; - break; - case 'T': - numericVal = 3; - break; - default: - return -1; - break; - } - val = (val << 2)|numericVal; - i++; - } - return val; -} -/**********************************************/ -void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize) -{ - int i=0; - for (i=0; i<maxSize; i++) - { - if (hashTable[i].locs != NULL) - { - freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int))); - hashTable[i].locs = NULL; - } - } -} -/**********************************************/ -void initSavingIHashTable(char *fileName) -{ - int tmp; - _ih_fp = fileOpen(fileName, "w"); - unsigned char bIndex = 0; // Bisulfite Index - - // First Two bytes are indicating the type of the index & window size - tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp); - tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); - - if (tmp == 0) - fprintf(stderr, "Write error while initializing hash table.\n"); - -} -/**********************************************/ -void finalizeSavingIHashTable() -{ - fclose(_ih_fp); -} -/**********************************************/ -void saveIHashTable(IHashTable *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset) -{ - int tmp; - - // Every Chunk starts with a byte indicating whether it has extra info; - unsigned char extraInfo = 0; - tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp); - - short len = strlen(refGenName); - tmp = fwrite(&len, sizeof(len), 1, _ih_fp); - tmp = fwrite(refGenName, sizeof(char), len, _ih_fp); - - tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp); - - unsigned int refGenLength = strlen(refGen); - tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp); - - tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp); - tmp = fwrite(&size, sizeof(size), 1, _ih_fp); - - int i=0,j=0; - unsigned char cnt=0; - for (i=0; i<maxSize; i++) - { - if (hashTable[i].locs != NULL) - { - tmp = fwrite(&i, sizeof(i), 1, _ih_fp); - - if (hashTable[i].locs[0] < 250) - { - cnt = hashTable[i].locs[0]; - tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp); - } - else - { - cnt =0; - tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp); - tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp); - } - - for (j=1; j<=hashTable[i].locs[0]; j++) - tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp); - } - } - - if (tmp == 0) - fprintf(stderr, "Write error while saving hash table.\n"); -} -/**********************************************/ -unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location) -{ - unsigned int sizeInc = 0; - - if (hashTable[hv].locs == NULL) - { - sizeInc = 1; - hashTable[hv].locs = getMem (sizeof(unsigned int)*2); - hashTable[hv].locs[0]=1; - hashTable[hv].locs[1]=location; - } - else - { - int size = hashTable[hv].locs[0]; - int i; - unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) ); - - for (i = 0; i <= size; i++) - { - tmp[i] = hashTable[hv].locs[i]; - } - size++; - tmp[0] = size; - tmp[size] = location; - - freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int)))); - hashTable[hv].locs = tmp; - } - return sizeInc; -} -/**********************************************/ -void generateIHashTable(char *fileName, char *indexName) -{ - double startTime = getTime(); - unsigned int hashTableSize = 0; - unsigned int hashTableMaxSize = pow(4, WINDOW_SIZE); - IHashTable *hashTable = getMem(sizeof(IHashTable)*hashTableMaxSize); - char *refGenName; - char *refGen; - int refGenOff = 0; - int i, hv, l, flag; - - - for ( i = 0; i < hashTableMaxSize; i++) - { - hashTable[i].locs = NULL; - } - - //Loading Fasta File - if (!initLoadingRefGenome(fileName)) - return; - initSavingIHashTable(indexName); - - fprintf(stdout, "Generating Index from %s", fileName); - fflush(stdout); - - char *prev = getMem (CONTIG_NAME_SIZE); - prev[0]='\0'; - - do - { - flag = loadRefGenome (&refGen, &refGenName, &refGenOff); - - if ( strcmp(prev, refGenName) != 0) - { - fprintf(stdout, "\n - %s ", refGenName); - fflush(stdout); - sprintf(prev, "%s", refGenName); - } - else - { - fprintf(stdout, "."); - fflush(stdout); - } - - l = strlen(refGen) - WINDOW_SIZE; - - for (i=0; i < l; i++) - { - hv = hashVal(refGen + i); - if (hv != -1) - { - hashTableSize += addIHashTableLocation (hashTable, hv, i+1); - } - } - - saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff); - freeIHashTableContent(hashTable, hashTableMaxSize); - hashTableSize = 0; - } while (flag); - - freeMem(prev, CONTIG_NAME_SIZE); - freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize); - - finalizeLoadingRefGenome(); - finalizeSavingIHashTable(); - - fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime)); -} - -/**********************************************/ -void finalizeLoadingIHashTable() -{ - freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); - freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); - freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; - freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); - fclose(_ih_fp); -} - -/**********************************************/ -int loadIHashTable(double *loadTime, int errThreshould) -{ - double startTime = getTime(); - unsigned char extraInfo = 0; - short len; - unsigned int refGenLength; - unsigned int hashTableSize; - unsigned int tmpSize; - int tmp; - int i=0,j=0; - - if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) ) - return 0; - - freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); - freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; - freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); - - // Reading Chr Name - tmp = fread(&len, sizeof(len), 1, _ih_fp); - if (tmp == 0){ - fprintf(stderr, "Read error while loading hash table.\n"); - exit(0); - } - - _ih_refGenName = getMem(sizeof(char)* (len+1)); - tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp); - _ih_refGenName [len] ='\0'; - - tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp); - - // Reading Size and Content of Ref Genome - tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp); - _ih_refGen = getMem(sizeof(char)*(refGenLength+1)); - tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp); - _ih_refGen[refGenLength]='\0'; - - //Reading Hashtable Size and Content - tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp); - - unsigned int hv; - unsigned char cnt=0; - unsigned int index = 1; - unsigned int tmpValue = 0; - for (i=0; i<hashTableSize; i++) - { - index = 1; - tmp = fread(&hv, sizeof(hv), 1, _ih_fp); - tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp); - - if (cnt>0) - { - tmpSize = cnt; - } - else - { - tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp); - } - - _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) ); - _ih_hashTable[hv].locs[0] = tmpSize; - -/* for (j=1; j<=tmpSize; j++) - tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp); -*/ - - if(tmpSize != 0) - { - tmp = fread(&(_ih_hashTable[hv].locs[index]), sizeof(_ih_hashTable[hv].locs[index]), 1, _ih_fp); - index++; - } - - for (j=2; j<=tmpSize; j++) - { - tmp = fread(&tmpValue, sizeof(_ih_hashTable[hv].locs[index-1]), 1, _ih_fp); - if(abs(tmpValue-_ih_hashTable[hv].locs[index-1]) > errThreshould) - { - _ih_hashTable[hv].locs[index] = tmpValue; - index++; - } - } - _ih_hashTable[hv].locs[0] = index-1; - } - *loadTime = getTime()-startTime; - return 1; -} -/**********************************************/ -unsigned int *getIHashTableCandidates(int hv) -{ - if ( hv != -1 ) - return _ih_hashTable[hv].locs; - else - return NULL; -} -/**********************************************/ -/**********************************************/ -/**********************************************/ -void configHashTable() -{ - if (WINDOW_SIZE <= 15) - { - generateHashTable = &generateIHashTable; - loadHashTable = &loadIHashTable; - finalizeLoadingHashTable = &finalizeLoadingIHashTable; - getCandidates = &getIHashTableCandidates; - } - else - { - - - } -} -/**********************************************/ -int initLoadingHashTable(char *fileName) -{ - int i; - unsigned char bsIndex; - int tmp; - - _ih_fp = fileOpen(fileName, "r"); - - if (_ih_fp == NULL) - return 0; - - tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp); - - if (tmp == 0){ - fprintf(stderr, "Read error while inilializing hash table loading.\n"); - exit(0); - } - - if (bsIndex) - { - fprintf(stdout, "Error: Wrong Type of Index indicated"); - return 0; - } - - tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); - - configHashTable(); - - if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE)) - { - - if (_ih_hashTable != NULL) - { - freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); - freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); - freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; - freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); - } - - _ih_maxHashTableSize = pow(4, WINDOW_SIZE); - - _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); - for (i=0; i<_ih_maxHashTableSize; i++) - _ih_hashTable[i].locs = NULL; - _ih_refGen = getMem(1); - _ih_refGen[0]='\0'; - _ih_refGenName = getMem(1); - _ih_refGenName[0] = '\0'; - } - - return 1; -} -/**********************************************/ -char *getRefGenome() -{ - return _ih_refGen; - -} -/**********************************************/ -char *getRefGenomeName() -{ - return _ih_refGenName; - -} -/**********************************************/ -int getRefGenomeOffset() -{ - return _ih_refGenOff; - -} -/**********************************************/ -HashTable *getHashTable() -{ - return NULL; -}
--- a/mrfast-2.1.0.4/HashTable.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#ifndef __HASH_TABLE__ -#define __HASH_TABLE__ - -typedef struct HashTable -{ - long long hv; - int *locs; -} HashTable; - -typedef struct -{ - unsigned int *locs; -} IHashTable; - -int hashVal(char *seq); -void configHashTable(); -char *getRefGenome(); -char *getRefGenomeName(); -int getRefGenomeOffset(); -int initLoadingHashTable(char *fileName); -HashTable *getHashTable(); - -void (*generateHashTable)(char *fileName, char *indexName); -int (*loadHashTable)(double *loadTime, int errThreshould); -void (*finalizeLoadingHashTable)(); -unsigned int *(*getCandidates)(int hv); - -#endif
--- a/mrfast-2.1.0.4/Makefile Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -CC=gcc -CFLAGS = -c -O3 -Wall -LDFLAGS = -lz -lm -SOURCES = baseFAST.c CommandLineParser.c Common.c HashTable.c MrFAST.c Output.c Reads.c RefGenome.c -OBJECTS = $(SOURCES:.c=.o) -EXECUTABLE = mrfast - - -all: $(SOURCES) $(EXECUTABLE) - rm -rf *.o - -$(EXECUTABLE): $(OBJECTS) - $(CC) $(OBJECTS) -o $@ $(LDFLAGS) - -.c.o: - $(CC) $(CFLAGS) $< -o $@ -clean: - rm -f *.o *~ #*
--- a/mrfast-2.1.0.4/MrFAST.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8148 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* -Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan -Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include <dirent.h> -#include <xmmintrin.h> -#include <emmintrin.h> -#include <mmintrin.h> - - -#include "Common.h" -#include "Reads.h" -#include "HashTable.h" -#include "Output.h" -#include "MrFAST.h" -#include "RefGenome.h" - - -#define min(a,b) ((a)>(b)?(b):(a)) -#define min3(a,b,c) ((a)>(b)?(b>c?c:b):(a>c?c:a)) -#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) - -#define MAX_REF_SIZE 18 - - -float calculateScore(int index, char *seq, char *qual, char *md); -unsigned char mrFAST = 1; -char *versionNumberF="0.4"; - -long long verificationCnt = 0; -long long mappingCnt = 0; -long long mappedSeqCnt = 0; -long long completedSeqCnt = 0; -char *mappingOutput; -/**********************************************/ -char *_msf_refGen = NULL; -int _msf_refGenLength = 0; -int _msf_refGenOffset = 0; -char *_msf_refGenName = NULL; - -int _msf_refGenBeg; -int _msf_refGenEnd; - -IHashTable *_msf_hashTable = NULL; - -int *_msf_samplingLocs; -int *_msf_samplingLocsEnds; -int _msf_samplingLocsSize; - -Read *_msf_seqList; -int _msf_seqListSize; - -Pair *_msf_sort_seqList = NULL; -int *_msf_map_sort_seqList; - -ReadIndexTable *_msf_rIndex = NULL; -int _msf_rIndexSize; -int _msf_rIndexMax; - -SAM _msf_output; - -OPT_FIELDS *_msf_optionalFields; - -char *_msf_op; - -int *_msf_verifiedLocs = NULL; - -char _msf_numbers[200][3]; -char _msf_cigar[5]; - -MappingInfo *_msf_mappingInfo; - -int *_msf_seqHits; -int _msf_openFiles = 0; -int _msf_maxLSize=0; -int _msf_maxRSize=0; - -BestFullMappingInfo *bestHitMappingInfo; - -/*************************/ -int _msf_maxFile=0; -char _msf_fileName[4000][200][2][FILE_NAME_LENGTH]; -int _msf_fileCount[4000]; - -char *_msf_readHasConcordantMapping; //boolean if a read has concordant mapping :D - -int *_msf_oeaMapping; -int *_msf_discordantMapping; - -FILE *bestConcordantFILE; -FILE *bestDiscordantFILE; - -int counter = 0; - -int scoreF[200][200]; -int scoreB[200][200]; - -int score[200][200]; -int direction1[200][200]; -int direction2[200][200]; - -__m128i MASK; - -int lookUpTable[15625][15625]; - -/**************************************************Methods***************************************************/ -int smallEditDistanceF(char *a, int lena, char *b, int lenb) -{ - int matrix[20][20]; - int i = 0; - int j = 0; - - for(i = 0; i <= lena; i++) - { - matrix[0][i] = i; - } - - for(i = 0; i <= lenb; i++) - { - matrix[i][0] = i; - } - - - for(i = 1; i <= lenb; i++) - { - for(j = 1; j <= lena; j++) - { - matrix[i][j] = min3(matrix[i-1][j-1]+ (a[j-1] != b[i-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); - } - } - return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); -} - -int smallEditDistanceB(char *a, int lena, char *b, int lenb) -{ - int matrix[20][20]; - int i = 0; - int j = 0; - - for(i = 0; i <= lena; i++) - { - matrix[0][i] = i; - } - - for(i = 0; i <= lenb; i++) - { - matrix[i][0] = i; - } - - - for(i = 1; i <= lenb; i++) - { - for(j = 1; j <= lena; j++) - { - matrix[i][j] = min3(matrix[i-1][j-1]+ (*(a-j+1) != *(b-i+1)),matrix[i][j-1]+1 ,matrix[i-1][j]+1); - } - } - - return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); -} - -char fastEditDistance(int per1, int per2) -{ - - int i = 0; - int j = 0; - - char str1[7]; - char str2[7]; - - int val1 = per1; - int val2 = per2; - - int index = 0; - int mod = 0; - - int matrix[7][7]; - - int min = 20; - - while(index < 6) - { - mod = val1%5; - str1[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); - val1 = val1 /5; - index++; - } - - str1[6] = '\0'; - - index = 0; - while(index < 6) - { - mod=val2%5; - str2[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); - val2 = val2 / 5; - index++; - } - str2[6] = '\0'; - - for(i = 0; i < 7; i++) - { - matrix[0][i] = i; - matrix[i][0] = i; - } - - for(i = 1; i < 7; i++) - { - for(j = 1; j < 7; j++) - { - matrix[i][j] = min3(matrix[i-1][j-1]+ (str1[i-1] != str2[j-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); - } - } - - for(i = 0; i < 7; i++) - { - if(matrix[i][6] < min) - min = matrix[i][6]; - } - - for(i = 0; i < 7; i++) - { - if(matrix[6][i] < min) - min = matrix[6][i]; - } - return min; -} - -void initLookUpTable() -{ - int i = 0; - - MASK = _mm_insert_epi16(MASK,1,0); - MASK = _mm_insert_epi16(MASK,1,1); - MASK = _mm_insert_epi16(MASK,1,2); - MASK = _mm_insert_epi16(MASK,1,3); - MASK = _mm_insert_epi16(MASK,1,4); - MASK = _mm_insert_epi16(MASK,0,5); - MASK = _mm_insert_epi16(MASK,0,6); - MASK = _mm_insert_epi16(MASK,0,7); - - for(i = 0 ; i < errThreshold + 1; i++) - { - scoreF[0][i] = i; - scoreF[i][0] = i; - } - - for(i = 0 ; i < errThreshold + 1; i++) - { - scoreB[0][i] = i; - scoreB[i][0] = i; - } - - -} - -int backwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - - int e = errThreshold; - - char flag = 0; - - int minError = 2*e; - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i Error; - __m128i tmp; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Error = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - if(lenb <= e) - { - return smallEditDistanceB(a,lena,b,lenb); - } - - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Side1 = _mm_xor_si128(Side1, Side1); - Down1 = _mm_xor_si128(Down1, Down1); - - Diag = _mm_insert_epi16(Diag,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,2*e,1); - - Down1 = _mm_insert_epi16(Down1,2*e,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - for(i=2; i <= e; i++) - { - //set side - Side1 = _mm_slli_si128(Side1,2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_slli_si128(Down1,2); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - Diag = _mm_insert_epi16(Diag,2*e,0); - - for(j=1;j<=i-1;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); - } - - else - { - Diag = _mm_insert_epi16(Diag,2*e,0); - for(j=i/2-1;j>=-i/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - } - } - Error = _mm_xor_si128(Error, Error); - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - - Error = _mm_insert_epi16(Error,e,0); - Side1 = _mm_insert_epi16(Side2,2*e,0); - Side2 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Error = _mm_slli_si128(Error, 2); - Error = _mm_insert_epi16(Error, e, 0); - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - for(; i <= 2*lenb-(e-1);i++) - { - flag = 0; - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - for(j=e/2;j>=-e/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); - } - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - - if(_mm_extract_epi16(R0,0) <= e) - flag = 1; - tmp = _mm_srli_si128(R0,2); - for(j=0; j <= e;j++) - { - if(_mm_extract_epi16(tmp,0) <= e) - flag = 1; - tmp = _mm_srli_si128(tmp,2); - } - - if(flag == 0) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - for(j=e/2;j>=-e/2-1;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i)/2-j-1)),0); - } - - // printf("@%d %d %d %d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1), _mm_extract_epi16(Diag,2), - // _mm_extract_epi16(Diag,3)); - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - - // printf("#~%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), - // _mm_extract_epi16(R1,3)); - - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - // printf("$%d %d %d %d\n", _mm_extract_epi16(Side2,0), _mm_extract_epi16(Side2,1), _mm_extract_epi16(Side2,2), - // _mm_extract_epi16(Side2,3)); - - // printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), - // _mm_extract_epi16(R1,3)); - - - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - } - - //first cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-3)) != *(a-lena), 0); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena-1)), 1); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-2)), 2); - Diag = _mm_insert_epi16(Diag, 2*e, 3); - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - minError = min(minError, _mm_extract_epi16(R1,2)); - - //second cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena)), 0); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-1)), 1); - Diag = _mm_insert_epi16(Diag, 2*e, 2); - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - minError = min(minError, _mm_extract_epi16(R0,1)); - - //third cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena+1)), 0); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena)), 1); - Diag = _mm_insert_epi16(Diag, 2*e, 2); - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - //forth - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+1)), 0); - Diag = _mm_insert_epi16(Diag, 2*e, 1); - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - //fifth - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+2)), 0); - Diag = _mm_insert_epi16(Diag, 2*e, 1); - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > e) - return -1; - return minError; -} - -int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - - int e = errThreshold; - - char flag = 0; - - int minError = 2*e; - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i Error; - __m128i tmp; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Error = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - if(lenb <= e) - { - return smallEditDistanceB(a,lena,b,lenb); - } - - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Side1 = _mm_xor_si128(Side1, Side1); - Down1 = _mm_xor_si128(Down1, Down1); - - Diag = _mm_insert_epi16(Diag,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,2*e,1); - - Down1 = _mm_insert_epi16(Down1,2*e,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - for(i=2; i <= e; i++) - { - //set side - Side1 = _mm_slli_si128(Side1,2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_slli_si128(Down1,2); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - Diag = _mm_insert_epi16(Diag,2*e,0); - - for(j=1;j<=i-1;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); - } - - else - { - Diag = _mm_insert_epi16(Diag,2*e,0); - for(j=i/2-1;j>=-i/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - } - } - Error = _mm_xor_si128(Error, Error); - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - - Error = _mm_insert_epi16(Error,e,0); - Side2 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Error = _mm_slli_si128(Error, 2); - Error = _mm_insert_epi16(Error, e, 0); - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - for(; i <= 2*lenb-(e-1);i++) - { - flag = 0; - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - for(j=e/2;j>=-e/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - if(_mm_extract_epi16(R0,0) <= e) - flag = 1; - tmp = _mm_srli_si128(R0,2); - for(j=0; j <= e;j++) - { - if(_mm_extract_epi16(tmp,0) <= e) - flag = 1; - tmp = _mm_srli_si128(tmp,2); - } - - if(flag == 0) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - for(j=-e/2+1;j<=e/2;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2-j-1)) != *(a-((i-1)/2+j-1)),0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - } - - j=0; - int tmpE = e; - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - - tmpE--; - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, 2*e, 0); - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, 2*e, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > e) - return -1; - return minError; -} - -inline int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int i0; - int i1; - int i2; - int i4; - int i5; - - int e = 4; - int mismatch = errThreshold; - - int minError = 2*errThreshold; - int index = 0; - int tmpValue = 0; - - if(lenb <= e) - { - return smallEditDistanceB(a,lena,b,lenb); - } - - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i tmp; - __m128i SeqA, SeqB; - __m128i Result; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - SeqA = _mm_setzero_si128 (); - SeqB = _mm_setzero_si128 (); - Result = _mm_setzero_si128 (); - /* end initialize */ - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Diag = _mm_insert_epi16(Diag,minError,0); - - i0 = (a[0] != b[0]); - i1 = min(i0, ( *(a-1)!=*b) )+1; - i2 = min(i0,( a[0] != *(b-1) ) )+1; - - i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); - i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; - i5 = min(i2, (a[0] != *(b-2))+1)+1; - - R1 = _mm_insert_epi16(R1, 3, 0); - R1 = _mm_insert_epi16(R1, i1, 1); - R1 = _mm_insert_epi16(R1, i2, 2); - R1 = _mm_insert_epi16(R1, 3, 3); - - - R0 = _mm_insert_epi16(R0, 4, 0); - R0 = _mm_insert_epi16(R0, i4, 1); - R0 = _mm_insert_epi16(R0, i0, 2); - R0 = _mm_insert_epi16(R0, i5, 3); - R0 = _mm_insert_epi16(R0, 4, 4); - - - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - Side1 = _mm_xor_si128(Side1, Side1); - - Side2 = _mm_insert_epi16(Side2,minError,0); - Down1 = _mm_insert_epi16(Down1,minError,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - - index = 0; - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,*(a-index),0); - SeqB = _mm_insert_epi16(SeqB,*(b-index),0); - index++; - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,minError,0); - - index = 4; - i = 5; - - int loopEnd = 2*lenb-(e-1); - for(; i <= loopEnd ;i++) - { - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); - SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); - - index++; - - tmp = _mm_shufflelo_epi16(SeqB,27); - tmp = _mm_slli_si128(tmp, 2); - tmpValue = _mm_extract_epi16(tmp, 5); - tmp = _mm_insert_epi16(tmp, tmpValue, 0); - - Result = _mm_cmpeq_epi16(SeqA, tmp); - Diag = _mm_andnot_si128(Result, MASK); - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold - && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && _mm_extract_epi16(R1, 0) > errThreshold - && _mm_extract_epi16(R1, 1) > errThreshold && _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); - Diag = _mm_andnot_si128(Result, MASK); - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - - - } - - j=0; - int tmpE = e; - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, 2*errThreshold, 0); - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, 2*errThreshold, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > mismatch) - return -1; - return minError; -} - -int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int i0; - int i1; - int i2; - int i4; - int i5; - - int e = errThreshold; - - int minError = 2*e; - int index = 0; - int tmpValue = 0; - - if(lenb <= e) - { - return smallEditDistanceB(a,lena,b,lenb); - } - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i tmp; - __m128i SeqA, SeqB; - __m128i Result; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - SeqA = _mm_setzero_si128 (); - SeqB = _mm_setzero_si128 (); - Result = _mm_setzero_si128 (); - /* end initialize */ - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - - i0 = (a[0] != b[0]); - i1 = min(i0, ( *(a-1)!=*b) )+1; - i2 = min(i0,( a[0] != *(b-1) ) )+1; - - i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); - i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; - i5 = min(i2, (a[0] != *(b-2))+1)+1; - - R1 = _mm_insert_epi16(R1, 3, 0); - R1 = _mm_insert_epi16(R1, i1, 1); - R1 = _mm_insert_epi16(R1, i2, 2); - R1 = _mm_insert_epi16(R1, 3, 3); - - - R0 = _mm_insert_epi16(R0, 4, 0); - R0 = _mm_insert_epi16(R0, i4, 1); - R0 = _mm_insert_epi16(R0, i0, 2); - R0 = _mm_insert_epi16(R0, i5, 3); - R0 = _mm_insert_epi16(R0, 4, 4); - - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - Side1 = _mm_xor_si128(Side1, Side1); - - Side2 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - - index = 0; - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,*(a-index),0); - SeqB = _mm_insert_epi16(SeqB,*(b-index),0); - index++; - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - index = 4; - i = 5; - int loopEnd = 2*lenb-(e-1); - for(; i <= loopEnd ;i++) - { - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); - SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); - - index++; - - tmp = _mm_shufflelo_epi16(SeqB,27); - tmp = _mm_slli_si128(tmp, 2); - tmpValue = _mm_extract_epi16(tmp, 5); - tmp = _mm_insert_epi16(tmp, tmpValue, 0); - - Result = _mm_cmpeq_epi16(SeqA, tmp); - Diag = _mm_andnot_si128(Result, MASK); - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - //tmp = _mm_sub_epi16(Error, R0); - //i0 = _mm_movemask_epi8(tmp); - - if( _mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e - && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && - _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e ) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); - Diag = _mm_andnot_si128(Result, MASK); - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - - - } - - j=0; - - int tmpE = e; - - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, 2*e, 0); - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, 2*e, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > e) - return -1; - return minError; -} - -inline int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int i0=0; - int i1=0; - int i2=0; - int i4=0; - int i5=0; - - int mismatch = errThreshold; - int e = 4; - - int minError = 4*mismatch+1; - int index = 0; - int tmpValue = 0; - - if(lenb <= e) - { - return smallEditDistanceF(a,lena,b,lenb); - } - - - register __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i tmp; - register __m128i SeqA, SeqB; - __m128i Result; - - __m128i tmpSeqA; - __m128i tmpSeqB; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - SeqA = _mm_setzero_si128 (); - SeqB = _mm_setzero_si128 (); - Result = _mm_setzero_si128 (); - /* end initialize */ - - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Diag = _mm_insert_epi16(Diag,minError,0); - - i0 = (a[0] != b[0]); - i1 = min(i0, (a[1]!=b[0]))+1; - i2 = min(i0,(a[0]!=b[1]))+1; - - i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); - i4 = min(i1, (a[2]!=b[0])+1)+1; - i5 = min(i2, (a[0]!=b[2])+1)+1; - - R1 = _mm_insert_epi16(R1, 3, 0); - R1 = _mm_insert_epi16(R1, i1, 1); - R1 = _mm_insert_epi16(R1, i2, 2); - R1 = _mm_insert_epi16(R1, 3, 3); - - R0 = _mm_insert_epi16(R0, 4, 0); - R0 = _mm_insert_epi16(R0, i4, 1); - R0 = _mm_insert_epi16(R0, i0, 2); - R0 = _mm_insert_epi16(R0, i5, 3); - R0 = _mm_insert_epi16(R0, 4, 4); - - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - Side1 = _mm_xor_si128(Side1, Side1); - - Side2 = _mm_insert_epi16(Side2,minError,0); - Down1 = _mm_insert_epi16(Down1,minError,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - - index = 0; - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,a[index],0); - SeqB = _mm_insert_epi16(SeqB,b[index],0); - index++; - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,minError,0); - - index = 4; - i = 5; - - int loopEnd = 2*lenb-(e-1); - for(; i <= loopEnd ;i++) - { - if( i%2 == 0) - { - tmpSeqA = _mm_slli_si128(SeqA, 2); - tmpSeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); - SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); - - index++; - - tmp = _mm_shufflelo_epi16(SeqB,27); - tmp = _mm_slli_si128(tmp, 2); - tmpValue = _mm_extract_epi16(tmp, 5); - tmp = _mm_insert_epi16(tmp, tmpValue, 0); - - Result = _mm_cmpeq_epi16(SeqA, tmp); - Diag = _mm_andnot_si128(Result, MASK); - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold - && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && - _mm_extract_epi16(R1, 0) > errThreshold && _mm_extract_epi16(R1, 1) > errThreshold && - _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); - Diag = _mm_andnot_si128(Result, MASK); - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - } - - j=0; - int tmpE = e; - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, minError, 0); - Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, minError, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - - minError = min(minError, _mm_extract_epi16(R0,0)); - - - if(minError > mismatch) - return -1; - return minError; -} - - - -int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int i0=0; - int i1=0; - int i2=0; - int i4=0; - int i5=0; - - int e = errThreshold; - - int minError = 2*e; - int index = 0; - int tmpValue = 0; - - if(lenb <= e) - { - return smallEditDistanceF(a,lena,b,lenb); - } - - - register __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i tmp; - register __m128i SeqA, SeqB; - __m128i Result; - - __m128i tmpSeqA; - __m128i tmpSeqB; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - SeqA = _mm_setzero_si128 (); - SeqB = _mm_setzero_si128 (); - Result = _mm_setzero_si128 (); - /* end initialize */ - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - - i0 = (a[0] != b[0]); - i1 = min(i0, (a[1]!=b[0]))+1; - i2 = min(i0,(a[0]!=b[1]))+1; - - i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); - i4 = min(i1, (a[2]!=b[0])+1)+1; - i5 = min(i2, (a[0]!=b[2])+1)+1; - - R1 = _mm_insert_epi16(R1, 3, 0); - R1 = _mm_insert_epi16(R1, i1, 1); - R1 = _mm_insert_epi16(R1, i2, 2); - R1 = _mm_insert_epi16(R1, 3, 3); - - R0 = _mm_insert_epi16(R0, 4, 0); - R0 = _mm_insert_epi16(R0, i4, 1); - R0 = _mm_insert_epi16(R0, i0, 2); - R0 = _mm_insert_epi16(R0, i5, 3); - R0 = _mm_insert_epi16(R0, 4, 4); - - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - Side1 = _mm_xor_si128(Side1, Side1); - - Side2 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - - index = 0; - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - SeqA = _mm_slli_si128(SeqA, 2); - SeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(SeqA,a[index],0); - SeqB = _mm_insert_epi16(SeqB,b[index],0); - index++; - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - index = 4; - i = 5; - - int loopEnd = 2*lenb-(e-1); - for(; i <= loopEnd ;i++) - { - //Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - tmpSeqA = _mm_slli_si128(SeqA, 2); - tmpSeqB = _mm_slli_si128(SeqB, 2); - SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); - SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); - - index++; - - tmp = _mm_shufflelo_epi16(SeqB,27); - tmp = _mm_slli_si128(tmp, 2); - tmpValue = _mm_extract_epi16(tmp, 5); - tmp = _mm_insert_epi16(tmp, tmpValue, 0); - - Result = _mm_cmpeq_epi16(SeqA, tmp); - Diag = _mm_andnot_si128(Result, MASK); - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - if(_mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e - && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && - _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); - Diag = _mm_andnot_si128(Result, MASK); - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - - - } - j=0; - int tmpE = e; - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, 2*e, 0); - Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, 2*e, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > e) - return -1; - return minError; -} - -int forwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int e = errThreshold; - - int minError = 2*e; - - char flag = 0; - - if(lenb <= e) - { - return smallEditDistanceF(a,lena,b,lenb); - } - - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i Error; - __m128i tmp; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Error = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Side1 = _mm_xor_si128(Side1, Side1); - Down1 = _mm_xor_si128(Down1, Down1); - - Diag = _mm_insert_epi16(Diag,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,2*e,1); - - Down1 = _mm_insert_epi16(Down1,2*e,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - for(i=2; i <= e; i++) - { - //set side - Side1 = _mm_slli_si128(Side1,2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_slli_si128(Down1,2); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - Diag = _mm_insert_epi16(Diag,2*e,0); - - for(j=1;j<=i-1;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); - - } - - else - { - Diag = _mm_insert_epi16(Diag,2*e,0); - for(j=i/2-1;j>=-i/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - } - } - Error = _mm_xor_si128(Error, Error); - Side2 = _mm_xor_si128(Side2, Side2); - Side1 = _mm_xor_si128(Side1, Side1); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - - - Error = _mm_insert_epi16(Error,e,0); - Side2 = _mm_insert_epi16(Side2,2*e,0); - Side1 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Side1 = _mm_slli_si128(Side1, 2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Error = _mm_slli_si128(Error, 2); - Error = _mm_insert_epi16(Error, e, 0); - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - for(; i <= 2*lenb-(e-1);i++) - { - flag = 0; - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - for(j=e/2;j>=-e/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); - } - - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - if(_mm_extract_epi16(R0,0) <= e) - flag = 1; - - tmp = _mm_srli_si128(R0,2); - for(j=0; j < e-1;j++) - { - if(_mm_extract_epi16(tmp,0) <= e) - flag = 1; - tmp = _mm_srli_si128(tmp,2); - } - // printf("#%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); - if(flag == 0) - return -1; - - if(i == 2*lenb-(e-1)) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - for(j=e/2;j>=-e/2-1;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i)/2-j-1],0); - } - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - //printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), - // _mm_extract_epi16(R1,3)); - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - } - - //first cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, b[lenb-3] != a[lena], 0); - Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena-1], 1); - Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-2], 2); - Diag = _mm_insert_epi16(Diag, 2*e, 3); - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - - minError = min(minError, _mm_extract_epi16(R1,2)); - - //second cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena], 0); - Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-1], 1); - Diag = _mm_insert_epi16(Diag, 2*e, 2); - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - - minError = min(minError, _mm_extract_epi16(R0,1)); - - //third cell - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena+1], 0); - Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena], 1); - Diag = _mm_insert_epi16(Diag, 2*e, 2); - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - - minError = min(minError, _mm_extract_epi16(R1,1)); - - //forth - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+1], 0); - Diag = _mm_insert_epi16(Diag, 2*e, 1); - - R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); - R0 = _mm_min_epi16(R0, R1+Down1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - //fifth - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+2], 0); - Diag = _mm_insert_epi16(Diag, 2*e, 1); - - R1 = _mm_min_epi16(R0+Side2, R1+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); - - - minError = min(minError, _mm_extract_epi16(R1,0)); - - if(minError > e) - return -1; - return minError; - -} - -int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i = 0; - int j = 0; - int k = 0; - - int e = errThreshold; - - int minError = 2*e; - - char flag = 0; - - if(lenb <= e) - { - return smallEditDistanceF(a,lena,b,lenb); - } - - - __m128i R0, R1; - __m128i Diag; - __m128i Side1, Side2; - __m128i Down1, Down2; - __m128i Error; - __m128i tmp; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Error = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - R1 = _mm_xor_si128(R1, R1); - R0 = _mm_xor_si128(R0, R0); - - Diag = _mm_xor_si128(Diag, Diag); - Side1 = _mm_xor_si128(Side1, Side1); - Down1 = _mm_xor_si128(Down1, Down1); - - Diag = _mm_insert_epi16(Diag,2*e,0); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,2*e,1); - - Down1 = _mm_insert_epi16(Down1,2*e,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - for(i=2; i <= e; i++) - { - //set side - Side1 = _mm_slli_si128(Side1,2); - Side1 = _mm_insert_epi16(Side1,1,0); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_slli_si128(Down1,2); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - Diag = _mm_insert_epi16(Diag,2*e,0); - - for(j=1;j<=i-1;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); - } - - else - { - Diag = _mm_insert_epi16(Diag,2*e,0); - for(j=i/2-1;j>=-i/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); - } - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, 2*e,0); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - } - } - Error = _mm_xor_si128(Error, Error); - Side2 = _mm_xor_si128(Side2, Side2); - Down2 = _mm_xor_si128(Down2, Down2); - Down1 = _mm_xor_si128(Down1, Down1); - - Error = _mm_insert_epi16(Error,e,0); - Side2 = _mm_insert_epi16(Side2,2*e,0); - Down1 = _mm_insert_epi16(Down1,2*e,0); - - - for(j=0; j < e; j++) - { - Side2 = _mm_slli_si128(Side2, 2); - Side2 = _mm_insert_epi16(Side2,1,0); - - Down1 = _mm_slli_si128(Down1, 2); - Down1 = _mm_insert_epi16(Down1,1,0); - - Down2 = _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,1,0); - - Error = _mm_slli_si128(Error, 2); - Error = _mm_insert_epi16(Error, e, 0); - } - - Down2= _mm_slli_si128(Down2, 2); - Down2 = _mm_insert_epi16(Down2,2*e,0); - - for(; i <= 2*lenb-(e-1);i++) - { - flag = 0; - Diag = _mm_xor_si128(Diag, Diag); - if( i%2 == 0) - { - for(j=e/2;j>=-e/2;j--) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - - if(_mm_extract_epi16(R0,0) <= e) - flag = 1; - - tmp = _mm_srli_si128(R0,2); - for(j=0; j < e-1;j++) - { - if(_mm_extract_epi16(tmp,0) <= e) - flag = 1; - tmp = _mm_srli_si128(tmp,2); - } - - - if(flag == 0) - return -1; - - if(i == 2*lenb-e) - { - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = _mm_extract_epi16(tmp,0); - } - - } - - else - { - for(j=-e/2+1;j<=e/2;j++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[(i+1)/2-j-1] != a[(i-1)/2+j-1],0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - if(i >= 2*lenb-e) - { - tmp = _mm_srli_si128(R1,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - } - } - - j=0; - int tmpE = e; - for(;j<2*(e-2)+1;j++) - { - - Diag = _mm_xor_si128(Diag, Diag); - //set the first element - if(j==0) - { - for( k=0;k<=e-1;k++ ) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < e-2;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - else if(j%2 == 0) - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R0 = _mm_min_epi16(R1+Side2, R0+Diag); - R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); - - tmpE--; - - tmp = _mm_srli_si128(R0,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - - - else - { - for(k=0;k<tmpE;k++) - { - Diag = _mm_slli_si128(Diag, 2); - Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); - } - - R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); - R1 = _mm_min_epi16(R1, R0+Down1); - - tmp = _mm_srli_si128(R1,2); - for(k=0; k < tmpE-1;k++) - tmp = _mm_srli_si128(tmp,2); - minError = min(minError, _mm_extract_epi16(tmp,0)); - } - i++; - } - //Diag - - Diag = _mm_xor_si128(Diag,Diag); - Diag = _mm_insert_epi16(Diag, 2*e, 0); - Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - - Down1 = _mm_insert_epi16(Down1, 2*e, 0); - Down1 = _mm_insert_epi16(Down1, 1, 1); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); - - minError = min(minError, _mm_extract_epi16(R1,1)); - - Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 1); - Down1 = _mm_insert_epi16(Down1, 1, 0); - - R0 = _mm_min_epi16(R1+Down1,R0+Diag); - R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); - - minError = min(minError, _mm_extract_epi16(R0,0)); - - if(minError > e) - return -1; - return minError; -} - - -int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - - - int i0 = 0; - int i1 = 0; - - - int error; //0: if the two character are equal 1: if not - - int i = 0; //loop index - - int e = 2; //error bound - - int totalError = 0; - - __m128i R0; - __m128i R1; - - __m128i Side1, Side2,Side; //side matrix - __m128i Down1, Down2,Down; //down matrix - __m128i Diag; - - __m128i tmp; - __m128i ERROR_REACH; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Side = _mm_setzero_si128 (); - Down = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - ERROR_REACH = _mm_setzero_si128 (); - /* end initialize */ - - - if(lenb <= e) - { - return smallEditDistanceF(a,lena,b,lenb); - } - - ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - -// error = ((a[0]) != (b[0])); - - Diag = _mm_set_epi16(0,0,0,0,0,2*e,((a[0]) != (b[0])),2*e); - Side1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); - Side2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); - Down1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); - Down2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - for (i = 3; i < 2*lena; i++) - { - if(i % 2 ==1) - { - - Diag = _mm_xor_si128(Diag, Diag); - error = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); - Diag = _mm_insert_epi16(Diag,error,0); - error = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); - Diag = _mm_insert_epi16(Diag,error,1); -// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0, ((a[(i-1)/2-1]) != (b[(i+1)/2-1])) ,((a[(i+1)/2-1]) != (b[(i-1)/2-1]))); - - - tmp = _mm_srli_si128(R0,2); - - R1 = _mm_min_epi16(tmp+Side1, R1+Diag); - R1 = _mm_min_epi16(R1,R0+Down1); - - if(i > 2 * lenb - 2) - { - i1 = _mm_extract_epi16(R1, 1); - totalError = min(totalError, i1); - } - } - - else if(i % 2 == 0) - { - error = ((a[i/2]) != (b[i/2-2])); - Diag = _mm_insert_epi16(Diag,error,0); - error = ((a[i/2-1]) != (b[i/2-1])); - Diag = _mm_insert_epi16(Diag,error,1); - error = ((a[i/2-2]) != (b[i/2])); - Diag = _mm_insert_epi16(Diag,error,2); - - // Diag = _mm_set_epi16(0, 0, 0, 0, 0, ((a[i/2-2]) != (b[i/2])) , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - tmp = _mm_sub_epi16(ERROR_REACH, R0); - i0 = _mm_movemask_epi8(tmp); - - if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) - return -1; - if(i == 2 * lenb - 2) { - totalError = _mm_extract_epi16(R0, 2); - } - } - } - - Down1 = _mm_insert_epi16(Down1,2*e,0); - - //fill the first part of the error - error = ((a[i/2]) != (b[i/2-2])); - Diag = _mm_insert_epi16(Diag,error,0); - error = ((a[i/2-1]) != (b[i/2-1])); - Diag = _mm_insert_epi16(Diag,error,1); - Diag = _mm_insert_epi16(Diag,2*e,2); -// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); - -// i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - - totalError = min(totalError, i1); - - //fill the second part of the error - i++; - - Diag = _mm_xor_si128(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - error = ((a[i/2]) != (b[lenb-1])); - Diag = _mm_insert_epi16(Diag,error,1); - Diag = _mm_insert_epi16(Diag,2*e,2); -// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2]) != (b[lenb-1])) , 2*e ); - - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); - -// i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - totalError = min(totalError, i1); - //fill the last the last element of the matrix - i++; - - Diag = _mm_xor_si128(Diag, Diag); - error = ((a[i/2]) != (b[lenb-1])); - Diag = _mm_insert_epi16(Diag,error,0); - -// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0 , 0 , ((a[i/2]) != (b[lenb-1])) ); - - - Down = _mm_insert_epi16(Down,1,0); - - Side = _mm_insert_epi16(Side,1,0); - - tmp = _mm_srli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0,tmp+Side); - - i0 = _mm_extract_epi16(R0, 0); - - totalError = min(totalError, i0); - - if(totalError > e) - return -1; - - return totalError; - -} - -int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) -{ - if(lenb == 0 || lena == 0) - return 0; - - int i0 = 0; - int i1 = 0; - - int error; //0: if the two character are equal 1: if not - - int i = 0; //loop index - - int e = 2; //error bound - - int totalError = 0; - - __m128i R0; - __m128i R1; - - __m128i Side1, Side2,Side; //side matrix - __m128i Down1, Down2,Down; //down matrix - __m128i Diag; //diag matrix - - __m128i tmp; - __m128i ERROR_REACH; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Side = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Down = _mm_setzero_si128 (); - ERROR_REACH = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - if(lenb <= e) - { - return smallEditDistanceB(a,lena,b,lenb); - } - - - ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); - - R0 = _mm_insert_epi16(R0,0,0); - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - error = ((a[0]) != (b[0])); - - Diag = _mm_insert_epi16(Diag,2*e,0); - Diag = _mm_insert_epi16(Diag,error,1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - Side1 = _mm_insert_epi16(Side1,2*e,2); - - Side2 = _mm_insert_epi16(Side2,2*e,0); - Side2 = _mm_insert_epi16(Side2,1,1); - Side2 = _mm_insert_epi16(Side2,1,2); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - Down2 = _mm_insert_epi16(Down2,2*e,0); - Down2 = _mm_insert_epi16(Down2,1,1); - Down2 = _mm_insert_epi16(Down2,1,2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - -// printf("%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); - for (i = 3; i < 2*lena; i++) - { - if(i % 2 ==1) - { - Diag = _mm_sub_epi8(Diag, Diag); - error = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); - Diag = _mm_insert_epi16(Diag,error,0); - error = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); - Diag = _mm_insert_epi16(Diag,error,1); - //printf("#%d #%d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1)); - tmp = _mm_srli_si128(R0,2); - - R1 = _mm_min_epi16(tmp+Side1, R1+Diag); - R1 = _mm_min_epi16(R1,R0+Down1); - - if(i > 2 * lenb - 2) { - i1 = _mm_extract_epi16(R1, 1); - totalError = min(totalError, i1); - } -// printf("%d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1)); - } - - else if(i % 2 == 0) - { - error = ( *(a-(i/2)) != *(b-(i/2-2)) ); - Diag = _mm_insert_epi16(Diag,error,0); - error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); - Diag = _mm_insert_epi16(Diag,error,1); - error = ( *(a-(i/2-2)) != *(b-(i/2))); - Diag = _mm_insert_epi16(Diag,error,2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - tmp = _mm_sub_epi16(ERROR_REACH, R0); - i0 = _mm_movemask_epi8(tmp); - - if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) - return -1; - - if(i == 2 * lenb - 2) { - totalError = _mm_extract_epi16(R0, 2); - } - } - } - Down1 = _mm_insert_epi16(Down1,2*e,0); - - //fill the first part of the error - error = ( *(a-(i/2)) != *(b-(i/2-2)) ); - Diag = _mm_insert_epi16(Diag,error,0); - error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); - Diag = _mm_insert_epi16(Diag,error,1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - - totalError = min(totalError, i1); - - //fill the second part of the error - i++; - Diag = _mm_sub_epi8(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - error = ( *(a-(i/2)) != *(b-(lenb-1)) ); - Diag = _mm_insert_epi16(Diag,error,1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); - - i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - totalError = min(totalError, i1); - - //fill the last the last element of the matrix - i++; - Diag = _mm_sub_epi8(Diag, Diag); - error = ( *(a-(i/2)) != *(b-(lenb-1)) ); - Diag = _mm_insert_epi16(Diag,error,0); - - Down = _mm_insert_epi16(Down,1,0); - - Side = _mm_insert_epi16(Side,1,0); - - tmp = _mm_srli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); - R0 = _mm_min_epi16(R0,tmp+Side); - - i0 = _mm_extract_epi16(R0, 0); - - totalError = min(totalError, i0); - - if(totalError > e || totalError == 0) - return -1; - return totalError; -} - -void initBestMapping(int totalReadNumber) -{ - int i = 0; - bestHitMappingInfo = getMem(totalReadNumber * sizeof(BestFullMappingInfo)); - for(i = 0; i < totalReadNumber; i++) { - bestHitMappingInfo[i].loc = -1; - } -} - - -void finalizeBestSingleMapping() -{ - int i = 0; - char *_tmpQual, *_tmpSeq; - char rqual[SEQ_LENGTH + 1]; - rqual[SEQ_LENGTH]='\0'; - - for(i = 0; i < _msf_seqListSize; i++) - { - if(_msf_seqList[i].hits[0] != 0) - { - if (bestHitMappingInfo[i].dir) - { - reverse(_msf_seqList[i].qual, rqual, SEQ_LENGTH); - _tmpQual = rqual; - _tmpSeq = _msf_seqList[i].rseq; - } - else - { - _tmpQual = _msf_seqList[i].qual; - _tmpSeq = _msf_seqList[i].seq; - } - - - _msf_output.QNAME = _msf_seqList[i].name; - _msf_output.FLAG = 16 * bestHitMappingInfo[i].dir; - _msf_output.RNAME = bestHitMappingInfo[i].chr; - - _msf_output.POS = bestHitMappingInfo[i].loc; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = bestHitMappingInfo[i].cigar ; - _msf_output.MRNAME = "*"; - _msf_output.MPOS = 0; - _msf_output.ISIZE = 0; - - - _msf_output.SEQ = _tmpSeq; - _msf_output.QUAL = _tmpQual; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = bestHitMappingInfo[i].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = bestHitMappingInfo[i].md; - - output(_msf_output); - } - } - freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); -} -/**********************************************/ -int compare (const void *a, const void *b) -{ - return ((Pair *)a)->hv - ((Pair *)b)->hv; - /*char *s1 = ((Pair *)a)->hv; - char *s2 = ((Pair *)b)->hv; - int i = 0; - - int diff = 0; - int sign = 0; - - for(i = 0; i < SEQ_LENGTH; i++) - { - diff += (s1[i] != s2[i]); - if(s1[i] > s2[i]) - sign++; - else if(s1[i] < s2[i]) - sign--; - } - - return diff*sign;*/ -// return strncmp(s1, s2,SEQ_LENGTH); - -} -/**********************************************/ -void preProcessReads() -{ - int i = 0; - - _msf_sort_seqList = getMem(_msf_seqListSize * sizeof(Pair)); - for(i = 0; i < _msf_seqListSize; i++) - { - _msf_sort_seqList[i].hv = hashVal(_msf_seqList[i].seq); - - _msf_sort_seqList[i].readNumber = i; - } - - qsort(_msf_sort_seqList, _msf_seqListSize, sizeof(Pair), compare); - - /* - for(i = 0; i < _msf_seqListSize; i++) - { - //printf("%s\n", _msf_sort_seqList[i].hv); - } - */ - - _msf_map_sort_seqList = getMem(_msf_seqListSize * sizeof(int)); - - for(i = 0; i < _msf_seqListSize; i++) - _msf_map_sort_seqList[_msf_seqList[i].readNumber] = i; - -} -/**********************************************/ - -int verifySingleEnd(int index, char* seq, int offset) -{ - int curOff = 0; - int i; - - char *ref; - - int err; - int errCnt =0; - int errCntOff = 0; - int NCntOff = 0; - - ref = _msf_refGen + index - 1; - - verificationCnt++; - - for (i = 0; i < SEQ_LENGTH; i++) - { - err = *ref != *seq; - errCnt += err; - if (errCnt > errThreshold) - { - - return -1; - } - - if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff]) - { - errCntOff += err; - NCntOff += (*seq == 'N'); - } - else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1]) - { - - if (errCntOff == 0 && NCntOff == 0 && offset > curOff) - { - return -1; - } - - errCntOff = 0; - NCntOff = 0; - curOff++; - - if ( i >= _msf_samplingLocs[curOff]) - { - errCntOff += err; - NCntOff += (*seq == 'N'); - } - } - - ref++; - seq++; - } - return errCnt; -} - -/*********************************************/ -void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName) -{ - int i; - - if (_msf_optionalFields == NULL) - { - _msf_op = getMem(SEQ_LENGTH); - if (pairedEndMode) - { - _msf_optionalFields = getMem(8*sizeof(OPT_FIELDS)); - } - else - { - _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS)); - } - - for (i=0; i<200;i++) - { - sprintf(_msf_numbers[i],"%d%c",i, '\0'); - } - sprintf(_msf_cigar, "%dM", SEQ_LENGTH); - } - - if (_msf_samplingLocsEnds == NULL) - { - _msf_samplingLocs = samplingLocs; - _msf_samplingLocsSize = samplingLocsSize; - - _msf_samplingLocsEnds = getMem(sizeof(int)*_msf_samplingLocsSize); - for (i=0; i<_msf_samplingLocsSize; i++) - { - _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1; - } - - _msf_seqList = seqList; - _msf_seqListSize = seqListSize; - - preProcessReads(); - - _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int)); - for(i = 0; i < _msf_seqListSize; i++) - { - _msf_oeaMapping[i] = 0; - } - - _msf_discordantMapping = getMem(_msf_seqListSize * sizeof(int)); - for(i = 0; i < _msf_seqListSize; i++) - { - _msf_discordantMapping[i] = 0; - } - - } - - if (_msf_refGenName == NULL) - { - _msf_refGenName = getMem(4*SEQ_LENGTH); - } - _msf_refGen = getRefGenome(); - _msf_refGenLength = strlen(_msf_refGen); - - _msf_refGenOffset = getRefGenomeOffset(); - snprintf(_msf_refGenName, 4*SEQ_LENGTH,"%s%c", getRefGenomeName(), '\0'); - _msf_refGenName[strlen(getRefGenomeName())] = '\0'; - - - if (_msf_verifiedLocs != NULL){ - freeMem(_msf_verifiedLocs, sizeof(int) * (_msf_refGenLength+1)); - } - - _msf_verifiedLocs = (int *) getMem(sizeof(int)*(_msf_refGenLength+1)); - - for (i=0; i<=_msf_refGenLength; i++) - _msf_verifiedLocs[i] = _msf_seqListSize*10+1; - - - - if (pairedEndMode && _msf_seqHits == NULL) - { - - _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo)); - - for (i=0; i<seqListSize; i++) - { - //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations)); - _msf_mappingInfo[i].next = NULL; - _msf_mappingInfo[i].size = 0; - } - - _msf_seqHits = getMem((_msf_seqListSize) * sizeof(int)); - - - for (i=0; i<_msf_seqListSize; i++) - { - _msf_seqHits[i] = 0; - } - - _msf_readHasConcordantMapping = getMem(_msf_seqListSize / 2 * sizeof(char)); - for(i = 0; i < _msf_seqListSize/2; i++) - { - _msf_readHasConcordantMapping[i] = 0; - } - - initLoadingRefGenome(genFileName); - } - - if (_msf_refGenOffset == 0) - { - _msf_refGenBeg = 1; - } - else - { - _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2; - } - _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1; - - -} -/**********************************************/ -void finalizeFAST() -{ - freeMem(_msf_seqHits, (_msf_seqListSize) * sizeof(int)); - freeMem(_msf_refGenName, 4*SEQ_LENGTH); - - - /* - int i; - for (i=0; i<_msf_rIndexSize; i++) - { - freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1); - } - freeMem(_msf_rIndex, _msf_rIndexSize);*/ - - - freeMem(_msf_map_sort_seqList, sizeof(Pair) * _msf_seqListSize); - freeMem(_msf_sort_seqList, sizeof(int) * _msf_seqListSize); - -} - -/* - Will apply the Levenshtein Dynamic programming. - Different from verifySingleEndEditDistance fucntion - as in this fucntion only one dynamic table is made while - in verifySingleEndEditDistance two dynamic table is made - for each right and left string -*/ -int editDistance(int refIndex, char *seq, int seqLength, char *matrix) -{ - int i = 0; - int size = 0; - int error = 0; - int rIndex = 0; - int directionIndex = 0; - - int min = 0; - int minIndex =0; - - int tempUp = 0; - int tempDown = 0; - - char *ref; - - int errorString = 0; - /* - 1: Up - 2: Side - 3: Diagnoal Match - 4: Diagnoal Mismatch - */ - - int upValue; - int diagValue; - int sideValue; - - ref = _msf_refGen + refIndex - 1; - - rIndex = 1; - - for(i=0; i <= errThreshold; i++) - { - score[0][i] = i; - score[i][0] = i; - } - - while(rIndex <= seqLength +errThreshold) - { - tempUp = ((rIndex - errThreshold) > 0 ? ((rIndex > seqLength) ? seqLength - errThreshold :rIndex - errThreshold) : 1 ); - tempDown = ((rIndex >= seqLength-errThreshold ) ? seqLength+1 :rIndex + errThreshold + 1); - for(i = tempUp ; i < tempDown ; i++) - { - errorString = (*(ref+rIndex-1) == *(seq+i-1)); - - upValue = score[i-1][rIndex]+1; - diagValue = score[i-1][rIndex-1]+ !errorString; - sideValue = score[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - score[i][rIndex] = min3(sideValue, diagValue , upValue); - - else if( (i == ((rIndex - errThreshold) > 0 ? rIndex - errThreshold : 1)) && rIndex <= seqLength ) - score[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > seqLength && (i == seqLength - errThreshold) ) - score[i][rIndex] = sideValue; - else - score[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = score[i][rIndex]; - else if(error > score[i][rIndex]) - error = score[i][rIndex]; - } - rIndex++; - } - - min = score[seqLength][seqLength+errThreshold]; - minIndex = seqLength + errThreshold; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*errThreshold; i++) - { - if(min >= score[seqLength][seqLength+errThreshold-i] && seqLength+errThreshold-i > 0) - { - min = score[seqLength][seqLength+errThreshold-i]; - minIndex = seqLength+errThreshold-i; - } - } - - error = score[seqLength][minIndex]; - - directionIndex = seqLength; - rIndex = minIndex; - while(directionIndex != 0 || rIndex != 0) - { - - if(rIndex == 0) - { - if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) - { - matrix[size] = *(seq+directionIndex-1); - size++; - matrix[size] = 'I'; - directionIndex--; - } - } - else if(directionIndex == 0) - { - if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) - { - matrix[size] = *(ref+rIndex-1); - size++; - matrix[size] = 'D'; - rIndex--; - } - } - else if(directionIndex-rIndex == errThreshold) - { - if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) - { - matrix[size] = *(seq+directionIndex-1); - size++; - matrix[size] = 'I'; - directionIndex--; - } - else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) - { - matrix[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrix[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) - { - matrix[size] = *(ref+rIndex-1); - size++; - matrix[size] = 'D'; - rIndex--; - } - else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) - { - matrix[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrix[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrix[size] = *(seq+directionIndex-1); - size++; - matrix[size] = 'I'; - directionIndex--; - } - else if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrix[size] = *(ref+rIndex-1); - size++; - matrix[size] = 'D'; - rIndex--; - } - else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) - { - matrix[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrix[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - - matrix[size] = '\0'; - - char returnString[200]; - - returnString[0] = '\0'; - reverse(matrix, returnString, size); - sprintf(matrix, "%s", returnString); - - return error; -} - -/* - Will apply the Levenshtein Dynamic programming. - in both right and left direction as long as the - threshould error is reached or end of string length - -*/ -int msfHashVal(char *seq) -{ - int i=0; - int val=0, numericVal=0; - - while(i<6) - { - switch (seq[i]) - { - case 'A': - numericVal = 0; - break; - case 'C': - numericVal = 1; - break; - case 'G' : - numericVal = 2; - break; - case 'T': - numericVal = 3; - break; - default: - return -1; - break; - } - val = (val << 2)|numericVal; - i++; - } - return val; -} - - - -int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) -{ - int i = 0; - - char * ref; - char * tempref; - - int rIndex = 0; //reference Index - - int e = errThreshold; - int error = 0; - int error1 = 0; - int totalError = 0; - - - /* - 1: Up - 2: Side - 3: Diagnoal Match - 4: Diagnoal Mismatch - */ - - - int minIndex1 = 0; - int minIndex2 = 0; - - - int directionIndex = 0; - - int size = 0; - - int startIndex1 = 0; - - rIndex = 1; - - - char matrixR[200]; - char matrixL[200]; - - ref = _msf_refGen + refIndex - 1; - tempref = _msf_refGen + refIndex - 1; - - int jumpIndex = 0; - - if(rSeqLength != 0) - { - error1 = forwardEditDistance2SSE2(ref+segLength+jumpIndex, rSeqLength-jumpIndex, rSeq+jumpIndex, rSeqLength-jumpIndex); - if(error1 == -1) - return -1; - } - - - if(lSeqLength != 0) - { - error = backwardEditDistance2SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); - if(error == -1) - { - return -1; - } - } - - matrixL[0] = '\0'; - matrixR[0] = '\0'; - - - ref = _msf_refGen + refIndex - 1; - - if(error1+error > errThreshold) - return -1; - - ref = _msf_refGen + refIndex - 1; - - rIndex = startIndex1+1; - - int i0 = 0; - int i1 = 0; - int i2 = 0; - - __m128i R0; - __m128i R1; - - __m128i Side1, Side2,Side; //side matrix - __m128i Down1, Down2,Down; //down matrix - __m128i Diag; // - - __m128i tmp; - - /* initialize */ - R0 = _mm_setzero_si128 (); - R1 = _mm_setzero_si128 (); - Diag = _mm_setzero_si128 (); - Side1 = _mm_setzero_si128 (); - Side2 = _mm_setzero_si128 (); - Down1 = _mm_setzero_si128 (); - Down2 = _mm_setzero_si128 (); - Down = _mm_setzero_si128 (); - Side = _mm_setzero_si128 (); - tmp = _mm_setzero_si128 (); - /* end initialize */ - - int mismatch[3] = {0,0,0}; - - if(lSeqLength != 0) - { - char *a; - char *b; - - a = ref-1; - b = lSeq+lSeqLength-1; - - R0 = _mm_insert_epi16(R0,0,0); - - score[0][0] = 0; - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - score[1][0] = 1; - direction1[1][0] = 1; - score[0][1] = 1; - direction1[0][1] = 2; - - mismatch[0] = ((a[0]) != (b[0])); - - Diag = _mm_insert_epi16(Diag,2*e,0); - Diag = _mm_insert_epi16(Diag,mismatch[0],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - Side1 = _mm_insert_epi16(Side1,2*e,2); - - Side2 = _mm_insert_epi16(Side2,2*e,0); - Side2 = _mm_insert_epi16(Side2,1,1); - Side2 = _mm_insert_epi16(Side2,1,2); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - Down2 = _mm_insert_epi16(Down2,2*e,0); - Down2 = _mm_insert_epi16(Down2,1,1); - Down2 = _mm_insert_epi16(Down2,1,2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - i2 = _mm_extract_epi16(R0, 2); - - score[0][2] = i0; - score[1][1] = i1; - score[2][0] = i2; - - direction1[0][2] = 2; - direction1[1][1] = ((mismatch[0] == 0)? 3 : 4); - direction1[2][0] = 1; - - for (i = 3; i < 2*lSeqLength; i++) - { - if(i % 2 ==1) - { - Diag = _mm_sub_epi8(Diag, Diag); - mismatch[0] = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - - tmp = _mm_srli_si128(R0,2); - - R1 = _mm_min_epi16(tmp+Side1, R1+Diag); - R1 = _mm_min_epi16(R1,R0+Down1); - - i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - score[i/2][i/2+1] = i0; - score[i/2+1][i/2] = i1; - - direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : - (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; - - direction1[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : - (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; - - if(i > 2 * lSeqLength - 2) - { - error = min(error, i1); - if(error == i1) - minIndex1 = i-lSeqLength; - } - } - - else if(i % 2 == 0) - { - mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - mismatch[2] = ( *(a-(i/2-2)) != *(b-(i/2)) ); - Diag = _mm_insert_epi16(Diag,mismatch[2],2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - i2 = _mm_extract_epi16(R0, 2); - - score[i/2-1][i/2+1] = i0; - score[i/2][i/2] = i1; - score[i/2+1][i/2-1] = i2; - - direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; - - direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : - (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; - - direction1[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2] == 0) ? 3 : - (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; - - if( (i/2) % segLength == 0 && i1 == 0) // the segment has been processed no need to process it again - { - return -1; - } - - if(i == 2 * lSeqLength - 2) - { - error = i2; - minIndex1 = i-lSeqLength; - } - } - } - - Down1 = _mm_insert_epi16(Down1,2*e,0); - - //fill the first part of the error - mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ( *(a-(i/2-1)) !=*(b-(i/2-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - - error = min(error, i1); - if(error == i1) - minIndex1 = i-lSeqLength; - - score[i/2-1][i/2+1] = i0; - score[i/2][i/2] = i1; - - direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2-1][i/2+1]-score[i/2-1][i/2]) ? 2 : 4; - - direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : - (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; - - //fill the second part of the error - i++; - Diag = _mm_sub_epi8(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[0],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); - - i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - error = min(error, i1); - if(error == i1) - minIndex1 = i-lSeqLength; - - score[i/2-1][i/2+2] = i0; - score[i/2][i/2+1] = i1; - - direction1[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : - (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 4; - - direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2]) ? 3 : - (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : - (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; - - //fill the last the last element of the matrix - i++; - Diag = _mm_sub_epi8(Diag, Diag); - mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - - Down = _mm_insert_epi16(Down,1,0); - - Side = _mm_insert_epi16(Side,1,0); - - tmp = _mm_srli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Down, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Side); - - i0 = _mm_extract_epi16(R0, 0); - - error = min(error, i0); - if(error == 0) - return -1; - if(error == i0) - minIndex1 = i-lSeqLength; - if(mismatch[0] == 0) - direction1[lSeqLength][lSeqLength+errThreshold] = 3; - else - { - if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength][lSeqLength+errThreshold-1] == 1) - direction1[lSeqLength][lSeqLength+errThreshold] = 2; - else if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength-1][lSeqLength+errThreshold] == 1) - direction1[lSeqLength][lSeqLength+errThreshold] = 1; - else - direction1[lSeqLength][lSeqLength+errThreshold] = 4; - } - } - error1 = error; - error = 0; - - directionIndex = lSeqLength; - rIndex = minIndex1; - - - *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; - - ref = ref + segLength; - - if(rSeqLength <= e) - { - char *a; - char *b; - - int tmp_index = 0; - - a = ref; - b = rSeq; - - for(tmp_index = 0; tmp_index < rSeqLength; tmp_index++) - { - matrixR[tmp_index] = (a[tmp_index]==b[tmp_index]) ? 'M' : a[tmp_index] ; - } - matrixR[tmp_index] = '\0'; - } - else if(rSeqLength != 0 && rSeqLength >= e) - { - char *a; - char *b; - - a = ref; - b = rSeq; - - R0 = _mm_sub_epi8(R0, R0); - R1 = _mm_sub_epi8(R1, R1); - - R0 = _mm_insert_epi16(R0,0,0); - - score[0][0] = 0; - - R1 = _mm_insert_epi16(R1,1,0); - R1 = _mm_insert_epi16(R1,1,1); - - score[1][0] = 1; - direction2[1][0] = 1; - score[0][1] = 1; - direction2[0][1] = 2; - - mismatch[0] = ((a[0]) != (b[0])); - - Diag = _mm_insert_epi16(Diag,2*e,0); - Diag = _mm_insert_epi16(Diag,mismatch[0],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - Side1 = _mm_insert_epi16(Side1,1,0); - Side1 = _mm_insert_epi16(Side1,1,1); - Side1 = _mm_insert_epi16(Side1,2*e,2); - - Side2 = _mm_insert_epi16(Side2,2*e,0); - Side2 = _mm_insert_epi16(Side2,1,1); - Side2 = _mm_insert_epi16(Side2,1,2); - - Down1 = _mm_insert_epi16(Down1,1,0); - Down1 = _mm_insert_epi16(Down1,1,1); - Down1 = _mm_insert_epi16(Down1,2*e,2); - - Down2 = _mm_insert_epi16(Down2,2*e,0); - Down2 = _mm_insert_epi16(Down2,1,1); - Down2 = _mm_insert_epi16(Down2,1,2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - i2 = _mm_extract_epi16(R0, 2); - - score[0][2] = i0; - score[1][1] = i1; - score[2][0] = i2; - - direction2[0][2] = 2; - direction2[1][1] = ((mismatch[0] == 0)? 3 : 4); - direction2[2][0] = 1; - - - for (i = 3; i < 2*rSeqLength; i++) - { - if(i % 2 ==1) - { - Diag = _mm_sub_epi8(Diag, Diag); - mismatch[0] = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - - tmp = _mm_srli_si128(R0,2); - - R1 = _mm_min_epi16(tmp+Side1, R1+Diag); - R1 = _mm_min_epi16(R1,R0+Down1); - - i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - score[i/2][i/2+1] = i0; - score[i/2+1][i/2] = i1; - - direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : - (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; - - direction2[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : - (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; - - - if(i > 2 * rSeqLength - 2) - { - error = min(error, i1); - if(error == i1) - minIndex2 = i-rSeqLength; - } - } - - else if(i % 2 == 0) - { - mismatch[0] = ((a[i/2]) != (b[i/2-2])); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - mismatch[2] = ((a[i/2-2]) != (b[i/2])); - Diag = _mm_insert_epi16(Diag,mismatch[2],2); - - tmp = _mm_slli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Down2); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - i2 = _mm_extract_epi16(R0, 2); - - score[i/2-1][i/2+1] = i0; - score[i/2][i/2] = i1; - score[i/2+1][i/2-1] = i2; - - direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; - - direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : - (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; - - direction2[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2]==0) ? 3 : - (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; - - - if(i == 2 * rSeqLength - 2) - { - error = i2; - minIndex2 = i-rSeqLength; - } - } - } - - Down1 = _mm_insert_epi16(Down1,2*e,0); - - //fill the first part of the error - mismatch[0] = ((a[i/2]) != (b[i/2-2])); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); - Diag = _mm_insert_epi16(Diag,mismatch[1],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R0 = _mm_min_epi16(R1+Side1, R0+Diag); - R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); - - i0 = _mm_extract_epi16(R0, 0); - i1 = _mm_extract_epi16(R0, 1); - - error = min(error, i1); - if(error == i1) - minIndex2 = i-rSeqLength; - - score[i/2-1][i/2+1] = i0; - score[i/2][i/2] = i1; - - direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; - - direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : - (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : - (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; - - - //fill the second part of the error - i++; - Diag = _mm_sub_epi8(Diag, Diag); - Diag = _mm_insert_epi16(Diag,2*e,0); - mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); - Diag = _mm_insert_epi16(Diag,mismatch[0],1); - Diag = _mm_insert_epi16(Diag,2*e,2); - - R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); - R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); - - i0 = _mm_extract_epi16(R1, 0); - i1 = _mm_extract_epi16(R1, 1); - - error = min(error, i1); - if(error == i1) - minIndex2 = i-rSeqLength; - - score[i/2-1][i/2+2] = i0; - score[i/2][i/2+1] = i1; - - direction2[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : - (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 3; - - direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : - (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : - (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; - - - //fill the last the last element of the matrix - i++; - Diag = _mm_sub_epi8(Diag, Diag); - mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); - Diag = _mm_insert_epi16(Diag,mismatch[0],0); - - Down = _mm_sub_epi8(Down, Down); - Down = _mm_insert_epi16(Down,1,0); - - Side = _mm_sub_epi8(Side, Side); - Side = _mm_insert_epi16(Side,1,0); - - tmp = _mm_srli_si128(R1,2); - - R0 = _mm_min_epi16(R1+Down, R0+Diag); - R0 = _mm_min_epi16(R0,tmp+Side); - - i0 = _mm_extract_epi16(R0, 0); - - error = min(error, i0); - if(error == i0) - minIndex2 = i-rSeqLength; - - if(mismatch[0] == 0) - direction2[rSeqLength][rSeqLength+errThreshold] = 3; - else - { - if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength][rSeqLength+errThreshold-1] == 1) - direction2[lSeqLength][lSeqLength+errThreshold] = 2; - else if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength-1][rSeqLength+errThreshold] == 1) - direction2[rSeqLength][rSeqLength+errThreshold] = 1; - else - direction2[rSeqLength][rSeqLength+errThreshold] = 4; - } - - } - - totalError = error1 + error; - - size = 0; - directionIndex = rSeqLength; - rIndex = minIndex2; - - - if(rSeqLength > e) - { - while(directionIndex != 0 || rIndex != 0) - { - - if(direction2[directionIndex][rIndex] == 3) - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - else if(direction2[directionIndex][rIndex] == 4) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else if(direction2[directionIndex][rIndex] == 2) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - size++; - } - matrixR[size] = '\0'; - } - size = 0; - directionIndex = lSeqLength; - rIndex = minIndex1; - - while(directionIndex != 0 || rIndex != 0) - { - - if(direction1[directionIndex][rIndex] == 3) - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - else if(direction1[directionIndex][rIndex] == 4) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else if(direction1[directionIndex][rIndex] == 2) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - - size++; - } - - matrixL[size] = '\0'; - - char middle[200]; - middle[0] = '\0'; - - for(i = 0; i < segLength; i++) - middle[i] = 'M'; - middle[segLength] = '\0'; - - char rmatrixR[200]; - - reverse(matrixR, rmatrixR, strlen(matrixR)); - - sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); - - return totalError; -} - -int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) -{ - - int i = 0; - - char * ref; - char * tempref; - - int rIndex = 0; //reference Index - - int error = 0; - int error1 = 0; - - int error2 = 0; - int error3 = 0; - int totalError = 0; - int errorSegment = 0; - - int ERROR_BOUND = errThreshold; - - - /* - 1: Up - 2: Side - 3: Diagnoal Match - 4: Diagnoal Mismatch - */ - - int min = 0; - int minIndex1 = 0; - int minIndex2 = 0; - - int directionIndex = 0; - - - int size = 0; - - ref = _msf_refGen + refIndex - 1; - tempref = _msf_refGen + refIndex - 1; - - - if(lSeqLength != 0) - { - error3 = backwardEditDistance4SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); - if(error3 == -1 || error3 == 0){ - return -1; - } - } - - if(rSeqLength != 0) - { - error2 = forwardEditDistance4SSE2(ref+segLength, rSeqLength, rSeq, rSeqLength); - if(error2 == -1) - return -1; - } - - if(error2 + error3 > errThreshold) - return -1; - - rIndex = 1; - - int prevError = 0; - - int tempUp = 0; - int tempDown = 0; - - int errorString = 0; - - int upValue; - int diagValue; - int sideValue; - - while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) - { - tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); - tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp ; i < tempDown ; i++) - { - errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); - - upValue = scoreB[i-1][rIndex]+1; - diagValue = scoreB[i-1][rIndex-1]+ !errorString; - sideValue = scoreB[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); - - else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) - scoreB[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) - scoreB[i][rIndex] = sideValue; - else - scoreB[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreB[i][rIndex]; - else if(error > scoreB[i][rIndex]) - error = scoreB[i][rIndex]; - } - if(rIndex <= lSeqLength) - { - errorSegment = error-prevError; - } - rIndex++; - } - - if(lSeqLength != 0) - { - min = scoreB[lSeqLength][lSeqLength+errThreshold]; - minIndex1 = lSeqLength + errThreshold; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*errThreshold; i++) - { - if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) - { - min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; - minIndex1 = lSeqLength+errThreshold-i; - } - } - error = scoreB[lSeqLength][minIndex1]; - } - - error1 = error; - - error = 0; - errorSegment = 0; - - directionIndex = lSeqLength; - rIndex = minIndex1; - - - *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; - - ref = ref + segLength; - - if(rSeqLength != 0) - { - rIndex = 1; - while(rIndex <= rSeqLength+errThreshold-error1) - { - tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; - tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp; i < tempDown ; i++) - { - errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); - - upValue = scoreF[i-1][rIndex]+1; - diagValue = scoreF[i-1][rIndex-1]+ !errorString; - sideValue = scoreF[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); - else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) - scoreF[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) - scoreF[i][rIndex] = sideValue; - else - scoreF[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreF[i][rIndex]; - if(error > scoreF[i][rIndex]) - error = scoreF[i][rIndex]; - } - if(rIndex <= rSeqLength) - { - errorSegment = error; - } - - rIndex++; - } - - min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; - minIndex2 = rSeqLength + errThreshold-error1; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*(errThreshold-error1); i++) - { - if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) - { - min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; - minIndex2 = rSeqLength+errThreshold-error1-i; - } - } - error = scoreF[rSeqLength][minIndex2]; - } - - totalError = error + error1; - - if(errThreshold > 4) - printf("ERROR in errorThreshold.\n"); - - - if(totalError != error2 + error3 && totalError > errThreshold) - { - printf("ErrorF=%d, ErrorB=%d Error=%d Error=%d\n", error2,error3,error1,error); - - scanf("%d", &i); - } - - char matrixR[200]; - char matrixL[200]; - - matrixR[0] = '\0'; - matrixL[0] = '\0'; - - size = 0; - directionIndex = rSeqLength; - rIndex = minIndex2; - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - matrixR[size] = '\0'; - - size = 0; - directionIndex = lSeqLength; - rIndex = minIndex1; - - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - - size++; - } - - matrixL[size] = '\0'; - char middle[200]; - middle[0] = '\0'; - - for(i = 0; i < segLength; i++) - middle[i] = 'M'; - middle[segLength] = '\0'; - - char rmatrixR[200]; - - reverse(matrixR, rmatrixR, strlen(matrixR)); - - sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); - - return totalError; - -} - -int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, - char *matrix, int *map_location, short *seqHashValue) -{ - int i = 0; - - char * ref; - char * tempref; - - int rIndex = 0; //reference Index - - int error = 0; - int error1 = 0; - - int error2 = 0; - int error3 = 0; - int totalError = 0; - int errorSegment = 0; - - int ERROR_BOUND = min(4, errThreshold); - - - /* - 1: Up - 2: Side - 3: Diagnoal Match - 4: Diagnoal Mismatch - */ - - int min = 0; - int minIndex1 = 0; - int minIndex2 = 0; - - int directionIndex = 0; - - - int size = 0; - - ref = _msf_refGen + refIndex - 1; - tempref = _msf_refGen + refIndex - 1; - - - if(lSeqLength != 0) - { - error3 = backwardEditDistanceSSE2Extention(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); - if(error3 == -1){ - return -1; - } - } - - if(rSeqLength != 0) - { - error2 = forwardEditDistanceSSE2Extention(ref+segLength, rSeqLength, rSeq, rSeqLength); - if(error2 == -1) - return -1; - } - - if(error2 + error3 > errThreshold) - return -1; - - rIndex = 1; - - int prevError = 0; - - int tempUp = 0; - int tempDown = 0; - - int errorString = 0; - - int upValue; - int diagValue; - int sideValue; - if(lSeqLength > ERROR_BOUND) - { - while(rIndex <= lSeqLength+ERROR_BOUND && lSeqLength != 0) - { - tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); - tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp ; i < tempDown ; i++) - { - errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); - - upValue = scoreB[i-1][rIndex]+1; - diagValue = scoreB[i-1][rIndex-1]+ !errorString; - sideValue = scoreB[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); - - else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) - scoreB[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) - scoreB[i][rIndex] = sideValue; - else - scoreB[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreB[i][rIndex]; - else if(error > scoreB[i][rIndex]) - error = scoreB[i][rIndex]; - } - if(rIndex <= lSeqLength) - { - errorSegment = error-prevError; - } - rIndex++; - } - - if(lSeqLength != 0) - { - min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND]; - minIndex1 = lSeqLength + ERROR_BOUND; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*ERROR_BOUND; i++) - { - if(min >= scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i] && lSeqLength+ERROR_BOUND-i > 0) - { - min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i]; - minIndex1 = lSeqLength+ERROR_BOUND-i; - } - } - error = scoreB[lSeqLength][minIndex1]; - } - } - else - { - int j = 0; - for(i = 1; i <= lSeqLength; i++) - { - for(j = 1; j <= lSeqLength; j++) - { - scoreB[i][j] = min3(scoreB[i-1][j-1]+ (*(ref-j) != *(lSeq+lSeqLength-i) ),scoreB[i][j-1]+1 ,scoreB[i-1][j]+1); - } - } - error = scoreB[lSeqLength][lSeqLength]; - minIndex1 = lSeqLength; - - } - error1 = error; - - error = 0; - errorSegment = 0; - - directionIndex = lSeqLength; - rIndex = minIndex1; - - *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; - - ref = ref + segLength; - - if(rSeqLength != 0 && rSeqLength > ERROR_BOUND) - { - ERROR_BOUND = min(ERROR_BOUND, rSeqLength); - - if(rSeqLength == ERROR_BOUND) - { - for(i=0; i < 2*ERROR_BOUND; i++) - scoreF[0][i] = i; - } - - rIndex = 1; - while(rIndex <= rSeqLength+ERROR_BOUND) - { - tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; - tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp; i < tempDown ; i++) - { - errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); - upValue = scoreF[i-1][rIndex]+1; - diagValue = scoreF[i-1][rIndex-1]+ !errorString; - sideValue = scoreF[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); - else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) - scoreF[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) - scoreF[i][rIndex] = sideValue; - else - scoreF[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreF[i][rIndex]; - if(error > scoreF[i][rIndex]) - error = scoreF[i][rIndex]; - } - if(rIndex <= rSeqLength) - { - errorSegment = error; - } - rIndex++; - } - min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND]; - minIndex2 = rSeqLength + ERROR_BOUND; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*ERROR_BOUND; i++) - { - if(min > scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i] && rSeqLength+ERROR_BOUND-i > 0) - { - min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i]; - minIndex2 = rSeqLength+ERROR_BOUND-i; - } - } - error = scoreF[rSeqLength][minIndex2]; - } - else - { - int j = 0; - for(i = 1; i <= rSeqLength; i++) - { - for(j = 1; j <= rSeqLength; j++) - { - scoreF[i][j] = min3(scoreF[i-1][j-1]+ (*(ref+j-1) != *(rSeq+i-1) ),scoreF[i][j-1]+1 ,scoreF[i-1][j]+1); - } - } - error = scoreF[rSeqLength][rSeqLength]; - minIndex2 = rSeqLength; - } - - totalError = error + error1; - - if(totalError != error2+error3) - { - for(i = 0; i < lSeqLength; i++) - printf("%c", *(tempref-1-i)); - printf("\n"); - for(i = 0; i < lSeqLength; i++) - printf("%c", *(lSeq+i)); - printf("\n"); - - for(i = 0; i < rSeqLength; i++) - printf("%c", *(tempref+segLength+i)); - printf("\n"); - - for(i = 0; i < rSeqLength; i++) - printf("%c", *(rSeq+i)); - printf("\n"); - - printf("ERROR=%d\n", totalError); - printf("ERROR_SSE=%d\n", error3+error2); - - printf("ERROR_SSE_back=%d E_SSE_forw=%d\n", error3, error2); - printf("ERROR_back=%d E_forw=%d\n", error1, error); - - } - - char matrixR[200]; - char matrixL[200]; - - matrixR[0] = '\0'; - matrixL[0] = '\0'; - - size = 0; - directionIndex = rSeqLength; - rIndex = minIndex2; - - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - matrixR[size] = '\0'; - - size = 0; - directionIndex = lSeqLength; - rIndex = minIndex1; - - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - matrixL[size] = '\0'; - - char middle[200]; - middle[0] = '\0'; - for(i = 0; i < segLength; i++) - middle[i] = 'M'; - middle[segLength] = '\0'; - - char rmatrixR[200]; - - reverse(matrixR, rmatrixR, strlen(matrixR)); - - sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); - - - return totalError; - -} - - -int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) -{ - - int i = 0; - - char * ref; - char * tempref; - - int rIndex = 0; //reference Index - - int error = 0; - int error1 = 0; - - int error2 = 0; - int error3 = 0; - - int totalError = 0; - int errorSegment = 0; - - int ERROR_BOUND = errThreshold; - - /* - 1: Up - 2: Side - 3: Diagnoal Match - 4: Diagnoal Mismatch - */ - - int min = 0; - int minIndex1 = 0; - int minIndex2 = 0; - - int directionIndex = 0; - - - int size = 0; - - ref = _msf_refGen + refIndex - 1; - tempref = _msf_refGen + refIndex - 1; - - - if(rSeqLength != 0) - { - if(errThreshold %2 == 1) - error2 = forwardEditDistanceSSE2Odd(ref+segLength, rSeqLength, rSeq, rSeqLength); - else - error2 = forwardEditDistanceSSE2G(ref+segLength, rSeqLength, rSeq, rSeqLength); - if(error2 == -1) - return -1; - } - - if(lSeqLength != 0) - { - if(errThreshold % 2 == 1) - error3 = backwardEditDistanceSSE2Odd(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); - else - error3 = backwardEditDistanceSSE2G(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); - if(error3 == -1 || error3 == 0){ - return -1; - } - } - - if(error3 + error2 > errThreshold) - return -1; - - for(i = 0 ; i < errThreshold + 1; i++) - { - scoreB[0][i] = i; - scoreB[i][0] = i; - } - - rIndex = 1; - int prevError = 0; - - int tempUp = 0; - int tempDown = 0; - - int errorString = 0; - - int upValue; - int diagValue; - int sideValue; - - while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) - { - tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); - tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp ; i < tempDown ; i++) - { - errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); - - upValue = scoreB[i-1][rIndex]+1; - diagValue = scoreB[i-1][rIndex-1]+ !errorString; - sideValue = scoreB[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); - - else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) - scoreB[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) - scoreB[i][rIndex] = sideValue; - else - scoreB[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreB[i][rIndex]; - else if(error > scoreB[i][rIndex]) - error = scoreB[i][rIndex]; - } - if(rIndex <= lSeqLength) - { - errorSegment = error-prevError; - } - rIndex++; - } - if(lSeqLength != 0) - { - min = scoreB[lSeqLength][lSeqLength+errThreshold]; - minIndex1 = lSeqLength + errThreshold; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*errThreshold; i++) - { - if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) - { - min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; - minIndex1 = lSeqLength+errThreshold-i; - } - } - error = scoreB[lSeqLength][minIndex1]; - } - - error1 = error; - - error = 0; - errorSegment = 0; - - directionIndex = lSeqLength; - rIndex = minIndex1; - - *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; - - ref = ref + segLength; - - if(rSeqLength != 0) - { - for(i = 0 ; i < errThreshold + 1; i++) - { - scoreF[0][i] = i; - scoreF[i][0] = i; - } - - - rIndex = 1; - while(rIndex <= rSeqLength+errThreshold-error1) - { - tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; - tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); - for(i = tempUp; i < tempDown ; i++) - { - errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); - - upValue = scoreF[i-1][rIndex]+1; - diagValue = scoreF[i-1][rIndex-1]+ !errorString; - sideValue = scoreF[i][rIndex-1]+1; - - if(i != tempUp && i != tempDown-1) - scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); - else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) - scoreF[i][rIndex] = min(sideValue, diagValue); - else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) - scoreF[i][rIndex] = sideValue; - else - scoreF[i][rIndex] = min(diagValue , upValue); - - if(i == tempUp) - error = scoreF[i][rIndex]; - if(error > scoreF[i][rIndex]) - error = scoreF[i][rIndex]; - } - if(rIndex <= rSeqLength) - { - errorSegment = error; - } - rIndex++; - } - - min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; - minIndex2 = rSeqLength + errThreshold-error1; - - // Find the Best error for all the possible ways. - for(i = 1; i <= 2*(errThreshold-error1); i++) - { - if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) - { - min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; - minIndex2 = rSeqLength+errThreshold-error1-i; - } - } - error = scoreF[rSeqLength][minIndex2]; - } - - totalError = error + error1; - - - if(totalError != error2 + error3 && totalError > errThreshold) - { - for(i = 0; i < lSeqLength; i++) - printf("%c", *(tempref-1-i)); - printf("\n"); - for(i = 0; i < lSeqLength; i++) - printf("%c", *(lSeq+i)); - printf("\n"); - - for(i = 0; i < rSeqLength; i++) - printf("%c", *(tempref+segLength+i)); - printf("\n"); - - for(i = 0; i < rSeqLength; i++) - printf("%c", *(rSeq+i)); - printf("\n"); - - - printf("SSEF=%d SSEB%d\n", error2, error3); - printf("F=%d B=%d\n", error, error1); - scanf("%d", &i); - } - - char matrixR[200]; - char matrixL[200]; - - matrixR[0] = '\0'; - matrixL[0] = '\0'; - - size = 0; - directionIndex = rSeqLength; - rIndex = minIndex2; - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixR[size] = *(rSeq+directionIndex-1); - size++; - matrixR[size] = 'I'; - directionIndex--; - } - else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixR[size] = *(ref+rIndex-1); - size++; - matrixR[size] = 'D'; - rIndex--; - } - else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) - { - matrixR[size] = *(ref+rIndex-1); - rIndex--; - directionIndex--; - } - else - { - matrixR[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - matrixR[size] = '\0'; - - size = 0; - directionIndex = lSeqLength; - rIndex = minIndex1; - - - while(directionIndex != 0 || rIndex != 0) - { - if(directionIndex-rIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - - } - else if(rIndex - directionIndex == errThreshold) - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - else - { - if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) - { - matrixL[size] = 'I'; - size++; - matrixL[size] = *(lSeq+lSeqLength-directionIndex); - directionIndex--; - } - else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) - { - matrixL[size] = 'D'; - size++; - matrixL[size] = *(tempref-rIndex); - rIndex--; - } - else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) - { - matrixL[size] = *(tempref-rIndex); - rIndex--; - directionIndex--; - } - else - { - matrixL[size] = 'M'; - rIndex--; - directionIndex--; - } - } - size++; - } - matrixL[size] = '\0'; - char middle[200]; - middle[0] = '\0'; - for(i = 0; i < segLength; i++) - middle[i] = 'M'; - middle[segLength] = '\0'; - - char rmatrixR[200]; - - reverse(matrixR, rmatrixR, strlen(matrixR)); - - sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); - - return totalError; -} - - -int addCigarSize(int cnt){ - if (cnt<10) return 1; - else if (cnt < 100) return 2; - return 3; -} - -/* - Generate Cigar from the back tracking matrix -*/ -void generateCigar(char *matrix, int matrixLength, char *cigar) -{ - int i = 0; - - int counterM=0; - int counterI=0; - int counterD=0; - - int cigarSize = 0; - - cigar[0] = '\0'; - - while(i < matrixLength) - { - if(matrix[i]=='M') - { - counterM++; - if(counterI != 0) - { - sprintf(cigar, "%s%dI", cigar, counterI); - cigarSize += addCigarSize(counterI) + 1; - cigar[cigarSize] = '\0'; - counterI=0; - } - else if(counterD != 0) - { - sprintf(cigar, "%s%dD", cigar, counterD); - cigarSize += addCigarSize(counterD) + 1; - cigar[cigarSize] = '\0'; - counterD=0; - } - } - else if(matrix[i] == 'I') - { - if(counterM != 0) - { - sprintf(cigar, "%s%dM", cigar, counterM); - cigarSize += addCigarSize(counterM) + 1; - cigar[cigarSize] = '\0'; - counterM = 0; - } - else if(counterD != 0) - { - sprintf(cigar, "%s%dD", cigar, counterD); - cigarSize += addCigarSize(counterD) + 1; - cigar[cigarSize] = '\0'; - counterD=0; - } - counterI++; - i++; - - } - else if (matrix[i] == 'D') - { - if(counterM != 0) - { - sprintf(cigar, "%s%dM", cigar, counterM); - cigarSize += addCigarSize(counterM) + 1; - cigar[cigarSize] = '\0'; - counterM = 0; - } - else if(counterI != 0) - { - sprintf(cigar, "%s%dI", cigar, counterI); - cigarSize += addCigarSize(counterI) + 1; - cigar[cigarSize] = '\0'; - counterI=0; - } - - counterD++; - i++; - - } - else - { - counterM++; - if(counterI != 0) - { - sprintf(cigar, "%s%dI", cigar, counterI); - cigarSize += addCigarSize(counterI) + 1; - cigar[cigarSize] = '\0'; - counterI=0; - } - else if(counterD != 0) - { - sprintf(cigar, "%s%dD", cigar, counterD); - cigarSize += addCigarSize(counterD) + 1; - cigar[cigarSize] = '\0'; - counterD=0; - } - } - i++; - } - - if(counterM != 0) - { - sprintf(cigar, "%s%dM", cigar, counterM); - cigarSize += addCigarSize(counterM) + 1; - cigar[cigarSize] = '\0'; - counterM = 0; - } - else if(counterI != 0) - { - sprintf(cigar, "%s%dI", cigar, counterI); - cigarSize += addCigarSize(counterI) + 1; - cigar[cigarSize] = '\0'; - counterI = 0; - } - else if(counterD != 0) - { - sprintf(cigar, "%s%dD", cigar, counterD); - cigarSize += addCigarSize(counterD) + 1; - cigar[cigarSize] = '\0'; - counterD = 0; - } - - cigar[cigarSize] = '\0'; -} - -/* - Creates the Cigar output from the mismatching positions format [0-9]+(([ACTGN]|\^[ACTGN]+)[0-9]+)* -*/ -void generateCigarFromMD(char *mismatch, int mismatchLength, char *cigar) -{ - int i = 0; - int j = 0; - - int start = 0; - int cigarSize = 0; - - cigar[0] = '\0'; - - while(i < mismatchLength) - { - if(mismatch[i] >= '0' && mismatch[i] <= '9') - { - start = i; - - while(mismatch[i] >= '0' && mismatch[i] <= '9' && i < mismatchLength) - i++; - - int value = atoi(mismatch+start); - for(j = 0; j < value-1; j++) - { - cigar[cigarSize] = 'M'; - cigarSize++; - } - cigar[cigarSize] = 'M'; - } - else if(mismatch[i] == '^') - { - cigar[cigarSize] = 'I'; - i++; - } - else if(mismatch[i] == '\'') - { - cigar[cigarSize] = 'D'; - i++; - } - else - { - cigar[cigarSize] = 'M'; - cigarSize++; - } - cigarSize++; - i++; - } - cigar[cigarSize] = '\0'; -} - -void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP) -{ - - int i = 0; - - int counterM = 0; - int counterD = 0; - - char delete[100]; - - int snpSize = 0; - - outputSNP[0] = '\0'; - delete[0] = '\0'; - - - while(i < matrixLength) - { - if(matrix[i]=='M') - { - counterM++; - if(counterD != 0) - { - delete[counterD] = '\0'; - counterD=0; - sprintf(outputSNP, "%s^%s", outputSNP,delete); - snpSize += strlen(delete) + 1; - outputSNP[snpSize] = '\0'; - delete[0] = '\0'; - } - } - else if(matrix[i] == 'D') - { - if(counterM != 0) - { - sprintf(outputSNP, "%s%d", outputSNP, counterM); - snpSize += addCigarSize(counterM); - outputSNP[snpSize] = '\0'; - counterM=0; - delete[counterD] = matrix[i+1]; - i++; - counterD++; - } - else if(counterD != 0) - { - delete[counterD] = matrix[i+1]; - counterD++; - i++; - } - else - { - delete[counterD] = matrix[i+1]; - counterD++; - i++; - } - } - else if(matrix[i] == 'I') - { - if(counterM != 0) - { - // sprintf(outputSNP, "%s%d\0", outputSNP, counterM); - //counterM++; - } - else if(counterD != 0) - { - delete[counterD] = '\0'; - sprintf(outputSNP, "%s^%s", outputSNP, delete); - snpSize += strlen(delete) + 1; - outputSNP[snpSize] = '\0'; - counterD = 0; - delete[0] = '\0'; - } - i++; - - } - else - { - if(counterM != 0) - { - sprintf(outputSNP, "%s%d", outputSNP, counterM); - snpSize += addCigarSize(counterM); - outputSNP[snpSize] = '\0'; - counterM = 0; - } - if(counterD != 0) - { - delete[counterD] = '\0'; - counterD=0; - sprintf(outputSNP, "%s^%s", outputSNP, delete); - snpSize += strlen(delete) + 1; - outputSNP[snpSize] = '\0'; - delete[0] = '\0'; - } - sprintf(outputSNP,"%s%c",outputSNP,matrix[i]); - snpSize += 1; - outputSNP[snpSize] = '\0'; - } - i++; - } - - if(counterM != 0) - { - sprintf(outputSNP, "%s%d", outputSNP, counterM); - snpSize += addCigarSize(counterM); - outputSNP[snpSize] = '\0'; - counterM = 0; - } - else if(counterD != 0) - { - delete[counterD] = '\0'; - sprintf(outputSNP, "%s^%s", outputSNP, delete); - snpSize += strlen(delete) + 1; - outputSNP[snpSize] = '\0'; - counterD = 0; - } - - outputSNP[snpSize] = '\0'; -} -/**********************************************/ - -/* - direction = 0 forward - 1 backward - -*/ - -void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) -{ - int j = 0; - int z = 0; - int *locs = (int *) l1; - char *_tmpSeq, *_tmpQual; - char rqual[SEQ_LENGTH+1]; - rqual[SEQ_LENGTH]='\0'; - - int genLoc = 0; - int leftSeqLength = 0; - int rightSeqLength = 0; - int middleSeqLength = 0; - - char matrix[200]; - char editString[200]; - char cigar[MAX_CIGAR_SIZE]; - - short *_tmpHashValue; - - if (direction) - { - reverse(_msf_seqList[readNumber].qual, rqual, SEQ_LENGTH); - _tmpQual = rqual; - _tmpSeq = _msf_seqList[readNumber].rseq; - _tmpHashValue = _msf_seqList[readNumber].rhashValue; - } - else - { - _tmpQual = _msf_seqList[readNumber].qual; - _tmpSeq = _msf_seqList[readNumber].seq; - _tmpHashValue = _msf_seqList[readNumber].hashValue; - } - - int readId = 2*readNumber+direction; - for (z=0; z<s1; z++) - { - - - int map_location = 0; - int a = 0; - int o = readSegment; - - genLoc = locs[z];//-_msf_samplingLocs[o]; - - - if ( genLoc-_msf_samplingLocs[o] < _msf_refGenBeg || - genLoc-_msf_samplingLocs[o] > _msf_refGenEnd || - _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || - _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId - ) - continue; - int err = -1; - - - map_location = 0; - - leftSeqLength = _msf_samplingLocs[o]; - middleSeqLength = WINDOW_SIZE; - a = leftSeqLength + middleSeqLength; - rightSeqLength = SEQ_LENGTH - a; - - if(errThreshold == 2) - err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - else if(errThreshold == 4) - err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - else if(errThreshold ==3) - err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - /*else if(errThreshold == 6) - err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - */ - else - err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - - if(err != -1) - { - generateSNPSAM(matrix, strlen(matrix), editString); - generateCigar(matrix, strlen(matrix), cigar); - } - - if(err != -1 && !bestMode) - { - - mappingCnt++; - - int j = 0; - int k = 0; - for(k = 0; k < readSegment+1; k++) - { - for(j = -errThreshold ; j <= errThreshold; j++) - { - if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && - genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) - _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; - } - } - _msf_seqList[readNumber].hits[0]++; - - _msf_output.QNAME = _msf_seqList[readNumber].name; - _msf_output.FLAG = 16 * direction; - _msf_output.RNAME = _msf_refGenName; - _msf_output.POS = map_location + _msf_refGenOffset; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = cigar; - _msf_output.MRNAME = "*"; - _msf_output.MPOS = 0; - _msf_output.ISIZE = 0; - _msf_output.SEQ = _tmpSeq; - _msf_output.QUAL = _tmpQual; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = editString; - - output(_msf_output); - - - if (_msf_seqList[readNumber].hits[0] == 1) - { - mappedSeqCnt++; - } - - if ( maxHits == 0 ) - { - _msf_seqList[readNumber].hits[0] = 2; - } - - - if ( maxHits!=0 && _msf_seqList[readNumber].hits[0] == maxHits) - { - completedSeqCnt++; - break; - } - - } - else if(err != -1 && bestMode) - { - mappingCnt++; - _msf_seqList[readNumber].hits[0]++; - - if (_msf_seqList[readNumber].hits[0] == 1) - { - mappedSeqCnt++; - } - - if ( maxHits == 0 ) - { - _msf_seqList[readNumber].hits[0] = 2; - } - - if(err < bestHitMappingInfo[readNumber].err || bestHitMappingInfo[readNumber].loc == -1) - { - setFullMappingInfo(readNumber, map_location + _msf_refGenOffset, direction, err, 0, editString, _msf_refGenName, cigar ); - } - } - else - { - for(j = -errThreshold ; j <= errThreshold; j++) - { - if(genLoc+j > _msf_refGenBeg && - genLoc+j < _msf_refGenEnd) - _msf_verifiedLocs[genLoc+j] = -readId; - } - } - } -} - - -int mapAllSingleEndSeq() -{ - int i = 0; - int j = 0; - int k = 0; - - - unsigned int *locs = NULL; - - - int prev_hash = 0; - - for(i = 0; i < _msf_seqListSize; i++) - { - for(j = 0; j < _msf_samplingLocsSize; j++) - { - k = _msf_sort_seqList[i].readNumber; -// if(j != 0) -// if(strncmp(_msf_seqList[k].seq+_msf_samplingLocs[j], _msf_seqList[k].seq+_msf_samplingLocs[j-1], segSize) == 0) -// continue; -// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) -// continue; - locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); - if ( locs != NULL) - { - mapSingleEndSeq(locs+1, locs[0],k ,j, 0); - } - prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); - } - } - i = 0; - - for(i = 0; i < _msf_seqListSize; i++) - { - for(j = 0; j < _msf_samplingLocsSize; j++) - { - k = _msf_sort_seqList[i].readNumber; - -// if(j != 0) -// if(strncmp(_msf_seqList[k].rseq+_msf_samplingLocs[j], _msf_seqList[k].rseq+_msf_samplingLocs[j-1], segSize) == 0) -// continue; -// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) -// continue; - locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); - if ( locs != NULL) - { - mapSingleEndSeq(locs+1, locs[0],k ,j, 1); - } - prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); - } - } - return 1; -} - - -/**********************************************/ -/**********************************************/ -/**********************************************/ -/**********************************************/ -/**********************************************/ -int compareOut (const void *a, const void *b) -{ - FullMappingInfo *aInfo = (FullMappingInfo *)a; - FullMappingInfo *bInfo = (FullMappingInfo *)b; - return aInfo->loc - bInfo->loc; -} - - - -/**********************************************/ - -/* - direction 0: Forward - 1: Reverse -*/ - -void mapPairEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) -{ - int z = 0; - int *locs = (int *) l1; - char *_tmpSeq; - - char rqual[SEQ_LENGTH+1]; - - char matrix[200]; - char editString[200]; - char cigar[MAX_CIGAR_SIZE]; - - short *_tmpHashValue; - - int leftSeqLength = 0; - int middleSeqLength = 0; - int rightSeqLength =0; - int a = 0; - - rqual[SEQ_LENGTH]='\0'; - - - int r = readNumber; - - char d = (direction==1)?-1:1; - - if (d==-1) - { - _tmpSeq = _msf_seqList[readNumber].rseq; - _tmpHashValue = _msf_seqList[r].rhashValue; - } - else - { - _tmpSeq = _msf_seqList[readNumber].seq; - _tmpHashValue = _msf_seqList[r].hashValue; - } - - int readId = 2*readNumber+direction; - for (z=0; z<s1; z++) - { - int genLoc = locs[z];//-_msf_samplingLocs[o]; - int err = -1; - int map_location = 0; - int o = readSegment; - - leftSeqLength = _msf_samplingLocs[o]; - middleSeqLength = WINDOW_SIZE; - a = leftSeqLength + middleSeqLength; - rightSeqLength = SEQ_LENGTH - a; - - if(genLoc - leftSeqLength < _msf_refGenBeg || genLoc + rightSeqLength + middleSeqLength > _msf_refGenEnd || - _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId) - continue; - - if(errThreshold == 2) - err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - else if(errThreshold == 4) - err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - else if(errThreshold ==3) - err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - /*else if(errThreshold == 6) - err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue);*/ - else - err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, - _tmpSeq + a, rightSeqLength, - middleSeqLength, matrix, &map_location, _tmpHashValue); - - - if (err != -1) - { - int j = 0; - int k = 0; - - for(k = 0; k < readSegment+1; k++) - { - for(j = -errThreshold ; j <= errThreshold; j++) - { - if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && - genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) - _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; - } - } - - - generateSNPSAM(matrix, strlen(matrix), editString); - generateCigar(matrix, strlen(matrix), cigar); - - MappingLocations *parent = NULL; - MappingLocations *child = _msf_mappingInfo[r].next; - - genLoc = map_location + _msf_refGenOffset; - int i = 0; - for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++) - { - parent = child; - child = child->next; - } - - if (child==NULL) - { - MappingLocations *tmp = getMem(sizeof(MappingLocations)); - - tmp->next = NULL; - tmp->loc[0]=genLoc * d; - tmp->err[0]=err; - - tmp->cigarSize[0] = strlen(cigar); - sprintf(tmp->cigar[0],"%s", cigar); - - tmp->mdSize[0] = strlen(editString); - sprintf(tmp->md[0],"%s", editString); - - if (parent == NULL) - _msf_mappingInfo[r].next = tmp; - else - parent->next = tmp; - } - else - { - if(strlen(cigar) > SEQ_LENGTH || strlen(editString) > SEQ_LENGTH) - { - printf("ERROR in %d read size(After mapping) exceedes cigar=%d md =%d cigar=%s md =%s\n", r, (int)strlen(cigar), (int)strlen(editString), cigar, editString); - } - - child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d; - child->err[_msf_mappingInfo[r].size % MAP_CHUNKS] = err; - - child->cigarSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(cigar); - sprintf(child->cigar[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",cigar); - - child->mdSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(editString); - sprintf(child->md[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",editString); - } - _msf_mappingInfo[r].size++; - - } - else - { - _msf_verifiedLocs[genLoc] = -readId; - } - - } -} - -/**********************************************/ -void mapPairedEndSeq() -{ - int i = 0; - int j = 0; - int k = 0; - - unsigned int *locs = NULL; - while ( i < _msf_seqListSize ) - { - for(j = 0; j < _msf_samplingLocsSize; j++) - { - k = _msf_sort_seqList[i].readNumber; - locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); - if ( locs != NULL) - { - mapPairEndSeqList(locs+1, locs[0],k ,j, 0); - } - } - i++; - } - i = 0; - - while ( i < _msf_seqListSize ) - { - for(j = 0; j < _msf_samplingLocsSize; j++) - { - k = _msf_sort_seqList[i].readNumber; - locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); - if ( locs != NULL) - { - mapPairEndSeqList(locs+1, locs[0],k ,j, 1); - } - } - - i++; - } - char fname1[FILE_NAME_LENGTH]; - char fname2[FILE_NAME_LENGTH]; - MappingLocations *cur; - int tmpOut; - int lmax=0, rmax=0; - - sprintf(fname1, "%s__%s__%s__%d__1.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); - sprintf(fname2, "%s__%s__%s__%d__2.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); - - FILE* out; - FILE* out1 = fileOpen(fname1, "w"); - FILE* out2 = fileOpen(fname2, "w"); - - _msf_openFiles++; - - for (i=0; i<_msf_seqListSize; i++) - { - - if (i%2==0) - { - out = out1; - - if (lmax < _msf_mappingInfo[i].size) - { - lmax = _msf_mappingInfo[i].size; - } - } - else - { - out = out2; - if (rmax < _msf_mappingInfo[i].size) - { - rmax = _msf_mappingInfo[i].size; - } - } - - tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out); - if (_msf_mappingInfo[i].size > 0) - { - cur = _msf_mappingInfo[i].next; - for (j=0; j < _msf_mappingInfo[i].size; j++) - { - if ( j>0 && j%MAP_CHUNKS==0) - { - cur = cur->next; - } - if(cur->cigarSize[j % MAP_CHUNKS] > SEQ_LENGTH || cur->mdSize[j % MAP_CHUNKS] > SEQ_LENGTH) - { - printf("ERROR in %d read size exceeds cigar=%d md =%d cigar=%s md =%s\n", i, cur->cigarSize[j % MAP_CHUNKS], cur->mdSize[j % MAP_CHUNKS], cur->cigar[j % MAP_CHUNKS], cur->md[j % MAP_CHUNKS]); - } - - tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out); - - tmpOut = fwrite(&(cur->err[j % MAP_CHUNKS]), sizeof(int), 1, out); - - tmpOut = fwrite(&(cur->cigarSize[j % MAP_CHUNKS]), sizeof(int), 1, out); - tmpOut = fwrite((cur->cigar[j % MAP_CHUNKS]), sizeof(char), (cur->cigarSize[j % MAP_CHUNKS]), out); - - tmpOut = fwrite(&(cur->mdSize[j % MAP_CHUNKS]), sizeof(int), 1, out); - tmpOut = fwrite((cur->md[j % MAP_CHUNKS]), sizeof(char), (cur->mdSize[j % MAP_CHUNKS]), out); - - } - _msf_mappingInfo[i].size = 0; - //_msf_mappingInfo[i].next = NULL; - } - } - - _msf_maxLSize += lmax; - _msf_maxRSize += rmax; - - fclose(out1); - fclose(out2); - -} - -void outputPairFullMappingInfo(FILE *fp, int readNumber) -{ - - char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; - char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; - - rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; - - seq1 = _msf_seqList[readNumber*2].seq; - rseq1 = _msf_seqList[readNumber*2].rseq; - qual1 = _msf_seqList[readNumber*2].qual; - - reverse(_msf_seqList[readNumber*2].qual, rqual1, SEQ_LENGTH); - - seq2 = _msf_seqList[readNumber*2+1].seq; - rseq2 = _msf_seqList[readNumber*2+1].rseq; - qual2 = _msf_seqList[readNumber*2+1].qual; - - reverse(_msf_seqList[readNumber*2+1].qual, rqual2, SEQ_LENGTH); - - - if(bestHitMappingInfo[readNumber*2].loc == -1 && bestHitMappingInfo[readNumber*2+1].loc == -1) - return; - else - { - - char *seq; - char *qual; - char d1; - char d2; - int isize; - int proper=0; - // ISIZE CALCULATION - // The distance between outer edges - isize = abs(bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc)+SEQ_LENGTH - 2; - - if (bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc > 0) - { - isize *= -1; - } - d1 = (bestHitMappingInfo[readNumber*2].dir == -1)?1:0; - d2 = (bestHitMappingInfo[readNumber*2+1].dir == -1)?1:0; - - if ( d1 ) - { - seq = rseq1; - qual = rqual1; - } - else - { - seq = seq1; - qual = qual1; - } - if ( (bestHitMappingInfo[readNumber*2].loc < bestHitMappingInfo[readNumber*2+1].loc && !d1 && d2) || - (bestHitMappingInfo[readNumber*2].loc > bestHitMappingInfo[readNumber*2+1].loc && d1 && !d2) ) - { - proper = 2; - } - else - { - proper = 0; - } - - _msf_output.POS = bestHitMappingInfo[readNumber*2].loc; - _msf_output.MPOS = bestHitMappingInfo[readNumber*2+1].loc; - _msf_output.FLAG = 1+proper+16*d1+32*d2+64; - _msf_output.ISIZE = isize; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[readNumber*2].name; - _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = bestHitMappingInfo[readNumber*2].cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2].md; - - outputSAM(fp, _msf_output); - output(_msf_output); - - if ( d2 ) - { - seq = rseq2; - qual = rqual2; - } - else - { - seq = seq2; - qual = qual2; - } - - _msf_output.POS = bestHitMappingInfo[readNumber*2+1].loc; - _msf_output.MPOS = bestHitMappingInfo[readNumber*2].loc; - _msf_output.FLAG = 1+proper+16*d2+32*d1+128; - _msf_output.ISIZE = -isize; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[readNumber*2].name; - _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = bestHitMappingInfo[readNumber*2+1].cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2+1].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2+1].md; - - outputSAM(fp, _msf_output); - output(_msf_output); - } -} - - -/* -Find the closet one to the c - @return 0: if the x1 is closer to c - 1: if the x2 is closer to c - 2: if both distance are equal - -1: if error -*/ -int findNearest(int x1, int x2, int c) -{ - - if (abs(x1 - c) > abs(x2 - c) ) - return 0; - else if ( abs(x1 - c) < abs(x2 - c) ) - return 1; - else if ( abs(x1 - c) == abs(x2 - c) ) - return 2; - else - return -1; -} - -void initBestConcordantDiscordant(int readNumber) -{ - char bestConcordantFileName[FILE_NAME_LENGTH]; - char bestDiscordantFileName[FILE_NAME_LENGTH]; - - //OPEN THE BEST CONCORDANT FILE - //BEGIN{Farhad Hormozdiari} - sprintf(bestConcordantFileName, "%s%s__BEST.CONCORDANT", mappingOutputPath, mappingOutput); - bestConcordantFILE = fileOpen(bestConcordantFileName, "w"); - //END{Farhad Hormozdiari} - - - //OPEN THE BEST DISCORDANT FILE - //BEGIN{Farhad Hormozdiari} - sprintf(bestDiscordantFileName, "%s%s__BEST.DISCORDANT", mappingOutputPath, mappingOutput); - bestDiscordantFILE = fileOpen(bestDiscordantFileName, "w"); - //END{Farhad Hormozdiari} - - initBestMapping(readNumber); -} - -void finalizeBestConcordantDiscordant() -{ - int i = 0; - - for(i = 0; i<_msf_seqListSize/2; i++) - { - if(_msf_readHasConcordantMapping[i]==1) - outputPairFullMappingInfo(bestConcordantFILE, i); - else - outputPairFullMappingInfo(bestDiscordantFILE, i); - } - - fclose(bestConcordantFILE); - fclose(bestDiscordantFILE); - - freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); -} - -void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar) -{ - bestHitMappingInfo[readNumber].loc = loc; - bestHitMappingInfo[readNumber].dir = dir; - bestHitMappingInfo[readNumber].err = err; - bestHitMappingInfo[readNumber].score = score; - - strncpy(bestHitMappingInfo[readNumber].md, md, strlen(md)); - strncpy(bestHitMappingInfo[readNumber].chr, refName, strlen(refName)); - strncpy(bestHitMappingInfo[readNumber].cigar, cigar, strlen(cigar)); -} - - -void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2) -{ - - bestHitMappingInfo[readNumber*2].loc = mi1.loc; - bestHitMappingInfo[readNumber*2].dir = mi1.dir; - bestHitMappingInfo[readNumber*2].err = mi1.err; - bestHitMappingInfo[readNumber*2].score = mi1.score; - snprintf(bestHitMappingInfo[readNumber*2].chr, MAX_REF_SIZE, "%s", _msf_refGenName); - - strncpy(bestHitMappingInfo[readNumber*2].md, mi1.md, strlen(mi1.md)); - strncpy(bestHitMappingInfo[readNumber*2].cigar, mi1.cigar, strlen(mi1.cigar)); - - /* - sprintf(bestHitMappingInfo[readNumber*2].md, "%s\0", mi1.md); - sprintf(bestHitMappingInfo[readNumber*2].cigar, "%s\0", mi1.cigar); - */ - - - bestHitMappingInfo[readNumber*2+1].loc = mi2.loc; - bestHitMappingInfo[readNumber*2+1].dir = mi2.dir; - bestHitMappingInfo[readNumber*2+1].err = mi2.err; - bestHitMappingInfo[readNumber*2+1].score = mi2.score; - - snprintf(bestHitMappingInfo[readNumber*2+1].chr, MAX_REF_SIZE, "%s", _msf_refGenName); - - /* - sprintf(bestHitMappingInfo[readNumber*2+1].md, "%s\0", mi2.md); - sprintf(bestHitMappingInfo[readNumber*2+1].cigar, "%s\0", mi2.cigar); - */ - - strncpy(bestHitMappingInfo[readNumber*2+1].md, mi2.md, strlen(mi2.md)); - strncpy(bestHitMappingInfo[readNumber*2+1].cigar, mi2.cigar, strlen(mi2.cigar)); - -} - -/**********************************************/ -void outputPairedEnd() -{ - int i = 0; - - char cigar[MAX_CIGAR_SIZE]; - - int tmpOut; - - loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut); - - FILE* in1[_msf_openFiles]; - FILE* in2[_msf_openFiles]; - - char fname1[_msf_openFiles][FILE_NAME_LENGTH]; - char fname2[_msf_openFiles][FILE_NAME_LENGTH]; - - // discordant - FILE *out=NULL, *out1=NULL; - - char fname3[FILE_NAME_LENGTH]; - char fname4[FILE_NAME_LENGTH]; - - int meanDistanceMapping = 0; - - char *rqual1; - char *rqual2; - - rqual1 = getMem((SEQ_LENGTH+1)*sizeof(char)); - rqual2 = getMem((SEQ_LENGTH+1)*sizeof(char)); - - if (pairedEndDiscordantMode) - { - sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); - sprintf(fname4, "%s__%s__oea", mappingOutputPath, mappingOutput); - out = fileOpen(fname3, "a"); - out1 = fileOpen(fname4, "a"); - } - - FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize); - FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize); - - _msf_fileCount[_msf_maxFile] = 0; - for (i=0; i<_msf_openFiles; i++) - { - sprintf(fname1[i], "%s__%s__%s__%d__1.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); - sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][0], "%s", fname1[i]); - - sprintf(fname2[i], "%s__%s__%s__%d__2.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); - sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][1], "%s", fname2[i]); - - in1[i] = fileOpen(fname1[i], "r"); - in2[i] = fileOpen(fname2[i], "r"); - _msf_fileCount[_msf_maxFile]++; - } - _msf_maxFile++; - - int size; - int j, k; - int size1, size2; - - meanDistanceMapping = (pairedEndDiscordantMode==1)? (minPairEndedDiscordantDistance+maxPairEndedDiscordantDistance)/2 + SEQ_LENGTH - : (minPairEndedDistance + maxPairEndedDistance) / 2 + SEQ_LENGTH; - - for (i=0; i<_msf_seqListSize/2; i++) - { - size1 = size2 = 0; - for (j=0; j<_msf_openFiles; j++) - { - tmpOut = fread(&size, sizeof(int), 1, in1[j]); - if ( size > 0 ) - { - for (k=0; k<size; k++) - { - mi1[size1+k].dir = 1; - tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]); - tmpOut = fread (&(mi1[size1+k].err), sizeof(int), 1, in1[j]); - - tmpOut = fread (&(mi1[size1+k].cigarSize), sizeof(int), 1, in1[j]); - tmpOut = fread ((mi1[size1+k].cigar), sizeof(char), mi1[size1+k].cigarSize, in1[j]); - mi1[size1+k].cigar[mi1[size1+k].cigarSize] = '\0'; - - tmpOut = fread (&(mi1[size1+k].mdSize), sizeof(int), 1, in1[j]); - tmpOut = fread ((mi1[size1+k].md), sizeof(char), (mi1[size1+k].mdSize), in1[j]); - mi1[size1+k].md[mi1[size1+k].mdSize] = '\0'; - - if (mi1[size1+k].loc<1) - { - mi1[size1+k].loc *= -1; - mi1[size1+k].dir = -1; - } - } - qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut); - size1+=size; - } - } - - for (j=0; j<_msf_openFiles; j++) - { - tmpOut = fread(&size, sizeof(int), 1, in2[j]); - if ( size > 0 ) - { - for (k=0; k<size; k++) - { - mi2[size2+k].dir = 1; - tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]); - tmpOut = fread (&(mi2[size2+k].err), sizeof(int), 1, in2[j]); - - tmpOut = fread (&(mi2[size2+k].cigarSize), sizeof(int), 1, in2[j]); - tmpOut = fread ((mi2[size2+k].cigar), sizeof(char), mi2[size2+k].cigarSize, in2[j]); - mi2[size2+k].cigar[mi2[size2+k].cigarSize] = '\0'; - - tmpOut = fread (&(mi2[size2+k].mdSize), sizeof(int), 1, in2[j]); - tmpOut = fread ((mi2[size2+k].md), sizeof(char), mi2[size2+k].mdSize, in2[j]); - mi2[size2+k].md[mi2[size2+k].mdSize] = '\0'; - - if (mi2[size2+k].loc<1) - { - mi2[size2+k].loc *= -1; - mi2[size2+k].dir = -1; - } - } - qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut); - size2+=size; - } - } - - int lm, ll, rl, rm; - int pos = 0; - - if (pairedEndDiscordantMode) - { - - for (j=0; j<size1; j++) - { - lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1; - ll = mi1[j].loc - minPairEndedDiscordantDistance + 1; - rl = mi1[j].loc + minPairEndedDiscordantDistance - 1; - rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1; - - while (pos<size2 && mi2[pos].loc < lm) - { - pos++; - } - - k = pos; - while (k<size2 && mi2[k].loc<=rm) - { - if ( mi2[k].loc <= ll || mi2[k].loc >= rl) - { - if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) || - (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) ) - { - _msf_seqList[i*2].hits[0]=1; - _msf_seqList[i*2+1].hits[0]=1; - - if(nosamMode != 0) - { - size1=0; - size2=0; - } - - break; - } - } - k++; - } - } - - _msf_seqHits[i*2] += size1; - _msf_seqHits[i*2+1] += size2; - - - if (_msf_seqHits[i*2+1] * _msf_seqHits[i*2] > DISCORDANT_CUT_OFF && nosamMode != 0) - { - _msf_seqList[i*2].hits[0]=1; - _msf_seqList[i*2+1].hits[0]=1; - size1=0; - size2=0; - } - - - - - int tmp = 0; - int rNo = 0; - int loc = 0; - int err = 0; - float sc = 0; - char l = 0; - - //write the OEA data - if(_msf_seqHits[i*2] == 0 ) - { - for(k = 0;k < size2 && _msf_oeaMapping[i*2+1] < maxOEAOutput ;k++) - { - rNo = i*2+1; - loc = mi2[k].loc*mi2[k].dir; - err = mi2[k].err; - sc = mi2[k].score; - - l = strlen(_msf_refGenName); - - tmp = fwrite(&rNo, sizeof(int), 1, out1); - - tmp = fwrite(&l, sizeof(char), 1, out1); - tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); - - tmp = fwrite(&loc, sizeof(int), 1, out1); - tmp = fwrite(&err, sizeof(int), 1, out1); - tmp = fwrite(&sc, sizeof(float), 1, out1); - - if(mi2[k].cigarSize > SEQ_LENGTH || mi2[k].cigarSize <= 0) - printf("ERROR CIGAR size=%d %s\n", mi2[k].cigarSize, _msf_seqList[i*2+1].seq); - - tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out1); - tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out1); - - tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out1); - tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out1); - - _msf_oeaMapping[i*2+1]++; - } - } - if(_msf_seqHits[i*2+1] == 0) - { - for(j = 0;j < size1 && _msf_oeaMapping[i*2] < maxOEAOutput;j++) - { - rNo = i*2; - loc = mi1[j].loc*mi1[j].dir; - err = mi1[j].err; - sc = mi1[j].score; - - l = strlen(_msf_refGenName); - - tmp = fwrite(&rNo, sizeof(int), 1, out1); - - tmp = fwrite(&l, sizeof(char), 1, out1); - tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); - - tmp = fwrite(&loc, sizeof(int), 1, out1); - tmp = fwrite(&err, sizeof(int), 1, out1); - tmp = fwrite(&sc, sizeof(float), 1, out1); - - if(mi1[j].cigarSize > SEQ_LENGTH || mi1[j].cigarSize <= 0 ) - printf("ERROR %d %s\n", mi1[j].cigarSize, _msf_seqList[i*2+1].seq); - - tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out1); - tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out1); - - tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out1); - tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out1); - - _msf_oeaMapping[i*2]++; - } - } - } - - char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; - - - - - rqual1[SEQ_LENGTH] = '\0'; - rqual2[SEQ_LENGTH] = '\0'; - rqual1[0] = '\0'; - rqual2[0] = '\0'; - - - - seq1 = _msf_seqList[i*2].seq; - rseq1 = _msf_seqList[i*2].rseq; - qual1 = _msf_seqList[i*2].qual; - - - - strncpy(rqual1, _msf_seqList[i*2].qual, SEQ_LENGTH); - - seq2 = _msf_seqList[i*2+1].seq; - rseq2 = _msf_seqList[i*2+1].rseq; - qual2 = _msf_seqList[i*2+1].qual; - - - strncpy(rqual2, _msf_seqList[i*2+1].qual, SEQ_LENGTH); - - if (pairedEndDiscordantMode) - { - for (k=0; k<size1; k++) - { - mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, mi1[k].cigar); - } - - for (k=0; k<size2; k++) - { - mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, mi2[k].cigar); - } - - } - - - if (pairedEndDiscordantMode) - { - for (j=0; j<size1; j++) - { - for(k = 0; k < size2; k++) - { - if( - (mi2[k].loc-mi1[j].loc >= minPairEndedDiscordantDistance && - mi2[k].loc-mi1[j].loc <= maxPairEndedDiscordantDistance && - mi1[j].dir > 0 && mi2[k].dir < 0 ) - - || - - (mi1[j].loc-mi2[k].loc >= minPairEndedDiscordantDistance && - mi1[j].loc-mi2[k].loc <= maxPairEndedDiscordantDistance && - mi1[j].dir < 0 && mi2[k].dir > 0) - ) - { - //POSSIBLE CONCORDANT - if(_msf_readHasConcordantMapping[i] == 0) - { - setPairFullMappingInfo(i, mi1[j], mi2[k]); - _msf_readHasConcordantMapping[i] = 1; - _msf_seqList[i*2].hits[0] = 1; - _msf_seqList[i*2+1].hits[0] = 1; - } - else - { - if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) - { - - if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == - mi1[j].err + mi2[k].err && - findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), - abs(mi2[k].loc - mi1[j].loc), - meanDistanceMapping - ) == 0 ) - { - continue; - } - setPairFullMappingInfo(i, mi1[j], mi2[k]); - } - } - } - //DISCORDANT TO TEMP FILE FOR POST PROCESSIING - else if(_msf_readHasConcordantMapping[i] == 0 && - _msf_seqHits[i*2] != 0 && - _msf_seqHits[i*2+1] != 0) - { - - int tmp; - int rNo = i; - int loc = mi1[j].loc*mi1[j].dir; - int err = mi1[j].err; - float sc = mi1[j].score; - - char l = strlen(_msf_refGenName); - - if(_msf_discordantMapping[i*2] < maxDiscordantOutput) - { - - tmp = fwrite(&rNo, sizeof(int), 1, out); - - tmp = fwrite(&l, sizeof(char), 1, out); - tmp = fwrite(_msf_refGenName, sizeof(char), l, out); - - tmp = fwrite(&loc, sizeof(int), 1, out); - tmp = fwrite(&err, sizeof(int), 1, out); - tmp = fwrite(&sc, sizeof(float), 1, out); - - tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out); - tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out); - - tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out); - tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out); - - - loc = mi2[k].loc*mi2[k].dir; - err = mi2[k].err; - sc = mi2[k].score; - - tmp = fwrite(&loc, sizeof(int), 1, out); - tmp = fwrite(&err, sizeof(int), 1, out); - tmp = fwrite(&sc, sizeof(float), 1, out); - - tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); - tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); - - tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); - tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); - - - _msf_discordantMapping[i*2]++; - } - //SET THE BEST DISCORDANT - //BEGIN {Farhad Hormozdiari} - if( bestHitMappingInfo[i*2].loc == -1 && - bestHitMappingInfo[i*2+1].loc == -1 && - _msf_readHasConcordantMapping[i] == 0) - { - setPairFullMappingInfo(i, mi1[j], mi2[k]); - _msf_seqList[i*2].hits[0] = 1; - _msf_seqList[i*2+1].hits[0] = 1; - } - else if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err - && _msf_readHasConcordantMapping[i] == 0) - { - if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && - findNearest( abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), - abs(mi1[j].loc - mi2[k].loc), - meanDistanceMapping - ) == 0 - ) - { - continue; - } - setPairFullMappingInfo(i, mi1[j], mi2[k]); - } - //END {Farhad Hormozdiari} - } - } - } - } - else - { - for (j=0; j<size1; j++) - { - for(k = 0; k < size2; k++) - { - if((mi2[k].loc-mi1[j].loc >= minPairEndedDistance && - mi2[k].loc-mi1[j].loc <= maxPairEndedDistance && - mi1[j].dir > 0 && mi2[k].dir < 0) - || - (mi1[j].loc-mi2[k].loc >= minPairEndedDistance && - mi1[j].loc-mi2[k].loc <= maxPairEndedDistance && - mi1[j].dir < 0 && mi2[k].dir > 0) - ) - { - char *seq; - char *qual; - char d1; - char d2; - int isize; - int proper=0; - // ISIZE CALCULATION - // The distance between outer edges - isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-2; - if (mi1[j].loc - mi2[k].loc > 0) - { - isize *= -1; - } - - d1 = (mi1[j].dir == -1)?1:0; - d2 = (mi2[k].dir == -1)?1:0; - - //SET THE READ HAS CONCORDANT MAPPING - _msf_readHasConcordantMapping[i] = 1; - - if ( d1 ) - { - seq = rseq1; - qual = rqual1; - } - else - { - seq = seq1; - qual = qual1; - } - - if ((mi1[j].loc < mi2[k].loc && !d1 && d2) || - (mi1[j].loc > mi2[k].loc && d1 && !d2) ) - { - proper = 2; - } - else - { - proper = 0; - } - - - _msf_output.POS = mi1[j].loc; - _msf_output.MPOS = mi2[k].loc; - _msf_output.FLAG = 1+proper+16*d1+32*d2+64; - _msf_output.ISIZE = isize; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[i*2].name; - _msf_output.RNAME = _msf_refGenName; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = mi1[j].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = mi1[j].md; - - if(!bestMode) - output(_msf_output); - - if ( d2 ) - { - seq = rseq2; - qual = rqual2; - } - else - { - seq = seq2; - qual = qual2; - } - - _msf_output.POS = mi2[k].loc; - _msf_output.MPOS = mi1[j].loc; - _msf_output.FLAG = 1+proper+16*d2+32*d1+128; - _msf_output.ISIZE = -isize; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[i*2].name; - _msf_output.RNAME = _msf_refGenName; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = mi2[k].err;; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = mi2[k].md; - - if(!bestMode) - output(_msf_output); - //SET THE BEST CONCORDANT - //BEGIN {Farhad Hormozdiari} - if(bestHitMappingInfo[i*2].loc == -1 && bestHitMappingInfo[i*2+1].loc == -1) - { - setPairFullMappingInfo(i, mi1[j], mi2[k]); - } - else - { - if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) - { - - if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && - findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), - abs(mi2[k].loc - mi1[j].loc), - meanDistanceMapping - ) == 0 ) - { - continue; - } - setPairFullMappingInfo(i, mi1[j], mi2[k]); - } - } - //END {Farhad Hormozdiari} - } - } - } - - } - } - - freeMem(rqual1, 0); - freeMem(rqual2, 0); - - if (pairedEndDiscordantMode) - { - fclose(out); - fclose(out1); - } - - for (i=0; i<_msf_openFiles; i++) - { - fclose(in1[i]); - fclose(in2[i]); - - unlink(fname1[i]); - unlink(fname2[i]); - } - - freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize); - freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize); - - _msf_openFiles = 0; -} - -/**********************************************/ -/**********************************************/ -/**********************************************/ -/**********************************************/ -float str2int(char *str, int index1, int index2) -{ - char tmp[200]; - strncpy(tmp, &str[index1], index2-index1); - tmp[index2-index1] = '\0'; - return atol(tmp); -} - -float calculateScore(int index, char *seq, char *qual,char *md) -{ - int i; - int j; - char *ref; - char *ver; - - ref = _msf_refGen + index-1; - ver = seq; - float score = 1; - - char tmp[200]; - int value = 0; - int end = 0; - int index1 = 0; - int index2 = 0; - - i=0; - while(1) - { - - if(i>=strlen(md)) - break; - - index1 = i; - - while(md[i] >='0' && md[i]<='9') - { - i++; - } - - index2 = i; - - value = str2int(md, index1,index2); - - if(md[i]=='M') - { - for(j=0;j<value;j++) - { - tmp[end]='M'; - end++; - } - } - else if(md[i]=='I') - { - for(j=0;j<value;j++) - { - tmp[end]='I'; - end++; - } - - } - else if(md[i] == 'D') - { - for(j=0;j<value;j++) - { - tmp[end]='D'; - end++; - } - } - i++; - } - - tmp[end] = '\0'; - - j = 0; - - for (i = 0; i < end; i++) - { - if(tmp[i] == 'M') - { - if (*ref != *ver) - { - score *= 0.001 + 1/pow( 10, ((qual[j]-33)/10.0) ); - } - - ref++; - ver++; - j++; - } - else if(tmp[i] == 'I') - { - ver++; - j++; - } - else if(tmp[i] == 'D') - { - ref++; - } - } - - return score; -} - -int matoi(char *str, int start, int end) -{ - int i = 0; - char tmp[200]; - - for(i=0;i < end-start; i++) - tmp[i] = str[start+i]; - tmp[i]='\0'; - - return atoi(tmp); -} - -void convertCigarToMatrix(char *cigar, int cigar_size, char * matrix) -{ - int i = 0; - int j = 0; - - int start = 0; - int size = 0; - - matrix[0] = '\0'; - - while(i < cigar_size) - { - if(cigar[i] >= '0' && cigar[i] <= '9') - { - start = i; - - while(cigar[i] >= '0' && cigar[i] <= '9' && i < cigar_size) - i++; - - int value = matoi(cigar, start, i); - for(j = 0; j < value; j++) - { - if(cigar[i] == 'M') - matrix[size] = 'M'; - else if(cigar[i] == 'D') - matrix[size] ='D'; - else if(cigar[i] == 'I') - matrix[size] = 'I'; - size++; - } - } - i++; - } - matrix[size] = '\0'; -} - - - -void convertMDToMatrix(char *md, int md_size, char * matrix) -{ - int i = 0; - int j = 0; - - int start = 0; - int size = 0; - - matrix[0] = '\0'; - - while(i < md_size) - { - if(md[i] >= '0' && md[i] <= '9') - { - start = i; - - while(md[i] >= '0' && md[i] <= '9' && i < md_size) - i++; - - int value = matoi(md, start, i); - for(j = 0; j < value; j++) - { - matrix[size] = 'M'; - size++; - } - i--; - } - else if(md[i] == '^') - { - matrix[size] = 'D'; - size++; - } - else - { - matrix[size] = md[i]; - size++; - } - //size++; - i++; - } - matrix[size] = '\0'; -} - - -void convertMDCigarToMatrix(char *cigar, int cigar_size, char *md, int md_size, char *matrix) -{ - int i = 0; - int j = 0; - - int size = 0; - - char tmp1[200]; - char tmp2[200]; - convertMDToMatrix(md,md_size, tmp2); - - convertCigarToMatrix(cigar, cigar_size,tmp1); - - - - while(i < strlen(tmp1)) - { - if(tmp1[i]=='M') - { - if(j < strlen(tmp2)) - { - if(tmp2[j]=='M') - { - matrix[size]='M'; - size++; - } - if(tmp2[j]!='M') - { - matrix[size]=tmp2[j]; - size++; - } - } - else - { - matrix[size]='M'; - size++; - } - } - else if(tmp1[i] == 'D') - { - matrix[size]='D'; - size++; - j++; - matrix[size]=tmp2[j]; - size++; - - } - else if(tmp1[i] == 'I') - { - matrix[size]='I'; - size++; - } - - i++; - if(j < strlen(tmp2)) - j++; - } - - if(strlen(tmp1)) - - matrix[size] = '\0'; - -} - -void convertInsertion(char * in_matrix, char * seq, char *out_matrix) -{ - int i = 0; - int j = 0; - int size = 0; - - while( i < strlen(in_matrix)) - { - if(in_matrix[i] == 'M') - { - out_matrix[size] = 'M'; - size++; - j++; - } - else if(in_matrix[i] == 'D') - { - out_matrix[size] = 'D'; - size++; - - i++; - j++; - - out_matrix[size] = seq[j]; - j++; - size++; - } - else if(in_matrix[i] == 'I') - { - out_matrix[size] = 'I'; - size++; - out_matrix[size] = seq[j]; - size++; - j++; - } - else - { - out_matrix[size] = in_matrix[i]; - size++; - j++; - } - i++; - } - out_matrix[size] = '\0'; -} - -/**********************************************/ -void outputPairedEndDiscPP() -{ - char tmp_matrix1[200]; - char tmp_matrix2[200]; - - char matrix1[200]; - char matrix2[200]; - - char cigar1[200]; - char editString1[200]; - - char cigar2[200]; - char editString2[200]; - - char seq1[SEQ_LENGTH+1]; - char qual1[SEQ_LENGTH+1]; - - char seq2[SEQ_LENGTH+1]; - char qual2[SEQ_LENGTH+1]; - - char genName[SEQ_LENGTH]; - char fname1[FILE_NAME_LENGTH]; - char fname2[FILE_NAME_LENGTH]; - char l; - int l_size; - int loc1, loc2; - int err1, err2; - char dir1, dir2; - float sc1, sc2, lsc=0; - int flag = 0; - int rNo,lrNo = -1; - int tmp; - FILE *in, *out; - - sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput); - sprintf(fname2, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput); - - in = fileOpen(fname1, "r"); - out = fileOpen(fname2, "w"); - - if (in != NULL) - { - flag = fread(&rNo, sizeof(int), 1, in); - } - else - { - flag = 0; - } - - seq1[SEQ_LENGTH] = '\0'; - qual1[SEQ_LENGTH] = '\0'; - - seq2[SEQ_LENGTH] = '\0'; - qual2[SEQ_LENGTH] = '\0'; - - while (flag) - { - tmp = fread(&l, sizeof(char), 1, in); - tmp = fread(genName, sizeof(char), l, in); - genName[(int)l]='\0'; - tmp = fread(&loc1, sizeof(int), 1, in); - tmp = fread(&err1, sizeof(int), 1, in); - tmp = fread(&sc1, sizeof(float), 1, in); - - //tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); - - tmp = fread(&l_size, sizeof(int), 1, in); - tmp = fread(cigar1, sizeof(char), l_size, in); - cigar1[(int)l_size]='\0'; - //tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); - - //tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); - tmp = fread(&l_size, sizeof(int), 1, in); - tmp = fread(editString1, sizeof(char), l_size, in); - editString1[(int)l_size]='\0'; - //tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); - - tmp = fread(&loc2, sizeof(int), 1, in); - tmp = fread(&err2, sizeof(int), 1, in); - tmp = fread(&sc2, sizeof(float), 1, in); - - tmp = fread(&l_size, sizeof(int), 1, in); - tmp = fread(cigar2, sizeof(char), l_size, in); - cigar2[(int)l_size]='\0'; - - tmp = fread(&l_size, sizeof(int), 1, in); - tmp = fread(editString2, sizeof(char), l_size, in); - editString2[(int)l_size]='\0'; - - convertMDCigarToMatrix(cigar1, strlen(cigar1), editString1, strlen(editString1), tmp_matrix1); - convertMDCigarToMatrix(cigar2, strlen(cigar2), editString2, strlen(editString2), tmp_matrix2); - - - if(_msf_readHasConcordantMapping[rNo] == 0) - { - - dir1 = dir2 = 'F'; - - strncpy(seq1, _msf_seqList[rNo*2].seq, SEQ_LENGTH); - strncpy(seq2, _msf_seqList[rNo*2+1].seq, SEQ_LENGTH); - - if (loc1 < 0) - { - dir1 = 'R'; - loc1 = -loc1; - - strncpy(seq1, _msf_seqList[rNo*2].rseq, SEQ_LENGTH); - } - - if (loc2 < 0) - { - dir2 = 'R'; - loc2 = -loc2; - - strncpy(seq2, _msf_seqList[rNo*2+1].rseq, SEQ_LENGTH); - } - - convertInsertion(tmp_matrix1, seq1, matrix1); - convertInsertion(tmp_matrix2, seq2, matrix2); - - - if (rNo != lrNo) - { - int j; - for (j=0; j<SEQ_LENGTH; j++) - { - lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j]; - } - lsc /= 2*SEQ_LENGTH; - lsc -= 33; - lrNo = rNo; - } - - char event = '\0'; - - - if ( dir1 == dir2 ) - { - event = 'V'; - } - else - { - if (loc1 < loc2) - { - - if (dir1 == 'R' && dir2 == 'F') - { - event = 'E'; - - } - else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance ) - { - event = 'D'; - } - else - { - event = 'I'; - } - } - else if (loc2 < loc1) - { - if (dir2 == 'R' && dir1 == 'F') - { - event = 'E'; - } - else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance ) - { - event = 'D'; - } - else - { - event = 'I'; - } - } - } - _msf_seqList[rNo*2].hits[0] = 2; - if(event != 'E') - fprintf(out, "%s\t%s\t%d\t%d\t%c\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%e\n", - _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, - loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2); - - } - flag = fread(&rNo, sizeof(int), 1, in); - } - - fclose(in); - fclose(out); - - unlink(fname1); -} - -void finalizeOEAReads(char *fileName) -{ - FILE *fp_out1; - FILE * in; - - char genName[SEQ_LENGTH]; - - char fname1[FILE_NAME_LENGTH]; - char fname2[FILE_NAME_LENGTH]; - - char l=0; - int loc1=0; - - int err1; - - char d; - - float sc1=0; - int flag = 0; - int rNo=-1; - int tmp=0; - - int cigarSize = 0; - int mdSize = 0; - - char cigar[SEQ_LENGTH+1]; - char md[SEQ_LENGTH+1]; - - char *seq1, *seq2, *qual1, *qual2; - char *rqual1, *rqual2; - - seq1=NULL; seq2=NULL; qual1=NULL; qual2=NULL; - - rqual1 = getMem(200*sizeof(char)); - rqual2 = getMem(200*sizeof(char)); - - rqual1[0] = '\0'; - rqual2[0] = '\0'; - - sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput); - - fp_out1 = fileOpen(fname1, "w"); - - in = NULL; - if (pairedEndDiscordantMode){ - sprintf(fname2, "%s__%s__oea", mappingOutputPath, mappingOutput); - - in = fileOpen(fname2, "r"); - } - - - if (in != NULL) - { - flag = fread(&rNo, sizeof(int), 1, in); - } - else - { - flag = 0; - } - - while (flag) - { - cigar[0] = '\0'; - md[0] = '\0'; - - tmp = fread(&l, sizeof(char), 1, in); - tmp = fread(genName, sizeof(char), l, in); - - genName[(int)l]='\0'; - - - tmp = fread(&loc1, sizeof(int), 1, in); - tmp = fread(&err1, sizeof(int), 1, in); - tmp = fread(&sc1, sizeof(float), 1, in); - - tmp = fread (&cigarSize, sizeof(int), 1, in); - tmp = fread (cigar, sizeof(char), cigarSize, in); - - cigar[cigarSize] = '\0'; - - tmp = fread (&mdSize, sizeof(int), 1, in); - tmp = fread (md, sizeof(char), mdSize, in); - md[mdSize] = '\0'; - - d = 1; - - if(loc1 < 0) - { - d = -1; - loc1 *= -1; - - seq1 = _msf_seqList[rNo].rseq; - reverse(_msf_seqList[rNo].qual, rqual1, SEQ_LENGTH); - rqual1[SEQ_LENGTH] = '\0'; - } - else - { - seq1 = _msf_seqList[rNo].seq; - qual1 = _msf_seqList[rNo].qual; - } - - if(rNo % 2 == 0) - { - seq2 = _msf_seqList[rNo+1].seq; - qual2 = _msf_seqList[rNo+1].qual; - } - else - { - seq2 = _msf_seqList[rNo-1].seq; - qual2 = _msf_seqList[rNo-1].qual; - } - - if(_msf_seqHits[rNo] != 0 && _msf_seqHits[(rNo%2==0)?rNo+1:rNo-1] == 0) - { - _msf_output.POS = loc1; - _msf_output.MPOS = 0; - _msf_output.FLAG = (rNo % 2 ==0)? 1+4+32*d+128 : 1+8+16*d+64 ; - _msf_output.ISIZE = 0; - _msf_output.SEQ = seq1; - _msf_output.QUAL = qual1; - _msf_output.QNAME = _msf_seqList[rNo].name; - _msf_output.RNAME = genName; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = cigar; - _msf_output.MRNAME = "="; - - - _msf_output.optSize = 4; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = err1; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = md; - - - - //for the OEA reads - _msf_optionalFields[2].tag = "NS"; - _msf_optionalFields[2].type = 'Z'; - _msf_optionalFields[2].sVal = seq2; - - - _msf_optionalFields[3].tag = "NQ"; - _msf_optionalFields[3].type = 'Z'; - _msf_optionalFields[3].sVal = qual2; - - outputSAM(fp_out1, _msf_output); - - _msf_seqList[rNo].hits[0] = -1; - _msf_seqList[(rNo%2==0)?rNo+1:rNo-1].hits[0] = -1; - } - flag = fread(&rNo, sizeof(int), 1, in); - } - - freeMem(rqual1, 0); - freeMem(rqual2, 0); - - unlink(fname2); - - fclose(fp_out1); -} - -/* - -void outputOEA(char *fileName1, FILE * fp_out, int readSegment) -{ - int i = 0; - int j = 0; - - char *index; - - int size1 = 0; - - FILE *fp1; - - char geneFileName1[FILE_NAME_LENGTH]; - - char matrix[200]; - char cigar[MAX_CIGAR_SIZE]; - char editString[200]; - - FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); - - if(fileName1 != NULL) - { - - fp1 = fileOpen(fileName1, "r"); - - index = strstr(fileName1, "__"); - strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); - geneFileName1[strstr(index + 2, "__") - index - 2] = '\0'; - - for(i = 0; i < _msf_seqListSize / 2; i++) - { - fread(&size1, sizeof(int), 1, fp1); - - miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) ); - - miL[i].size = size1; - - for(j = 0; j < size1; j++) - { - fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1); - - fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1); - - fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1); - fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1); - - fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1); - fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1); - - miL[i].mi[j].dir = 1; - if(miL[i].mi[j].loc < 1) - { - miL[i].mi[j].loc *= -1; - miL[i].mi[j].dir = -1; - } - } - - int tmpSize = (readSegment==0) ? _msf_seqHits[i*2+1] : _msf_seqHits[i*2]; - - if(_msf_seqHits[i*2+readSegment] == 0 && size1 != 0 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput) - { - int d1 = 0; - - char *seq, *qual; - char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; - char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; - - rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; - seq1 = _msf_seqList[i*2].seq; - rseq1 = _msf_seqList[i*2].rseq; - qual1 = _msf_seqList[i*2].qual; - reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH); - - seq2 = _msf_seqList[i*2+1].seq; - rseq2 = _msf_seqList[i*2+1].rseq; - qual2 = _msf_seqList[i*2+1].qual; - reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH); - - for(j = 0; j < size1 && _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)] <= maxOEAOutput; j++) - { - d1 = (miL[i].mi[j].dir == -1)?1:0; - - if(readSegment == 0) - { - if ( d1 ) - { - seq = rseq2; - qual = rqual2; - } - else - { - seq = seq2; - qual = qual2; - } - } - else - { - if ( d1 ) - { - seq = rseq1; - qual = rqual1; - } - else - { - seq = seq1; - qual = qual1; - } - } - - _msf_oeaMapping[i*2+(readSegment == 0 ? 1: 0)]++; - - _msf_output.POS = (readSegment==1)?miL[i].mi[j].loc:0; - _msf_output.MPOS = (readSegment==0)?miL[i].mi[j].loc:0; - _msf_output.FLAG = (readSegment==0)? 1+4+32*d1+128 : 1+8+16*d1+64 ; - _msf_output.ISIZE = 0; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[i*2+(readSegment==0?1:0)].name; - _msf_output.RNAME = geneFileName1; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = miL[i].mi[j].cigar; - _msf_output.MRNAME = "="; - //_msf_output.NSEQ = (readSegment == 0)?seq1:seq2; - //_msf_output.NQUAL = (readSegment == 0)?qual1:qual2; - - - _msf_output.optSize = 4; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = miL[i].mi[j].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = miL[i].mi[j].md; - - - - //for the OEA reads - _msf_optionalFields[2].tag = "NS"; - _msf_optionalFields[2].type = 'Z'; - _msf_optionalFields[2].sVal = (readSegment == 0)?seq1:seq2; - - - _msf_optionalFields[3].tag = "NQ"; - _msf_optionalFields[3].type = 'Z'; - _msf_optionalFields[3].sVal = (readSegment == 0)?qual1:qual2; - - outputSAM(fp_out, _msf_output); - - } - } - } - - } - - for(i = 0; i < _msf_seqListSize / 2; i++) - { - freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo)); - } - - freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink)); - - fclose(fp1); -} - -void finalizeOEAReads(char *fileName) -{ - - int i = 0; - int k = 0; - - FILE *fp_out1; - char fname1[200]; - - _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int)); - for(i = 0; i < _msf_seqListSize; i++) - { - _msf_oeaMapping[i] = 0; - } - - sprintf(fname1, "%s%s_OEA", mappingOutputPath, mappingOutput); - - fp_out1 = fileOpen(fname1, "w"); - for(i = 0; i < _msf_maxFile; i++) - { - for(k = 0; k < _msf_fileCount[i]; k++) - { - outputOEA(_msf_fileName[i][k][1], fp_out1, 0); - }// for k - } //for i - - - for(i = 0; i < _msf_maxFile; i++) - { - for(k = 0; k < _msf_fileCount[i]; k++) - { - outputOEA(_msf_fileName[i][k][0], fp_out1, 1); - }// for k - } //for i - - fclose(fp_out1); - -} -*/ - -void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out) -{ - int i = 0; - int j = 0; - int k = 0; - - char *index; - - int size1 = 0; - int size2 = 0; - - FILE *fp1 = NULL; - FILE *fp2 = NULL; - - char geneFileName1[FILE_NAME_LENGTH]; - char geneFileName2[FILE_NAME_LENGTH]; - - FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); - FullMappingInfoLink *miR = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); - - - if(fileName1 != NULL && fileName2 != NULL) - { - - fp1 = fileOpen(fileName1, "r"); - fp2 = fileOpen(fileName2, "r"); - - index = strstr(fileName1, "__"); - strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); - geneFileName1[strstr(index + 2, "__") - index - 2] = '\0'; - - index = strstr(fileName2, "__"); - strncpy(geneFileName2, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); - geneFileName2[strstr(index + 2, "__") - index - 2] = '\0'; - - - for(i = 0; i < _msf_seqListSize / 2; i++) - { - fread(&size1, sizeof(int), 1, fp1); - fread(&size2, sizeof(int), 1, fp2); - - miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) ); - miR[i].mi = getMem(size2 * sizeof(FullMappingInfo) ); - - miL[i].size = size1; - miR[i].size = size2; - - for(j = 0; j < size1; j++) - { - fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1); - - fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1); - - fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1); - fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1); - - fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1); - fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1); - - miL[i].mi[j].dir = 1; - if(miL[i].mi[j].loc < 1) - { - miL[i].mi[j].loc *= -1; - miL[i].mi[j].dir = -1; - } - } - for(k = 0; k < size2; k++) - { - fread(&(miR[i].mi[k].loc), sizeof(int), 1, fp2); - - fread (&(miR[i].mi[k].err), sizeof(int), 1, fp2); - - fread (&(miR[i].mi[k].cigarSize), sizeof(int), 1, fp2); - fread ((miR[i].mi[k].cigar), sizeof(char), miR[i].mi[k].cigarSize+1, fp2); - - fread (&(miR[i].mi[k].mdSize), sizeof(int), 1, fp2); - fread ((miR[i].mi[k].md), sizeof(char), miR[i].mi[k].mdSize+1, fp2); - - miR[i].mi[k].dir = 1; - if(miR[i].mi[k].loc < 1) - { - miR[i].mi[k].loc *= -1; - miR[i].mi[k].dir = -1; - } - } - if(_msf_readHasConcordantMapping[i] == 0 && size1 != 0 && size2 != 0 && (size1 * size2 < MAX_TRANS_CHROMOSAL_OUTPUT)) - { - int d1 = 0; - int d2 = 0; - char *seq, *qual; - char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; - char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; - rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; - seq1 = _msf_seqList[i*2].seq; - rseq1 = _msf_seqList[i*2].rseq; - qual1 = _msf_seqList[i*2].qual; - reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH); - - seq2 = _msf_seqList[i*2+1].seq; - rseq2 = _msf_seqList[i*2+1].rseq; - qual2 = _msf_seqList[i*2+1].qual; - reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH); - - for(j = 0; j < size1; j++) - { - d1 = (miL[i].mi[j].dir == -1)?1:0; - - if ( d1 ) - { - seq = rseq1; - qual = rqual1; - } - else - { - seq = seq1; - qual = qual1; - } - - for(k = 0; k < size2; k++) - { - - d2 = (miR[i].mi[k].dir == -1)?1:0; - - _msf_output.POS = miL[i].mi[j].loc; - _msf_output.MPOS = miR[i].mi[k].loc; - _msf_output.FLAG = 0; - _msf_output.ISIZE = 0; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[i*2].name; - _msf_output.RNAME = geneFileName1; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = miL[i].mi[j].cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = miL[i].mi[j].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = miL[i].mi[j].md; - - - if ( d2 ) - { - seq = rseq2; - qual = rqual2; - } - else - { - seq = seq2; - qual = qual2; - } - - outputSAM(fp_out, _msf_output); - - - _msf_output.POS = miR[i].mi[k].loc; - _msf_output.MPOS = miL[i].mi[j].loc; - _msf_output.FLAG = 0; - _msf_output.ISIZE = 0; - _msf_output.SEQ = seq, - _msf_output.QUAL = qual; - _msf_output.QNAME = _msf_seqList[i*2+1].name; - _msf_output.RNAME = geneFileName2; - _msf_output.MAPQ = 255; - _msf_output.CIGAR = miR[i].mi[k].cigar; - _msf_output.MRNAME = "="; - - _msf_output.optSize = 2; - _msf_output.optFields = _msf_optionalFields; - - _msf_optionalFields[0].tag = "NM"; - _msf_optionalFields[0].type = 'i'; - _msf_optionalFields[0].iVal = miR[i].mi[k].err; - - _msf_optionalFields[1].tag = "MD"; - _msf_optionalFields[1].type = 'Z'; - _msf_optionalFields[1].sVal = miR[i].mi[k].md; - - outputSAM(fp_out, _msf_output); - - } - } - } - } - - } - - for(i = 0; i < _msf_seqListSize / 2; i++) - { - freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo)); - freeMem(miR[i].mi, miR[i].size * sizeof(FullMappingInfo)); - } - - freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink)); - freeMem(miR, _msf_seqListSize * sizeof(FullMappingInfoLink)); - - fclose(fp1); - fclose(fp2); -} - -/* - if flag is 1 it will output all the possible trans chromsal mapping - otherwise only tmp file will be delete - -*/ - -void outputAllTransChromosal(int flag) -{ - - int i = 0; - int j = 0; - int k = 0; - int l = 0; - - FILE *fp_out = NULL; - char fname1[200]; - - if(flag) - { - fp_out = fileOpen(fname1, "w"); - - sprintf(fname1, "%s%s_TRANSCHROMOSOMAL", mappingOutputPath, mappingOutput); - - // for(i = 0; i < _msf_maxFile; i++) - // { - i = 0; - for(j = i+1; j < _msf_maxFile; j++) - { - if(i != j) - { - for(k = 0; k < _msf_fileCount[i]; k++) - { - for(l = 0; l < _msf_fileCount[j]; l++) - { - outputTransChromosal(_msf_fileName[i][k][0], _msf_fileName[j][l][1], fp_out); - }// for l - }// for k - }// if - }// for j - // } //for i - } - - for(i = 0; i < _msf_maxFile; i++) - { - for(j = 0; j < _msf_fileCount[i]; j++) - { - unlink(_msf_fileName[i][j][0]); - unlink(_msf_fileName[i][j][1]); - } - } - if(flag) - fclose(fp_out); -}
--- a/mrfast-2.1.0.4/MrFAST.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,209 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#ifndef __MR_FAST__ -#define __MR_FAST__ - -#include "Reads.h" - -#define MAP_CHUNKS 15 -#define MAX_CIGAR_SIZE 100 - - -// Pair is used to pre-processing and making the read index table -typedef struct -{ - int hv; - //char hv[50]; - int readNumber; -} Pair; - -typedef struct -{ - int hv; - unsigned int *seqInfo; -} ReadIndexTable; - - -typedef struct -{ - int loc; - char dir; - int err; - float score; - char md[MAX_CIGAR_SIZE]; - char chr[20]; - char cigar[MAX_CIGAR_SIZE]; - int cigarSize; - int mdSize; -} FullMappingInfo; - -typedef struct -{ - int loc; - char dir; - int err; - float score; - char md[MAX_CIGAR_SIZE]; - char chr[20]; - char cigar[MAX_CIGAR_SIZE]; - int cigarSize; - int mdSize; -} BestFullMappingInfo; - -typedef struct lc -{ - char md[MAP_CHUNKS][MAX_CIGAR_SIZE]; - int mdSize[MAP_CHUNKS]; - - char cigar[MAP_CHUNKS][MAX_CIGAR_SIZE]; - int cigarSize[MAP_CHUNKS]; - - int err[MAP_CHUNKS]; - int loc[MAP_CHUNKS]; - struct lc *next; -} MappingLocations; - -typedef struct inf -{ - int size; - MappingLocations *next; -} MappingInfo; - -typedef struct -{ - FILE * fp; - char name[400]; -} FILE_STRUCT; - -typedef struct -{ - FullMappingInfo *mi; - int size; -} FullMappingInfoLink; - - -typedef struct -{ - char readString[200]; - char ref[200]; - int err; - char matrix[200]; -} extraCaching; - -extern long long verificationCnt; -extern long long mappingCnt; -extern long long mappedSeqCnt; -extern long long completedSeqCnt; - -void initFAST( Read *seqList, - int seqListSize, - int *samplingLocs, - int samplingLocsSize, - char *fileName); - -void initVerifiedLocs(); -void initLookUpTable(); -void initBestMapping(); -void initBestConcordantDiscordant(int readNumber); - -void finalizeFAST(); -void finalizeBestSingleMapping(); -void finalizeBestConcordantDiscordant(); -void finalizeOEAReads(char *); - - -int mapAllSingleEndSeq(); -void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); -void mapPairedEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); - -void mapPairedEndSeq(); - -void outputPairedEnd(); -void outputPairedEndDiscPP(); - - -void outputPairFullMappingInfo(FILE *fp, int readNumber); -void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2); -void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar); - -void outputAllTransChromosal(); -void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out); - -void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP); -void generateCigar(char *matrix, int matrixLength, char *cigar); -void generateCigarFromMD(char *mistmatch, int mismatchLength, char *cigar); - -int msfHashVal(char *seq); - -int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); -int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); - -int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); -int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); - -int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); -int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); - -int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); -int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); - - -/***********************************/ - -int editDistance(int refIndex, char *seq, int seqLength, char *matrix); - -int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, - char *matrix, int *map_location, short *seqHashValue); - -int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, - char *matrix, int *map_location, short *seqHashValue); - -int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, - char *matrix, int *map_location, short *seqHashValue); - -int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, - char *matrix, int *map_location, short *seqHashValue); - -#endif
--- a/mrfast-2.1.0.4/Output.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,231 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - - -#include <stdio.h> -#include <stdlib.h> -#include <zlib.h> -#include <string.h> -#include "Common.h" -#include "Output.h" - -FILE *_out_fp; -gzFile _out_gzfp; - -char buffer[300000]; -int bufferSize = 0; - - -void finalizeGZOutput() -{ - gzclose(_out_gzfp); -} - -void finalizeTXOutput() -{ - fclose(_out_fp); -} - - -void gzOutputQ(SAM map) -{ - gzprintf(_out_gzfp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", - map.QNAME, - map.FLAG, - map.RNAME, - map.POS, - map.MAPQ, - map.CIGAR, - map.MRNAME, - map.MPOS, - map.ISIZE, - map.SEQ, - map.QUAL); - - int i; - - for ( i = 0; i < map.optSize; i++) - { - switch (map.optFields[i].type) - { - case 'A': - gzprintf(_out_gzfp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); - break; - case 'i': - gzprintf(_out_gzfp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); - break; - case 'f': - gzprintf(_out_gzfp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); - break; - case 'Z': - case 'H': - gzprintf(_out_gzfp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); - break; - } - } - gzprintf(_out_gzfp, "\n"); -} - -void outputSAM(FILE *fp, SAM map) -{ - fprintf(fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", - map.QNAME, - map.FLAG, - map.RNAME, - map.POS, - map.MAPQ, - map.CIGAR, - map.MRNAME, - map.MPOS, - map.ISIZE, - map.SEQ, - map.QUAL); - - - int i; - - for ( i = 0; i < map.optSize; i++) - { - switch (map.optFields[i].type) - { - case 'A': - fprintf(fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); - break; - case 'i': - fprintf(fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); - break; - case 'f': - fprintf(fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); - break; - case 'Z': - case 'H': - fprintf(fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); - break; - } - } - - fprintf(fp, "\n"); - -} - -void outputQ(SAM map) -{ - - fprintf(_out_fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", - map.QNAME, - map.FLAG, - map.RNAME, - map.POS, - map.MAPQ, - map.CIGAR, - map.MRNAME, - map.MPOS, - map.ISIZE, - map.SEQ, - map.QUAL); - - - int i; - - for ( i = 0; i < map.optSize; i++) - { - switch (map.optFields[i].type) - { - case 'A': - fprintf(_out_fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); - break; - case 'i': - fprintf(_out_fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); - break; - case 'f': - fprintf(_out_fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); - break; - case 'Z': - case 'H': - fprintf(_out_fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); - break; - } - } - - fprintf(_out_fp, "\n"); -} - -int initOutput ( char *fileName, int compressed) -{ - if (compressed) - { - char newFileName[strlen(fileName)+4]; - sprintf(newFileName, "%s.gz", fileName); - - _out_gzfp = fileOpenGZ(newFileName, "w1f"); - if (_out_gzfp == Z_NULL) - { - return 0; - } - - finalizeOutput = &finalizeGZOutput; - - output = &gzOutputQ; - } - else - { - _out_fp = fileOpen(fileName, "w"); - if (_out_fp == NULL) - { - return 0; - } - - finalizeOutput = &finalizeTXOutput; - output = &outputQ; - } - buffer[0] = '\0'; - return 1; -} - -FILE* getOutputFILE() -{ - if(_out_fp != NULL) - return _out_fp; - else if(_out_gzfp != NULL) - return _out_gzfp; - else - return NULL; -}
--- a/mrfast-2.1.0.4/Output.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#ifndef __OUTPUT__ -#define __OUTPUT__ - -#define FORWARD 0 -#define REVERSE 1 - -typedef struct -{ - char *tag; - char type; - char cVal; - int iVal; - float fVal; - char *sVal; -} OPT_FIELDS; - -typedef struct -{ - char *QNAME; - short FLAG; - char *RNAME; - int POS; - unsigned char MAPQ; - char *CIGAR; - char *MRNAME; - int MPOS; - int ISIZE; - char *SEQ; - char *QUAL; - - int optSize; - OPT_FIELDS *optFields; -} SAM; - -int initOutput(char *fileName, int compressed); -void (*finalizeOutput)(); -void (*output)(SAM map); -void outputSAM(FILE *fp, SAM map); - -FILE * getOutputFILE(); - -#endif
--- a/mrfast-2.1.0.4/Reads.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,773 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include <zlib.h> -#include "Common.h" -#include "Reads.h" - -#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) - -FILE *_r_fp1; -FILE *_r_fp2; -gzFile _r_gzfp1; -gzFile _r_gzfp2; -Read *_r_seq; -int _r_seqCnt; -int *_r_samplingLocs; - -/**********************************************/ -char *(*readFirstSeq)(char *); -char *(*readSecondSeq)(char *); -/**********************************************/ -char *readFirstSeqTXT( char *seq ) -{ - return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); -} - -/**********************************************/ -char *readSecondSeqTXT( char *seq ) -{ - return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); -} -/**********************************************/ -char *readFirstSeqGZ( char *seq ) -{ - return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); -} - -/**********************************************/ -char *readSecondSeqGZ( char *seq ) -{ - return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); -} -/**********************************************/ -int toCompareRead(const void * elem1, const void * elem2) -{ - return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); -} -/**********************************************/ -int readAllReads(char *fileName1, - char *fileName2, - int compressed, - unsigned char *fastq, - unsigned char pairedEnd, - Read **seqList, - unsigned int *seqListSize) -{ - double startTime=getTime(); - - char seq1[SEQ_MAX_LENGTH]; - char rseq1[SEQ_MAX_LENGTH]; - char name1[SEQ_MAX_LENGTH]; - char qual1[SEQ_MAX_LENGTH]; - char seq2[SEQ_MAX_LENGTH]; - char rseq2[SEQ_MAX_LENGTH]; - char name2[SEQ_MAX_LENGTH]; - char qual2[SEQ_MAX_LENGTH]; - - char dummy[SEQ_MAX_LENGTH]; - char ch; - int err1, err2; - int nCnt; - int discarded = 0; - int seqCnt = 0; - int maxCnt = 0; - int i; - Read *list = NULL; - - int clipped = 0; - - if (!compressed) - { - _r_fp1 = fileOpen( fileName1, "r"); - - if (_r_fp1 == NULL) - { - return 0; - } - - ch = fgetc(_r_fp1); - - if ( pairedEnd && fileName2 != NULL ) - { - _r_fp2 = fileOpen ( fileName2, "r" ); - if (_r_fp2 == NULL) - { - return 0; - } - } - else - { - _r_fp2 = _r_fp1; - } - - readFirstSeq = &readFirstSeqTXT; - readSecondSeq = &readSecondSeqTXT; - } - else - { - - _r_gzfp1 = fileOpenGZ (fileName1, "r"); - - if (_r_gzfp1 == NULL) - { - return 0; - } - - ch = gzgetc(_r_gzfp1); - - if ( pairedEnd && fileName2 != NULL ) - { - _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); - if (_r_gzfp2 == NULL) - { - return 0; - } - } - else - { - _r_gzfp2 = _r_gzfp1; - } - - readFirstSeq = &readFirstSeqGZ; - readSecondSeq = &readSecondSeqGZ; - } - - if (ch == '>') - *fastq = 0; - else - *fastq = 1; - - // Counting the number of lines in the file - while (readFirstSeq(dummy)) maxCnt++; - - if (!compressed) - { - rewind(_r_fp1); - } - else - { - gzrewind(_r_gzfp1); - } - - // Calculating the Maximum # of sequences - if (*fastq) - { - maxCnt /= 4; - } - else - { - maxCnt /= 2; - } - - if (pairedEnd && fileName2 != NULL ) - maxCnt *= 2; - - list = getMem(sizeof(Read)*maxCnt); - - while( readFirstSeq(name1) ) - { - err1 = 0; - err2 = 0; - readFirstSeq(seq1); - name1[strlen(name1)-1] = '\0'; - - if ( *fastq ) - { - readFirstSeq(dummy); - readFirstSeq(qual1); - qual1[strlen(qual1)-1] = '\0'; - } - else - { - sprintf(qual1, "*"); - } - - // Cropping - if (cropSize > 0) - { - seq1[cropSize] = '\0'; - if ( *fastq ) - qual1[cropSize] = '\0'; - } - - - nCnt = 0; - for (i=0; i<strlen(seq1); i++) - { - seq1[i] = toupper (seq1[i]); - if (seq1[i] == 'N') - { - nCnt++; - } - else if (isspace(seq1[i])) - { - - seq1[i] = '\0'; - break; - } - } - - if (nCnt > errThreshold) - { - err1 = 1; - } - - // Reading the second seq of pair-ends - if (pairedEnd) - { - readSecondSeq(name2); - readSecondSeq(seq2); - name2[strlen(name2)-1] = '\0'; - - - if ( *fastq ) - { - readSecondSeq(dummy); - readSecondSeq(qual2); - - qual2[strlen(qual2)-1] = '\0'; - } - else - { - sprintf(qual2, "*"); - } - - - // Cropping - if (cropSize > 0) - { - seq2[cropSize] = '\0'; - if ( *fastq ) - qual2[cropSize] = '\0'; - } - - - nCnt = 0; - for (i=0; i<strlen(seq2); i++) - { - seq2[i] = toupper (seq2[i]); - if (seq2[i] == 'N') - { - nCnt++; - - } - else if (isspace(seq2[i])) - { - seq2[i] = '\0'; - } - } - if (nCnt > errThreshold) - { - err2 = 1; - } - - - if (strlen(seq1) < strlen(seq2)) { - seq2[strlen(seq1)] = '\0'; - if ( *fastq ) - qual2[strlen(seq1)] = '\0'; - if (!clipped) clipped = 2; - } - else if (strlen(seq1) > strlen(seq2)){ - seq1[strlen(seq2)] = '\0'; - if ( *fastq ) - qual1[strlen(seq2)] = '\0'; - if (!clipped) clipped = 1; - } - - if (clipped == 1 || clipped == 2){ - fprintf(stdout, "[PE mode warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped); - clipped = 3; - } - - - - } - - if (!pairedEnd && !err1) - { - int _mtmp = strlen(seq1); - list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); - list[seqCnt].seq = list[seqCnt].hits + 1; - list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; - list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; - list[seqCnt].name = list[seqCnt].qual + _mtmp+1; - - list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); - list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); - - list[seqCnt].readNumber = seqCnt; - - reverseComplete(seq1, rseq1, _mtmp); - rseq1[_mtmp] = '\0'; - int i; - - list[seqCnt].hits[0] = 0; - - for (i=0; i<=_mtmp; i++) - { - list[seqCnt].seq[i] = seq1[i]; - list[seqCnt].rseq[i] = rseq1[i] ; - list[seqCnt].qual[i] = qual1[i]; - } - - - //MAKE HASH VALUE - short code = 0; - - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].seq[i]); - list[seqCnt].hashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); - } - - - code = 0; - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); - list[seqCnt].rhashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); - } - - int j = 0; - int tmpSize = _mtmp / (errThreshold+1); - - list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); - for(i=0; i < errThreshold+1; i++) - { - code = 0; - for(j = 0; j < tmpSize; j++) - { - code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); - } - list[seqCnt].hashValSampleSize[i] = code; - } - - sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); - - seqCnt++; - - } - else if (pairedEnd && !err1 && !err2) - { - - // Naming Conventions X/1, X/2 OR X - int tmplen = strlen(name1); - if (strcmp(name1, name2) != 0) - { - tmplen = strlen(name1)-2; - } - - //first seq - int _mtmp = strlen(seq1); - list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); - list[seqCnt].seq = list[seqCnt].hits + 1; - list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; - list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; - list[seqCnt].name = list[seqCnt].qual + _mtmp+1; - - list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); - list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); - list[seqCnt].readNumber = seqCnt; - - reverseComplete(seq1, rseq1, _mtmp); - rseq1[_mtmp] = '\0'; - int i; - - list[seqCnt].hits[0] = 0; - - for (i=0; i<=_mtmp; i++) - { - list[seqCnt].seq[i] = seq1[i]; - list[seqCnt].rseq[i] = rseq1[i] ; - list[seqCnt].qual[i] = qual1[i]; - } - - - name1[tmplen]='\0'; - - //MAKE HASH VALUE - short code = 0; - - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].seq[i]); - list[seqCnt].hashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); - } - - - code = 0; - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); - list[seqCnt].rhashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); - } - - int j = 0; - int tmpSize = _mtmp / (errThreshold+1); - - list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); - for(i=0; i < errThreshold+1; i++) - { - code = 0; - for(j = 0; j < tmpSize; j++) - { - code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); - } - list[seqCnt].hashValSampleSize[i] = code; - } - - sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); - - - seqCnt++; - - //second seq - list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); - list[seqCnt].seq = list[seqCnt].hits + 1; - list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; - list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; - list[seqCnt].name = list[seqCnt].qual + _mtmp+1; - - list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); - list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); - list[seqCnt].readNumber = seqCnt; - - reverseComplete(seq2, rseq2, _mtmp); - rseq2[_mtmp] = '\0'; - - list[seqCnt].hits[0] = 0; - - for (i=0; i<=_mtmp; i++) - { - list[seqCnt].seq[i] = seq2[i]; - list[seqCnt].rseq[i] = rseq2[i] ; - list[seqCnt].qual[i] = qual2[i]; - } - - - name2[tmplen]='\0'; - - //MAKE HASH VALUE - code = 0; - - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].seq[i]); - list[seqCnt].hashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); - } - - - code = 0; - for(i=0; i < 4; i++) - code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); - list[seqCnt].rhashValue[0] = code; - - - for(i = 1; i < _mtmp-3; i++) - { - list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); - } - - j = 0; - tmpSize = _mtmp / (errThreshold+1); - - list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); - for(i=0; i < errThreshold+1; i++) - { - code = 0; - for(j = 0; j < tmpSize; j++) - { - code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); - } - list[seqCnt].hashValSampleSize[i] = code; - } - - sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); - - seqCnt++; - - } - else - { - discarded++; - } - } - - if (seqCnt > 0) - { - SEQ_LENGTH = strlen(list[0].seq); - } - else - { - fprintf(stdout, "ERROR: No reads can be found for mapping\n"); - return 0; - } - - - // Closing Files - if (!compressed) - { - fclose(_r_fp1); - if ( pairedEnd && fileName2 != NULL ) - { - fclose(_r_fp2); - } - } - else - { - gzclose(_r_gzfp1); - if ( pairedEnd && fileName2 != NULL) - { - gzclose(_r_fp2); - } - } - - //qsort(list, seqCnt, sizeof(Read), toCompareRead); - - adjustQual(list, seqCnt); - - *seqList = list; - *seqListSize = seqCnt; - - - _r_seq = list; - _r_seqCnt = seqCnt; - - if ( pairedEnd ) discarded *= 2; - - if (seqCnt>1) - fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); - else - fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); - - - - return 1; -} -/**********************************************/ -void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) -{ - int i; - int samLocsSize = errThreshold + 1; - int *samLocs = getMem(sizeof(int)*samLocsSize); - - for (i=0; i<samLocsSize; i++) - { - samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; - if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) - samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; - } - - // Outputing the sampling locations - - /* - - int j; - for (i=0; i<SEQ_LENGTH; i++) - { - fprintf(stdout, "-"); - } - fprintf(stdout, "\n"); - - for ( i=0; i<samLocsSize; i++ ) - { - for ( j=0; j<samLocs[i]; j++ ) - { - fprintf(stdout," "); - } - for (j=0; j<WINDOW_SIZE; j++) - { - fprintf(stdout,"+"); - } - fprintf(stdout, "\n"); - fflush(stdout); - } - - - for ( i=0; i<SEQ_LENGTH; i++ ) - { - fprintf(stdout, "-"); - } - fprintf(stdout, "\n"); - - */ - - *samplingLocs = samLocs; - *samplingLocsSize = samLocsSize; - _r_samplingLocs = samLocs; -} - -void finalizeReads(char *fileName) -{ - FILE *fp1=NULL; - - if (fileName != NULL) - { - fp1 = fileOpen(fileName, "w"); - } - if (pairedEndMode) - _r_seqCnt /=2; - - int i=0; - for (i = 0; i < _r_seqCnt; i++) - { - if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) - { - fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); - } - else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0) - { - fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); - } - else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) - { - fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); - } - else if (!pairedEndMode && _r_seq[i].hits[0] == 0) - { - fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); - } - } - - fclose(fp1); - if (pairedEndMode) - _r_seqCnt *= 2; - - for (i = 0; i < _r_seqCnt; i++) - { - freeMem(_r_seq[i].hits,0); - } - - - freeMem(_r_seq,0); - freeMem(_r_samplingLocs,0); -} - -void adjustQual(Read *list, int seqCnt){ - /* This function will automatically determine the phred_offset and readjust quality values if needed */ -} - - - -/*void finalizeOEAReads(char *fileName) -{ - FILE *fp1=NULL; - FILE *fp2=NULL; - - //printf("OEA%s\n", fileName); - char fileNameOEA1[200]; - char fileNameOEA2[200]; - sprintf(fileNameOEA1, "%s_OEA1", fileName); - sprintf(fileNameOEA2, "%s_OEA2", fileName); - - - if (fileName != NULL) - { - fp1 = fileOpen(fileNameOEA1, "w"); - fp2 = fileOpen(fileNameOEA2, "w"); - } - if (pairedEndMode) - _r_seqCnt /=2; - - int i=0; - printf("%d\n", _r_seqCnt); - for (i = 0; i < _r_seqCnt; i++) - { - if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) - { - fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); - } - else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) - { - fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); - } - else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) - { - fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, - _r_seq[2*i].seq, - _r_seq[2*i].qual, - _r_seq[2*i+1].name, - _r_seq[2*i+1].seq, - _r_seq[2*i+1].qual); - } - else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 ) - { - fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, - _r_seq[2*i].seq, - _r_seq[2*i].qual, - _r_seq[2*i+1].name, - _r_seq[2*i+1].seq, - _r_seq[2*i+1].qual); - } - } - - fclose(fp1); - fclose(fp2); - -}*/
--- a/mrfast-2.1.0.4/Reads.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,65 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - -#ifndef __READ__ -#define __READ__ - -typedef struct -{ - char *name; - char *seq; - char *rseq; - char *qual; - char *hits; - int readNumber; - short *hashValue; - short *rhashValue; - int *hashValSampleSize; -} Read; - -int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pe, Read **seqList, unsigned int *seqListSize); -void loadSamplingLocations(int **samplingLocs, int *samplingLocsSize); -void finalizeReads(char *fileName); -void adjustQual(Read *list, int seqCnt); - -#endif
--- a/mrfast-2.1.0.4/RefGenome.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,157 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <ctype.h> -#include "Common.h" -#include "RefGenome.h" - -FILE *_rg_fp; -char *_rg_gen; -char *_rg_name; -int _rg_offset; -int _rg_contGen; - -/**********************************************/ -int initLoadingRefGenome(char *fileName) -{ - char ch; - _rg_fp = fileOpen (fileName, "r"); - if (fscanf(_rg_fp, "%c", &ch)) - { - if (ch == '>') - { - _rg_contGen = 0; - _rg_offset = 0; - _rg_gen = getMem(CONTIG_MAX_SIZE); - _rg_name = getMem(CONTIG_NAME_SIZE); - return 1; - } - } - return 0; -} -/**********************************************/ -void finalizeLoadingRefGenome() -{ - freeMem(_rg_gen, CONTIG_MAX_SIZE); - freeMem(_rg_name, CONTIG_NAME_SIZE); - fclose(_rg_fp); -} -/**********************************************/ -int loadRefGenome(char **refGen, char **refGenName, int *refGenOff) -{ - char ch; - int i; - int returnVal = 0; - int actualSize=0; - int size; - char *tmp; - - // New Conting - if (!_rg_contGen) - { - size = 0; - tmp = fgets(_rg_name, SEQ_MAX_LENGTH, _rg_fp); - } - else - { - size=strlen(_rg_gen); - for( i = 0 ; i < CONTIG_OVERLAP ; i++ ) - { - - _rg_gen[i] = _rg_gen[size-CONTIG_OVERLAP+i]; - if (_rg_gen[i] != 'N') - actualSize++; - } - size = CONTIG_OVERLAP; - } - while( fscanf(_rg_fp, "%c", &ch) > 0 ) - { - if (ch == '>') - { - _rg_contGen = 0; - returnVal = 1; - break; - } - else if (!isspace(ch)) - { - ch = toupper(ch); - _rg_gen[size++] = ch; - if (ch != 'N') - { - actualSize++; - } - if (actualSize == CONTIG_SIZE || size == CONTIG_MAX_SIZE) - { - _rg_contGen = 1; - returnVal=1; - break; - } - } - - } - - _rg_gen[size] = '\0'; - for (i=strlen(_rg_name)-1; i >= 0; i--) - if (!isspace(_rg_name[i])) - break; - _rg_name[i+1] = '\0'; - - *refGenOff = _rg_offset; - *refGenName = _rg_name; - *refGen = _rg_gen; - - if (_rg_contGen == 1) - { - _rg_offset += size-CONTIG_OVERLAP; - } - else - { - _rg_offset = 0; - } - - - return returnVal; -} -/**********************************************/
--- a/mrfast-2.1.0.4/RefGenome.h Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,51 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu - */ - - - -#ifndef _REF_GENOME_ -#define _REF_GENOME_ - -int initLoadingRefGenome(char *fileName); -void finalizeLoadingRefGenome(); -int loadRefGenome(char **refGen, char **refGenName, int *refGenOff); - -#endif
--- a/mrfast-2.1.0.4/baseFAST.c Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,376 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the names of the University of Washington, Simon Fraser University, - * nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu DOT ca - calkan AT uw DOT edu -*/ - - - - - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <math.h> -#include "Common.h" -#include "CommandLineParser.h" -#include "Reads.h" -#include "Output.h" -#include "HashTable.h" -#include "MrFAST.h" - -char *versionNumber = "2.1"; // Current Version -unsigned char seqFastq; - -int main(int argc, char *argv[]) -{ - if (!parseCommandLine(argc, argv)) - return 1; - - configHashTable(); - /**************************************************** - * INDEXING - ***************************************************/ - if (indexingMode) - { - int i; - /******************************** - * Regular Mode - ********************************/ - configHashTable(); - for (i = 0; i < fileCnt; i++) - { - generateHashTable(fileName[i][0], fileName[i][1]); - } - } - /**************************************************** - * SEARCHING - ***************************************************/ - else - { - Read *seqList; - unsigned int seqListSize; - int fc; - int samplingLocsSize; - int *samplingLocs; - double totalLoadingTime = 0; - double totalMappingTime = 0; - double startTime; - double loadingTime; - double mappingTime; - double lstartTime; - double ppTime = 0.0; - double tmpTime;; - char *prevGen = getMem(CONTIG_NAME_SIZE); - prevGen[0]='\0'; - char *curGen; - int flag; - double maxMem=0; - char fname1[FILE_NAME_LENGTH]; - char fname2[FILE_NAME_LENGTH]; - char fname3[FILE_NAME_LENGTH]; - char fname4[FILE_NAME_LENGTH]; - char fname5[FILE_NAME_LENGTH]; - - char outputFileName[FILE_NAME_LENGTH]; - // Loading Sequences & Sampling Locations - startTime = getTime(); - if (!readAllReads(seqFile1, seqFile2, seqCompressed, &seqFastq, pairedEndMode, &seqList, &seqListSize)) - { - return 1; - } - - loadSamplingLocations(&samplingLocs, &samplingLocsSize); - totalLoadingTime += getTime()-startTime; - - if (pairedEndMode) - { - //Switching to Inferred Size - minPairEndedDistance = minPairEndedDistance - SEQ_LENGTH + 2; - maxPairEndedDistance = maxPairEndedDistance - SEQ_LENGTH + 2; - if (pairedEndDiscordantMode) - { - maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance - SEQ_LENGTH + 2; - minPairEndedDiscordantDistance = minPairEndedDiscordantDistance - SEQ_LENGTH + 2; - } - - sprintf(fname1, "__%s__1", mappingOutput); - sprintf(fname2, "__%s__2", mappingOutput); - sprintf(fname3, "__%s__disc", mappingOutput); - sprintf(fname4, "__%s__oea1", mappingOutput); - sprintf(fname5, "__%s__oea2", mappingOutput); - unlink(fname1); - unlink(fname2); - unlink(fname3); - unlink(fname4); - unlink(fname5); - } - - sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput); - // Preparing output - initOutput(outputFileName, outCompressed); - - fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); - fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads"); - fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); - - /******************************** - * Regular Mode - ********************************/ - if (!pairedEndMode) - { - initLookUpTable(); - if(bestMode) - initBestMapping(seqListSize); - - for (fc = 0; fc <fileCnt; fc++) - { - if (!initLoadingHashTable(fileName[fc][1])) - { - return 1; - } - - mappingTime = 0; - loadingTime = 0; - prevGen[0] = '\0'; - flag = 1; - - do - { - flag = loadHashTable ( &tmpTime, errThreshold); // Reading a fragment - curGen = getRefGenomeName(); - - // First Time - if (flag && prevGen[0]== '\0') - { - sprintf(prevGen, "%s", curGen); - } - - if ( !flag || strcmp(prevGen, curGen)!=0) - { - - fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", - prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); - fflush(stdout); - - totalMappingTime += mappingTime; - totalLoadingTime += loadingTime; - - loadingTime = 0; - mappingTime = 0; - maxMem = 0; - - if (!flag) - { - break; - } - } - else if (progressRep && mappingTime != 0) - { - fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", - prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); - fflush(stdout); - } - - sprintf(prevGen, "%s", curGen); - - loadingTime += tmpTime; -// lstartTime = getTime(); - - initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); - - lstartTime = getTime(); - - - mapAllSingleEndSeq(); - - - - mappingTime += getTime() - lstartTime; - if (maxMem < getMemUsage()) - { - maxMem = getMemUsage(); - } - } while (flag); - - } // end for; - if(bestMode) - finalizeBestSingleMapping(); - finalizeFAST(); - finalizeLoadingHashTable(); - - } - // Pairedend Mapping Mode - else - { - initLookUpTable(); - if(pairedEndMode) - initBestConcordantDiscordant(seqListSize); - for (fc = 0; fc < fileCnt; fc++) - { - if (!initLoadingHashTable(fileName[fc][1])) - { - return 1; - } - mappingTime = 0; - loadingTime = 0; - prevGen[0] = '\0'; - flag = 1; - - do - { - flag = loadHashTable ( &tmpTime , errThreshold); // Reading a fragment - curGen = getRefGenomeName(); - - // First Time - if (flag && prevGen[0]== '\0') - { - sprintf(prevGen, "%s", curGen); - } - - if ( !flag || strcmp(prevGen, curGen)!=0) - { - - // DISCORDANT - lstartTime = getTime(); - outputPairedEnd(); - mappingTime += getTime() - lstartTime; - //DISCORDANT - - fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", - prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); - fflush(stdout); - - totalMappingTime += mappingTime; - totalLoadingTime += loadingTime; - - loadingTime = 0; - mappingTime = 0; - maxMem = 0; - - if (!flag) - { - break; - } - } - else if (progressRep && mappingTime != 0) - { - fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", - prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); - fflush(stdout); - } - - sprintf(prevGen, "%s", curGen); - - loadingTime += tmpTime; - lstartTime = getTime(); - - initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); - - mapPairedEndSeq(); - - mappingTime += getTime() - lstartTime; - if (maxMem < getMemUsage()) - { - maxMem = getMemUsage(); - } - - } while (flag); - - } // end for; - - if(pairedEndMode) - { - sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput); - finalizeOEAReads(outputFileName); - outputAllTransChromosal(transChromosal); - finalizeBestConcordantDiscordant(); - } - - finalizeLoadingHashTable(); - - if (pairedEndDiscordantMode) - { - lstartTime = getTime(); - outputPairedEndDiscPP(); - ppTime = getTime() - lstartTime; - } - finalizeFAST(); - } //else - - finalizeOutput(); - - fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); - fprintf(stdout, "%19s%16.2f%18.2f\n\n", "Total:",totalLoadingTime, totalMappingTime); - if (pairedEndDiscordantMode) - fprintf(stdout, "Post Processing Time: %18.2f \n", ppTime); - fprintf(stdout, "%-30s%10.2f\n","Total Time:", totalMappingTime+totalLoadingTime); - fprintf(stdout, "%-30s%10d\n","Total No. of Reads:", seqListSize); - fprintf(stdout, "%-30s%10lld\n","Total No. of Mappings:", mappingCnt); - fprintf(stdout, "%-30s%10.0f\n\n","Avg No. of locations verified:", ceil((float)verificationCnt/seqListSize)); - - int cof = (pairedEndMode)?2:1; - - if (progressRep && maxHits != 0) - { - int frequency[maxHits+1]; - int i; - for ( i=0 ; i <= maxHits; i++) - { - frequency[i] = 0; - } - - - for (fc = 0; fc < seqListSize; fc++) - { - frequency[(int)(*(seqList[fc*cof].hits))]++; - } - frequency[maxHits] = completedSeqCnt; - for ( i=0 ; i <= maxHits; i++) - { - fprintf(stdout, "%-30s%10d%10d%10.2f%%\n","Reads Mapped to ", i, frequency[i], 100*(float)frequency[i]/(float)seqListSize); - } - } - - - finalizeReads(unmappedOutput); - freeMem(prevGen, CONTIG_NAME_SIZE); - } - - return 1; -}
--- a/mrfast-2.1.0.4/header Tue Feb 21 10:29:47 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -/* - * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list - * of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright notice, this - * list of conditions and the following disclaimer in the documentation and/or other - * materials provided with the distribution. - * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be - * used to endorse or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - Authors: - Farhad Hormozdiari - Faraz Hach - Can Alkan - Emails: - farhadh AT uw DOT edu - fhach AT cs DOT sfu - calkan AT gmail DOT com - */ - -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/CommandLineParser.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,395 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + +#include <stdio.h> +#include <stdlib.h> +#include <getopt.h> +#include <string.h> +#include <ctype.h> +#include "Common.h" +#include "CommandLineParser.h" + +int uniqueMode=1; +int indexingMode; +int searchingMode; +int pairedEndMode; +int pairedEndDiscordantMode; +int transChromosal=0; +int pairedEndProfilingMode; +int seqCompressed; +int outCompressed; +int cropSize = 0; +int progressRep = 0; +int minPairEndedDistance=-1; +int maxPairEndedDistance=-1; +int minPairEndedDiscordantDistance=-1; +int maxPairEndedDiscordantDistance=-1; +int bestMode; +int nosamMode; +char *seqFile1; +char *seqFile2; +char *mappingOutput = "output"; +char *mappingOutputPath = ""; +char *unmappedOutput = "unmapped"; +char fileName[1000][2][FILE_NAME_LENGTH]; +int fileCnt; +int maxOEAOutput=100; +int maxDiscordantOutput=300; +unsigned char errThreshold=2; +unsigned char maxHits=0; +unsigned char WINDOW_SIZE = 12; +unsigned int CONTIG_SIZE; +unsigned int CONTIG_MAX_SIZE; + +void printHelp(); + +int parseCommandLine (int argc, char *argv[]) +{ + + int o; + int index; + char *fastaFile = NULL; + char *batchFile = NULL ; + int batchMode = 0; + + static struct option longOptions[] = + { + {"pe", no_argument, &pairedEndMode, 1}, + {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, + {"trans", no_argument, &transChromosal , 1}, + {"profile", no_argument, &pairedEndProfilingMode, 1}, + {"seqcomp", no_argument, &seqCompressed, 1}, + {"outcomp", no_argument, &outCompressed, 1}, + {"progress", no_argument, &progressRep, 1}, + {"best", no_argument, &bestMode, 1}, + {"index", required_argument, 0, 'i'}, + {"search", required_argument, 0, 's'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'v'}, + {"seq", required_argument, 0, 'x'}, + {"seq1", required_argument, 0, 'x'}, + {"seq2", required_argument, 0, 'y'}, + {"ws", required_argument, 0, 'w'}, + {"min", required_argument, 0, 'l'}, + {"max", required_argument, 0, 'm'}, + {"crop", required_argument, 0, 'c'}, + {"maxoea", required_argument, 0, 'a'}, + {"maxdis", required_argument, 0, 'd'}, + {"nosam", no_argument, &nosamMode, 1}, + {0, 0, 0, 0}, + }; + + while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 ) + { + switch (o) + { + case 'a': + maxOEAOutput = atoi(optarg); + if (maxOEAOutput == 0) + maxOEAOutput = 100000; + break; + case 'd': + maxDiscordantOutput = atoi(optarg); + if (maxDiscordantOutput == 0) + maxDiscordantOutput = 100000; + break; + case 'i': + indexingMode = 1; + fastaFile = optarg; + break; + case 's': + searchingMode = 1; + fastaFile = optarg; + break; + case 'b': + batchMode = 1; + break; + case 'c': + cropSize = atoi(optarg); + break; + case 'w': + WINDOW_SIZE = atoi(optarg); + break; + case 'x': + seqFile1 = optarg; + break; + case 'y': + seqFile2 = optarg; + break; + case 'u': + unmappedOutput = optarg; + break; + case 'o': + mappingOutput = getMem(FILE_NAME_LENGTH); + mappingOutputPath = getMem(FILE_NAME_LENGTH); + stripPath (optarg, &mappingOutputPath, &mappingOutput); + break; + case 'n': + maxHits = atoi(optarg); + break; + case 'e': + errThreshold = atoi(optarg); + break; + case 'l': + minPairEndedDistance = atoi(optarg); + break; + case 'm': + maxPairEndedDistance = atoi(optarg); + break; + case 'h': + printHelp(); + return 0; + break; + case 'v': + fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF); + return 0; + break; + /* case '?': + fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name); + abort(); + break;*/ + } + + } + if (indexingMode + searchingMode != 1) + { + fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); + return 0; + } + + if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11) + { + fprintf(stdout, "ERROR: Window size should be in [12..15]\n"); + return 0; + } + + + if ( indexingMode ) + { + CONTIG_SIZE = 15000000; + CONTIG_MAX_SIZE = 40000000; + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); + return 0; + } + + if (pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant-vh cannot be used in indexing mode. \n"); + return 0; + } + + } + + + if ( searchingMode ) + { + CONTIG_SIZE = 300000000; + CONTIG_MAX_SIZE = 300000000; + + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); + return 0; + } + + if (seqFile1 == NULL && seqFile2 == NULL) + { + fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); + return 0; + } + + + if (!pairedEndMode && seqFile2 != NULL) + { + fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n"); + return 0; + } + + if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance)) + { + fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); + return 0; + } + + if (pairedEndMode && seqFile1 == NULL) + { + fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); + return 0; + } + + if (!pairedEndMode && pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant should be used with --pe"); + return 0; + } + + if (!pairedEndMode && pairedEndProfilingMode) + { + fprintf(stdout, "ERROR: --profile should be used with --pe"); + return 0; + } + + if (pairedEndMode) + pairedEndDiscordantMode = 1; + } + + int i = 0; + + + if (batchMode) + { + FILE *fp = fileOpen(batchFile, "r"); + + if (fp == NULL) + return 0; + + fileCnt = 0; + + while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp)) + { + for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--) + if ( !isspace(fileName[fileCnt][0][i])) + break; + fileName[fileCnt][0][i+1] = '\0'; + + if (strcmp(fileName[fileCnt][0], "") != 0) + { + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + } + } + else + { + sprintf(fileName[fileCnt][0], "%s", fastaFile); + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + + + if (pairedEndProfilingMode) + { + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + + } + + if (pairedEndDiscordantMode) + { + minPairEndedDiscordantDistance = minPairEndedDistance; + maxPairEndedDiscordantDistance = maxPairEndedDistance; + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + } + + return 1; +} + + +void printHelp() +{ + char *errorType; + if (mrFAST) + { + fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"Usage: mrfast [options]\n\n"); + errorType="edit distance"; + } + else + { + fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n"); + fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n"); + fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n"); + fprintf(stdout,"the output in SAM format.\n\n"); + fprintf(stdout,"Usage: mrsFAST [options]\n\n"); + errorType="hamming distance"; + } + + fprintf(stdout,"General Options:\n"); + fprintf(stdout," -v|--version\t\tCurrent Version.\n"); + fprintf(stdout," -h\t\t\tShows the help file.\n"); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Indexing Options:\n"); + fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n"); + fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n"); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Searching Options:\n"); + fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n"); + fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n"); + fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n"); + fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n"); + fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads. \n"); + fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n"); + fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n"); + fprintf(stdout," --best \t\tOnly the best mapping from all the possible mapping is returned.\n"); + fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n"); + fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n"); + fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType); + fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n"); + fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n"); + + fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n"); + fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/CommandLineParser.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,49 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + +#ifndef __COMMAND_LINE_PARSER__ +#define __COMMAND_LINE_PARSER__ + +int parseCommandLine (int argc, char *argv[]); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Common.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,184 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <zlib.h> +#include <string.h> +#include "Common.h" + + +unsigned short SEQ_LENGTH = 0; +long long memUsage = 0; +/**********************************************/ +FILE *fileOpen(char *fileName, char *mode) +{ + FILE *fp; + fp = fopen (fileName, mode); + if (fp == NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return fp; +} +/**********************************************/ +gzFile fileOpenGZ(char *fileName, char *mode) +{ + gzFile gzfp; + gzfp = gzopen (fileName, mode); + if (gzfp == Z_NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return gzfp; +} +/**********************************************/ +double getTime(void) +{ + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec+t.tv_usec/1000000.0; +} + +/**********************************************/ +char reverseCompleteChar(char c) +{ + char ret; + switch (c) + { + case 'A': + ret = 'T'; + break; + case 'T': + ret = 'A'; + break; + case 'C': + ret = 'G'; + break; + case 'G': + ret = 'C'; + break; + default: + ret = 'N'; + break; + } + return ret; +} +/**********************************************/ +void reverseComplete (char *seq, char *rcSeq , int length) +{ + int i; + for (i=0; i<length; i++) + { + rcSeq[i]=reverseCompleteChar (seq[length-1-i]) ; + } +} +/**********************************************/ +void * getMem(size_t size) +{ + void *ret; + ret = malloc(size); + if (ret == NULL){ + fprintf(stderr, "Cannot allocate memory. Currently addressed memory = %0.2f MB, requested memory = %0.2f MB.\nCheck the available main memory, and if you have user limits (ulimit -v).\n", getMemUsage(), (double)size); + exit(0); + } + memUsage+=size; + return ret; +} +/**********************************************/ +void freeMem(void *ptr, size_t size) +{ + memUsage-=size; + free(ptr); +} +/**********************************************/ +double getMemUsage() +{ + return memUsage/1048576.0; +} +/**********************************************/ +void reverse (char *seq, char *rcSeq , int length) +{ + int i; + int l = length; + if(l != strlen(seq)) + l = strlen(seq); + for (i=0; i<l; i++) + { + rcSeq[i]=seq[l-1-i] ; + } + rcSeq[l] = '\0'; +} +/**********************************************/ +void stripPath(char *full, char **path, char **fileName) +{ + int i; + int pos = -1; + + for (i=strlen(full)-1; i>=0; i--) + { + if (full[i]=='/') + { + pos = i; + break; + } + + } + + if (pos != -1) + { + sprintf(*fileName, "%s%c", (full+pos+1), '\0'); + full[pos+1]='\0'; + sprintf(*path,"%s%c", full, '\0'); + } + else + { + sprintf(*fileName, "%s%c", full, '\0'); + sprintf(*path,"%c", '\0'); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Common.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,111 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __COMMON__ +#define __COMMON__ + +#include <zlib.h> + +#define SEQ_MAX_LENGTH 300 // Seq Max Length +#define CONTIG_OVERLAP 200 // No. of characters overlapped between contings +#define CONTIG_NAME_SIZE 200 // Contig name max size +#define FILE_NAME_LENGTH 400 // Filename Max Length +#define DISCORDANT_CUT_OFF 800 +#define MAX_OPEN_FILE 600 +#define MAX_TRANS_CHROMOSAL_OUTPUT 50 +#define MAX_OEA_OUT 500 + +extern unsigned int CONTIG_SIZE; +extern unsigned int CONTIG_MAX_SIZE; + + +extern unsigned char WINDOW_SIZE ; // WINDOW SIZE for indexing/searching +extern unsigned short SEQ_LENGTH; // Sequence(read) length + +extern char *versionNumber; +extern char *versionNumberF; +extern unsigned char mrFAST; + + +extern int maxOEAOutput; +extern int maxDiscordantOutput; +extern int uniqueMode; +extern int indexingMode; +extern int searchingMode; +extern int pairedEndMode; +extern int pairedEndDiscordantMode; +extern int transChromosal; +extern int pairedEndProfilingMode; +extern int bestMode; +extern int nosamMode; +extern int seqCompressed; +extern int outCompressed; +extern int cropSize; +extern int progressRep; +extern char *seqFile1; +extern char *seqFile2; +extern char *seqUnmapped; +extern char *mappingOutput; +extern char *mappingOutputPath; +extern char *unmappedOutput; +extern unsigned char seqFastq; +extern unsigned char errThreshold; +extern unsigned char maxHits; +extern int minPairEndedDiscordantDistance; +extern int maxPairEndedDiscordantDistance; +extern int minPairEndedDistance; +extern int maxPairEndedDistance; +extern char fileName[1000][2][FILE_NAME_LENGTH]; +extern int fileCnt; +extern long long memUsage; + +FILE * fileOpen(char *fileName, char *mode); +gzFile fileOpenGZ(char *fileName, char *mode); +double getTime(void); +void reverseComplete (char *seq, char *rcSeq , int length); +void * getMem(size_t size); +void freeMem(void * ptr, size_t size); +double getMemUsage(); +void reverse (char *seq, char *rcSeq , int length); +void stripPath(char *full, char **path, char **fileName); +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/HashTable.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,472 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "Common.h" +#include "RefGenome.h" +#include "HashTable.h" +/**********************************************/ +FILE *_ih_fp = NULL; +IHashTable *_ih_hashTable = NULL; +int _ih_maxHashTableSize = 0; +char *_ih_refGen = NULL; +char *_ih_refGenName = NULL; +long long _ih_memUsage = 0; +int _ih_refGenOff = 0; +/**********************************************/ + +int hashVal(char *seq) +{ + int i=0; + int val=0, numericVal=0; + + while(i<WINDOW_SIZE) + { + switch (seq[i]) + { + case 'A': + numericVal = 0; + break; + case 'C': + numericVal = 1; + break; + case 'G' : + numericVal = 2; + break; + case 'T': + numericVal = 3; + break; + default: + return -1; + break; + } + val = (val << 2)|numericVal; + i++; + } + return val; +} +/**********************************************/ +void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize) +{ + int i=0; + for (i=0; i<maxSize; i++) + { + if (hashTable[i].locs != NULL) + { + freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int))); + hashTable[i].locs = NULL; + } + } +} +/**********************************************/ +void initSavingIHashTable(char *fileName) +{ + int tmp; + _ih_fp = fileOpen(fileName, "w"); + unsigned char bIndex = 0; // Bisulfite Index + + // First Two bytes are indicating the type of the index & window size + tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp); + tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); + + if (tmp == 0) + fprintf(stderr, "Write error while initializing hash table.\n"); + +} +/**********************************************/ +void finalizeSavingIHashTable() +{ + fclose(_ih_fp); +} +/**********************************************/ +void saveIHashTable(IHashTable *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset) +{ + int tmp; + + // Every Chunk starts with a byte indicating whether it has extra info; + unsigned char extraInfo = 0; + tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp); + + short len = strlen(refGenName); + tmp = fwrite(&len, sizeof(len), 1, _ih_fp); + tmp = fwrite(refGenName, sizeof(char), len, _ih_fp); + + tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp); + + unsigned int refGenLength = strlen(refGen); + tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp); + + tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp); + tmp = fwrite(&size, sizeof(size), 1, _ih_fp); + + int i=0,j=0; + unsigned char cnt=0; + for (i=0; i<maxSize; i++) + { + if (hashTable[i].locs != NULL) + { + tmp = fwrite(&i, sizeof(i), 1, _ih_fp); + + if (hashTable[i].locs[0] < 250) + { + cnt = hashTable[i].locs[0]; + tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp); + } + else + { + cnt =0; + tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp); + tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp); + } + + for (j=1; j<=hashTable[i].locs[0]; j++) + tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp); + } + } + + if (tmp == 0) + fprintf(stderr, "Write error while saving hash table.\n"); +} +/**********************************************/ +unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location) +{ + unsigned int sizeInc = 0; + + if (hashTable[hv].locs == NULL) + { + sizeInc = 1; + hashTable[hv].locs = getMem (sizeof(unsigned int)*2); + hashTable[hv].locs[0]=1; + hashTable[hv].locs[1]=location; + } + else + { + int size = hashTable[hv].locs[0]; + int i; + unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) ); + + for (i = 0; i <= size; i++) + { + tmp[i] = hashTable[hv].locs[i]; + } + size++; + tmp[0] = size; + tmp[size] = location; + + freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int)))); + hashTable[hv].locs = tmp; + } + return sizeInc; +} +/**********************************************/ +void generateIHashTable(char *fileName, char *indexName) +{ + double startTime = getTime(); + unsigned int hashTableSize = 0; + unsigned int hashTableMaxSize = pow(4, WINDOW_SIZE); + IHashTable *hashTable = getMem(sizeof(IHashTable)*hashTableMaxSize); + char *refGenName; + char *refGen; + int refGenOff = 0; + int i, hv, l, flag; + + + for ( i = 0; i < hashTableMaxSize; i++) + { + hashTable[i].locs = NULL; + } + + //Loading Fasta File + if (!initLoadingRefGenome(fileName)) + return; + initSavingIHashTable(indexName); + + fprintf(stdout, "Generating Index from %s", fileName); + fflush(stdout); + + char *prev = getMem (CONTIG_NAME_SIZE); + prev[0]='\0'; + + do + { + flag = loadRefGenome (&refGen, &refGenName, &refGenOff); + + if ( strcmp(prev, refGenName) != 0) + { + fprintf(stdout, "\n - %s ", refGenName); + fflush(stdout); + sprintf(prev, "%s", refGenName); + } + else + { + fprintf(stdout, "."); + fflush(stdout); + } + + l = strlen(refGen) - WINDOW_SIZE; + + for (i=0; i < l; i++) + { + hv = hashVal(refGen + i); + if (hv != -1) + { + hashTableSize += addIHashTableLocation (hashTable, hv, i+1); + } + } + + saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff); + freeIHashTableContent(hashTable, hashTableMaxSize); + hashTableSize = 0; + } while (flag); + + freeMem(prev, CONTIG_NAME_SIZE); + freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize); + + finalizeLoadingRefGenome(); + finalizeSavingIHashTable(); + + fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime)); +} + +/**********************************************/ +void finalizeLoadingIHashTable() +{ + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + fclose(_ih_fp); +} + +/**********************************************/ +int loadIHashTable(double *loadTime, int errThreshould) +{ + double startTime = getTime(); + unsigned char extraInfo = 0; + short len; + unsigned int refGenLength; + unsigned int hashTableSize; + unsigned int tmpSize; + int tmp; + int i=0,j=0; + + if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) ) + return 0; + + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + + // Reading Chr Name + tmp = fread(&len, sizeof(len), 1, _ih_fp); + if (tmp == 0){ + fprintf(stderr, "Read error while loading hash table.\n"); + exit(0); + } + + _ih_refGenName = getMem(sizeof(char)* (len+1)); + tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp); + _ih_refGenName [len] ='\0'; + + tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp); + + // Reading Size and Content of Ref Genome + tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp); + _ih_refGen = getMem(sizeof(char)*(refGenLength+1)); + tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp); + _ih_refGen[refGenLength]='\0'; + + //Reading Hashtable Size and Content + tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp); + + unsigned int hv; + unsigned char cnt=0; + unsigned int index = 1; + unsigned int tmpValue = 0; + for (i=0; i<hashTableSize; i++) + { + index = 1; + tmp = fread(&hv, sizeof(hv), 1, _ih_fp); + tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp); + + if (cnt>0) + { + tmpSize = cnt; + } + else + { + tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp); + } + + _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) ); + _ih_hashTable[hv].locs[0] = tmpSize; + +/* for (j=1; j<=tmpSize; j++) + tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp); +*/ + + if(tmpSize != 0) + { + tmp = fread(&(_ih_hashTable[hv].locs[index]), sizeof(_ih_hashTable[hv].locs[index]), 1, _ih_fp); + index++; + } + + for (j=2; j<=tmpSize; j++) + { + tmp = fread(&tmpValue, sizeof(_ih_hashTable[hv].locs[index-1]), 1, _ih_fp); + if(abs(tmpValue-_ih_hashTable[hv].locs[index-1]) > errThreshould) + { + _ih_hashTable[hv].locs[index] = tmpValue; + index++; + } + } + _ih_hashTable[hv].locs[0] = index-1; + } + *loadTime = getTime()-startTime; + return 1; +} +/**********************************************/ +unsigned int *getIHashTableCandidates(int hv) +{ + if ( hv != -1 ) + return _ih_hashTable[hv].locs; + else + return NULL; +} +/**********************************************/ +/**********************************************/ +/**********************************************/ +void configHashTable() +{ + if (WINDOW_SIZE <= 15) + { + generateHashTable = &generateIHashTable; + loadHashTable = &loadIHashTable; + finalizeLoadingHashTable = &finalizeLoadingIHashTable; + getCandidates = &getIHashTableCandidates; + } + else + { + + + } +} +/**********************************************/ +int initLoadingHashTable(char *fileName) +{ + int i; + unsigned char bsIndex; + int tmp; + + _ih_fp = fileOpen(fileName, "r"); + + if (_ih_fp == NULL) + return 0; + + tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp); + + if (tmp == 0){ + fprintf(stderr, "Read error while inilializing hash table loading.\n"); + exit(0); + } + + if (bsIndex) + { + fprintf(stdout, "Error: Wrong Type of Index indicated"); + return 0; + } + + tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); + + configHashTable(); + + if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE)) + { + + if (_ih_hashTable != NULL) + { + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + } + + _ih_maxHashTableSize = pow(4, WINDOW_SIZE); + + _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); + for (i=0; i<_ih_maxHashTableSize; i++) + _ih_hashTable[i].locs = NULL; + _ih_refGen = getMem(1); + _ih_refGen[0]='\0'; + _ih_refGenName = getMem(1); + _ih_refGenName[0] = '\0'; + } + + return 1; +} +/**********************************************/ +char *getRefGenome() +{ + return _ih_refGen; + +} +/**********************************************/ +char *getRefGenomeName() +{ + return _ih_refGenName; + +} +/**********************************************/ +int getRefGenomeOffset() +{ + return _ih_refGenOff; + +} +/**********************************************/ +HashTable *getHashTable() +{ + return NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/HashTable.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,71 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __HASH_TABLE__ +#define __HASH_TABLE__ + +typedef struct HashTable +{ + long long hv; + int *locs; +} HashTable; + +typedef struct +{ + unsigned int *locs; +} IHashTable; + +int hashVal(char *seq); +void configHashTable(); +char *getRefGenome(); +char *getRefGenomeName(); +int getRefGenomeOffset(); +int initLoadingHashTable(char *fileName); +HashTable *getHashTable(); + +void (*generateHashTable)(char *fileName, char *indexName); +int (*loadHashTable)(double *loadTime, int errThreshould); +void (*finalizeLoadingHashTable)(); +unsigned int *(*getCandidates)(int hv); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Makefile Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,18 @@ +CC=gcc +CFLAGS = -c -O3 -Wall +LDFLAGS = -lz -lm +SOURCES = baseFAST.c CommandLineParser.c Common.c HashTable.c MrFAST.c Output.c Reads.c RefGenome.c +OBJECTS = $(SOURCES:.c=.o) +EXECUTABLE = mrfast + + +all: $(SOURCES) $(EXECUTABLE) + rm -rf *.o + +$(EXECUTABLE): $(OBJECTS) + $(CC) $(OBJECTS) -o $@ $(LDFLAGS) + +.c.o: + $(CC) $(CFLAGS) $< -o $@ +clean: + rm -f *.o *~ #*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/MrFAST.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,7961 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan +Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <dirent.h> +#include <xmmintrin.h> +#include <emmintrin.h> +#include <mmintrin.h> + + +#include "Common.h" +#include "Reads.h" +#include "HashTable.h" +#include "Output.h" +#include "MrFAST.h" +#include "RefGenome.h" + + +#define min(a,b) ((a)>(b)?(b):(a)) +#define min3(a,b,c) ((a)>(b)?(b>c?c:b):(a>c?c:a)) +#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) + +#define MAX_REF_SIZE 18 + + +float calculateScore(int index, char *seq, char *qual, char *md); +unsigned char mrFAST = 1; +char *versionNumberF="0.5"; + +long long verificationCnt = 0; +long long mappingCnt = 0; +long long mappedSeqCnt = 0; +long long completedSeqCnt = 0; +char *mappingOutput; +/**********************************************/ +char *_msf_refGen = NULL; +int _msf_refGenLength = 0; +int _msf_refGenOffset = 0; +char *_msf_refGenName = NULL; + +int _msf_refGenBeg; +int _msf_refGenEnd; + +IHashTable *_msf_hashTable = NULL; + +int *_msf_samplingLocs; +int *_msf_samplingLocsEnds; +int _msf_samplingLocsSize; + +Read *_msf_seqList; +int _msf_seqListSize; + +Pair *_msf_sort_seqList = NULL; +int *_msf_map_sort_seqList; + +ReadIndexTable *_msf_rIndex = NULL; +int _msf_rIndexSize; +int _msf_rIndexMax; + +SAM _msf_output; + +OPT_FIELDS *_msf_optionalFields; + +char *_msf_op; + +int *_msf_verifiedLocs = NULL; + +char _msf_numbers[200][3]; +char _msf_cigar[5]; + +MappingInfo *_msf_mappingInfo; + +int *_msf_seqHits; +int _msf_openFiles = 0; +int _msf_maxLSize=0; +int _msf_maxRSize=0; + +BestFullMappingInfo *bestHitMappingInfo; + +/*************************/ +int _msf_maxFile=0; +char _msf_fileName[4000][200][2][FILE_NAME_LENGTH]; +int _msf_fileCount[4000]; + +char *_msf_readHasConcordantMapping; //boolean if a read has concordant mapping :D + +int *_msf_oeaMapping; +int *_msf_discordantMapping; + +FILE *bestConcordantFILE; +FILE *bestDiscordantFILE; + +int counter = 0; + +int scoreF[200][200]; +int scoreB[200][200]; + +int score[200][200]; +int direction1[200][200]; +int direction2[200][200]; + +__m128i MASK; + +int lookUpTable[15625][15625]; + +/**************************************************Methods***************************************************/ +int smallEditDistanceF(char *a, int lena, char *b, int lenb) +{ + int matrix[20][20]; + int i = 0; + int j = 0; + + for(i = 0; i <= lena; i++) + { + matrix[0][i] = i; + } + + for(i = 0; i <= lenb; i++) + { + matrix[i][0] = i; + } + + + for(i = 1; i <= lenb; i++) + { + for(j = 1; j <= lena; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (a[j-1] != b[i-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); +} + +int smallEditDistanceB(char *a, int lena, char *b, int lenb) +{ + int matrix[20][20]; + int i = 0; + int j = 0; + + for(i = 0; i <= lena; i++) + { + matrix[0][i] = i; + } + + for(i = 0; i <= lenb; i++) + { + matrix[i][0] = i; + } + + + for(i = 1; i <= lenb; i++) + { + for(j = 1; j <= lena; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (*(a-j+1) != *(b-i+1)),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + + return (matrix[lenb][lena]>errThreshold?-1:matrix[lenb][lena]); +} + +char fastEditDistance(int per1, int per2) +{ + + int i = 0; + int j = 0; + + char str1[7]; + char str2[7]; + + int val1 = per1; + int val2 = per2; + + int index = 0; + int mod = 0; + + int matrix[7][7]; + + int min = 20; + + while(index < 6) + { + mod = val1%5; + str1[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); + val1 = val1 /5; + index++; + } + + str1[6] = '\0'; + + index = 0; + while(index < 6) + { + mod=val2%5; + str2[5-index] = (mod==0 ? 'A':(mod==1?'C':mod==2?'G':(mod==3)?'T':'N')); + val2 = val2 / 5; + index++; + } + str2[6] = '\0'; + + for(i = 0; i < 7; i++) + { + matrix[0][i] = i; + matrix[i][0] = i; + } + + for(i = 1; i < 7; i++) + { + for(j = 1; j < 7; j++) + { + matrix[i][j] = min3(matrix[i-1][j-1]+ (str1[i-1] != str2[j-1]),matrix[i][j-1]+1 ,matrix[i-1][j]+1); + } + } + + for(i = 0; i < 7; i++) + { + if(matrix[i][6] < min) + min = matrix[i][6]; + } + + for(i = 0; i < 7; i++) + { + if(matrix[6][i] < min) + min = matrix[6][i]; + } + return min; +} + +void initLookUpTable() +{ + int i = 0; + + MASK = _mm_insert_epi16(MASK,1,0); + MASK = _mm_insert_epi16(MASK,1,1); + MASK = _mm_insert_epi16(MASK,1,2); + MASK = _mm_insert_epi16(MASK,1,3); + MASK = _mm_insert_epi16(MASK,1,4); + MASK = _mm_insert_epi16(MASK,0,5); + MASK = _mm_insert_epi16(MASK,0,6); + MASK = _mm_insert_epi16(MASK,0,7); + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreF[0][i] = i; + scoreF[i][0] = i; + } + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreB[0][i] = i; + scoreB[i][0] = i; + } + + +} + +int backwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + + int e = errThreshold; + + char flag = 0; + + int minError = 2*e; + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side1 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); + } + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + tmp = _mm_srli_si128(R0,2); + for(j=0; j <= e;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=e/2;j>=-e/2-1;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i)/2-j-1)),0); + } + + // printf("@%d %d %d %d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1), _mm_extract_epi16(Diag,2), + // _mm_extract_epi16(Diag,3)); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + + // printf("#~%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + // printf("$%d %d %d %d\n", _mm_extract_epi16(Side2,0), _mm_extract_epi16(Side2,1), _mm_extract_epi16(Side2,2), + // _mm_extract_epi16(Side2,3)); + + // printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + //first cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-3)) != *(a-lena), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena-1)), 1); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-2)), 2); + Diag = _mm_insert_epi16(Diag, 2*e, 3); + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R1,2)); + + //second cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena)), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena-1)), 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,1)); + + //third cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-2)) != *(a-(lena+1)), 0); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena)), 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + //forth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+1)), 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + //fifth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1)) != *(a-(lena+2)), 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + +int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + + int e = errThreshold; + + char flag = 0; + + int minError = 2*e; + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+(i/2-j))) != *(a-(i/2-1-(i/2-j))),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2+j-1)) != *(a-((i-1)/2-j-1)),0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(i/2-1+j)) != *(a-(i/2-1-j)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + tmp = _mm_srli_si128(R0,2); + for(j=0; j <= e;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=-e/2+1;j<=e/2;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-((i+1)/2-j-1)) != *(a-((i-1)/2+j-1)),0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + + tmpE--; + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, 2*e, 0); + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, 2*e, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + +inline int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0; + int i1; + int i2; + int i4; + int i5; + + int e = 4; + int mismatch = errThreshold; + + int minError = 2*errThreshold; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + __m128i SeqA, SeqB; + __m128i Result; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,minError,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, ( *(a-1)!=*b) )+1; + i2 = min(i0,( a[0] != *(b-1) ) )+1; + + i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); + i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; + i5 = min(i2, (a[0] != *(b-2))+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,minError,0); + Down1 = _mm_insert_epi16(Down1,minError,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-index),0); + SeqB = _mm_insert_epi16(SeqB,*(b-index),0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,minError,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); + SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold + && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && _mm_extract_epi16(R1, 0) > errThreshold + && _mm_extract_epi16(R1, 1) > errThreshold && _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, 2*errThreshold, 0); + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, 2*errThreshold, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > mismatch) + return -1; + return minError; +} + +int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0; + int i1; + int i2; + int i4; + int i5; + + int e = errThreshold; + + int minError = 2*e; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + __m128i SeqA, SeqB; + __m128i Result; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, ( *(a-1)!=*b) )+1; + i2 = min(i0,( a[0] != *(b-1) ) )+1; + + i0 = min3( i0+ ( *(a-1)!=*(b-1) ),i1+1,i2+1); + i4 = min(i1, ( *(a-2)!=b[0] )+1)+1; + i5 = min(i2, (a[0] != *(b-2))+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-index),0); + SeqB = _mm_insert_epi16(SeqB,*(b-index),0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + index = 4; + i = 5; + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,*(a-(index)),0); + SeqB = _mm_insert_epi16(SeqB,*(b-(index)),0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + //tmp = _mm_sub_epi16(Error, R0); + //i0 = _mm_movemask_epi8(tmp); + + if( _mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e + && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && + _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e ) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + + j=0; + + int tmpE = e; + + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, *(b-(lenb-1-k)) != *(a-((i-lenb)-1+k)),0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, 2*e, 0); + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-2)) != *(b-(lenb-1)), 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, 2*e, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, *(a-(lenb+e-1)) != *(b-(lenb-1)), 0); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + +inline int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0=0; + int i1=0; + int i2=0; + int i4=0; + int i5=0; + + int mismatch = errThreshold; + int e = 4; + + int minError = 4*mismatch+1; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + register __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + register __m128i SeqA, SeqB; + __m128i Result; + + __m128i tmpSeqA; + __m128i tmpSeqB; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,minError,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, (a[1]!=b[0]))+1; + i2 = min(i0,(a[0]!=b[1]))+1; + + i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); + i4 = min(i1, (a[2]!=b[0])+1)+1; + i5 = min(i2, (a[0]!=b[2])+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,minError,0); + Down1 = _mm_insert_epi16(Down1,minError,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,a[index],0); + SeqB = _mm_insert_epi16(SeqB,b[index],0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,minError,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + if( i%2 == 0) + { + tmpSeqA = _mm_slli_si128(SeqA, 2); + tmpSeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); + SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > errThreshold && _mm_extract_epi16(R0, 1) > errThreshold && _mm_extract_epi16(R0, 2) > errThreshold + && _mm_extract_epi16(R0, 3) > errThreshold && _mm_extract_epi16(R0, 4) > errThreshold && + _mm_extract_epi16(R1, 0) > errThreshold && _mm_extract_epi16(R1, 1) > errThreshold && + _mm_extract_epi16(R1, 2) > errThreshold && _mm_extract_epi16(R1, 3) > errThreshold) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, minError, 0); + Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, minError, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + + minError = min(minError, _mm_extract_epi16(R0,0)); + + + if(minError > mismatch) + return -1; + return minError; +} + + + +int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int i0=0; + int i1=0; + int i2=0; + int i4=0; + int i5=0; + + int e = errThreshold; + + int minError = 2*e; + int index = 0; + int tmpValue = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + register __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i tmp; + register __m128i SeqA, SeqB; + __m128i Result; + + __m128i tmpSeqA; + __m128i tmpSeqB; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + SeqA = _mm_setzero_si128 (); + SeqB = _mm_setzero_si128 (); + Result = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + + i0 = (a[0] != b[0]); + i1 = min(i0, (a[1]!=b[0]))+1; + i2 = min(i0,(a[0]!=b[1]))+1; + + i0 = min3(i0+(a[1]!=b[1]),i1+1,i2+1); + i4 = min(i1, (a[2]!=b[0])+1)+1; + i5 = min(i2, (a[0]!=b[2])+1)+1; + + R1 = _mm_insert_epi16(R1, 3, 0); + R1 = _mm_insert_epi16(R1, i1, 1); + R1 = _mm_insert_epi16(R1, i2, 2); + R1 = _mm_insert_epi16(R1, 3, 3); + + R0 = _mm_insert_epi16(R0, 4, 0); + R0 = _mm_insert_epi16(R0, i4, 1); + R0 = _mm_insert_epi16(R0, i0, 2); + R0 = _mm_insert_epi16(R0, i5, 3); + R0 = _mm_insert_epi16(R0, 4, 4); + + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + Side1 = _mm_xor_si128(Side1, Side1); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + + index = 0; + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + SeqA = _mm_slli_si128(SeqA, 2); + SeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(SeqA,a[index],0); + SeqB = _mm_insert_epi16(SeqB,b[index],0); + index++; + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + index = 4; + i = 5; + + int loopEnd = 2*lenb-(e-1); + for(; i <= loopEnd ;i++) + { + //Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + tmpSeqA = _mm_slli_si128(SeqA, 2); + tmpSeqB = _mm_slli_si128(SeqB, 2); + SeqA = _mm_insert_epi16(tmpSeqA,a[index],0); + SeqB = _mm_insert_epi16(tmpSeqB,b[index],0); + + index++; + + tmp = _mm_shufflelo_epi16(SeqB,27); + tmp = _mm_slli_si128(tmp, 2); + tmpValue = _mm_extract_epi16(tmp, 5); + tmp = _mm_insert_epi16(tmp, tmpValue, 0); + + Result = _mm_cmpeq_epi16(SeqA, tmp); + Diag = _mm_andnot_si128(Result, MASK); + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + if(_mm_extract_epi16(R0, 0) > e && _mm_extract_epi16(R0, 1) > e && _mm_extract_epi16(R0, 2) > e + && _mm_extract_epi16(R0, 3) > e && _mm_extract_epi16(R0, 4) > e && _mm_extract_epi16(R1, 0) > e && + _mm_extract_epi16(R1, 1) > e && _mm_extract_epi16(R1, 2) > e && _mm_extract_epi16(R1, 3) > e) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + Result = _mm_cmpeq_epi16(SeqA, _mm_shufflelo_epi16(SeqB,27)); + Diag = _mm_andnot_si128(Result, MASK); + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + + + } + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, 2*e, 0); + Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, 2*e, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 0); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + +int forwardEditDistanceSSE2Odd(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int e = errThreshold; + + int minError = 2*e; + + char flag = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Side1 = _mm_xor_si128(Side1, Side1); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side1 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Side1 = _mm_slli_si128(Side1, 2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); + } + + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + + tmp = _mm_srli_si128(R0,2); + for(j=0; j < e-1;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + // printf("#%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); + if(flag == 0) + return -1; + + if(i == 2*lenb-(e-1)) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=e/2;j>=-e/2-1;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i)/2-j-1],0); + } + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + //printf("#%d %d %d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1), _mm_extract_epi16(R1,2), + // _mm_extract_epi16(R1,3)); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + //first cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-3] != a[lena], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena-1], 1); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-2], 2); + Diag = _mm_insert_epi16(Diag, 2*e, 3); + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,2)); + + //second cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena-1], 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + + minError = min(minError, _mm_extract_epi16(R0,1)); + + //third cell + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-2] != a[lena+1], 0); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena], 1); + Diag = _mm_insert_epi16(Diag, 2*e, 2); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,1)); + + //forth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+1], 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R0 = _mm_min_epi16(_mm_srli_si128(R1,2)+Side1, R0+Diag); + R0 = _mm_min_epi16(R0, R1+Down1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + //fifth + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, b[lenb-1] != a[lena+2], 0); + Diag = _mm_insert_epi16(Diag, 2*e, 1); + + R1 = _mm_min_epi16(R0+Side2, R1+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down2); + + + minError = min(minError, _mm_extract_epi16(R1,0)); + + if(minError > e) + return -1; + return minError; + +} + +int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i = 0; + int j = 0; + int k = 0; + + int e = errThreshold; + + int minError = 2*e; + + char flag = 0; + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + + __m128i R0, R1; + __m128i Diag; + __m128i Side1, Side2; + __m128i Down1, Down2; + __m128i Error; + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Error = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + R1 = _mm_xor_si128(R1, R1); + R0 = _mm_xor_si128(R0, R0); + + Diag = _mm_xor_si128(Diag, Diag); + Side1 = _mm_xor_si128(Side1, Side1); + Down1 = _mm_xor_si128(Down1, Down1); + + Diag = _mm_insert_epi16(Diag,2*e,0); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,2*e,1); + + Down1 = _mm_insert_epi16(Down1,2*e,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + for(i=2; i <= e; i++) + { + //set side + Side1 = _mm_slli_si128(Side1,2); + Side1 = _mm_insert_epi16(Side1,1,0); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_slli_si128(Down1,2); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + Diag = _mm_insert_epi16(Diag,2*e,0); + + for(j=1;j<=i-1;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+(i/2-j)] != a[i/2-1-(i/2-j)],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R0 = _mm_min_epi16(R1+Side1, _mm_slli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down1); + } + + else + { + Diag = _mm_insert_epi16(Diag,2*e,0); + for(j=i/2-1;j>=-i/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2+j-1] != a[(i-1)/2-j-1],0); + } + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, 2*e,0); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + } + } + Error = _mm_xor_si128(Error, Error); + Side2 = _mm_xor_si128(Side2, Side2); + Down2 = _mm_xor_si128(Down2, Down2); + Down1 = _mm_xor_si128(Down1, Down1); + + Error = _mm_insert_epi16(Error,e,0); + Side2 = _mm_insert_epi16(Side2,2*e,0); + Down1 = _mm_insert_epi16(Down1,2*e,0); + + + for(j=0; j < e; j++) + { + Side2 = _mm_slli_si128(Side2, 2); + Side2 = _mm_insert_epi16(Side2,1,0); + + Down1 = _mm_slli_si128(Down1, 2); + Down1 = _mm_insert_epi16(Down1,1,0); + + Down2 = _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,1,0); + + Error = _mm_slli_si128(Error, 2); + Error = _mm_insert_epi16(Error, e, 0); + } + + Down2= _mm_slli_si128(Down2, 2); + Down2 = _mm_insert_epi16(Down2,2*e,0); + + for(; i <= 2*lenb-(e-1);i++) + { + flag = 0; + Diag = _mm_xor_si128(Diag, Diag); + if( i%2 == 0) + { + for(j=e/2;j>=-e/2;j--) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[i/2-1+j] != a[i/2-1-j],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + + if(_mm_extract_epi16(R0,0) <= e) + flag = 1; + + tmp = _mm_srli_si128(R0,2); + for(j=0; j < e-1;j++) + { + if(_mm_extract_epi16(tmp,0) <= e) + flag = 1; + tmp = _mm_srli_si128(tmp,2); + } + + + if(flag == 0) + return -1; + + if(i == 2*lenb-e) + { + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = _mm_extract_epi16(tmp,0); + } + + } + + else + { + for(j=-e/2+1;j<=e/2;j++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[(i+1)/2-j-1] != a[(i-1)/2+j-1],0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + if(i >= 2*lenb-e) + { + tmp = _mm_srli_si128(R1,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + } + } + + j=0; + int tmpE = e; + for(;j<2*(e-2)+1;j++) + { + + Diag = _mm_xor_si128(Diag, Diag); + //set the first element + if(j==0) + { + for( k=0;k<=e-1;k++ ) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < e-2;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + else if(j%2 == 0) + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R0 = _mm_min_epi16(R1+Side2, R0+Diag); + R0 = _mm_min_epi16(R0, _mm_slli_si128(R1,2)+Down2); + + tmpE--; + + tmp = _mm_srli_si128(R0,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + + + else + { + for(k=0;k<tmpE;k++) + { + Diag = _mm_slli_si128(Diag, 2); + Diag = _mm_insert_epi16(Diag, b[lenb-1-k] != a[(i-lenb)-1+k],0); + } + + R1 = _mm_min_epi16(_mm_srli_si128(R0,2)+Side1, R1+Diag); + R1 = _mm_min_epi16(R1, R0+Down1); + + tmp = _mm_srli_si128(R1,2); + for(k=0; k < tmpE-1;k++) + tmp = _mm_srli_si128(tmp,2); + minError = min(minError, _mm_extract_epi16(tmp,0)); + } + i++; + } + //Diag + + Diag = _mm_xor_si128(Diag,Diag); + Diag = _mm_insert_epi16(Diag, 2*e, 0); + Diag = _mm_insert_epi16(Diag, a[lenb+e-2] != b[lenb-1], 1); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + + Down1 = _mm_insert_epi16(Down1, 2*e, 0); + Down1 = _mm_insert_epi16(Down1, 1, 1); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1, _mm_slli_si128(R0,2)+Down1); + + minError = min(minError, _mm_extract_epi16(R1,1)); + + Diag = _mm_insert_epi16(Diag, a[lenb+e-1] != b[lenb-1], 1); + Down1 = _mm_insert_epi16(Down1, 1, 0); + + R0 = _mm_min_epi16(R1+Down1,R0+Diag); + R0 = _mm_min_epi16(R0,_mm_srli_si128(R1,2)+Side1); + + minError = min(minError, _mm_extract_epi16(R0,0)); + + if(minError > e) + return -1; + return minError; +} + + +int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + + + int i0 = 0; + int i1 = 0; + + + int error; //0: if the two character are equal 1: if not + + int i = 0; //loop index + + int e = 2; //error bound + + int totalError = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; + + __m128i tmp; + __m128i ERROR_REACH; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + ERROR_REACH = _mm_setzero_si128 (); + /* end initialize */ + + + if(lenb <= e) + { + return smallEditDistanceF(a,lena,b,lenb); + } + + ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + +// error = ((a[0]) != (b[0])); + + Diag = _mm_set_epi16(0,0,0,0,0,2*e,((a[0]) != (b[0])),2*e); + Side1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); + Side2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); + Down1 = _mm_set_epi16(0,0,0,0,0,2*e,1,1); + Down2 = _mm_set_epi16(0,0,0,0,0,1,1,2*e); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + for (i = 3; i < 2*lena; i++) + { + if(i % 2 ==1) + { + + Diag = _mm_xor_si128(Diag, Diag); + error = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0, ((a[(i-1)/2-1]) != (b[(i+1)/2-1])) ,((a[(i+1)/2-1]) != (b[(i-1)/2-1]))); + + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + if(i > 2 * lenb - 2) + { + i1 = _mm_extract_epi16(R1, 1); + totalError = min(totalError, i1); + } + } + + else if(i % 2 == 0) + { + error = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); + error = ((a[i/2-2]) != (b[i/2])); + Diag = _mm_insert_epi16(Diag,error,2); + + // Diag = _mm_set_epi16(0, 0, 0, 0, 0, ((a[i/2-2]) != (b[i/2])) , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + tmp = _mm_sub_epi16(ERROR_REACH, R0); + i0 = _mm_movemask_epi8(tmp); + + if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) + return -1; + if(i == 2 * lenb - 2) { + totalError = _mm_extract_epi16(R0, 2); + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + error = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,error,0); + error = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2-1]) != (b[i/2-1])) , ((a[i/2]) != (b[i/2-2])) ); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + +// i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + totalError = min(totalError, i1); + + //fill the second part of the error + i++; + + Diag = _mm_xor_si128(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + error = ((a[i/2]) != (b[lenb-1])); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 2*e , ((a[i/2]) != (b[lenb-1])) , 2*e ); + + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + +// i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + totalError = min(totalError, i1); + //fill the last the last element of the matrix + i++; + + Diag = _mm_xor_si128(Diag, Diag); + error = ((a[i/2]) != (b[lenb-1])); + Diag = _mm_insert_epi16(Diag,error,0); + +// Diag = _mm_set_epi16(0, 0, 0, 0, 0, 0 , 0 , ((a[i/2]) != (b[lenb-1])) ); + + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + totalError = min(totalError, i0); + + if(totalError > e) + return -1; + + return totalError; + +} + +int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb) +{ + if(lenb == 0 || lena == 0) + return 0; + + int i0 = 0; + int i1 = 0; + + int error; //0: if the two character are equal 1: if not + + int i = 0; //loop index + + int e = 2; //error bound + + int totalError = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; //diag matrix + + __m128i tmp; + __m128i ERROR_REACH; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + ERROR_REACH = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + if(lenb <= e) + { + return smallEditDistanceB(a,lena,b,lenb); + } + + + ERROR_REACH = _mm_set_epi16(0,0,0,0,0,e,e,e); + + R0 = _mm_insert_epi16(R0,0,0); + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + error = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + +// printf("%d %d %d\n", _mm_extract_epi16(R0,0), _mm_extract_epi16(R0,1), _mm_extract_epi16(R0,2)); + for (i = 3; i < 2*lena; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + error = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + //printf("#%d #%d\n", _mm_extract_epi16(Diag,0), _mm_extract_epi16(Diag,1)); + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + if(i > 2 * lenb - 2) { + i1 = _mm_extract_epi16(R1, 1); + totalError = min(totalError, i1); + } +// printf("%d %d\n", _mm_extract_epi16(R1,0), _mm_extract_epi16(R1,1)); + } + + else if(i % 2 == 0) + { + error = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + error = ( *(a-(i/2-2)) != *(b-(i/2))); + Diag = _mm_insert_epi16(Diag,error,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + tmp = _mm_sub_epi16(ERROR_REACH, R0); + i0 = _mm_movemask_epi8(tmp); + + if(i0 == 63 && _mm_extract_epi16(R1,0) > errThreshold && _mm_extract_epi16(R1,1) > errThreshold && i < 2 * lenb - 2) + return -1; + + if(i == 2 * lenb - 2) { + totalError = _mm_extract_epi16(R0, 2); + } + } + } + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + error = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,error,0); + error = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + totalError = min(totalError, i1); + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + error = ( *(a-(i/2)) != *(b-(lenb-1)) ); + Diag = _mm_insert_epi16(Diag,error,1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + totalError = min(totalError, i1); + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + error = ( *(a-(i/2)) != *(b-(lenb-1)) ); + Diag = _mm_insert_epi16(Diag,error,0); + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, _mm_srli_si128(R0,2)+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + totalError = min(totalError, i0); + + if(totalError > e || totalError == 0) + return -1; + return totalError; +} + +void initBestMapping(int totalReadNumber) +{ + int i = 0; + bestHitMappingInfo = getMem(totalReadNumber * sizeof(BestFullMappingInfo)); + for(i = 0; i < totalReadNumber; i++) { + bestHitMappingInfo[i].loc = -1; + } +} + + +void finalizeBestSingleMapping() +{ + int i = 0; + char *_tmpQual, *_tmpSeq; + char rqual[SEQ_LENGTH + 1]; + rqual[SEQ_LENGTH]='\0'; + + for(i = 0; i < _msf_seqListSize; i++) + { + if(_msf_seqList[i].hits[0] != 0) + { + if (bestHitMappingInfo[i].dir) + { + reverse(_msf_seqList[i].qual, rqual, SEQ_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[i].rseq; + } + else + { + _tmpQual = _msf_seqList[i].qual; + _tmpSeq = _msf_seqList[i].seq; + } + + + _msf_output.QNAME = _msf_seqList[i].name; + _msf_output.FLAG = 16 * bestHitMappingInfo[i].dir; + _msf_output.RNAME = bestHitMappingInfo[i].chr; + + _msf_output.POS = bestHitMappingInfo[i].loc; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[i].cigar ; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + + + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[i].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[i].md; + + output(_msf_output); + } + } + freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); +} +/**********************************************/ +int compare (const void *a, const void *b) +{ + return ((Pair *)a)->hv - ((Pair *)b)->hv; + /*char *s1 = ((Pair *)a)->hv; + char *s2 = ((Pair *)b)->hv; + int i = 0; + + int diff = 0; + int sign = 0; + + for(i = 0; i < SEQ_LENGTH; i++) + { + diff += (s1[i] != s2[i]); + if(s1[i] > s2[i]) + sign++; + else if(s1[i] < s2[i]) + sign--; + } + + return diff*sign;*/ +// return strncmp(s1, s2,SEQ_LENGTH); + +} +/**********************************************/ +void preProcessReads() +{ + int i = 0; + + _msf_sort_seqList = getMem(_msf_seqListSize * sizeof(Pair)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_sort_seqList[i].hv = hashVal(_msf_seqList[i].seq); + + _msf_sort_seqList[i].readNumber = i; + } + + qsort(_msf_sort_seqList, _msf_seqListSize, sizeof(Pair), compare); + + /* + for(i = 0; i < _msf_seqListSize; i++) + { + //printf("%s\n", _msf_sort_seqList[i].hv); + } + */ + + _msf_map_sort_seqList = getMem(_msf_seqListSize * sizeof(int)); + + for(i = 0; i < _msf_seqListSize; i++) + _msf_map_sort_seqList[_msf_seqList[i].readNumber] = i; + +} +/**********************************************/ + +int verifySingleEnd(int index, char* seq, int offset) +{ + int curOff = 0; + int i; + + char *ref; + + int err; + int errCnt =0; + int errCntOff = 0; + int NCntOff = 0; + + ref = _msf_refGen + index - 1; + + verificationCnt++; + + for (i = 0; i < SEQ_LENGTH; i++) + { + err = *ref != *seq; + errCnt += err; + if (errCnt > errThreshold) + { + + return -1; + } + + if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1]) + { + + if (errCntOff == 0 && NCntOff == 0 && offset > curOff) + { + return -1; + } + + errCntOff = 0; + NCntOff = 0; + curOff++; + + if ( i >= _msf_samplingLocs[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + } + + ref++; + seq++; + } + return errCnt; +} + +/*********************************************/ +void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName) +{ + int i; + + if (_msf_optionalFields == NULL) + { + _msf_op = getMem(SEQ_LENGTH); + if (pairedEndMode) + { + _msf_optionalFields = getMem(8*sizeof(OPT_FIELDS)); + } + else + { + _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS)); + } + + for (i=0; i<200;i++) + { + sprintf(_msf_numbers[i],"%d%c",i, '\0'); + } + sprintf(_msf_cigar, "%dM", SEQ_LENGTH); + } + + if (_msf_samplingLocsEnds == NULL) + { + _msf_samplingLocs = samplingLocs; + _msf_samplingLocsSize = samplingLocsSize; + + _msf_samplingLocsEnds = getMem(sizeof(int)*_msf_samplingLocsSize); + for (i=0; i<_msf_samplingLocsSize; i++) + { + _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1; + } + + _msf_seqList = seqList; + _msf_seqListSize = seqListSize; + + preProcessReads(); + + _msf_oeaMapping = getMem(_msf_seqListSize * sizeof(int)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_oeaMapping[i] = 0; + } + + _msf_discordantMapping = getMem(_msf_seqListSize * sizeof(int)); + for(i = 0; i < _msf_seqListSize; i++) + { + _msf_discordantMapping[i] = 0; + } + + } + + if (_msf_refGenName == NULL) + { + _msf_refGenName = getMem(4*SEQ_LENGTH); + } + _msf_refGen = getRefGenome(); + _msf_refGenLength = strlen(_msf_refGen); + + _msf_refGenOffset = getRefGenomeOffset(); + snprintf(_msf_refGenName, 4*SEQ_LENGTH,"%s%c", getRefGenomeName(), '\0'); + _msf_refGenName[strlen(getRefGenomeName())] = '\0'; + + + if (_msf_verifiedLocs != NULL){ + freeMem(_msf_verifiedLocs, sizeof(int) * (_msf_refGenLength+1)); + } + + _msf_verifiedLocs = (int *) getMem(sizeof(int)*(_msf_refGenLength+1)); + + for (i=0; i<=_msf_refGenLength; i++) + _msf_verifiedLocs[i] = _msf_seqListSize*10+1; + + + + if (pairedEndMode && _msf_seqHits == NULL) + { + + _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo)); + + for (i=0; i<seqListSize; i++) + { + //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations)); + _msf_mappingInfo[i].next = NULL; + _msf_mappingInfo[i].size = 0; + } + + _msf_seqHits = getMem((_msf_seqListSize) * sizeof(int)); + + + for (i=0; i<_msf_seqListSize; i++) + { + _msf_seqHits[i] = 0; + } + + _msf_readHasConcordantMapping = getMem(_msf_seqListSize / 2 * sizeof(char)); + for(i = 0; i < _msf_seqListSize/2; i++) + { + _msf_readHasConcordantMapping[i] = 0; + } + + initLoadingRefGenome(genFileName); + } + + if (_msf_refGenOffset == 0) + { + _msf_refGenBeg = 1; + } + else + { + _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2; + } + _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1; + + +} +/**********************************************/ +void finalizeFAST() +{ + freeMem(_msf_seqHits, (_msf_seqListSize) * sizeof(int)); + freeMem(_msf_refGenName, 4*SEQ_LENGTH); + + + /* + int i; + for (i=0; i<_msf_rIndexSize; i++) + { + freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1); + } + freeMem(_msf_rIndex, _msf_rIndexSize);*/ + + + freeMem(_msf_map_sort_seqList, sizeof(Pair) * _msf_seqListSize); + freeMem(_msf_sort_seqList, sizeof(int) * _msf_seqListSize); + +} + +/* + Will apply the Levenshtein Dynamic programming. + Different from verifySingleEndEditDistance fucntion + as in this fucntion only one dynamic table is made while + in verifySingleEndEditDistance two dynamic table is made + for each right and left string +*/ +int editDistance(int refIndex, char *seq, int seqLength, char *matrix) +{ + int i = 0; + int size = 0; + int error = 0; + int rIndex = 0; + int directionIndex = 0; + + int min = 0; + int minIndex =0; + + int tempUp = 0; + int tempDown = 0; + + char *ref; + + int errorString = 0; + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int upValue; + int diagValue; + int sideValue; + + ref = _msf_refGen + refIndex - 1; + + rIndex = 1; + + for(i=0; i <= errThreshold; i++) + { + score[0][i] = i; + score[i][0] = i; + } + + while(rIndex <= seqLength +errThreshold) + { + tempUp = ((rIndex - errThreshold) > 0 ? ((rIndex > seqLength) ? seqLength - errThreshold :rIndex - errThreshold) : 1 ); + tempDown = ((rIndex >= seqLength-errThreshold ) ? seqLength+1 :rIndex + errThreshold + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(seq+i-1)); + + upValue = score[i-1][rIndex]+1; + diagValue = score[i-1][rIndex-1]+ !errorString; + sideValue = score[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + score[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - errThreshold) > 0 ? rIndex - errThreshold : 1)) && rIndex <= seqLength ) + score[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > seqLength && (i == seqLength - errThreshold) ) + score[i][rIndex] = sideValue; + else + score[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = score[i][rIndex]; + else if(error > score[i][rIndex]) + error = score[i][rIndex]; + } + rIndex++; + } + + min = score[seqLength][seqLength+errThreshold]; + minIndex = seqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= score[seqLength][seqLength+errThreshold-i] && seqLength+errThreshold-i > 0) + { + min = score[seqLength][seqLength+errThreshold-i]; + minIndex = seqLength+errThreshold-i; + } + } + + error = score[seqLength][minIndex]; + + directionIndex = seqLength; + rIndex = minIndex; + while(directionIndex != 0 || rIndex != 0) + { + + if(rIndex == 0) + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + } + else if(directionIndex == 0) + { + if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + } + else if(directionIndex-rIndex == errThreshold) + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(score[directionIndex][rIndex] - score[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrix[size] = *(seq+directionIndex-1); + size++; + matrix[size] = 'I'; + directionIndex--; + } + else if(score[directionIndex][rIndex] - score[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrix[size] = *(ref+rIndex-1); + size++; + matrix[size] = 'D'; + rIndex--; + } + else if( score[directionIndex][rIndex] - score[directionIndex-1][rIndex-1] == 1 ) + { + matrix[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrix[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + + matrix[size] = '\0'; + + char returnString[200]; + + returnString[0] = '\0'; + reverse(matrix, returnString, size); + sprintf(matrix, "%s", returnString); + + return error; +} + +/* + Will apply the Levenshtein Dynamic programming. + in both right and left direction as long as the + threshould error is reached or end of string length + +*/ +int msfHashVal(char *seq) +{ + int i=0; + int val=0, numericVal=0; + + while(i<6) + { + switch (seq[i]) + { + case 'A': + numericVal = 0; + break; + case 'C': + numericVal = 1; + break; + case 'G' : + numericVal = 2; + break; + case 'T': + numericVal = 3; + break; + default: + return -1; + break; + } + val = (val << 2)|numericVal; + i++; + } + return val; +} + + + +int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int e = errThreshold; + int error = 0; + int error1 = 0; + int totalError = 0; + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + + int minIndex1 = 0; + int minIndex2 = 0; + + + int directionIndex = 0; + + int size = 0; + + int startIndex1 = 0; + + rIndex = 1; + + + char matrixR[200]; + char matrixL[200]; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + int jumpIndex = 0; + + if(rSeqLength != 0) + { + error1 = forwardEditDistance2SSE2(ref+segLength+jumpIndex, rSeqLength-jumpIndex, rSeq+jumpIndex, rSeqLength-jumpIndex); + if(error1 == -1) + return -1; + } + + + if(lSeqLength != 0) + { + error = backwardEditDistance2SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error == -1) + { + return -1; + } + } + + matrixL[0] = '\0'; + matrixR[0] = '\0'; + + + ref = _msf_refGen + refIndex - 1; + + if(error1+error > errThreshold) + return -1; + + ref = _msf_refGen + refIndex - 1; + + rIndex = startIndex1+1; + + int i0 = 0; + int i1 = 0; + int i2 = 0; + + __m128i R0; + __m128i R1; + + __m128i Side1, Side2,Side; //side matrix + __m128i Down1, Down2,Down; //down matrix + __m128i Diag; // + + __m128i tmp; + + /* initialize */ + R0 = _mm_setzero_si128 (); + R1 = _mm_setzero_si128 (); + Diag = _mm_setzero_si128 (); + Side1 = _mm_setzero_si128 (); + Side2 = _mm_setzero_si128 (); + Down1 = _mm_setzero_si128 (); + Down2 = _mm_setzero_si128 (); + Down = _mm_setzero_si128 (); + Side = _mm_setzero_si128 (); + tmp = _mm_setzero_si128 (); + /* end initialize */ + + int mismatch[3] = {0,0,0}; + + if(lSeqLength != 0) + { + char *a; + char *b; + + a = ref-1; + b = lSeq+lSeqLength-1; + + R0 = _mm_insert_epi16(R0,0,0); + + score[0][0] = 0; + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + score[1][0] = 1; + direction1[1][0] = 1; + score[0][1] = 1; + direction1[0][1] = 2; + + mismatch[0] = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[0][2] = i0; + score[1][1] = i1; + score[2][0] = i2; + + direction1[0][2] = 2; + direction1[1][1] = ((mismatch[0] == 0)? 3 : 4); + direction1[2][0] = 1; + + for (i = 3; i < 2*lSeqLength; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ( *(a-((i+1)/2-1)) != *(b-((i-1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-((i-1)/2-1)) != *(b-((i+1)/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + score[i/2][i/2+1] = i0; + score[i/2+1][i/2] = i1; + + direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; + + direction1[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : + (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; + + if(i > 2 * lSeqLength - 2) + { + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + } + } + + else if(i % 2 == 0) + { + mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-(i/2-1)) != *(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + mismatch[2] = ( *(a-(i/2-2)) != *(b-(i/2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[2],2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + score[i/2+1][i/2-1] = i2; + + direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; + + direction1[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2] == 0) ? 3 : + (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; + + if( (i/2) % segLength == 0 && i1 == 0) // the segment has been processed no need to process it again + { + return -1; + } + + if(i == 2 * lSeqLength - 2) + { + error = i2; + minIndex1 = i-lSeqLength; + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + mismatch[0] = ( *(a-(i/2)) != *(b-(i/2-2)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ( *(a-(i/2-1)) !=*(b-(i/2-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + + direction1[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]) ? 2 : 4; + + direction1[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + error = min(error, i1); + if(error == i1) + minIndex1 = i-lSeqLength; + + score[i/2-1][i/2+2] = i0; + score[i/2][i/2+1] = i1; + + direction1[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 4; + + direction1[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2]) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ( *(a-(i/2)) != *(b-(lSeqLength-1)) ); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + error = min(error, i0); + if(error == 0) + return -1; + if(error == i0) + minIndex1 = i-lSeqLength; + if(mismatch[0] == 0) + direction1[lSeqLength][lSeqLength+errThreshold] = 3; + else + { + if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength][lSeqLength+errThreshold-1] == 1) + direction1[lSeqLength][lSeqLength+errThreshold] = 2; + else if(score[lSeqLength][lSeqLength+errThreshold] - score[lSeqLength-1][lSeqLength+errThreshold] == 1) + direction1[lSeqLength][lSeqLength+errThreshold] = 1; + else + direction1[lSeqLength][lSeqLength+errThreshold] = 4; + } + } + error1 = error; + error = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength <= e) + { + char *a; + char *b; + + int tmp_index = 0; + + a = ref; + b = rSeq; + + for(tmp_index = 0; tmp_index < rSeqLength; tmp_index++) + { + matrixR[tmp_index] = (a[tmp_index]==b[tmp_index]) ? 'M' : a[tmp_index] ; + } + matrixR[tmp_index] = '\0'; + } + else if(rSeqLength != 0 && rSeqLength >= e) + { + char *a; + char *b; + + a = ref; + b = rSeq; + + R0 = _mm_sub_epi8(R0, R0); + R1 = _mm_sub_epi8(R1, R1); + + R0 = _mm_insert_epi16(R0,0,0); + + score[0][0] = 0; + + R1 = _mm_insert_epi16(R1,1,0); + R1 = _mm_insert_epi16(R1,1,1); + + score[1][0] = 1; + direction2[1][0] = 1; + score[0][1] = 1; + direction2[0][1] = 2; + + mismatch[0] = ((a[0]) != (b[0])); + + Diag = _mm_insert_epi16(Diag,2*e,0); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + Side1 = _mm_insert_epi16(Side1,1,0); + Side1 = _mm_insert_epi16(Side1,1,1); + Side1 = _mm_insert_epi16(Side1,2*e,2); + + Side2 = _mm_insert_epi16(Side2,2*e,0); + Side2 = _mm_insert_epi16(Side2,1,1); + Side2 = _mm_insert_epi16(Side2,1,2); + + Down1 = _mm_insert_epi16(Down1,1,0); + Down1 = _mm_insert_epi16(Down1,1,1); + Down1 = _mm_insert_epi16(Down1,2*e,2); + + Down2 = _mm_insert_epi16(Down2,2*e,0); + Down2 = _mm_insert_epi16(Down2,1,1); + Down2 = _mm_insert_epi16(Down2,1,2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[0][2] = i0; + score[1][1] = i1; + score[2][0] = i2; + + direction2[0][2] = 2; + direction2[1][1] = ((mismatch[0] == 0)? 3 : 4); + direction2[2][0] = 1; + + + for (i = 3; i < 2*rSeqLength; i++) + { + if(i % 2 ==1) + { + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ((a[(i+1)/2-1]) != (b[(i-1)/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[(i-1)/2-1]) != (b[(i+1)/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + + tmp = _mm_srli_si128(R0,2); + + R1 = _mm_min_epi16(tmp+Side1, R1+Diag); + R1 = _mm_min_epi16(R1,R0+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + score[i/2][i/2+1] = i0; + score[i/2+1][i/2] = i1; + + direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1) ? 2 : 4; + + direction2[i/2+1][i/2] = (score[i/2+1][i/2]==score[i/2][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2+1][i/2]-score[i/2][i/2]==1) ? 1 : + (score[i/2+1][i/2]-score[i/2+1][i/2-1]==1)? 2 : 4; + + + if(i > 2 * rSeqLength - 2) + { + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + } + } + + else if(i % 2 == 0) + { + mismatch[0] = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + mismatch[2] = ((a[i/2-2]) != (b[i/2])); + Diag = _mm_insert_epi16(Diag,mismatch[2],2); + + tmp = _mm_slli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Down2); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + i2 = _mm_extract_epi16(R0, 2); + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + score[i/2+1][i/2-1] = i2; + + direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1) ? 2 : 4; + + direction2[i/2+1][i/2-1] = (score[i/2+1][i/2-1]==score[i/2][i/2-2] && mismatch[2]==0) ? 3 : + (score[i/2+1][i/2-1]-score[i/2][i/2-1]==1) ? 1 : 4; + + + if(i == 2 * rSeqLength - 2) + { + error = i2; + minIndex2 = i-rSeqLength; + } + } + } + + Down1 = _mm_insert_epi16(Down1,2*e,0); + + //fill the first part of the error + mismatch[0] = ((a[i/2]) != (b[i/2-2])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + mismatch[1] = ((a[i/2-1]) != (b[i/2-1])); + Diag = _mm_insert_epi16(Diag,mismatch[1],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R0 = _mm_min_epi16(R1+Side1, R0+Diag); + R0 = _mm_min_epi16(R0,_mm_slli_si128(R1,2)+Down1); + + i0 = _mm_extract_epi16(R0, 0); + i1 = _mm_extract_epi16(R0, 1); + + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + + score[i/2-1][i/2+1] = i0; + score[i/2][i/2] = i1; + + direction2[i/2-1][i/2+1] = (score[i/2-1][i/2+1]==score[i/2-2][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+1]-score[i/2-1][i/2]==1) ? 2 : 4; + + direction2[i/2][i/2] = (score[i/2][i/2]==score[i/2-1][i/2-1] && mismatch[1] == 0) ? 3 : + (score[i/2][i/2]-score[i/2-1][i/2]==1) ? 1 : + (score[i/2][i/2]-score[i/2][i/2-1]==1)? 2 : 4; + + + //fill the second part of the error + i++; + Diag = _mm_sub_epi8(Diag, Diag); + Diag = _mm_insert_epi16(Diag,2*e,0); + mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],1); + Diag = _mm_insert_epi16(Diag,2*e,2); + + R1 = _mm_min_epi16(R0+Side1, _mm_slli_si128(R1,2)+Diag); + R1 = _mm_min_epi16(R1,_mm_slli_si128(R0,2)+Down1); + + i0 = _mm_extract_epi16(R1, 0); + i1 = _mm_extract_epi16(R1, 1); + + error = min(error, i1); + if(error == i1) + minIndex2 = i-rSeqLength; + + score[i/2-1][i/2+2] = i0; + score[i/2][i/2+1] = i1; + + direction2[i/2-1][i/2+2] = (score[i/2-1][i/2+2]==score[i/2-2][i/2+1] && mismatch[0] == 0) ? 3 : + (score[i/2-1][i/2+2]-score[i/2-1][i/2+1]==1) ? 2 : 3; + + direction2[i/2][i/2+1] = (score[i/2][i/2+1]==score[i/2-1][i/2] && mismatch[0] == 0) ? 3 : + (score[i/2][i/2+1]-score[i/2-1][i/2+1]==1)? 1 : + (score[i/2][i/2+1]-score[i/2][i/2]==1)? 2 : 4; + + + //fill the last the last element of the matrix + i++; + Diag = _mm_sub_epi8(Diag, Diag); + mismatch[0] = ((a[i/2]) != (b[rSeqLength-1])); + Diag = _mm_insert_epi16(Diag,mismatch[0],0); + + Down = _mm_sub_epi8(Down, Down); + Down = _mm_insert_epi16(Down,1,0); + + Side = _mm_sub_epi8(Side, Side); + Side = _mm_insert_epi16(Side,1,0); + + tmp = _mm_srli_si128(R1,2); + + R0 = _mm_min_epi16(R1+Down, R0+Diag); + R0 = _mm_min_epi16(R0,tmp+Side); + + i0 = _mm_extract_epi16(R0, 0); + + error = min(error, i0); + if(error == i0) + minIndex2 = i-rSeqLength; + + if(mismatch[0] == 0) + direction2[rSeqLength][rSeqLength+errThreshold] = 3; + else + { + if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength][rSeqLength+errThreshold-1] == 1) + direction2[lSeqLength][lSeqLength+errThreshold] = 2; + else if(score[rSeqLength][rSeqLength+errThreshold] - score[rSeqLength-1][rSeqLength+errThreshold] == 1) + direction2[rSeqLength][rSeqLength+errThreshold] = 1; + else + direction2[rSeqLength][rSeqLength+errThreshold] = 4; + } + + } + + totalError = error1 + error; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + + if(rSeqLength > e) + { + while(directionIndex != 0 || rIndex != 0) + { + + if(direction2[directionIndex][rIndex] == 3) + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + else if(direction2[directionIndex][rIndex] == 4) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else if(direction2[directionIndex][rIndex] == 2) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + size++; + } + matrixR[size] = '\0'; + } + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + while(directionIndex != 0 || rIndex != 0) + { + + if(direction1[directionIndex][rIndex] == 3) + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + else if(direction1[directionIndex][rIndex] == 4) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else if(direction1[directionIndex][rIndex] == 2) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + + size++; + } + + matrixL[size] = '\0'; + + char middle[200]; + middle[0] = '\0'; + + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; +} + +int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = errThreshold; + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(lSeqLength != 0) + { + error3 = backwardEditDistance4SSE2(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1 || error3 == 0){ + return -1; + } + } + + if(rSeqLength != 0) + { + error2 = forwardEditDistance4SSE2(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(error2 + error3 > errThreshold) + return -1; + + rIndex = 1; + + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + + while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold]; + minIndex1 = lSeqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; + minIndex1 = lSeqLength+errThreshold-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0) + { + rIndex = 1; + while(rIndex <= rSeqLength+errThreshold-error1) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + + rIndex++; + } + + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; + minIndex2 = rSeqLength + errThreshold-error1; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*(errThreshold-error1); i++) + { + if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; + minIndex2 = rSeqLength+errThreshold-error1-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + + totalError = error + error1; + + if(errThreshold > 4) + printf("ERROR in errorThreshold.\n"); + + + if(totalError != error2 + error3 && totalError > errThreshold) + { + printf("ErrorF=%d, ErrorB=%d Error=%d Error=%d\n", error2,error3,error1,error); + + scanf("%d", &i); + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + + size++; + } + + matrixL[size] = '\0'; + char middle[200]; + middle[0] = '\0'; + + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; + +} + +int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue) +{ + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = min(4, errThreshold); + + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(lSeqLength != 0) + { + error3 = backwardEditDistanceSSE2Extention(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1){ + return -1; + } + } + + if(rSeqLength != 0) + { + error2 = forwardEditDistanceSSE2Extention(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(error2 + error3 > errThreshold) + return -1; + + rIndex = 1; + + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + if(lSeqLength > ERROR_BOUND) + { + while(rIndex <= lSeqLength+ERROR_BOUND && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND]; + minIndex1 = lSeqLength + ERROR_BOUND; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*ERROR_BOUND; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i] && lSeqLength+ERROR_BOUND-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+ERROR_BOUND-i]; + minIndex1 = lSeqLength+ERROR_BOUND-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + } + else + { + int j = 0; + for(i = 1; i <= lSeqLength; i++) + { + for(j = 1; j <= lSeqLength; j++) + { + scoreB[i][j] = min3(scoreB[i-1][j-1]+ (*(ref-j) != *(lSeq+lSeqLength-i) ),scoreB[i][j-1]+1 ,scoreB[i-1][j]+1); + } + } + error = scoreB[lSeqLength][lSeqLength]; + minIndex1 = lSeqLength; + + } + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0 && rSeqLength > ERROR_BOUND) + { + ERROR_BOUND = min(ERROR_BOUND, rSeqLength); + + if(rSeqLength == ERROR_BOUND) + { + for(i=0; i < 2*ERROR_BOUND; i++) + scoreF[0][i] = i; + } + + rIndex = 1; + while(rIndex <= rSeqLength+ERROR_BOUND) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + rIndex++; + } + min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND]; + minIndex2 = rSeqLength + ERROR_BOUND; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*ERROR_BOUND; i++) + { + if(min > scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i] && rSeqLength+ERROR_BOUND-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+ERROR_BOUND-i]; + minIndex2 = rSeqLength+ERROR_BOUND-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + else + { + int j = 0; + for(i = 1; i <= rSeqLength; i++) + { + for(j = 1; j <= rSeqLength; j++) + { + scoreF[i][j] = min3(scoreF[i-1][j-1]+ (*(ref+j-1) != *(rSeq+i-1) ),scoreF[i][j-1]+1 ,scoreF[i-1][j]+1); + } + } + error = scoreF[rSeqLength][rSeqLength]; + minIndex2 = rSeqLength; + } + + totalError = error + error1; + + if(totalError != error2+error3) + { + for(i = 0; i < lSeqLength; i++) + printf("%c", *(tempref-1-i)); + printf("\n"); + for(i = 0; i < lSeqLength; i++) + printf("%c", *(lSeq+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(tempref+segLength+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(rSeq+i)); + printf("\n"); + + printf("ERROR=%d\n", totalError); + printf("ERROR_SSE=%d\n", error3+error2); + + printf("ERROR_SSE_back=%d E_SSE_forw=%d\n", error3, error2); + printf("ERROR_back=%d E_forw=%d\n", error1, error); + + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixL[size] = '\0'; + + char middle[200]; + middle[0] = '\0'; + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + + return totalError; + +} + + +int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, char *matrix, int *map_location, short *seqHashValue) +{ + + int i = 0; + + char * ref; + char * tempref; + + int rIndex = 0; //reference Index + + int error = 0; + int error1 = 0; + + int error2 = 0; + int error3 = 0; + + int totalError = 0; + int errorSegment = 0; + + int ERROR_BOUND = errThreshold; + + /* + 1: Up + 2: Side + 3: Diagnoal Match + 4: Diagnoal Mismatch + */ + + int min = 0; + int minIndex1 = 0; + int minIndex2 = 0; + + int directionIndex = 0; + + + int size = 0; + + ref = _msf_refGen + refIndex - 1; + tempref = _msf_refGen + refIndex - 1; + + + if(rSeqLength != 0) + { + if(errThreshold %2 == 1) + error2 = forwardEditDistanceSSE2Odd(ref+segLength, rSeqLength, rSeq, rSeqLength); + else + error2 = forwardEditDistanceSSE2G(ref+segLength, rSeqLength, rSeq, rSeqLength); + if(error2 == -1) + return -1; + } + + if(lSeqLength != 0) + { + if(errThreshold % 2 == 1) + error3 = backwardEditDistanceSSE2Odd(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + else + error3 = backwardEditDistanceSSE2G(ref-1, lSeqLength, lSeq+lSeqLength-1, lSeqLength); + if(error3 == -1 || error3 == 0){ + return -1; + } + } + + if(error3 + error2 > errThreshold) + return -1; + + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreB[0][i] = i; + scoreB[i][0] = i; + } + + rIndex = 1; + int prevError = 0; + + int tempUp = 0; + int tempDown = 0; + + int errorString = 0; + + int upValue; + int diagValue; + int sideValue; + + while(rIndex <= lSeqLength+errThreshold && lSeqLength != 0) + { + tempUp = ((rIndex - ERROR_BOUND) > 0 ? ((rIndex > lSeqLength) ? lSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1 ); + tempDown = ((rIndex >= lSeqLength-ERROR_BOUND ) ? lSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp ; i < tempDown ; i++) + { + errorString = (*(ref-rIndex) == *(lSeq+lSeqLength-i)); + + upValue = scoreB[i-1][rIndex]+1; + diagValue = scoreB[i-1][rIndex-1]+ !errorString; + sideValue = scoreB[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreB[i][rIndex] = min3(sideValue, diagValue , upValue); + + else if( (i == ((rIndex - ERROR_BOUND) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= lSeqLength ) + scoreB[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > lSeqLength && (i == lSeqLength - ERROR_BOUND) ) + scoreB[i][rIndex] = sideValue; + else + scoreB[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreB[i][rIndex]; + else if(error > scoreB[i][rIndex]) + error = scoreB[i][rIndex]; + } + if(rIndex <= lSeqLength) + { + errorSegment = error-prevError; + } + rIndex++; + } + if(lSeqLength != 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold]; + minIndex1 = lSeqLength + errThreshold; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*errThreshold; i++) + { + if(min >= scoreB[lSeqLength][lSeqLength+errThreshold-i] && lSeqLength+errThreshold-i > 0) + { + min = scoreB[lSeqLength][lSeqLength+errThreshold-i]; + minIndex1 = lSeqLength+errThreshold-i; + } + } + error = scoreB[lSeqLength][minIndex1]; + } + + error1 = error; + + error = 0; + errorSegment = 0; + + directionIndex = lSeqLength; + rIndex = minIndex1; + + *map_location = ((lSeqLength == 0) ? refIndex : refIndex - rIndex) ; + + ref = ref + segLength; + + if(rSeqLength != 0) + { + for(i = 0 ; i < errThreshold + 1; i++) + { + scoreF[0][i] = i; + scoreF[i][0] = i; + } + + + rIndex = 1; + while(rIndex <= rSeqLength+errThreshold-error1) + { + tempUp = (rIndex - ERROR_BOUND) > 0 ? ((rIndex > rSeqLength) ? rSeqLength - ERROR_BOUND :rIndex - ERROR_BOUND) : 1; + tempDown = ((rIndex >= rSeqLength- ERROR_BOUND ) ? rSeqLength+1 :rIndex + ERROR_BOUND + 1); + for(i = tempUp; i < tempDown ; i++) + { + errorString = (*(ref+rIndex-1) == *(rSeq+i-1)); + + upValue = scoreF[i-1][rIndex]+1; + diagValue = scoreF[i-1][rIndex-1]+ !errorString; + sideValue = scoreF[i][rIndex-1]+1; + + if(i != tempUp && i != tempDown-1) + scoreF[i][rIndex] = min3(sideValue, diagValue , upValue); + else if( (i == ((rIndex - ERROR_BOUND ) > 0 ? rIndex - ERROR_BOUND : 1)) && rIndex <= rSeqLength ) + scoreF[i][rIndex] = min(sideValue, diagValue); + else if(rIndex > rSeqLength && (i == rSeqLength - ERROR_BOUND) ) + scoreF[i][rIndex] = sideValue; + else + scoreF[i][rIndex] = min(diagValue , upValue); + + if(i == tempUp) + error = scoreF[i][rIndex]; + if(error > scoreF[i][rIndex]) + error = scoreF[i][rIndex]; + } + if(rIndex <= rSeqLength) + { + errorSegment = error; + } + rIndex++; + } + + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1]; + minIndex2 = rSeqLength + errThreshold-error1; + + // Find the Best error for all the possible ways. + for(i = 1; i <= 2*(errThreshold-error1); i++) + { + if(min > scoreF[rSeqLength][rSeqLength+errThreshold-error1-i] && rSeqLength+errThreshold-error1-i > 0) + { + min = scoreF[rSeqLength][rSeqLength+errThreshold-error1-i]; + minIndex2 = rSeqLength+errThreshold-error1-i; + } + } + error = scoreF[rSeqLength][minIndex2]; + } + + totalError = error + error1; + + + if(totalError != error2 + error3 && totalError > errThreshold) + { + for(i = 0; i < lSeqLength; i++) + printf("%c", *(tempref-1-i)); + printf("\n"); + for(i = 0; i < lSeqLength; i++) + printf("%c", *(lSeq+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(tempref+segLength+i)); + printf("\n"); + + for(i = 0; i < rSeqLength; i++) + printf("%c", *(rSeq+i)); + printf("\n"); + + + printf("SSEF=%d SSEB%d\n", error2, error3); + printf("F=%d B=%d\n", error, error1); + scanf("%d", &i); + } + + char matrixR[200]; + char matrixL[200]; + + matrixR[0] = '\0'; + matrixL[0] = '\0'; + + size = 0; + directionIndex = rSeqLength; + rIndex = minIndex2; + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixR[size] = *(rSeq+directionIndex-1); + size++; + matrixR[size] = 'I'; + directionIndex--; + } + else if(scoreF[directionIndex][rIndex] - scoreF[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixR[size] = *(ref+rIndex-1); + size++; + matrixR[size] = 'D'; + rIndex--; + } + else if( scoreF[directionIndex][rIndex] - scoreF[directionIndex-1][rIndex-1] == 1 ) + { + matrixR[size] = *(ref+rIndex-1); + rIndex--; + directionIndex--; + } + else + { + matrixR[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixR[size] = '\0'; + + size = 0; + directionIndex = lSeqLength; + rIndex = minIndex1; + + + while(directionIndex != 0 || rIndex != 0) + { + if(directionIndex-rIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + + } + else if(rIndex - directionIndex == errThreshold) + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + else + { + if(scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex] == 1 && directionIndex != 0) + { + matrixL[size] = 'I'; + size++; + matrixL[size] = *(lSeq+lSeqLength-directionIndex); + directionIndex--; + } + else if(scoreB[directionIndex][rIndex] - scoreB[directionIndex][rIndex-1] == 1 && rIndex != 0) + { + matrixL[size] = 'D'; + size++; + matrixL[size] = *(tempref-rIndex); + rIndex--; + } + else if( scoreB[directionIndex][rIndex] - scoreB[directionIndex-1][rIndex-1] == 1 ) + { + matrixL[size] = *(tempref-rIndex); + rIndex--; + directionIndex--; + } + else + { + matrixL[size] = 'M'; + rIndex--; + directionIndex--; + } + } + size++; + } + matrixL[size] = '\0'; + char middle[200]; + middle[0] = '\0'; + for(i = 0; i < segLength; i++) + middle[i] = 'M'; + middle[segLength] = '\0'; + + char rmatrixR[200]; + + reverse(matrixR, rmatrixR, strlen(matrixR)); + + sprintf(matrix, "%s%s%s", matrixL, middle, rmatrixR); + + return totalError; +} + + +int addCigarSize(int cnt){ + if (cnt<10) return 1; + else if (cnt < 100) return 2; + return 3; +} + +/* + Generate Cigar from the back tracking matrix +*/ +void generateCigar(char *matrix, int matrixLength, char *cigar) +{ + int i = 0; + + int counterM=0; + int counterI=0; + int counterD=0; + + int cigarSize = 0; + + cigar[0] = '\0'; + + while(i < matrixLength) + { + if(matrix[i]=='M') + { + counterM++; + if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + } + else if(matrix[i] == 'I') + { + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + counterI++; + i++; + + } + else if (matrix[i] == 'D') + { + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + + counterD++; + i++; + + } + else + { + counterM++; + if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI=0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD=0; + } + } + i++; + } + + if(counterM != 0) + { + sprintf(cigar, "%s%dM", cigar, counterM); + cigarSize += addCigarSize(counterM) + 1; + cigar[cigarSize] = '\0'; + counterM = 0; + } + else if(counterI != 0) + { + sprintf(cigar, "%s%dI", cigar, counterI); + cigarSize += addCigarSize(counterI) + 1; + cigar[cigarSize] = '\0'; + counterI = 0; + } + else if(counterD != 0) + { + sprintf(cigar, "%s%dD", cigar, counterD); + cigarSize += addCigarSize(counterD) + 1; + cigar[cigarSize] = '\0'; + counterD = 0; + } + + cigar[cigarSize] = '\0'; +} + +/* + Creates the Cigar output from the mismatching positions format [0-9]+(([ACTGN]|\^[ACTGN]+)[0-9]+)* +*/ +void generateCigarFromMD(char *mismatch, int mismatchLength, char *cigar) +{ + int i = 0; + int j = 0; + + int start = 0; + int cigarSize = 0; + + cigar[0] = '\0'; + + while(i < mismatchLength) + { + if(mismatch[i] >= '0' && mismatch[i] <= '9') + { + start = i; + + while(mismatch[i] >= '0' && mismatch[i] <= '9' && i < mismatchLength) + i++; + + int value = atoi(mismatch+start); + for(j = 0; j < value-1; j++) + { + cigar[cigarSize] = 'M'; + cigarSize++; + } + cigar[cigarSize] = 'M'; + } + else if(mismatch[i] == '^') + { + cigar[cigarSize] = 'I'; + i++; + } + else if(mismatch[i] == '\'') + { + cigar[cigarSize] = 'D'; + i++; + } + else + { + cigar[cigarSize] = 'M'; + cigarSize++; + } + cigarSize++; + i++; + } + cigar[cigarSize] = '\0'; +} + +void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP) +{ + + int i = 0; + + int counterM = 0; + int counterD = 0; + + char delete[100]; + + int snpSize = 0; + + outputSNP[0] = '\0'; + delete[0] = '\0'; + + + while(i < matrixLength) + { + if(matrix[i]=='M') + { + counterM++; + if(counterD != 0) + { + delete[counterD] = '\0'; + counterD=0; + sprintf(outputSNP, "%s^%s", outputSNP,delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + delete[0] = '\0'; + } + } + else if(matrix[i] == 'D') + { + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM=0; + delete[counterD] = matrix[i+1]; + i++; + counterD++; + } + else if(counterD != 0) + { + delete[counterD] = matrix[i+1]; + counterD++; + i++; + } + else + { + delete[counterD] = matrix[i+1]; + counterD++; + i++; + } + } + else if(matrix[i] == 'I') + { + if(counterM != 0) + { + // sprintf(outputSNP, "%s%d\0", outputSNP, counterM); + //counterM++; + } + else if(counterD != 0) + { + delete[counterD] = '\0'; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + counterD = 0; + delete[0] = '\0'; + } + i++; + + } + else + { + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM = 0; + } + if(counterD != 0) + { + delete[counterD] = '\0'; + counterD=0; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + delete[0] = '\0'; + } + sprintf(outputSNP,"%s%c",outputSNP,matrix[i]); + snpSize += 1; + outputSNP[snpSize] = '\0'; + } + i++; + } + + if(counterM != 0) + { + sprintf(outputSNP, "%s%d", outputSNP, counterM); + snpSize += addCigarSize(counterM); + outputSNP[snpSize] = '\0'; + counterM = 0; + } + else if(counterD != 0) + { + delete[counterD] = '\0'; + sprintf(outputSNP, "%s^%s", outputSNP, delete); + snpSize += strlen(delete) + 1; + outputSNP[snpSize] = '\0'; + counterD = 0; + } + + outputSNP[snpSize] = '\0'; +} +/**********************************************/ + +/* + direction = 0 forward + 1 backward + +*/ + +void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) +{ + int j = 0; + int z = 0; + int *locs = (int *) l1; + char *_tmpSeq, *_tmpQual; + char rqual[SEQ_LENGTH+1]; + rqual[SEQ_LENGTH]='\0'; + + int genLoc = 0; + int leftSeqLength = 0; + int rightSeqLength = 0; + int middleSeqLength = 0; + + char matrix[200]; + char editString[200]; + char cigar[MAX_CIGAR_SIZE]; + + short *_tmpHashValue; + + if (direction) + { + reverse(_msf_seqList[readNumber].qual, rqual, SEQ_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[readNumber].rseq; + _tmpHashValue = _msf_seqList[readNumber].rhashValue; + } + else + { + _tmpQual = _msf_seqList[readNumber].qual; + _tmpSeq = _msf_seqList[readNumber].seq; + _tmpHashValue = _msf_seqList[readNumber].hashValue; + } + + int readId = 2*readNumber+direction; + for (z=0; z<s1; z++) + { + + + int map_location = 0; + int a = 0; + int o = readSegment; + + genLoc = locs[z];//-_msf_samplingLocs[o]; + + + if ( genLoc-_msf_samplingLocs[o] < _msf_refGenBeg || + genLoc-_msf_samplingLocs[o] > _msf_refGenEnd || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId + ) + continue; + int err = -1; + + + map_location = 0; + + leftSeqLength = _msf_samplingLocs[o]; + middleSeqLength = WINDOW_SIZE; + a = leftSeqLength + middleSeqLength; + rightSeqLength = SEQ_LENGTH - a; + + if(errThreshold == 2) + err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold == 4) + err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold ==3) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + /*else if(errThreshold == 6) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + */ + else + err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + + if(err != -1) + { + generateSNPSAM(matrix, strlen(matrix), editString); + generateCigar(matrix, strlen(matrix), cigar); + } + + if(err != -1 && !bestMode) + { + + mappingCnt++; + + int j = 0; + int k = 0; + for(k = 0; k < readSegment+1; k++) + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && + genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) + _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; + } + } + _msf_seqList[readNumber].hits[0]++; + + _msf_output.QNAME = _msf_seqList[readNumber].name; + _msf_output.FLAG = 16 * direction; + _msf_output.RNAME = _msf_refGenName; + _msf_output.POS = map_location + _msf_refGenOffset; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = editString; + + output(_msf_output); + + + if (_msf_seqList[readNumber].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[readNumber].hits[0] = 2; + } + + + if ( maxHits!=0 && _msf_seqList[readNumber].hits[0] == maxHits) + { + completedSeqCnt++; + break; + } + + } + else if(err != -1 && bestMode) + { + mappingCnt++; + _msf_seqList[readNumber].hits[0]++; + + if (_msf_seqList[readNumber].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[readNumber].hits[0] = 2; + } + + if(err < bestHitMappingInfo[readNumber].err || bestHitMappingInfo[readNumber].loc == -1) + { + setFullMappingInfo(readNumber, map_location + _msf_refGenOffset, direction, err, 0, editString, _msf_refGenName, cigar ); + } + } + else + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc+j > _msf_refGenBeg && + genLoc+j < _msf_refGenEnd) + _msf_verifiedLocs[genLoc+j] = -readId; + } + } + } +} + + +int mapAllSingleEndSeq() +{ + int i = 0; + int j = 0; + int k = 0; + + + unsigned int *locs = NULL; + + + int prev_hash = 0; + + for(i = 0; i < _msf_seqListSize; i++) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; +// if(j != 0) +// if(strncmp(_msf_seqList[k].seq+_msf_samplingLocs[j], _msf_seqList[k].seq+_msf_samplingLocs[j-1], segSize) == 0) +// continue; +// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) +// continue; + locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapSingleEndSeq(locs+1, locs[0],k ,j, 0); + } + prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); + } + } + i = 0; + + for(i = 0; i < _msf_seqListSize; i++) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + +// if(j != 0) +// if(strncmp(_msf_seqList[k].rseq+_msf_samplingLocs[j], _msf_seqList[k].rseq+_msf_samplingLocs[j-1], segSize) == 0) +// continue; +// if(prev_hash == hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])) +// continue; + locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapSingleEndSeq(locs+1, locs[0],k ,j, 1); + } + prev_hash = hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j]); + } + } + return 1; +} + + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +int compareOut (const void *a, const void *b) +{ + FullMappingInfo *aInfo = (FullMappingInfo *)a; + FullMappingInfo *bInfo = (FullMappingInfo *)b; + return aInfo->loc - bInfo->loc; +} + + + +/**********************************************/ + +/* + direction 0: Forward + 1: Reverse +*/ + +void mapPairEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction) +{ + int z = 0; + int *locs = (int *) l1; + char *_tmpSeq; + + char rqual[SEQ_LENGTH+1]; + + char matrix[200]; + char editString[200]; + char cigar[MAX_CIGAR_SIZE]; + + short *_tmpHashValue; + + int leftSeqLength = 0; + int middleSeqLength = 0; + int rightSeqLength =0; + int a = 0; + + rqual[SEQ_LENGTH]='\0'; + + + int r = readNumber; + + char d = (direction==1)?-1:1; + + if (d==-1) + { + _tmpSeq = _msf_seqList[readNumber].rseq; + _tmpHashValue = _msf_seqList[r].rhashValue; + } + else + { + _tmpSeq = _msf_seqList[readNumber].seq; + _tmpHashValue = _msf_seqList[r].hashValue; + } + + int readId = 2*readNumber+direction; + for (z=0; z<s1; z++) + { + int genLoc = locs[z];//-_msf_samplingLocs[o]; + int err = -1; + int map_location = 0; + int o = readSegment; + + leftSeqLength = _msf_samplingLocs[o]; + middleSeqLength = WINDOW_SIZE; + a = leftSeqLength + middleSeqLength; + rightSeqLength = SEQ_LENGTH - a; + + if(genLoc - leftSeqLength < _msf_refGenBeg || genLoc + rightSeqLength + middleSeqLength > _msf_refGenEnd || + _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == readId || _msf_verifiedLocs[genLoc-_msf_samplingLocs[o]] == -readId) + continue; + + if(errThreshold == 2) + err = verifySingleEndEditDistance2(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold == 4) + err = verifySingleEndEditDistance4(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + else if(errThreshold ==3) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + /*else if(errThreshold == 6) + err = verifySingleEndEditDistance(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue);*/ + else + err = verifySingleEndEditDistanceExtention(genLoc, _tmpSeq, leftSeqLength, + _tmpSeq + a, rightSeqLength, + middleSeqLength, matrix, &map_location, _tmpHashValue); + + + if (err != -1) + { + int j = 0; + int k = 0; + + for(k = 0; k < readSegment+1; k++) + { + for(j = -errThreshold ; j <= errThreshold; j++) + { + if(genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j >= _msf_refGenBeg && + genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j <= _msf_refGenEnd) + _msf_verifiedLocs[genLoc-(k*(_msf_samplingLocs[1]-_msf_samplingLocs[0]))+j] = readId; + } + } + + + generateSNPSAM(matrix, strlen(matrix), editString); + generateCigar(matrix, strlen(matrix), cigar); + + MappingLocations *parent = NULL; + MappingLocations *child = _msf_mappingInfo[r].next; + + genLoc = map_location + _msf_refGenOffset; + int i = 0; + for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++) + { + parent = child; + child = child->next; + } + + if (child==NULL) + { + MappingLocations *tmp = getMem(sizeof(MappingLocations)); + + tmp->next = NULL; + tmp->loc[0]=genLoc * d; + tmp->err[0]=err; + + tmp->cigarSize[0] = strlen(cigar); + sprintf(tmp->cigar[0],"%s", cigar); + + tmp->mdSize[0] = strlen(editString); + sprintf(tmp->md[0],"%s", editString); + + if (parent == NULL) + _msf_mappingInfo[r].next = tmp; + else + parent->next = tmp; + } + else + { + if(strlen(cigar) > SEQ_LENGTH || strlen(editString) > SEQ_LENGTH) + { + printf("ERROR in %d read size(After mapping) exceedes cigar=%d md =%d cigar=%s md =%s\n", r, (int)strlen(cigar), (int)strlen(editString), cigar, editString); + } + + child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d; + child->err[_msf_mappingInfo[r].size % MAP_CHUNKS] = err; + + child->cigarSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(cigar); + sprintf(child->cigar[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",cigar); + + child->mdSize[_msf_mappingInfo[r].size % MAP_CHUNKS] = strlen(editString); + sprintf(child->md[_msf_mappingInfo[r].size % MAP_CHUNKS],"%s",editString); + } + _msf_mappingInfo[r].size++; + + } + else + { + _msf_verifiedLocs[genLoc] = -readId; + } + + } +} + +/**********************************************/ +void mapPairedEndSeq() +{ + int i = 0; + int j = 0; + int k = 0; + + unsigned int *locs = NULL; + while ( i < _msf_seqListSize ) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + locs = getCandidates ( hashVal(_msf_seqList[k].seq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapPairEndSeqList(locs+1, locs[0],k ,j, 0); + } + } + i++; + } + i = 0; + + while ( i < _msf_seqListSize ) + { + for(j = 0; j < _msf_samplingLocsSize; j++) + { + k = _msf_sort_seqList[i].readNumber; + locs = getCandidates ( hashVal(_msf_seqList[k].rseq+_msf_samplingLocs[j])); + if ( locs != NULL) + { + mapPairEndSeqList(locs+1, locs[0],k ,j, 1); + } + } + + i++; + } + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + MappingLocations *cur; + int tmpOut; + int lmax=0, rmax=0; + + sprintf(fname1, "%s__%s__%s__%d__1.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); + sprintf(fname2, "%s__%s__%s__%d__2.tmp",mappingOutputPath, _msf_refGenName, mappingOutput, _msf_openFiles); + + FILE* out; + FILE* out1 = fileOpen(fname1, "w"); + FILE* out2 = fileOpen(fname2, "w"); + + _msf_openFiles++; + + for (i=0; i<_msf_seqListSize; i++) + { + + if (i%2==0) + { + out = out1; + + if (lmax < _msf_mappingInfo[i].size) + { + lmax = _msf_mappingInfo[i].size; + } + } + else + { + out = out2; + if (rmax < _msf_mappingInfo[i].size) + { + rmax = _msf_mappingInfo[i].size; + } + } + + tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out); + if (_msf_mappingInfo[i].size > 0) + { + cur = _msf_mappingInfo[i].next; + for (j=0; j < _msf_mappingInfo[i].size; j++) + { + if ( j>0 && j%MAP_CHUNKS==0) + { + cur = cur->next; + } + if(cur->cigarSize[j % MAP_CHUNKS] > SEQ_LENGTH || cur->mdSize[j % MAP_CHUNKS] > SEQ_LENGTH) + { + printf("ERROR in %d read size exceeds cigar=%d md =%d cigar=%s md =%s\n", i, cur->cigarSize[j % MAP_CHUNKS], cur->mdSize[j % MAP_CHUNKS], cur->cigar[j % MAP_CHUNKS], cur->md[j % MAP_CHUNKS]); + } + + tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out); + + tmpOut = fwrite(&(cur->err[j % MAP_CHUNKS]), sizeof(int), 1, out); + + tmpOut = fwrite(&(cur->cigarSize[j % MAP_CHUNKS]), sizeof(int), 1, out); + tmpOut = fwrite((cur->cigar[j % MAP_CHUNKS]), sizeof(char), (cur->cigarSize[j % MAP_CHUNKS]), out); + + tmpOut = fwrite(&(cur->mdSize[j % MAP_CHUNKS]), sizeof(int), 1, out); + tmpOut = fwrite((cur->md[j % MAP_CHUNKS]), sizeof(char), (cur->mdSize[j % MAP_CHUNKS]), out); + + } + _msf_mappingInfo[i].size = 0; + //_msf_mappingInfo[i].next = NULL; + } + } + + _msf_maxLSize += lmax; + _msf_maxRSize += rmax; + + fclose(out1); + fclose(out2); + +} + +void outputPairFullMappingInfo(FILE *fp, int readNumber) +{ + + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; + + rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; + + seq1 = _msf_seqList[readNumber*2].seq; + rseq1 = _msf_seqList[readNumber*2].rseq; + qual1 = _msf_seqList[readNumber*2].qual; + + reverse(_msf_seqList[readNumber*2].qual, rqual1, SEQ_LENGTH); + + seq2 = _msf_seqList[readNumber*2+1].seq; + rseq2 = _msf_seqList[readNumber*2+1].rseq; + qual2 = _msf_seqList[readNumber*2+1].qual; + + reverse(_msf_seqList[readNumber*2+1].qual, rqual2, SEQ_LENGTH); + + + if(bestHitMappingInfo[readNumber*2].loc == -1 && bestHitMappingInfo[readNumber*2+1].loc == -1) + return; + else + { + + char *seq; + char *qual; + char d1; + char d2; + int isize; + int proper=0; + // ISIZE CALCULATION + // The distance between outer edges + isize = abs(bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc)+SEQ_LENGTH - 2; + + if (bestHitMappingInfo[readNumber*2].loc - bestHitMappingInfo[readNumber*2+1].loc > 0) + { + isize *= -1; + } + d1 = (bestHitMappingInfo[readNumber*2].dir == -1)?1:0; + d2 = (bestHitMappingInfo[readNumber*2+1].dir == -1)?1:0; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + if ( (bestHitMappingInfo[readNumber*2].loc < bestHitMappingInfo[readNumber*2+1].loc && !d1 && d2) || + (bestHitMappingInfo[readNumber*2].loc > bestHitMappingInfo[readNumber*2+1].loc && d1 && !d2) ) + { + proper = 2; + } + else + { + proper = 0; + } + + _msf_output.POS = bestHitMappingInfo[readNumber*2].loc; + _msf_output.MPOS = bestHitMappingInfo[readNumber*2+1].loc; + _msf_output.FLAG = 1+proper+16*d1+32*d2+64; + _msf_output.ISIZE = isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[readNumber*2].name; + _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[readNumber*2].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2].md; + + outputSAM(fp, _msf_output); + output(_msf_output); + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + _msf_output.POS = bestHitMappingInfo[readNumber*2+1].loc; + _msf_output.MPOS = bestHitMappingInfo[readNumber*2].loc; + _msf_output.FLAG = 1+proper+16*d2+32*d1+128; + _msf_output.ISIZE = -isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[readNumber*2].name; + _msf_output.RNAME = bestHitMappingInfo[readNumber*2].chr; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = bestHitMappingInfo[readNumber*2+1].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = bestHitMappingInfo[readNumber*2+1].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = bestHitMappingInfo[readNumber*2+1].md; + + outputSAM(fp, _msf_output); + output(_msf_output); + } +} + + +/* +Find the closet one to the c + @return 0: if the x1 is closer to c + 1: if the x2 is closer to c + 2: if both distance are equal + -1: if error +*/ +int findNearest(int x1, int x2, int c) +{ + + if (abs(x1 - c) > abs(x2 - c) ) + return 0; + else if ( abs(x1 - c) < abs(x2 - c) ) + return 1; + else if ( abs(x1 - c) == abs(x2 - c) ) + return 2; + else + return -1; +} + +void initBestConcordantDiscordant(int readNumber) +{ + char bestConcordantFileName[FILE_NAME_LENGTH]; + //char bestDiscordantFileName[FILE_NAME_LENGTH]; + + //OPEN THE BEST CONCORDANT FILE + //BEGIN{Farhad Hormozdiari} + /* begin {calkan} */ + //sprintf(bestConcordantFileName, "%s%s__BEST.CONCORDANT", mappingOutputPath, mappingOutput); + sprintf(bestConcordantFileName, "%s%s_BEST.sam", mappingOutputPath, mappingOutput); + + bestConcordantFILE = fileOpen(bestConcordantFileName, "w"); + bestDiscordantFILE = bestConcordantFILE; + /* end {calkan} */ + //END{Farhad Hormozdiari} + + + //OPEN THE BEST DISCORDANT FILE + //BEGIN{Farhad Hormozdiari} + /* begin {calkan} + sprintf(bestDiscordantFileName, "%s%s__BEST.DISCORDANT", mappingOutputPath, mappingOutput); + bestDiscordantFILE = fileOpen(bestDiscordantFileName, "w"); + end {calkan} */ + + //END{Farhad Hormozdiari} + + initBestMapping(readNumber); +} + +void finalizeBestConcordantDiscordant() +{ + int i = 0; + + for(i = 0; i<_msf_seqListSize/2; i++) + { + if(_msf_readHasConcordantMapping[i]==1) + outputPairFullMappingInfo(bestConcordantFILE, i); + else + outputPairFullMappingInfo(bestDiscordantFILE, i); + } + + fclose(bestConcordantFILE); + // fclose(bestDiscordantFILE); + + freeMem(bestHitMappingInfo, _msf_seqListSize * sizeof(FullMappingInfo)); +} + +void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar) +{ + bestHitMappingInfo[readNumber].loc = loc; + bestHitMappingInfo[readNumber].dir = dir; + bestHitMappingInfo[readNumber].err = err; + bestHitMappingInfo[readNumber].score = score; + + strncpy(bestHitMappingInfo[readNumber].md, md, strlen(md)+1); + strncpy(bestHitMappingInfo[readNumber].chr, refName, strlen(refName)+1); + strncpy(bestHitMappingInfo[readNumber].cigar, cigar, strlen(cigar)+1); +} + + +void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2) +{ + + bestHitMappingInfo[readNumber*2].loc = mi1.loc; + bestHitMappingInfo[readNumber*2].dir = mi1.dir; + bestHitMappingInfo[readNumber*2].err = mi1.err; + bestHitMappingInfo[readNumber*2].score = mi1.score; + snprintf(bestHitMappingInfo[readNumber*2].chr, MAX_REF_SIZE, "%s", _msf_refGenName); + + + strncpy(bestHitMappingInfo[readNumber*2].md, mi1.md, strlen(mi1.md)+1); + strncpy(bestHitMappingInfo[readNumber*2].cigar, mi1.cigar, strlen(mi1.cigar)+1); + + + /* + sprintf(bestHitMappingInfo[readNumber*2].md, "%s\0", mi1.md); + sprintf(bestHitMappingInfo[readNumber*2].cigar, "%s\0", mi1.cigar); + */ + + + bestHitMappingInfo[readNumber*2+1].loc = mi2.loc; + bestHitMappingInfo[readNumber*2+1].dir = mi2.dir; + bestHitMappingInfo[readNumber*2+1].err = mi2.err; + bestHitMappingInfo[readNumber*2+1].score = mi2.score; + + snprintf(bestHitMappingInfo[readNumber*2+1].chr, MAX_REF_SIZE, "%s", _msf_refGenName); + + /* + sprintf(bestHitMappingInfo[readNumber*2+1].md, "%s\0", mi2.md); + sprintf(bestHitMappingInfo[readNumber*2+1].cigar, "%s\0", mi2.cigar); + */ + + strncpy(bestHitMappingInfo[readNumber*2+1].md, mi2.md, strlen(mi2.md)+1); + strncpy(bestHitMappingInfo[readNumber*2+1].cigar, mi2.cigar, strlen(mi2.cigar)+1); + +} + +/**********************************************/ +void outputPairedEnd() +{ + int i = 0; + + char cigar[MAX_CIGAR_SIZE]; + + int tmpOut; + + loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut); + + FILE* in1[_msf_openFiles]; + FILE* in2[_msf_openFiles]; + + char fname1[_msf_openFiles][FILE_NAME_LENGTH]; + char fname2[_msf_openFiles][FILE_NAME_LENGTH]; + + // discordant + FILE *out=NULL, *out1=NULL; + + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + + int meanDistanceMapping = 0; + + char *rqual1; + char *rqual2; + + rqual1 = getMem((SEQ_LENGTH+1)*sizeof(char)); + rqual2 = getMem((SEQ_LENGTH+1)*sizeof(char)); + + if (pairedEndDiscordantMode) + { + sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname4, "%s__%s__oea", mappingOutputPath, mappingOutput); + out = fileOpen(fname3, "a"); + out1 = fileOpen(fname4, "a"); + } + + FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize); + FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize); + + _msf_fileCount[_msf_maxFile] = 0; + for (i=0; i<_msf_openFiles; i++) + { + sprintf(fname1[i], "%s__%s__%s__%d__1.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); + sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][0], "%s", fname1[i]); + + sprintf(fname2[i], "%s__%s__%s__%d__2.tmp", mappingOutputPath, _msf_refGenName, mappingOutput, i); + sprintf(_msf_fileName[_msf_maxFile][_msf_fileCount[_msf_maxFile]][1], "%s", fname2[i]); + + in1[i] = fileOpen(fname1[i], "r"); + in2[i] = fileOpen(fname2[i], "r"); + _msf_fileCount[_msf_maxFile]++; + } + _msf_maxFile++; + + int size; + int j, k; + int size1, size2; + + meanDistanceMapping = (pairedEndDiscordantMode==1)? (minPairEndedDiscordantDistance+maxPairEndedDiscordantDistance)/2 + SEQ_LENGTH + : (minPairEndedDistance + maxPairEndedDistance) / 2 + SEQ_LENGTH; + + for (i=0; i<_msf_seqListSize/2; i++) + { + size1 = size2 = 0; + for (j=0; j<_msf_openFiles; j++) + { + tmpOut = fread(&size, sizeof(int), 1, in1[j]); + if ( size > 0 ) + { + for (k=0; k<size; k++) + { + mi1[size1+k].dir = 1; + tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]); + tmpOut = fread (&(mi1[size1+k].err), sizeof(int), 1, in1[j]); + + tmpOut = fread (&(mi1[size1+k].cigarSize), sizeof(int), 1, in1[j]); + tmpOut = fread ((mi1[size1+k].cigar), sizeof(char), mi1[size1+k].cigarSize, in1[j]); + mi1[size1+k].cigar[mi1[size1+k].cigarSize] = '\0'; + + tmpOut = fread (&(mi1[size1+k].mdSize), sizeof(int), 1, in1[j]); + tmpOut = fread ((mi1[size1+k].md), sizeof(char), (mi1[size1+k].mdSize), in1[j]); + mi1[size1+k].md[mi1[size1+k].mdSize] = '\0'; + + if (mi1[size1+k].loc<1) + { + mi1[size1+k].loc *= -1; + mi1[size1+k].dir = -1; + } + } + qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut); + size1+=size; + } + } + + for (j=0; j<_msf_openFiles; j++) + { + tmpOut = fread(&size, sizeof(int), 1, in2[j]); + if ( size > 0 ) + { + for (k=0; k<size; k++) + { + mi2[size2+k].dir = 1; + tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]); + tmpOut = fread (&(mi2[size2+k].err), sizeof(int), 1, in2[j]); + + tmpOut = fread (&(mi2[size2+k].cigarSize), sizeof(int), 1, in2[j]); + tmpOut = fread ((mi2[size2+k].cigar), sizeof(char), mi2[size2+k].cigarSize, in2[j]); + mi2[size2+k].cigar[mi2[size2+k].cigarSize] = '\0'; + + tmpOut = fread (&(mi2[size2+k].mdSize), sizeof(int), 1, in2[j]); + tmpOut = fread ((mi2[size2+k].md), sizeof(char), mi2[size2+k].mdSize, in2[j]); + mi2[size2+k].md[mi2[size2+k].mdSize] = '\0'; + + if (mi2[size2+k].loc<1) + { + mi2[size2+k].loc *= -1; + mi2[size2+k].dir = -1; + } + } + qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut); + size2+=size; + } + } + + int lm, ll, rl, rm; + int pos = 0; + + if (pairedEndDiscordantMode) + { + + for (j=0; j<size1; j++) + { + lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1; + ll = mi1[j].loc - minPairEndedDiscordantDistance + 1; + rl = mi1[j].loc + minPairEndedDiscordantDistance - 1; + rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1; + + while (pos<size2 && mi2[pos].loc < lm) + { + pos++; + } + + k = pos; + while (k<size2 && mi2[k].loc<=rm) + { + if ( mi2[k].loc <= ll || mi2[k].loc >= rl) + { + if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) || + (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) ) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + + if(nosamMode != 0) + { + size1=0; + size2=0; + } + + break; + } + } + k++; + } + } + + _msf_seqHits[i*2] += size1; + _msf_seqHits[i*2+1] += size2; + + + if (_msf_seqHits[i*2+1] * _msf_seqHits[i*2] > DISCORDANT_CUT_OFF && nosamMode != 0) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + size1=0; + size2=0; + } + + + + + int tmp = 0; + int rNo = 0; + int loc = 0; + int err = 0; + float sc = 0; + char l = 0; + + //write the OEA data + if(_msf_seqHits[i*2] == 0 ) + { + for(k = 0;k < size2 && _msf_oeaMapping[i*2+1] < maxOEAOutput ;k++) + { + rNo = i*2+1; + loc = mi2[k].loc*mi2[k].dir; + err = mi2[k].err; + sc = mi2[k].score; + + l = strlen(_msf_refGenName); + + tmp = fwrite(&rNo, sizeof(int), 1, out1); + + tmp = fwrite(&l, sizeof(char), 1, out1); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); + + tmp = fwrite(&loc, sizeof(int), 1, out1); + tmp = fwrite(&err, sizeof(int), 1, out1); + tmp = fwrite(&sc, sizeof(float), 1, out1); + + if(mi2[k].cigarSize > SEQ_LENGTH || mi2[k].cigarSize <= 0) + printf("ERROR CIGAR size=%d %s\n", mi2[k].cigarSize, _msf_seqList[i*2+1].seq); + + tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out1); + tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out1); + + tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out1); + tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out1); + + _msf_oeaMapping[i*2+1]++; + } + } + if(_msf_seqHits[i*2+1] == 0) + { + for(j = 0;j < size1 && _msf_oeaMapping[i*2] < maxOEAOutput;j++) + { + rNo = i*2; + loc = mi1[j].loc*mi1[j].dir; + err = mi1[j].err; + sc = mi1[j].score; + + l = strlen(_msf_refGenName); + + tmp = fwrite(&rNo, sizeof(int), 1, out1); + + tmp = fwrite(&l, sizeof(char), 1, out1); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out1); + + tmp = fwrite(&loc, sizeof(int), 1, out1); + tmp = fwrite(&err, sizeof(int), 1, out1); + tmp = fwrite(&sc, sizeof(float), 1, out1); + + if(mi1[j].cigarSize > SEQ_LENGTH || mi1[j].cigarSize <= 0 ) + printf("ERROR %d %s\n", mi1[j].cigarSize, _msf_seqList[i*2+1].seq); + + tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out1); + tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out1); + + tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out1); + tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out1); + + _msf_oeaMapping[i*2]++; + } + } + } + + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + + + + + rqual1[SEQ_LENGTH] = '\0'; + rqual2[SEQ_LENGTH] = '\0'; + rqual1[0] = '\0'; + rqual2[0] = '\0'; + + + + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + + + + strncpy(rqual1, _msf_seqList[i*2].qual, SEQ_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + + + strncpy(rqual2, _msf_seqList[i*2+1].qual, SEQ_LENGTH); + + if (pairedEndDiscordantMode) + { + for (k=0; k<size1; k++) + { + mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, mi1[k].cigar); + } + + for (k=0; k<size2; k++) + { + mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, mi2[k].cigar); + } + + } + + + if (pairedEndDiscordantMode) + { + for (j=0; j<size1; j++) + { + for(k = 0; k < size2; k++) + { + if( + (mi2[k].loc-mi1[j].loc >= minPairEndedDiscordantDistance && + mi2[k].loc-mi1[j].loc <= maxPairEndedDiscordantDistance && + mi1[j].dir > 0 && mi2[k].dir < 0 ) + + || + + (mi1[j].loc-mi2[k].loc >= minPairEndedDiscordantDistance && + mi1[j].loc-mi2[k].loc <= maxPairEndedDiscordantDistance && + mi1[j].dir < 0 && mi2[k].dir > 0) + ) + { + //POSSIBLE CONCORDANT + if(_msf_readHasConcordantMapping[i] == 0) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + _msf_readHasConcordantMapping[i] = 1; + _msf_seqList[i*2].hits[0] = 1; + _msf_seqList[i*2+1].hits[0] = 1; + } + else + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) + { + + if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == + mi1[j].err + mi2[k].err && + findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi2[k].loc - mi1[j].loc), + meanDistanceMapping + ) == 0 ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + } + } + //DISCORDANT TO TEMP FILE FOR POST PROCESSIING + else if(_msf_readHasConcordantMapping[i] == 0 && + _msf_seqHits[i*2] != 0 && + _msf_seqHits[i*2+1] != 0) + { + + int tmp; + int rNo = i; + int loc = mi1[j].loc*mi1[j].dir; + int err = mi1[j].err; + float sc = mi1[j].score; + + char l = strlen(_msf_refGenName); + + if(_msf_discordantMapping[i*2] < maxDiscordantOutput) + { + + tmp = fwrite(&rNo, sizeof(int), 1, out); + + tmp = fwrite(&l, sizeof(char), 1, out); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out); + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(int), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + + tmp = fwrite (&(mi1[j].cigarSize), sizeof(int), 1, out); + tmp = fwrite ((mi1[j].cigar), sizeof(char), mi1[j].cigarSize, out); + + tmp = fwrite (&(mi1[j].mdSize), sizeof(int), 1, out); + tmp = fwrite ((mi1[j].md), sizeof(char), mi1[j].mdSize, out); + + + loc = mi2[k].loc*mi2[k].dir; + err = mi2[k].err; + sc = mi2[k].score; + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(int), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + + tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); + tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); + + tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); + tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); + + + _msf_discordantMapping[i*2]++; + } + //SET THE BEST DISCORDANT + //BEGIN {Farhad Hormozdiari} + if( bestHitMappingInfo[i*2].loc == -1 && + bestHitMappingInfo[i*2+1].loc == -1 && + _msf_readHasConcordantMapping[i] == 0) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + _msf_seqList[i*2].hits[0] = 1; + _msf_seqList[i*2+1].hits[0] = 1; + } + else if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err + && _msf_readHasConcordantMapping[i] == 0) + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && + findNearest( abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi1[j].loc - mi2[k].loc), + meanDistanceMapping + ) == 0 + ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + //END {Farhad Hormozdiari} + } + } + } + } + else + { + for (j=0; j<size1; j++) + { + for(k = 0; k < size2; k++) + { + if((mi2[k].loc-mi1[j].loc >= minPairEndedDistance && + mi2[k].loc-mi1[j].loc <= maxPairEndedDistance && + mi1[j].dir > 0 && mi2[k].dir < 0) + || + (mi1[j].loc-mi2[k].loc >= minPairEndedDistance && + mi1[j].loc-mi2[k].loc <= maxPairEndedDistance && + mi1[j].dir < 0 && mi2[k].dir > 0) + ) + { + char *seq; + char *qual; + char d1; + char d2; + int isize; + int proper=0; + // ISIZE CALCULATION + // The distance between outer edges + isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-2; + if (mi1[j].loc - mi2[k].loc > 0) + { + isize *= -1; + } + + d1 = (mi1[j].dir == -1)?1:0; + d2 = (mi2[k].dir == -1)?1:0; + + //SET THE READ HAS CONCORDANT MAPPING + _msf_readHasConcordantMapping[i] = 1; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + + if ((mi1[j].loc < mi2[k].loc && !d1 && d2) || + (mi1[j].loc > mi2[k].loc && d1 && !d2) ) + { + proper = 2; + } + else + { + proper = 0; + } + + + _msf_output.POS = mi1[j].loc; + _msf_output.MPOS = mi2[k].loc; + _msf_output.FLAG = 1+proper+16*d1+32*d2+64; + _msf_output.ISIZE = isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi1[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi1[j].md; + + if(!bestMode) + output(_msf_output); + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + _msf_output.POS = mi2[k].loc; + _msf_output.MPOS = mi1[j].loc; + _msf_output.FLAG = 1+proper+16*d2+32*d1+128; + _msf_output.ISIZE = -isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi2[k].err;; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi2[k].md; + + if(!bestMode) + output(_msf_output); + //SET THE BEST CONCORDANT + //BEGIN {Farhad Hormozdiari} + if(bestHitMappingInfo[i*2].loc == -1 && bestHitMappingInfo[i*2+1].loc == -1) + { + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + else + { + if(bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err >= mi1[j].err + mi2[k].err) + { + + if( bestHitMappingInfo[i*2].err + bestHitMappingInfo[i*2+1].err == mi1[j].err + mi2[k].err && + findNearest(abs(bestHitMappingInfo[i*2+1].loc - bestHitMappingInfo[i*2].loc), + abs(mi2[k].loc - mi1[j].loc), + meanDistanceMapping + ) == 0 ) + { + continue; + } + setPairFullMappingInfo(i, mi1[j], mi2[k]); + } + } + //END {Farhad Hormozdiari} + } + } + } + + } + } + + freeMem(rqual1, 0); + freeMem(rqual2, 0); + + if (pairedEndDiscordantMode) + { + fclose(out); + fclose(out1); + } + + for (i=0; i<_msf_openFiles; i++) + { + fclose(in1[i]); + fclose(in2[i]); + + unlink(fname1[i]); + unlink(fname2[i]); + } + + freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize); + freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize); + + _msf_openFiles = 0; +} + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +float str2int(char *str, int index1, int index2) +{ + char tmp[200]; + strncpy(tmp, &str[index1], index2-index1); + tmp[index2-index1] = '\0'; + return atol(tmp); +} + +float calculateScore(int index, char *seq, char *qual,char *md) +{ + int i; + int j; + char *ref; + char *ver; + + ref = _msf_refGen + index-1; + ver = seq; + float score = 1; + + char tmp[200]; + int value = 0; + int end = 0; + int index1 = 0; + int index2 = 0; + + i=0; + while(1) + { + + if(i>=strlen(md)) + break; + + index1 = i; + + while(md[i] >='0' && md[i]<='9') + { + i++; + } + + index2 = i; + + value = str2int(md, index1,index2); + + if(md[i]=='M') + { + for(j=0;j<value;j++) + { + tmp[end]='M'; + end++; + } + } + else if(md[i]=='I') + { + for(j=0;j<value;j++) + { + tmp[end]='I'; + end++; + } + + } + else if(md[i] == 'D') + { + for(j=0;j<value;j++) + { + tmp[end]='D'; + end++; + } + } + i++; + } + + tmp[end] = '\0'; + + j = 0; + + for (i = 0; i < end; i++) + { + if(tmp[i] == 'M') + { + if (*ref != *ver) + { + score *= 0.001 + 1/pow( 10, ((qual[j]-33)/10.0) ); + } + + ref++; + ver++; + j++; + } + else if(tmp[i] == 'I') + { + ver++; + j++; + } + else if(tmp[i] == 'D') + { + ref++; + } + } + + return score; +} + +int matoi(char *str, int start, int end) +{ + int i = 0; + char tmp[200]; + + for(i=0;i < end-start; i++) + tmp[i] = str[start+i]; + tmp[i]='\0'; + + return atoi(tmp); +} + +void convertCigarToMatrix(char *cigar, int cigar_size, char * matrix) +{ + int i = 0; + int j = 0; + + int start = 0; + int size = 0; + + matrix[0] = '\0'; + + while(i < cigar_size) + { + if(cigar[i] >= '0' && cigar[i] <= '9') + { + start = i; + + while(cigar[i] >= '0' && cigar[i] <= '9' && i < cigar_size) + i++; + + int value = matoi(cigar, start, i); + for(j = 0; j < value; j++) + { + if(cigar[i] == 'M') + matrix[size] = 'M'; + else if(cigar[i] == 'D') + matrix[size] ='D'; + else if(cigar[i] == 'I') + matrix[size] = 'I'; + size++; + } + } + i++; + } + matrix[size] = '\0'; +} + + + +void convertMDToMatrix(char *md, int md_size, char * matrix) +{ + int i = 0; + int j = 0; + + int start = 0; + int size = 0; + + matrix[0] = '\0'; + + while(i < md_size) + { + if(md[i] >= '0' && md[i] <= '9') + { + start = i; + + while(md[i] >= '0' && md[i] <= '9' && i < md_size) + i++; + + int value = matoi(md, start, i); + for(j = 0; j < value; j++) + { + matrix[size] = 'M'; + size++; + } + i--; + } + else if(md[i] == '^') + { + matrix[size] = 'D'; + size++; + } + else + { + matrix[size] = md[i]; + size++; + } + //size++; + i++; + } + matrix[size] = '\0'; +} + + +void convertMDCigarToMatrix(char *cigar, int cigar_size, char *md, int md_size, char *matrix) +{ + int i = 0; + int j = 0; + + int size = 0; + + char tmp1[200]; + char tmp2[200]; + convertMDToMatrix(md,md_size, tmp2); + + convertCigarToMatrix(cigar, cigar_size,tmp1); + + + + while(i < strlen(tmp1)) + { + if(tmp1[i]=='M') + { + if(j < strlen(tmp2)) + { + if(tmp2[j]=='M') + { + matrix[size]='M'; + size++; + } + if(tmp2[j]!='M') + { + matrix[size]=tmp2[j]; + size++; + } + } + else + { + matrix[size]='M'; + size++; + } + } + else if(tmp1[i] == 'D') + { + matrix[size]='D'; + size++; + j++; + matrix[size]=tmp2[j]; + size++; + + } + else if(tmp1[i] == 'I') + { + matrix[size]='I'; + size++; + } + + i++; + if(j < strlen(tmp2)) + j++; + } + + if(strlen(tmp1)) + + matrix[size] = '\0'; + +} + +void convertInsertion(char * in_matrix, char * seq, char *out_matrix) +{ + int i = 0; + int j = 0; + int size = 0; + + while( i < strlen(in_matrix)) + { + if(in_matrix[i] == 'M') + { + out_matrix[size] = 'M'; + size++; + j++; + } + else if(in_matrix[i] == 'D') + { + out_matrix[size] = 'D'; + size++; + + i++; + j++; + + out_matrix[size] = seq[j]; + j++; + size++; + } + else if(in_matrix[i] == 'I') + { + out_matrix[size] = 'I'; + size++; + out_matrix[size] = seq[j]; + size++; + j++; + } + else + { + out_matrix[size] = in_matrix[i]; + size++; + j++; + } + i++; + } + out_matrix[size] = '\0'; +} + +/**********************************************/ +void outputPairedEndDiscPP() +{ + char tmp_matrix1[200]; + char tmp_matrix2[200]; + + char matrix1[200]; + char matrix2[200]; + + char cigar1[200]; + char editString1[200]; + + char cigar2[200]; + char editString2[200]; + + char seq1[SEQ_LENGTH+1]; + char qual1[SEQ_LENGTH+1]; + + char seq2[SEQ_LENGTH+1]; + char qual2[SEQ_LENGTH+1]; + + char genName[SEQ_LENGTH]; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char l; + int l_size; + int loc1, loc2; + int err1, err2; + char dir1, dir2; + float sc1, sc2, lsc=0; + int flag = 0; + int rNo,lrNo = -1; + int tmp; + FILE *in, *out; + + sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname2, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput); + + in = fileOpen(fname1, "r"); + out = fileOpen(fname2, "w"); + + if (in != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in); + } + else + { + flag = 0; + } + + seq1[SEQ_LENGTH] = '\0'; + qual1[SEQ_LENGTH] = '\0'; + + seq2[SEQ_LENGTH] = '\0'; + qual2[SEQ_LENGTH] = '\0'; + + while (flag) + { + tmp = fread(&l, sizeof(char), 1, in); + tmp = fread(genName, sizeof(char), l, in); + genName[(int)l]='\0'; + tmp = fread(&loc1, sizeof(int), 1, in); + tmp = fread(&err1, sizeof(int), 1, in); + tmp = fread(&sc1, sizeof(float), 1, in); + + //tmp = fwrite (&(mi2[k].cigarSize), sizeof(int), 1, out); + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(cigar1, sizeof(char), l_size, in); + cigar1[(int)l_size]='\0'; + //tmp = fwrite ((mi2[k].cigar), sizeof(char), mi2[k].cigarSize, out); + + //tmp = fwrite (&(mi2[k].mdSize), sizeof(int), 1, out); + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(editString1, sizeof(char), l_size, in); + editString1[(int)l_size]='\0'; + //tmp = fwrite ((mi2[k].md), sizeof(char), mi2[k].mdSize, out); + + tmp = fread(&loc2, sizeof(int), 1, in); + tmp = fread(&err2, sizeof(int), 1, in); + tmp = fread(&sc2, sizeof(float), 1, in); + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(cigar2, sizeof(char), l_size, in); + cigar2[(int)l_size]='\0'; + + tmp = fread(&l_size, sizeof(int), 1, in); + tmp = fread(editString2, sizeof(char), l_size, in); + editString2[(int)l_size]='\0'; + + convertMDCigarToMatrix(cigar1, strlen(cigar1), editString1, strlen(editString1), tmp_matrix1); + convertMDCigarToMatrix(cigar2, strlen(cigar2), editString2, strlen(editString2), tmp_matrix2); + + + if(_msf_readHasConcordantMapping[rNo] == 0) + { + + dir1 = dir2 = 'F'; + + strncpy(seq1, _msf_seqList[rNo*2].seq, SEQ_LENGTH); + strncpy(seq2, _msf_seqList[rNo*2+1].seq, SEQ_LENGTH); + + if (loc1 < 0) + { + dir1 = 'R'; + loc1 = -loc1; + + strncpy(seq1, _msf_seqList[rNo*2].rseq, SEQ_LENGTH); + } + + if (loc2 < 0) + { + dir2 = 'R'; + loc2 = -loc2; + + strncpy(seq2, _msf_seqList[rNo*2+1].rseq, SEQ_LENGTH); + } + + convertInsertion(tmp_matrix1, seq1, matrix1); + convertInsertion(tmp_matrix2, seq2, matrix2); + + + if (rNo != lrNo) + { + int j; + for (j=0; j<SEQ_LENGTH; j++) + { + lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j]; + } + lsc /= 2*SEQ_LENGTH; + lsc -= 33; + lrNo = rNo; + } + + char event = '\0'; + + + if ( dir1 == dir2 ) + { + event = 'V'; + } + else + { + if (loc1 < loc2) + { + + if (dir1 == 'R' && dir2 == 'F') + { + event = 'E'; + + } + else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance ) + { + event = 'D'; + } + else + { + event = 'I'; + } + } + else if (loc2 < loc1) + { + if (dir2 == 'R' && dir1 == 'F') + { + event = 'E'; + } + else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance ) + { + event = 'D'; + } + else + { + event = 'I'; + } + } + } + _msf_seqList[rNo*2].hits[0] = 2; + if(event != 'E') + fprintf(out, "%s\t%s\t%d\t%d\t%c\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%e\n", + _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, + loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2); + + } + flag = fread(&rNo, sizeof(int), 1, in); + } + + fclose(in); + fclose(out); + + unlink(fname1); +} + +void finalizeOEAReads(char *fileName) +{ + FILE *fp_out1; + FILE * in; + + char genName[SEQ_LENGTH]; + + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + + char l=0; + int loc1=0; + + int err1; + + char d; + + float sc1=0; + int flag = 0; + int rNo=-1; + int tmp=0; + + int cigarSize = 0; + int mdSize = 0; + + char cigar[SEQ_LENGTH+1]; + char md[SEQ_LENGTH+1]; + + char *seq1, *seq2, *qual1, *qual2; + char *rqual1, *rqual2; + + seq1=NULL; seq2=NULL; qual1=NULL; qual2=NULL; + + rqual1 = getMem(200*sizeof(char)); + rqual2 = getMem(200*sizeof(char)); + + rqual1[0] = '\0'; + rqual2[0] = '\0'; + + /* + char mappingOutput2[2 * SEQ_LENGTH]; + int mo_len; + mo_len = strlen(mappingOutput); + strcpy(mappingOutput2, mappingOutput); + + if (mappingOutput[mo_len-1]=='m' && mappingOutput[mo_len-2]=='a' && mappingOutput[mo_len-3]=='s' && mappingOutput[mo_len-4]=='.') + mappingOutput2[mo_len-4] = 0; + */ + + sprintf(fname1, "%s%s_OEA.sam", mappingOutputPath, mappingOutput); + + fp_out1 = fileOpen(fname1, "w"); + + in = NULL; + if (pairedEndDiscordantMode){ + sprintf(fname2, "%s__%s__oea", mappingOutputPath, mappingOutput); + + in = fileOpen(fname2, "r"); + } + + + if (in != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in); + } + else + { + flag = 0; + } + + while (flag) + { + cigar[0] = '\0'; + md[0] = '\0'; + + tmp = fread(&l, sizeof(char), 1, in); + tmp = fread(genName, sizeof(char), l, in); + + genName[(int)l]='\0'; + + + tmp = fread(&loc1, sizeof(int), 1, in); + tmp = fread(&err1, sizeof(int), 1, in); + tmp = fread(&sc1, sizeof(float), 1, in); + + tmp = fread (&cigarSize, sizeof(int), 1, in); + tmp = fread (cigar, sizeof(char), cigarSize, in); + + cigar[cigarSize] = '\0'; + + tmp = fread (&mdSize, sizeof(int), 1, in); + tmp = fread (md, sizeof(char), mdSize, in); + md[mdSize] = '\0'; + + d = 1; + + if(loc1 < 0) + { + d = -1; + loc1 *= -1; + + seq1 = _msf_seqList[rNo].rseq; + reverse(_msf_seqList[rNo].qual, rqual1, SEQ_LENGTH); + rqual1[SEQ_LENGTH] = '\0'; + } + else + { + seq1 = _msf_seqList[rNo].seq; + qual1 = _msf_seqList[rNo].qual; + } + + if(rNo % 2 == 0) + { + seq2 = _msf_seqList[rNo+1].seq; + qual2 = _msf_seqList[rNo+1].qual; + } + else + { + seq2 = _msf_seqList[rNo-1].seq; + qual2 = _msf_seqList[rNo-1].qual; + } + + if(_msf_seqHits[rNo] != 0 && _msf_seqHits[(rNo%2==0)?rNo+1:rNo-1] == 0) + { + _msf_output.POS = loc1; + _msf_output.MPOS = 0; + _msf_output.FLAG = (rNo % 2 ==0)? 1+4+32*d+128 : 1+8+16*d+64 ; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq1; + _msf_output.QUAL = qual1; + _msf_output.QNAME = _msf_seqList[rNo].name; + _msf_output.RNAME = genName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = cigar; + _msf_output.MRNAME = "="; + + + _msf_output.optSize = 4; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err1; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = md; + + + + //for the OEA reads + _msf_optionalFields[2].tag = "NS"; + _msf_optionalFields[2].type = 'Z'; + _msf_optionalFields[2].sVal = seq2; + + + _msf_optionalFields[3].tag = "NQ"; + _msf_optionalFields[3].type = 'Z'; + _msf_optionalFields[3].sVal = qual2; + + outputSAM(fp_out1, _msf_output); + + _msf_seqList[rNo].hits[0] = -1; + _msf_seqList[(rNo%2==0)?rNo+1:rNo-1].hits[0] = -1; + } + flag = fread(&rNo, sizeof(int), 1, in); + } + + freeMem(rqual1, 0); + freeMem(rqual2, 0); + + unlink(fname2); + + fclose(fp_out1); +} + + +void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out) +{ + int i = 0; + int j = 0; + int k = 0; + + char *index; + + int size1 = 0; + int size2 = 0; + + FILE *fp1 = NULL; + FILE *fp2 = NULL; + + char geneFileName1[FILE_NAME_LENGTH]; + char geneFileName2[FILE_NAME_LENGTH]; + + FullMappingInfoLink *miL = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); + FullMappingInfoLink *miR = getMem(_msf_seqListSize * sizeof(FullMappingInfoLink)); + + + if(fileName1 != NULL && fileName2 != NULL) + { + + fp1 = fileOpen(fileName1, "r"); + fp2 = fileOpen(fileName2, "r"); + + index = strstr(fileName1, "__"); + strncpy(geneFileName1, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); + geneFileName1[strstr(index + 2, "__") - index - 2] = '\0'; + + index = strstr(fileName2, "__"); + strncpy(geneFileName2, index + 2 * sizeof(char), strstr(index + 2, "__") - index - 2); + geneFileName2[strstr(index + 2, "__") - index - 2] = '\0'; + + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + fread(&size1, sizeof(int), 1, fp1); + fread(&size2, sizeof(int), 1, fp2); + + miL[i].mi = getMem(size1 * sizeof(FullMappingInfo) ); + miR[i].mi = getMem(size2 * sizeof(FullMappingInfo) ); + + miL[i].size = size1; + miR[i].size = size2; + + for(j = 0; j < size1; j++) + { + fread(&(miL[i].mi[j].loc), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].err), sizeof(int), 1, fp1); + + fread (&(miL[i].mi[j].cigarSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].cigar), sizeof(char), miL[i].mi[j].cigarSize+1, fp1); + + fread (&(miL[i].mi[j].mdSize), sizeof(int), 1, fp1); + fread ((miL[i].mi[j].md), sizeof(char), miL[i].mi[j].mdSize+1, fp1); + + miL[i].mi[j].dir = 1; + if(miL[i].mi[j].loc < 1) + { + miL[i].mi[j].loc *= -1; + miL[i].mi[j].dir = -1; + } + } + for(k = 0; k < size2; k++) + { + fread(&(miR[i].mi[k].loc), sizeof(int), 1, fp2); + + fread (&(miR[i].mi[k].err), sizeof(int), 1, fp2); + + fread (&(miR[i].mi[k].cigarSize), sizeof(int), 1, fp2); + fread ((miR[i].mi[k].cigar), sizeof(char), miR[i].mi[k].cigarSize+1, fp2); + + fread (&(miR[i].mi[k].mdSize), sizeof(int), 1, fp2); + fread ((miR[i].mi[k].md), sizeof(char), miR[i].mi[k].mdSize+1, fp2); + + miR[i].mi[k].dir = 1; + if(miR[i].mi[k].loc < 1) + { + miR[i].mi[k].loc *= -1; + miR[i].mi[k].dir = -1; + } + } + if(_msf_readHasConcordantMapping[i] == 0 && size1 != 0 && size2 != 0 && (size1 * size2 < MAX_TRANS_CHROMOSAL_OUTPUT)) + { + int d1 = 0; + int d2 = 0; + char *seq, *qual; + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[SEQ_LENGTH+1], rqual2[SEQ_LENGTH+1]; + rqual1[SEQ_LENGTH] = rqual2[SEQ_LENGTH] = '\0'; + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + reverse(_msf_seqList[i*2].qual, rqual1, SEQ_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + reverse(_msf_seqList[i*2+1].qual, rqual2, SEQ_LENGTH); + + for(j = 0; j < size1; j++) + { + d1 = (miL[i].mi[j].dir == -1)?1:0; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + + for(k = 0; k < size2; k++) + { + + d2 = (miR[i].mi[k].dir == -1)?1:0; + + _msf_output.POS = miL[i].mi[j].loc; + _msf_output.MPOS = miR[i].mi[k].loc; + _msf_output.FLAG = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = geneFileName1; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = miL[i].mi[j].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = miL[i].mi[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = miL[i].mi[j].md; + + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + outputSAM(fp_out, _msf_output); + + + _msf_output.POS = miR[i].mi[k].loc; + _msf_output.MPOS = miL[i].mi[j].loc; + _msf_output.FLAG = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2+1].name; + _msf_output.RNAME = geneFileName2; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = miR[i].mi[k].cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = miR[i].mi[k].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = miR[i].mi[k].md; + + outputSAM(fp_out, _msf_output); + + } + } + } + } + + } + + for(i = 0; i < _msf_seqListSize / 2; i++) + { + freeMem(miL[i].mi, miL[i].size * sizeof(FullMappingInfo)); + freeMem(miR[i].mi, miR[i].size * sizeof(FullMappingInfo)); + } + + freeMem(miL, _msf_seqListSize * sizeof(FullMappingInfoLink)); + freeMem(miR, _msf_seqListSize * sizeof(FullMappingInfoLink)); + + fclose(fp1); + fclose(fp2); +} + +/* + if flag is 1 it will output all the possible trans chromsal mapping + otherwise only tmp file will be delete + +*/ + +void outputAllTransChromosal(int flag) +{ + + int i = 0; + int j = 0; + int k = 0; + int l = 0; + + FILE *fp_out = NULL; + char fname1[200]; + + if(flag) + { + fp_out = fileOpen(fname1, "w"); + + sprintf(fname1, "%s%s_TRANSCHROMOSOMAL", mappingOutputPath, mappingOutput); + + // for(i = 0; i < _msf_maxFile; i++) + // { + i = 0; + for(j = i+1; j < _msf_maxFile; j++) + { + if(i != j) + { + for(k = 0; k < _msf_fileCount[i]; k++) + { + for(l = 0; l < _msf_fileCount[j]; l++) + { + outputTransChromosal(_msf_fileName[i][k][0], _msf_fileName[j][l][1], fp_out); + }// for l + }// for k + }// if + }// for j + // } //for i + } + + for(i = 0; i < _msf_maxFile; i++) + { + for(j = 0; j < _msf_fileCount[i]; j++) + { + unlink(_msf_fileName[i][j][0]); + unlink(_msf_fileName[i][j][1]); + } + } + if(flag) + fclose(fp_out); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/MrFAST.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,209 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __MR_FAST__ +#define __MR_FAST__ + +#include "Reads.h" + +#define MAP_CHUNKS 15 +#define MAX_CIGAR_SIZE 100 + + +// Pair is used to pre-processing and making the read index table +typedef struct +{ + int hv; + //char hv[50]; + int readNumber; +} Pair; + +typedef struct +{ + int hv; + unsigned int *seqInfo; +} ReadIndexTable; + + +typedef struct +{ + int loc; + char dir; + int err; + float score; + char md[MAX_CIGAR_SIZE]; + char chr[20]; + char cigar[MAX_CIGAR_SIZE]; + int cigarSize; + int mdSize; +} FullMappingInfo; + +typedef struct +{ + int loc; + char dir; + int err; + float score; + char md[MAX_CIGAR_SIZE]; + char chr[20]; + char cigar[MAX_CIGAR_SIZE]; + int cigarSize; + int mdSize; +} BestFullMappingInfo; + +typedef struct lc +{ + char md[MAP_CHUNKS][MAX_CIGAR_SIZE]; + int mdSize[MAP_CHUNKS]; + + char cigar[MAP_CHUNKS][MAX_CIGAR_SIZE]; + int cigarSize[MAP_CHUNKS]; + + int err[MAP_CHUNKS]; + int loc[MAP_CHUNKS]; + struct lc *next; +} MappingLocations; + +typedef struct inf +{ + int size; + MappingLocations *next; +} MappingInfo; + +typedef struct +{ + FILE * fp; + char name[400]; +} FILE_STRUCT; + +typedef struct +{ + FullMappingInfo *mi; + int size; +} FullMappingInfoLink; + + +typedef struct +{ + char readString[200]; + char ref[200]; + int err; + char matrix[200]; +} extraCaching; + +extern long long verificationCnt; +extern long long mappingCnt; +extern long long mappedSeqCnt; +extern long long completedSeqCnt; + +void initFAST( Read *seqList, + int seqListSize, + int *samplingLocs, + int samplingLocsSize, + char *fileName); + +void initVerifiedLocs(); +void initLookUpTable(); +void initBestMapping(); +void initBestConcordantDiscordant(int readNumber); + +void finalizeFAST(); +void finalizeBestSingleMapping(); +void finalizeBestConcordantDiscordant(); +void finalizeOEAReads(char *); + + +int mapAllSingleEndSeq(); +void mapSingleEndSeq(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); +void mapPairedEndSeqList(unsigned int *l1, int s1, int readNumber, int readSegment, int direction); + +void mapPairedEndSeq(); + +void outputPairedEnd(); +void outputPairedEndDiscPP(); + + +void outputPairFullMappingInfo(FILE *fp, int readNumber); +void setPairFullMappingInfo(int readNumber, FullMappingInfo mi1, FullMappingInfo mi2); +void setFullMappingInfo(int readNumber, int loc, int dir, int err, int score, char *md, char * refName, char *cigar); + +void outputAllTransChromosal(); +void outputTransChromosal(char *fileName1, char *fileName2, FILE * fp_out); + +void generateSNPSAM(char *matrix, int matrixLength, char *outputSNP); +void generateCigar(char *matrix, int matrixLength, char *cigar); +void generateCigarFromMD(char *mistmatch, int mismatchLength, char *cigar); + +int msfHashVal(char *seq); + +int backwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); +int forwardEditDistance2SSE2(char *a, int lena, char *b,int lenb); + +int forwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); +int backwardEditDistanceSSE2G(char *a, int lena, char *b,int lenb); + +int forwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); +int backwardEditDistance4SSE2(char *a, int lena, char *b,int lenb); + +int forwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); +int backwardEditDistanceSSE2Extention(char *a, int lena, char *b,int lenb); + + +/***********************************/ + +int editDistance(int refIndex, char *seq, int seqLength, char *matrix); + +int verifySingleEndEditDistance(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistance2(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistance4(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +int verifySingleEndEditDistanceExtention(int refIndex, char *lSeq, int lSeqLength, char *rSeq, int rSeqLength, int segLength, + char *matrix, int *map_location, short *seqHashValue); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Output.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,231 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + +#include <stdio.h> +#include <stdlib.h> +#include <zlib.h> +#include <string.h> +#include "Common.h" +#include "Output.h" + +FILE *_out_fp; +gzFile _out_gzfp; + +char buffer[300000]; +int bufferSize = 0; + + +void finalizeGZOutput() +{ + gzclose(_out_gzfp); +} + +void finalizeTXOutput() +{ + fclose(_out_fp); +} + + +void gzOutputQ(SAM map) +{ + gzprintf(_out_gzfp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + gzprintf(_out_gzfp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + gzprintf(_out_gzfp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + gzprintf(_out_gzfp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + gzprintf(_out_gzfp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + gzprintf(_out_gzfp, "\n"); +} + +void outputSAM(FILE *fp, SAM map) +{ + fprintf(fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + fprintf(fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + fprintf(fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + fprintf(fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + fprintf(fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + + fprintf(fp, "\n"); + +} + +void outputQ(SAM map) +{ + + fprintf(_out_fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + fprintf(_out_fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + fprintf(_out_fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + fprintf(_out_fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + fprintf(_out_fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + + fprintf(_out_fp, "\n"); +} + +int initOutput ( char *fileName, int compressed) +{ + if (compressed) + { + char newFileName[strlen(fileName)+4]; + sprintf(newFileName, "%s.gz", fileName); + + _out_gzfp = fileOpenGZ(newFileName, "w1f"); + if (_out_gzfp == Z_NULL) + { + return 0; + } + + finalizeOutput = &finalizeGZOutput; + + output = &gzOutputQ; + } + else + { + _out_fp = fileOpen(fileName, "w"); + if (_out_fp == NULL) + { + return 0; + } + + finalizeOutput = &finalizeTXOutput; + output = &outputQ; + } + buffer[0] = '\0'; + return 1; +} + +FILE* getOutputFILE() +{ + if(_out_fp != NULL) + return _out_fp; + else if(_out_gzfp != NULL) + return _out_gzfp; + else + return NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Output.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,85 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef __OUTPUT__ +#define __OUTPUT__ + +#define FORWARD 0 +#define REVERSE 1 + +typedef struct +{ + char *tag; + char type; + char cVal; + int iVal; + float fVal; + char *sVal; +} OPT_FIELDS; + +typedef struct +{ + char *QNAME; + short FLAG; + char *RNAME; + int POS; + unsigned char MAPQ; + char *CIGAR; + char *MRNAME; + int MPOS; + int ISIZE; + char *SEQ; + char *QUAL; + + int optSize; + OPT_FIELDS *optFields; +} SAM; + +int initOutput(char *fileName, int compressed); +void (*finalizeOutput)(); +void (*output)(SAM map); +void outputSAM(FILE *fp, SAM map); + +FILE * getOutputFILE(); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Reads.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,798 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <zlib.h> +#include "Common.h" +#include "Reads.h" + +#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) + +FILE *_r_fp1; +FILE *_r_fp2; +gzFile _r_gzfp1; +gzFile _r_gzfp2; +Read *_r_seq; +int _r_seqCnt; +int *_r_samplingLocs; + +/**********************************************/ +char *(*readFirstSeq)(char *); +char *(*readSecondSeq)(char *); +/**********************************************/ +char *readFirstSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); +} + +/**********************************************/ +char *readSecondSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); +} +/**********************************************/ +char *readFirstSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); +} + +/**********************************************/ +char *readSecondSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); +} +/**********************************************/ +int toCompareRead(const void * elem1, const void * elem2) +{ + return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); +} +/**********************************************/ +int readAllReads(char *fileName1, + char *fileName2, + int compressed, + unsigned char *fastq, + unsigned char pairedEnd, + Read **seqList, + unsigned int *seqListSize) +{ + double startTime=getTime(); + + char seq1[SEQ_MAX_LENGTH]; + char rseq1[SEQ_MAX_LENGTH]; + char name1[SEQ_MAX_LENGTH]; + char qual1[SEQ_MAX_LENGTH]; + char seq2[SEQ_MAX_LENGTH]; + char rseq2[SEQ_MAX_LENGTH]; + char name2[SEQ_MAX_LENGTH]; + char qual2[SEQ_MAX_LENGTH]; + + char dummy[SEQ_MAX_LENGTH]; + char ch; + int err1, err2; + int nCnt; + int discarded = 0; + int seqCnt = 0; + int maxCnt = 0; + int i; + Read *list = NULL; + + int clipped = 0; + + + if (!compressed) + { + _r_fp1 = fileOpen( fileName1, "r"); + + if (_r_fp1 == NULL) + { + return 0; + } + + ch = fgetc(_r_fp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_fp2 = fileOpen ( fileName2, "r" ); + if (_r_fp2 == NULL) + { + return 0; + } + } + else + { + _r_fp2 = _r_fp1; + } + + readFirstSeq = &readFirstSeqTXT; + readSecondSeq = &readSecondSeqTXT; + } + else + { + + _r_gzfp1 = fileOpenGZ (fileName1, "r"); + + if (_r_gzfp1 == NULL) + { + return 0; + } + + ch = gzgetc(_r_gzfp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); + if (_r_gzfp2 == NULL) + { + return 0; + } + } + else + { + _r_gzfp2 = _r_gzfp1; + } + + readFirstSeq = &readFirstSeqGZ; + readSecondSeq = &readSecondSeqGZ; + } + + if (ch == '>') + *fastq = 0; + else + *fastq = 1; + + // Counting the number of lines in the file + while (readFirstSeq(dummy)) maxCnt++; + + if (!compressed) + { + rewind(_r_fp1); + } + else + { + gzrewind(_r_gzfp1); + } + + // Calculating the Maximum # of sequences + if (*fastq) + { + maxCnt /= 4; + } + else + { + maxCnt /= 2; + } + + if (pairedEnd && fileName2 != NULL ) + maxCnt *= 2; + + list = getMem(sizeof(Read)*maxCnt); + + while( readFirstSeq(name1) ) + { + err1 = 0; + err2 = 0; + readFirstSeq(seq1); + name1[strlen(name1)-1] = '\0'; + + if ( *fastq ) + { + readFirstSeq(dummy); + readFirstSeq(qual1); + qual1[strlen(qual1)-1] = '\0'; + } + else + { + sprintf(qual1, "*"); + } + + // Cropping + if (cropSize > 0) + { + seq1[cropSize] = '\0'; + if ( *fastq ) + qual1[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq1); i++) + { + seq1[i] = toupper (seq1[i]); + if (seq1[i] == 'N') + { + nCnt++; + } + else if (isspace(seq1[i])) + { + + seq1[i] = '\0'; + break; + } + } + + if (nCnt > errThreshold) + { + err1 = 1; + } + + // Reading the second seq of pair-ends + if (pairedEnd) + { + readSecondSeq(name2); + readSecondSeq(seq2); + name2[strlen(name2)-1] = '\0'; + + + if ( *fastq ) + { + readSecondSeq(dummy); + readSecondSeq(qual2); + + qual2[strlen(qual2)-1] = '\0'; + } + else + { + sprintf(qual2, "*"); + } + + + // Cropping + if (cropSize > 0) + { + seq2[cropSize] = '\0'; + if ( *fastq ) + qual2[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq2); i++) + { + seq2[i] = toupper (seq2[i]); + if (seq2[i] == 'N') + { + nCnt++; + + } + else if (isspace(seq2[i])) + { + seq2[i] = '\0'; + } + } + if (nCnt > errThreshold) + { + err2 = 1; + } + + + if (strlen(seq1) < strlen(seq2)) { + seq2[strlen(seq1)] = '\0'; + if ( *fastq ) + qual2[strlen(seq1)] = '\0'; + if (!clipped) clipped = 2; + } + else if (strlen(seq1) > strlen(seq2)){ + seq1[strlen(seq2)] = '\0'; + if ( *fastq ) + qual1[strlen(seq2)] = '\0'; + if (!clipped) clipped = 1; + } + + if (clipped == 1 || clipped == 2){ + fprintf(stdout, "[PE mode Warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped); + clipped = 3; + } + + + + } + + if (!pairedEnd && !err1) + { + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + seqCnt++; + + } + else if (pairedEnd && !err1 && !err2) + { + + // Naming Conventions X/1, X/2 OR X + int tmplen = strlen(name1); + if (strcmp(name1, name2) != 0) + { + tmplen = strlen(name1)-2; + } + + //first seq + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + name1[tmplen]='\0'; + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + + seqCnt++; + + //second seq + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq2, rseq2, _mtmp); + rseq2[_mtmp] = '\0'; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq2[i]; + list[seqCnt].rseq[i] = rseq2[i] ; + list[seqCnt].qual[i] = qual2[i]; + } + + + name2[tmplen]='\0'; + + //MAKE HASH VALUE + code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + j = 0; + tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); + + seqCnt++; + + } + else + { + discarded++; + } + } + + if (seqCnt > 0) + { + SEQ_LENGTH = strlen(list[0].seq); + } + else + { + fprintf(stdout, "ERROR: No reads can be found for mapping\n"); + return 0; + } + + + // Closing Files + if (!compressed) + { + fclose(_r_fp1); + if ( pairedEnd && fileName2 != NULL ) + { + fclose(_r_fp2); + } + } + else + { + gzclose(_r_gzfp1); + if ( pairedEnd && fileName2 != NULL) + { + gzclose(_r_fp2); + } + } + + //qsort(list, seqCnt, sizeof(Read), toCompareRead); + + adjustQual(list, seqCnt); + + *seqList = list; + *seqListSize = seqCnt; + + + _r_seq = list; + _r_seqCnt = seqCnt; + + if ( pairedEnd ) discarded *= 2; + + if (seqCnt>1) + fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + else + fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + + + + return 1; +} +/**********************************************/ +void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) +{ + int i; + int samLocsSize = errThreshold + 1; + int *samLocs = getMem(sizeof(int)*samLocsSize); + + for (i=0; i<samLocsSize; i++) + { + samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; + if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) + samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; + } + + // Outputing the sampling locations + + /* + + int j; + for (i=0; i<SEQ_LENGTH; i++) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n"); + + for ( i=0; i<samLocsSize; i++ ) + { + for ( j=0; j<samLocs[i]; j++ ) + { + fprintf(stdout," "); + } + for (j=0; j<WINDOW_SIZE; j++) + { + fprintf(stdout,"+"); + } + fprintf(stdout, "\n"); + fflush(stdout); + } + + + for ( i=0; i<SEQ_LENGTH; i++ ) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n"); + + */ + + *samplingLocs = samLocs; + *samplingLocsSize = samLocsSize; + _r_samplingLocs = samLocs; +} + +void finalizeReads(char *fileName) +{ + FILE *fp1=NULL; + + if (fileName != NULL) + { + fp1 = fileOpen(fileName, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) + { + fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0) + { + fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0) + { + fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); + } + } + + fclose(fp1); + if (pairedEndMode) + _r_seqCnt *= 2; + + for (i = 0; i < _r_seqCnt; i++) + { + freeMem(_r_seq[i].hits,0); + } + + + freeMem(_r_seq,0); + freeMem(_r_samplingLocs,0); +} + +void adjustQual(Read *list, int seqCnt){ + /* This function will automatically determine the phred_offset and readjust quality values if needed */ + int i,j,q, offset=64; + int len = strlen(list[0].qual); + + for (i=0; i<10000 && i<seqCnt; i++){ + for (j=0;j<len;j++){ + q = (int) list[i].qual[j] - offset; + if (q < 0){ + offset = 33; + break; + } + } + if (offset == 33) + break; + } + + if (offset == 64){ + fprintf(stdout, "[Quality Warning] Phred offset is 64. Readjusting to 33.\n"); + fflush(stdout); + for (i=0;i<seqCnt;i++){ + for (j=0;j<len;j++){ + list[i].qual[j] -= 31; + } + } + } +} + + + +/*void finalizeOEAReads(char *fileName) +{ + FILE *fp1=NULL; + FILE *fp2=NULL; + + //printf("OEA%s\n", fileName); + char fileNameOEA1[200]; + char fileNameOEA2[200]; + sprintf(fileNameOEA1, "%s_OEA1", fileName); + sprintf(fileNameOEA2, "%s_OEA2", fileName); + + + if (fileName != NULL) + { + fp1 = fileOpen(fileNameOEA1, "w"); + fp2 = fileOpen(fileNameOEA2, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + printf("%d\n", _r_seqCnt); + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 ) + { + fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + } + + fclose(fp1); + fclose(fp2); + +}*/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Reads.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,65 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + +#ifndef __READ__ +#define __READ__ + +typedef struct +{ + char *name; + char *seq; + char *rseq; + char *qual; + char *hits; + int readNumber; + short *hashValue; + short *rhashValue; + int *hashValSampleSize; +} Read; + +int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pe, Read **seqList, unsigned int *seqListSize); +void loadSamplingLocations(int **samplingLocs, int *samplingLocsSize); +void finalizeReads(char *fileName); +void adjustQual(Read *list, int seqCnt); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/RefGenome.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,157 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "Common.h" +#include "RefGenome.h" + +FILE *_rg_fp; +char *_rg_gen; +char *_rg_name; +int _rg_offset; +int _rg_contGen; + +/**********************************************/ +int initLoadingRefGenome(char *fileName) +{ + char ch; + _rg_fp = fileOpen (fileName, "r"); + if (fscanf(_rg_fp, "%c", &ch)) + { + if (ch == '>') + { + _rg_contGen = 0; + _rg_offset = 0; + _rg_gen = getMem(CONTIG_MAX_SIZE); + _rg_name = getMem(CONTIG_NAME_SIZE); + return 1; + } + } + return 0; +} +/**********************************************/ +void finalizeLoadingRefGenome() +{ + freeMem(_rg_gen, CONTIG_MAX_SIZE); + freeMem(_rg_name, CONTIG_NAME_SIZE); + fclose(_rg_fp); +} +/**********************************************/ +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff) +{ + char ch; + int i; + int returnVal = 0; + int actualSize=0; + int size; + char *tmp; + + // New Conting + if (!_rg_contGen) + { + size = 0; + tmp = fgets(_rg_name, SEQ_MAX_LENGTH, _rg_fp); + } + else + { + size=strlen(_rg_gen); + for( i = 0 ; i < CONTIG_OVERLAP ; i++ ) + { + + _rg_gen[i] = _rg_gen[size-CONTIG_OVERLAP+i]; + if (_rg_gen[i] != 'N') + actualSize++; + } + size = CONTIG_OVERLAP; + } + while( fscanf(_rg_fp, "%c", &ch) > 0 ) + { + if (ch == '>') + { + _rg_contGen = 0; + returnVal = 1; + break; + } + else if (!isspace(ch)) + { + ch = toupper(ch); + _rg_gen[size++] = ch; + if (ch != 'N') + { + actualSize++; + } + if (actualSize == CONTIG_SIZE || size == CONTIG_MAX_SIZE) + { + _rg_contGen = 1; + returnVal=1; + break; + } + } + + } + + _rg_gen[size] = '\0'; + for (i=strlen(_rg_name)-1; i >= 0; i--) + if (!isspace(_rg_name[i])) + break; + _rg_name[i+1] = '\0'; + + *refGenOff = _rg_offset; + *refGenName = _rg_name; + *refGen = _rg_gen; + + if (_rg_contGen == 1) + { + _rg_offset += size-CONTIG_OVERLAP; + } + else + { + _rg_offset = 0; + } + + + return returnVal; +} +/**********************************************/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/RefGenome.h Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,51 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#ifndef _REF_GENOME_ +#define _REF_GENOME_ + +int initLoadingRefGenome(char *fileName); +void finalizeLoadingRefGenome(); +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/baseFAST.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,376 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu +*/ + + + + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "Common.h" +#include "CommandLineParser.h" +#include "Reads.h" +#include "Output.h" +#include "HashTable.h" +#include "MrFAST.h" + +char *versionNumber = "2.1"; // Current Version +unsigned char seqFastq; + +int main(int argc, char *argv[]) +{ + if (!parseCommandLine(argc, argv)) + return 1; + + configHashTable(); + /**************************************************** + * INDEXING + ***************************************************/ + if (indexingMode) + { + int i; + /******************************** + * Regular Mode + ********************************/ + configHashTable(); + for (i = 0; i < fileCnt; i++) + { + generateHashTable(fileName[i][0], fileName[i][1]); + } + } + /**************************************************** + * SEARCHING + ***************************************************/ + else + { + Read *seqList; + unsigned int seqListSize; + int fc; + int samplingLocsSize; + int *samplingLocs; + double totalLoadingTime = 0; + double totalMappingTime = 0; + double startTime; + double loadingTime; + double mappingTime; + double lstartTime; + double ppTime = 0.0; + double tmpTime;; + char *prevGen = getMem(CONTIG_NAME_SIZE); + prevGen[0]='\0'; + char *curGen; + int flag; + double maxMem=0; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + char fname5[FILE_NAME_LENGTH]; + + char outputFileName[FILE_NAME_LENGTH]; + // Loading Sequences & Sampling Locations + startTime = getTime(); + if (!readAllReads(seqFile1, seqFile2, seqCompressed, &seqFastq, pairedEndMode, &seqList, &seqListSize)) + { + return 1; + } + + loadSamplingLocations(&samplingLocs, &samplingLocsSize); + totalLoadingTime += getTime()-startTime; + + if (pairedEndMode) + { + //Switching to Inferred Size + minPairEndedDistance = minPairEndedDistance - SEQ_LENGTH + 2; + maxPairEndedDistance = maxPairEndedDistance - SEQ_LENGTH + 2; + if (pairedEndDiscordantMode) + { + maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance - SEQ_LENGTH + 2; + minPairEndedDiscordantDistance = minPairEndedDiscordantDistance - SEQ_LENGTH + 2; + } + + sprintf(fname1, "__%s__1", mappingOutput); + sprintf(fname2, "__%s__2", mappingOutput); + sprintf(fname3, "__%s__disc", mappingOutput); + sprintf(fname4, "__%s__oea1", mappingOutput); + sprintf(fname5, "__%s__oea2", mappingOutput); + unlink(fname1); + unlink(fname2); + unlink(fname3); + unlink(fname4); + unlink(fname5); + } + + sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput); + // Preparing output + initOutput(outputFileName, outCompressed); + + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads"); + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + + /******************************** + * Regular Mode + ********************************/ + if (!pairedEndMode) + { + initLookUpTable(); + if(bestMode) + initBestMapping(seqListSize); + + for (fc = 0; fc <fileCnt; fc++) + { + if (!initLoadingHashTable(fileName[fc][1])) + { + return 1; + } + + mappingTime = 0; + loadingTime = 0; + prevGen[0] = '\0'; + flag = 1; + + do + { + flag = loadHashTable ( &tmpTime, errThreshold); // Reading a fragment + curGen = getRefGenomeName(); + + // First Time + if (flag && prevGen[0]== '\0') + { + sprintf(prevGen, "%s", curGen); + } + + if ( !flag || strcmp(prevGen, curGen)!=0) + { + + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + + totalMappingTime += mappingTime; + totalLoadingTime += loadingTime; + + loadingTime = 0; + mappingTime = 0; + maxMem = 0; + + if (!flag) + { + break; + } + } + else if (progressRep && mappingTime != 0) + { + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + } + + sprintf(prevGen, "%s", curGen); + + loadingTime += tmpTime; +// lstartTime = getTime(); + + initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); + + lstartTime = getTime(); + + + mapAllSingleEndSeq(); + + + + mappingTime += getTime() - lstartTime; + if (maxMem < getMemUsage()) + { + maxMem = getMemUsage(); + } + } while (flag); + + } // end for; + if(bestMode) + finalizeBestSingleMapping(); + finalizeFAST(); + finalizeLoadingHashTable(); + + } + // Pairedend Mapping Mode + else + { + initLookUpTable(); + if(pairedEndMode) + initBestConcordantDiscordant(seqListSize); + for (fc = 0; fc < fileCnt; fc++) + { + if (!initLoadingHashTable(fileName[fc][1])) + { + return 1; + } + mappingTime = 0; + loadingTime = 0; + prevGen[0] = '\0'; + flag = 1; + + do + { + flag = loadHashTable ( &tmpTime , errThreshold); // Reading a fragment + curGen = getRefGenomeName(); + + // First Time + if (flag && prevGen[0]== '\0') + { + sprintf(prevGen, "%s", curGen); + } + + if ( !flag || strcmp(prevGen, curGen)!=0) + { + + // DISCORDANT + lstartTime = getTime(); + outputPairedEnd(); + mappingTime += getTime() - lstartTime; + //DISCORDANT + + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + + totalMappingTime += mappingTime; + totalLoadingTime += loadingTime; + + loadingTime = 0; + mappingTime = 0; + maxMem = 0; + + if (!flag) + { + break; + } + } + else if (progressRep && mappingTime != 0) + { + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + } + + sprintf(prevGen, "%s", curGen); + + loadingTime += tmpTime; + lstartTime = getTime(); + + initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); + + mapPairedEndSeq(); + + mappingTime += getTime() - lstartTime; + if (maxMem < getMemUsage()) + { + maxMem = getMemUsage(); + } + + } while (flag); + + } // end for; + + if(pairedEndMode) + { + sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput); + finalizeOEAReads(outputFileName); + outputAllTransChromosal(transChromosal); + finalizeBestConcordantDiscordant(); + } + + finalizeLoadingHashTable(); + + if (pairedEndDiscordantMode) + { + lstartTime = getTime(); + outputPairedEndDiscPP(); + ppTime = getTime() - lstartTime; + } + finalizeFAST(); + } //else + + finalizeOutput(); + + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "%19s%16.2f%18.2f\n\n", "Total:",totalLoadingTime, totalMappingTime); + if (pairedEndDiscordantMode) + fprintf(stdout, "Post Processing Time: %18.2f \n", ppTime); + fprintf(stdout, "%-30s%10.2f\n","Total Time:", totalMappingTime+totalLoadingTime); + fprintf(stdout, "%-30s%10d\n","Total No. of Reads:", seqListSize); + fprintf(stdout, "%-30s%10lld\n","Total No. of Mappings:", mappingCnt); + fprintf(stdout, "%-30s%10.0f\n\n","Avg No. of locations verified:", ceil((float)verificationCnt/seqListSize)); + + int cof = (pairedEndMode)?2:1; + + if (progressRep && maxHits != 0) + { + int frequency[maxHits+1]; + int i; + for ( i=0 ; i <= maxHits; i++) + { + frequency[i] = 0; + } + + + for (fc = 0; fc < seqListSize; fc++) + { + frequency[(int)(*(seqList[fc*cof].hits))]++; + } + frequency[maxHits] = completedSeqCnt; + for ( i=0 ; i <= maxHits; i++) + { + fprintf(stdout, "%-30s%10d%10d%10.2f%%\n","Reads Mapped to ", i, frequency[i], 100*(float)frequency[i]/(float)seqListSize); + } + } + + + finalizeReads(unmappedOutput); + freeMem(prevGen, CONTIG_NAME_SIZE); + } + + return 1; +}