Mercurial > repos > calkan > mrsfast
changeset 0:ec628ba33878 default tip
Uploaded source code for mrsFAST
author | calkan |
---|---|
date | Tue, 21 Feb 2012 10:39:28 -0500 |
parents | |
children | |
files | mrsfast-2.3.0.2/CommandLineParser.c mrsfast-2.3.0.2/CommandLineParser.h mrsfast-2.3.0.2/Common.c mrsfast-2.3.0.2/Common.h mrsfast-2.3.0.2/HashTable.c mrsfast-2.3.0.2/HashTable.h mrsfast-2.3.0.2/Makefile mrsfast-2.3.0.2/MrsFAST.c mrsfast-2.3.0.2/MrsFAST.h mrsfast-2.3.0.2/Output.c mrsfast-2.3.0.2/Output.h mrsfast-2.3.0.2/Reads.c mrsfast-2.3.0.2/Reads.h mrsfast-2.3.0.2/RefGenome.c mrsfast-2.3.0.2/RefGenome.h mrsfast-2.3.0.2/baseFAST.c |
diffstat | 16 files changed, 4508 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/CommandLineParser.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,373 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-01-29 + */ + +#include <stdio.h> +#include <stdlib.h> +#include <getopt.h> +#include <string.h> +#include <ctype.h> +#include "Common.h" +#include "CommandLineParser.h" + +int uniqueMode=1; +int indexingMode; +int searchingMode; +int bisulfiteMode; +int pairedEndMode; +int pairedEndDiscordantMode; +int pairedEndProfilingMode; +int seqCompressed; +int outCompressed; +int cropSize = 0; +int progressRep = 0; +int minPairEndedDistance=-1; +int maxPairEndedDistance=-1; +int minPairEndedDiscordantDistance=-1; +int maxPairEndedDiscordantDistance=-1; +char *seqFile1; +char *seqFile2; +char *mappingOutput = "output"; +char *mappingOutputPath = ""; +char *unmappedOutput = ""; +char fileName[1000][2][FILE_NAME_LENGTH]; +int fileCnt; +unsigned char errThreshold=2; +unsigned char maxHits=0; +unsigned char WINDOW_SIZE = 12; +unsigned int CONTIG_SIZE; +unsigned int CONTIG_MAX_SIZE; + +void printHelp(); + +int parseCommandLine (int argc, char *argv[]) +{ + + int o; + int index; + char *fastaFile = NULL; + char *fastaOutputFile = NULL; + char *indexFile = NULL; + char *batchFile = NULL ; + int batchMode = 0; + static struct option longOptions[] = + { + + // {"bs", no_argument, &bisulfiteMode, 1}, + {"pe", no_argument, &pairedEndMode, 1}, + {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, + {"profile", no_argument, &pairedEndProfilingMode, 1}, + {"seqcomp", no_argument, &seqCompressed, 1}, + {"outcomp", no_argument, &outCompressed, 1}, + {"progress", no_argument, &progressRep, 1}, + {"index", required_argument, 0, 'i'}, + {"search", required_argument, 0, 's'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'v'}, + {"seq", required_argument, 0, 'x'}, + {"seq1", required_argument, 0, 'x'}, + {"seq2", required_argument, 0, 'y'}, + {"ws", required_argument, 0, 'w'}, + {"min", required_argument, 0, 'l'}, + {"max", required_argument, 0, 'm'}, + {"crop", required_argument, 0, 'c'} + + }; + + + + while ( (o = getopt_long ( argc, argv, "f:i:u:o:s:e:n:bhv", longOptions, &index))!= -1 ) + { + switch (o) + { + case 'i': + indexingMode = 1; + fastaFile = optarg; + break; + case 's': + searchingMode = 1; + fastaFile = optarg; + break; + case 'b': + batchMode = 1; + break; + case 'c': + cropSize = atoi(optarg); + break; + case 'w': + WINDOW_SIZE = atoi(optarg); + break; + case 'x': + seqFile1 = optarg; + break; + case 'y': + seqFile2 = optarg; + break; + case 'u': + unmappedOutput = optarg; + break; + case 'o': + mappingOutput = getMem(FILE_NAME_LENGTH); + mappingOutputPath = getMem(FILE_NAME_LENGTH); + stripPath (optarg, &mappingOutputPath, &mappingOutput); + break; + case 'n': + maxHits = atoi(optarg); + break; + case 'e': + errThreshold = atoi(optarg); + break; + case 'l': + minPairEndedDistance = atoi(optarg); + break; + case 'm': + maxPairEndedDistance = atoi(optarg); + break; + case 'h': + printHelp(); + return 0; + break; + case 'v': + fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF); + return 0; + break; + } + + } + + if (indexingMode + searchingMode != 1) + { + fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); + return 0; + } + + if (WINDOW_SIZE > 14 || WINDOW_SIZE < 8) + { + fprintf(stdout, "ERROR: Window size should be in [8..14]\n"); + return 0; + } + + + if ( indexingMode ) + { + CONTIG_SIZE = 15000000; + CONTIG_MAX_SIZE = 40000000; + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); + return 0; + } + + if (pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant cannot be used in indexing mode. \n"); + return 0; + } + + } + + + if ( searchingMode ) + { + CONTIG_SIZE = 300000000; + CONTIG_MAX_SIZE = 300000000; + + + if (batchMode) + { + batchFile = fastaFile; + fastaFile = NULL; + } + + if (batchFile == NULL && fastaFile == NULL) + { + fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); + return 0; + } + + if (seqFile1 == NULL && seqFile2 == NULL) + { + fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); + return 0; + } + + + if (!pairedEndMode && !bisulfiteMode && seqFile2 != NULL) + { + fprintf(stdout, "ERROR: Second File can be indicated in pairedend/bisulfite mode\n"); + return 0; + } + + if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance)) + { + fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); + return 0; + } + + if (pairedEndMode && seqFile1 == NULL) + { + fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); + return 0; + } + + if (!pairedEndMode && pairedEndDiscordantMode) + { + fprintf(stdout, "ERROR: --discordant should be used with --pe"); + return 0; + } + + if (!pairedEndMode && pairedEndProfilingMode) + { + fprintf(stdout, "ERROR: --profile should be used with --pe"); + return 0; + } + } + + int i = 0; + + + if (batchMode) + { + FILE *fp = fileOpen(batchFile, "r"); + + if (fp == NULL) + return 0; + + fileCnt = 0; + + while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp)) + { + for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--) + if ( !isspace(fileName[fileCnt][0][i])) + break; + fileName[fileCnt][0][i+1] = '\0'; + + if (strcmp(fileName[fileCnt][0], "") != 0) + { + if (bisulfiteMode) + sprintf(fileName[fileCnt][1], "%s.bsindex", fileName[fileCnt][0]); + else + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + } + } + else + { + sprintf(fileName[fileCnt][0], "%s", fastaFile); + if (bisulfiteMode) + sprintf(fileName[fileCnt][1], "%s.bsindex", fileName[fileCnt][0]); + else + sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); + fileCnt++; + } + + + if (pairedEndProfilingMode) + { + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + + } + + if (pairedEndDiscordantMode) + { + minPairEndedDiscordantDistance = minPairEndedDistance; + maxPairEndedDiscordantDistance = maxPairEndedDistance; + + minPairEndedDistance = 0; + maxPairEndedDistance = 300000000; + } + + return 1; +} + + +void printHelp() +{ + char *errorType; + if (mrFAST) + { + fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"Usage: mrFAST [options]\n\n"); + errorType="edit distance"; + } + else + { + fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n"); + fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n"); + fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n"); + fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n"); + fprintf(stdout,"the output in SAM format.\n\n"); + fprintf(stdout,"Usage: mrsFAST [options]\n\n"); + errorType="hamming distance"; + } + + fprintf(stdout,"General Options:\n"); + fprintf(stdout," -v|--version\t\tCurrent Version.\n"); + fprintf(stdout," -h\t\t\tShows the help file.\n"); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Indexing Options:\n"); + fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n"); + fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," -ws [int]\t\tSet window size for indexing (default:12-min:8 max:14).\n"); + // fprintf(stdout," -bs \t\t\tBisulfite mode."); + fprintf(stdout,"\n\n"); + + fprintf(stdout,"Searching Options:\n"); + fprintf(stdout," --search [file]\tSearch the specified genome. Index file should be \n\t\t\tin same directory as the fasta file.\n"); + fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n"); + fprintf(stdout," --pe \t\t\tSearch will be done in Pairedend mode.\n"); + // fprintf(stdout," --bs \t\t\tSearch will be done in Bisulfite mode.\n"); + fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpairend reads are interleaved, use this option.\n"); + fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpair-end reads. You can use this option alone in \n\t\t\tbisulfite mode. \n"); + fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpair-end reads. You can use this option alone in \n\t\t\tbisulfite mode. \n"); + fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is output.\n"); + fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed(gz).\n"); + fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed(gz).\n"); + // fprintf(stdout," -u [file]\t\tSave unmapped sequences to the [file] in fasta format.\n"); + fprintf(stdout," -n [int]\t\tMaximum number of locations reported for a sequence \n\t\t\t(default 0, all mappings). \n"); + fprintf(stdout," -e [int]\t\t%s (default 2).\n", errorType); + fprintf(stdout," --min [int]\t\tMin inferred distance allowed between two pairend sequences.\n"); + fprintf(stdout," --max [int]\t\tMax inferred distance allowed between two pairend sequences.\n"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/CommandLineParser.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,42 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __COMMAND_LINE_PARSER__ +#define __COMMAND_LINE_PARSER__ + +int parseCommandLine (int argc, char *argv[]); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Common.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,168 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-02-01 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <zlib.h> +#include <string.h> +#include "Common.h" + + +unsigned short SEQ_LENGTH = 0; +unsigned short QUAL_LENGTH = 0; +long long memUsage = 0; +/**********************************************/ +FILE *fileOpen(char *fileName, char *mode) +{ + FILE *fp; + fp = fopen (fileName, mode); + if (fp == NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return fp; +} +/**********************************************/ +gzFile fileOpenGZ(char *fileName, char *mode) +{ + gzFile gzfp; + gzfp = gzopen (fileName, mode); + if (gzfp == Z_NULL) + { + fprintf(stdout, "Error: Cannot Open the file %s\n", fileName); + fflush(stdout); + exit(0); + } + return gzfp; +} +/**********************************************/ +double getTime(void) +{ + struct timeval t; + gettimeofday(&t, NULL); + return t.tv_sec+t.tv_usec/1000000.0; +} + +/**********************************************/ +char reverseCompleteChar(char c) +{ + char ret; + switch (c) + { + case 'A': + ret = 'T'; + break; + case 'T': + ret = 'A'; + break; + case 'C': + ret = 'G'; + break; + case 'G': + ret = 'C'; + break; + default: + ret = 'N'; + break; + } + return ret; +} +/**********************************************/ +void reverseComplete (char *seq, char *rcSeq , int length) +{ + int i; + for (i=0; i<length; i++) + { + rcSeq[i]=reverseCompleteChar (seq[length-1-i]) ; + } +} +/**********************************************/ +void * getMem(size_t size) +{ + memUsage+=size; + return malloc(size); +} +/**********************************************/ +void freeMem(void *ptr, size_t size) +{ + memUsage-=size; + free(ptr); +} +/**********************************************/ +double getMemUsage() +{ + return memUsage/1048576.0; +} +/**********************************************/ +void reverse (char *seq, char *rcSeq , int length) +{ + int i; + for (i=0; i<length; i++) + { + rcSeq[i]=seq[length-1-i]; + } +} +/**********************************************/ +void stripPath(char *full, char **path, char **fileName) +{ + int i; + int pos = -1; + + for (i=strlen(full)-1; i>=0; i--) + { + if (full[i]=='/') + { + pos = i; + break; + } + + } + + if (pos != -1) + { + sprintf(*fileName, "%s%c", (full+pos+1), '\0'); + full[pos+1]='\0'; + sprintf(*path,"%s%c", full, '\0'); + } + else + { + sprintf(*fileName, "%s%c", full, '\0'); + sprintf(*path,"%c", '\0'); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Common.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,98 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __COMMON__ +#define __COMMON__ + +#include <zlib.h> + +#define SEQ_MAX_LENGTH 200 // Seq Max Length +#define CONTIG_OVERLAP 200 // No. of characters overlapped between contings +#define CONTIG_NAME_SIZE 200 // Contig name max size +#define FILE_NAME_LENGTH 400 // Filename Max Length +#define DISCORDANT_CUT_OFF 800 + +extern unsigned int CONTIG_SIZE; +extern unsigned int CONTIG_MAX_SIZE; + + +extern unsigned char WINDOW_SIZE ; // WINDOW SIZE for indexing/searching +extern unsigned short SEQ_LENGTH; // Sequence(read) length +extern unsigned short QUAL_LENGTH; + +extern char *versionNumber; +extern char *versionNumberF; +extern unsigned char mrFAST; + + +extern int uniqueMode; +extern int indexingMode; +extern int searchingMode; +extern int bisulfiteMode; +extern int pairedEndMode; +extern int pairedEndDiscordantMode; +extern int pairedEndProfilingMode; +extern int seqCompressed; +extern int outCompressed; +extern int cropSize; +extern int progressRep; +extern char *seqFile1; +extern char *seqFile2; +extern char *seqUnmapped; +extern char *mappingOutput; +extern char *mappingOutputPath; +extern char *unmappedOutput; +extern unsigned char seqFastq; +extern unsigned char errThreshold; +extern unsigned char maxHits; +extern int minPairEndedDiscordantDistance; +extern int maxPairEndedDiscordantDistance; +extern int minPairEndedDistance; +extern int maxPairEndedDistance; +extern char fileName[1000][2][FILE_NAME_LENGTH]; +extern int fileCnt; +extern long long memUsage; + +FILE * fileOpen(char *fileName, char *mode); +gzFile fileOpenGZ(char *fileName, char *mode); +double getTime(void); +void reverseComplete (char *seq, char *rcSeq , int length); +void * getMem(size_t size); +void freeMem(void * ptr, size_t size); +double getMemUsage(); +void reverse (char *seq, char *rcSeq , int length); +void stripPath(char *full, char **path, char **fileName); +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/HashTable.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,428 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "Common.h" +#include "RefGenome.h" +#include "HashTable.h" +/**********************************************/ +FILE *_ih_fp = NULL; +IHashTable *_ih_hashTable = NULL; +int _ih_maxHashTableSize = 0; +char *_ih_refGen = NULL; +char *_ih_refGenName = NULL; +long long _ih_memUsage = 0; +int _ih_refGenOff = 0; +/**********************************************/ + +int hashVal(char *seq) +{ + int i=0; + int val=0, numericVal=0; + + while(i<WINDOW_SIZE) + { + switch (seq[i]) + { + case 'A': + numericVal = 0; + break; + case 'C': + numericVal = 1; + break; + case 'G' : + numericVal = 2; + break; + case 'T': + numericVal = 3; + break; + default: + return -1; + break; + } + val = (val << 2)|numericVal; + i++; + } + return val; +} +/**********************************************/ +void freeIHashTableContent(IHashTable *hashTable, unsigned int maxSize) +{ + int i=0; + for (i=0; i<maxSize; i++) + { + if (hashTable[i].locs != NULL) + { + freeMem(hashTable[i].locs, (hashTable[i].locs[0]+1)*(sizeof(unsigned int))); + hashTable[i].locs = NULL; + } + } +} +/**********************************************/ +void initSavingIHashTable(char *fileName) +{ + int tmp; + _ih_fp = fileOpen(fileName, "w"); + unsigned char bIndex = 0; // Bisulfite Index + + // First Two bytes are indicating the type of the index & window size + tmp = fwrite(&bIndex, sizeof(bIndex), 1, _ih_fp); + tmp = fwrite(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); + +} +/**********************************************/ +void finalizeSavingIHashTable() +{ + fclose(_ih_fp); +} +/**********************************************/ +void saveIHashTable(IHashTable *hashTable, unsigned int size, unsigned int maxSize, char *refGen, char *refGenName, int refGenOffset) +{ + int tmp; + + // Every Chunk starts with a byte indicating whether it has extra info; + unsigned char extraInfo = 0; + tmp = fwrite (&extraInfo, sizeof(extraInfo), 1, _ih_fp); + + short len = strlen(refGenName); + tmp = fwrite(&len, sizeof(len), 1, _ih_fp); + tmp = fwrite(refGenName, sizeof(char), len, _ih_fp); + + tmp = fwrite(&refGenOffset, sizeof(refGenOffset), 1, _ih_fp); + + unsigned int refGenLength = strlen(refGen); + tmp = fwrite(&refGenLength, sizeof(refGenLength), 1, _ih_fp); + tmp = fwrite(refGen, sizeof(char), refGenLength, _ih_fp); + + tmp = fwrite(&size, sizeof(size), 1, _ih_fp); + + int i=0,j=0; + unsigned char cnt=0; + for (i=0; i<maxSize; i++) + { + if (hashTable[i].locs != NULL) + { + tmp = fwrite(&i, sizeof(i), 1, _ih_fp); + + if (hashTable[i].locs[0] < 250) + { + cnt = hashTable[i].locs[0]; + tmp = fwrite(&cnt, sizeof(cnt), 1, _ih_fp); + } + else + { + cnt =0; + tmp = fwrite (&cnt, sizeof(cnt), 1, _ih_fp); + tmp = fwrite (&(hashTable[i].locs[0]), sizeof(hashTable[i].locs[0]), 1, _ih_fp); + } + + for (j=1; j<=hashTable[i].locs[0]; j++) + tmp = fwrite(&(hashTable[i].locs[j]), sizeof(hashTable[i].locs[j]), 1, _ih_fp); + } + } +} +/**********************************************/ +unsigned int addIHashTableLocation(IHashTable *hashTable, int hv, int location) +{ + unsigned int sizeInc = 0; + + if (hashTable[hv].locs == NULL) + { + sizeInc = 1; + hashTable[hv].locs = getMem (sizeof(unsigned int)*2); + hashTable[hv].locs[0]=1; + hashTable[hv].locs[1]=location; + } + else + { + int size = hashTable[hv].locs[0]; + int i; + unsigned int *tmp = getMem( (size + 2) * sizeof(unsigned int) ); + + for (i = 0; i <= size; i++) + { + tmp[i] = hashTable[hv].locs[i]; + } + size++; + tmp[0] = size; + tmp[size] = location; + + freeMem(hashTable[hv].locs, (hashTable[hv].locs[0]*(sizeof(unsigned int)))); + hashTable[hv].locs = tmp; + } + return sizeInc; +} +/**********************************************/ +void generateIHashTable(char *fileName, char *indexName) +{ + double startTime = getTime(); + unsigned int hashTableSize = 0; + unsigned int hashTableMaxSize = pow(4, WINDOW_SIZE); + IHashTable *hashTable = getMem(sizeof(IHashTable)*hashTableMaxSize); + char *refGenName; + char *refGen; + int refGenOff = 0; + int i, hv, l, flag; + + + for ( i = 0; i < hashTableMaxSize; i++) + { + hashTable[i].locs = NULL; + } + + //Loading Fasta File + if (!initLoadingRefGenome(fileName)) + return; + initSavingIHashTable(indexName); + + fprintf(stdout, "Generating Index from %s", fileName); + fflush(stdout); + + char *prev = getMem (CONTIG_NAME_SIZE); + prev[0]='\0'; + + do + { + flag = loadRefGenome (&refGen, &refGenName, &refGenOff); + + if ( strcmp(prev, refGenName) != 0) + { + fprintf(stdout, "\n - %s ", refGenName); + fflush(stdout); + sprintf(prev, "%s", refGenName); + } + else + { + fprintf(stdout, "."); + fflush(stdout); + } + + l = strlen(refGen) - WINDOW_SIZE; + + for (i=0; i < l; i++) + { + hv = hashVal(refGen + i); + if (hv != -1) + { + hashTableSize += addIHashTableLocation (hashTable, hv, i+1); + } + } + + saveIHashTable(hashTable, hashTableSize, hashTableMaxSize, refGen, refGenName, refGenOff); + freeIHashTableContent(hashTable, hashTableMaxSize); + hashTableSize = 0; + } while (flag); + + freeMem(prev, CONTIG_NAME_SIZE); + freeMem(hashTable, sizeof(IHashTable)*hashTableMaxSize); + + finalizeLoadingRefGenome(); + finalizeSavingIHashTable(); + + fprintf(stdout, "\nDONE in %0.2fs!\n", (getTime()-startTime)); +} + +/**********************************************/ +void finalizeLoadingIHashTable() +{ + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + fclose(_ih_fp); +} + +/**********************************************/ +int loadIHashTable(double *loadTime) +{ + double startTime = getTime(); + unsigned char extraInfo = 0; + short len; + unsigned int refGenLength; + unsigned int hashTableSize; + unsigned int tmpSize; + int tmp; + int i=0,j=0; + + if ( fread(&extraInfo, sizeof(extraInfo), 1, _ih_fp) != sizeof(extraInfo) ) + return 0; + + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + + // Reading Chr Name + tmp = fread(&len, sizeof(len), 1, _ih_fp); + _ih_refGenName = getMem(sizeof(char)* (len+1)); + tmp = fread(_ih_refGenName, sizeof(char), len, _ih_fp); + _ih_refGenName [len] ='\0'; + + tmp = fread(&_ih_refGenOff, sizeof (_ih_refGenOff), 1, _ih_fp); + + // Reading Size and Content of Ref Genome + tmp = fread(&refGenLength, sizeof(refGenLength), 1, _ih_fp); + _ih_refGen = getMem(sizeof(char)*(refGenLength+1)); + tmp = fread(_ih_refGen, sizeof(char), refGenLength, _ih_fp); + _ih_refGen[refGenLength]='\0'; + + //Reading Hashtable Size and Content + tmp = fread(&hashTableSize, sizeof(hashTableSize), 1, _ih_fp); + + unsigned int hv; + unsigned char cnt=0; + for (i=0; i<hashTableSize; i++) + { + tmp = fread(&hv, sizeof(hv), 1, _ih_fp); + tmp = fread(&cnt, sizeof(cnt), 1, _ih_fp); + + if (cnt>0) + { + tmpSize = cnt; + } + else + { + tmp = fread(&tmpSize, sizeof(tmpSize), 1, _ih_fp); + } + + _ih_hashTable[hv].locs = getMem( sizeof(unsigned int)* (tmpSize+1) ); + _ih_hashTable[hv].locs[0] = tmpSize; + + for (j=1; j<=tmpSize; j++) + tmp = fread(&(_ih_hashTable[hv].locs[j]), sizeof(_ih_hashTable[hv].locs[j]), 1, _ih_fp); + } + *loadTime = getTime()-startTime; + + return 1; +} +/**********************************************/ +unsigned int *getIHashTableCandidates(int hv) +{ + if ( hv != -1 ) + return _ih_hashTable[hv].locs; + else + return NULL; +} +/**********************************************/ +/**********************************************/ +/**********************************************/ +void configHashTable() +{ + if (WINDOW_SIZE <= 14) + { + generateHashTable = &generateIHashTable; + loadHashTable = &loadIHashTable; + finalizeLoadingHashTable = &finalizeLoadingIHashTable; + getCandidates = &getIHashTableCandidates; + } + else + { + + + } +} +/**********************************************/ +int initLoadingHashTable(char *fileName) +{ + int i; + unsigned char bsIndex; + int tmp; + + _ih_fp = fileOpen(fileName, "r"); + + if (_ih_fp == NULL) + return 0; + + tmp = fread(&bsIndex, sizeof(bsIndex), 1, _ih_fp); + if (bsIndex) + { + fprintf(stdout, "Error: Wrong Type of Index indicated"); + return 0; + } + + tmp = fread(&WINDOW_SIZE, sizeof(WINDOW_SIZE), 1, _ih_fp); + + configHashTable(); + + if (_ih_maxHashTableSize != pow(4, WINDOW_SIZE)) + { + + if (_ih_hashTable != NULL) + { + freeIHashTableContent(_ih_hashTable, _ih_maxHashTableSize); + freeMem(_ih_hashTable, sizeof(IHashTable)* _ih_maxHashTableSize); + freeMem(_ih_refGen, strlen(_ih_refGen)+1) ; + freeMem(_ih_refGenName, strlen(_ih_refGenName)+1); + } + + _ih_maxHashTableSize = pow(4, WINDOW_SIZE); + + _ih_hashTable = getMem (sizeof(IHashTable) * _ih_maxHashTableSize); + for (i=0; i<_ih_maxHashTableSize; i++) + _ih_hashTable[i].locs = NULL; + _ih_refGen = getMem(1); + _ih_refGen[0]='\0'; + _ih_refGenName = getMem(1); + _ih_refGenName[0] = '\0'; + } + + return 1; +} +/**********************************************/ +char *getRefGenome() +{ + return _ih_refGen; + +} +/**********************************************/ +char *getRefGenomeName() +{ + return _ih_refGenName; + +} +/**********************************************/ +int getRefGenomeOffset() +{ + return _ih_refGenOff; + +} +/**********************************************/ +HashTable *getHashTable() +{ + return NULL; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/HashTable.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,64 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __HASH_TABLE__ +#define __HASH_TABLE__ + +typedef struct HashTable +{ + long long hv; + int *locs; +} HashTable; + +typedef struct +{ + unsigned int *locs; +} IHashTable; + +int hashVal(char *seq); +void configHashTable(); +char *getRefGenome(); +char *getRefGenomeName(); +int getRefGenomeOffset(); +int initLoadingHashTable(char *fileName); +HashTable *getHashTable(); + +void (*generateHashTable)(char *fileName, char *indexName); +int (*loadHashTable)(double *loadTime); +void (*finalizeLoadingHashTable)(); +unsigned int *(*getCandidates)(int hv); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Makefile Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,10 @@ +ALL: mrsfast + +LDFLAGS=-static +LIBS=-lz -lm +CFLAGS=-O3 +CC=gcc + +mrsfast: baseFAST.o MrsFAST.o Common.o CommandLineParser.o RefGenome.o HashTable.o Reads.o Output.o + $(CC) $^ -o $@ ${LDFLAGS} ${LIBS} + rm *.o
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/MrsFAST.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,1757 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-01-29 + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "Common.h" +#include "Reads.h" +#include "HashTable.h" +#include "Output.h" +#include "MrsFAST.h" +#include "RefGenome.h" + +float calculateScore(int index, char *seq, char *qual, int *err); +unsigned char mrFAST = 0; +char *versionNumberF="0.2"; + +long long verificationCnt = 0; +long long mappingCnt = 0; +long long mappedSeqCnt = 0; +long long completedSeqCnt = 0; +char *mappingOutput; +/**********************************************/ +char *_msf_refGen = NULL; +int _msf_refGenLength = 0; +int _msf_refGenOffset = 0; +char *_msf_refGenName = NULL; + +int _msf_refGenBeg; +int _msf_refGenEnd; + +IHashTable *_msf_hashTable = NULL; + +int *_msf_samplingLocs; +int *_msf_samplingLocsEnds; +int _msf_samplingLocsSize; + +Read *_msf_seqList; +int _msf_seqListSize; + +ReadIndexTable *_msf_rIndex = NULL; +int _msf_rIndexSize; +int _msf_rIndexMax; + +SAM _msf_output; + +OPT_FIELDS *_msf_optionalFields; + +char *_msf_op; + +char _msf_numbers[200][3]; +char _msf_cigar[5]; + +MappingInfo *_msf_mappingInfo; + +int *_msf_seqHits; +int _msf_openFiles = 0; +int _msf_maxLSize=0; +int _msf_maxRSize=0; +/**********************************************/ +int compare (const void *a, const void *b) +{ + return ((Pair *)a)->hv - ((Pair *)b)->hv; +} +/**********************************************/ +void preProcessReads() +{ + int i=0; + int j=0; + int pos = 0; + + _msf_rIndexMax = -1; + + int tmpSize = _msf_seqListSize*_msf_samplingLocsSize*2; + Pair *tmp = getMem(sizeof(Pair)*tmpSize); + + for (i=0; i<_msf_seqListSize; i++) + { + for (j=0; j< _msf_samplingLocsSize; j++) + { + + tmp[pos].hv = hashVal(_msf_seqList[i].seq+_msf_samplingLocs[j]); + tmp[pos].seqInfo = pos; + pos++; + } + for (j=0; j<_msf_samplingLocsSize; j++) + { + tmp[pos].hv = hashVal(_msf_seqList[i].rseq+_msf_samplingLocs[j]); + tmp[pos].seqInfo = pos; + pos++; + } + } + + qsort(tmp, tmpSize, sizeof(Pair), compare); + + + int uniq = 0; + int prev = -2; + int beg = -1; + int end = -1; + + for (i=0; i<tmpSize; i++) + { + if (prev != tmp[i].hv) + { + uniq ++; + prev = tmp[i].hv; + } + } + + _msf_rIndexSize = uniq; + _msf_rIndex = getMem(sizeof(ReadIndexTable)*_msf_rIndexSize); + prev = -2; + + j=0; + beg =0; + while (beg < tmpSize) + { + end = beg; + while (end+1<tmpSize && tmp[end+1].hv==tmp[beg].hv) + end++; + + _msf_rIndex[j].hv = tmp[beg].hv; + _msf_rIndex[j].seqInfo = getMem(sizeof(int)*(end-beg+2)); + _msf_rIndex[j].seqInfo[0] = end-beg+1; + if ((end-beg+1) > _msf_rIndexMax) + _msf_rIndexMax = end-beg+1; + + for (i=1; i<=_msf_rIndex[j].seqInfo[0]; i++) + { + _msf_rIndex[j].seqInfo[i]=tmp[beg+i-1].seqInfo; + } + j++; + beg = end+1; + } + freeMem(tmp, sizeof(Pair)*tmpSize); +} +/**********************************************/ +void initFAST(Read *seqList, int seqListSize, int *samplingLocs, int samplingLocsSize, char *genFileName) +{ + if (_msf_optionalFields == NULL) + { + _msf_op = getMem(SEQ_LENGTH); + if (pairedEndMode) + { + _msf_optionalFields = getMem(4*sizeof(OPT_FIELDS)); + } + else + { + _msf_optionalFields = getMem(2*sizeof(OPT_FIELDS)); + } + + int i; + for (i=0; i<200;i++) + { + sprintf(_msf_numbers[i],"%d%c",i, '\0'); + } + sprintf(_msf_cigar, "%dM", SEQ_LENGTH); + } + + if (_msf_samplingLocsEnds == NULL) + { + int i; + _msf_samplingLocs = samplingLocs; + _msf_samplingLocsSize = samplingLocsSize; + + _msf_samplingLocsEnds = malloc(sizeof(int)*_msf_samplingLocsSize); + for (i=0; i<_msf_samplingLocsSize; i++) + { + _msf_samplingLocsEnds[i]=_msf_samplingLocs[i]+WINDOW_SIZE-1; + } + + _msf_seqList = seqList; + _msf_seqListSize = seqListSize; + + preProcessReads(); + } + if (_msf_refGenName == NULL) + { + _msf_refGenName = getMem(SEQ_LENGTH); + } + _msf_refGen = getRefGenome(); + _msf_refGenLength = strlen(_msf_refGen); + _msf_refGenOffset = getRefGenomeOffset(); + sprintf(_msf_refGenName,"%s%c", getRefGenomeName(), '\0'); + + if (pairedEndMode && _msf_seqHits == NULL) + { + _msf_mappingInfo = getMem(seqListSize * sizeof (MappingInfo)); + + int i=0; + for (i=0; i<seqListSize; i++) + { + //_msf_mappingInfo[i].next = getMem(sizeof(MappingLocations)); + _msf_mappingInfo[i].next = NULL; + _msf_mappingInfo[i].size = 0; + } + + _msf_seqHits = getMem((_msf_seqListSize/2) * sizeof(int)); + + + for (i=0; i<_msf_seqListSize/2; i++) + { + _msf_seqHits[i] = 0; + } + + initLoadingRefGenome(genFileName); + } + + if (_msf_refGenOffset == 0) + { + _msf_refGenBeg = 1; + } + else + { + _msf_refGenBeg = CONTIG_OVERLAP - SEQ_LENGTH + 2; + } + _msf_refGenEnd = _msf_refGenLength - SEQ_LENGTH + 1; + + +} +/**********************************************/ +void finalizeFAST() +{ + freeMem(_msf_seqHits, (_msf_seqListSize/2) * sizeof(int)); + freeMem(_msf_refGenName, SEQ_LENGTH); + int i; + for (i=0; i<_msf_rIndexSize; i++) + { + freeMem(_msf_rIndex[i].seqInfo, _msf_rIndex[i].seqInfo[0]+1); + } + freeMem(_msf_rIndex, _msf_rIndexSize); +} + +/**********************************************/ +int verifySingleEnd(int index, char* seq, int offset) +{ + int curOff = 0; + int i; + + char *ref; + + int err; + int errCnt =0; + int errCntOff = 0; + int NCntOff = 0; + + int matchCnt = 0; + int pp = 0; + + ref = _msf_refGen + index - 1; + + verificationCnt++; + + for (i = 0; i < SEQ_LENGTH; i++) + { + err = *ref != *seq; + + errCnt += err; + if (errCnt > errThreshold) + { + + return -1; + } + + if (i >= _msf_samplingLocs[curOff] && i <= _msf_samplingLocsEnds[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + else if (curOff < _msf_samplingLocsSize && i>=_msf_samplingLocs[curOff+1]) + { + + if (errCntOff == 0 && NCntOff == 0 && offset > curOff) + { + return -1; + } + + errCntOff = 0; + NCntOff = 0; + curOff++; + + if ( i >= _msf_samplingLocs[curOff]) + { + errCntOff += err; + NCntOff += (*seq == 'N'); + } + } + + ref++; + seq++; + } + return errCnt; +} + +/**********************************************/ +int calculateMD(int index, char *seq, int err, char **opSeq) +{ + int i; + char *ref; + char *ver; + short matchCnt = 0; + char *op = *opSeq; + int pp = 0; + + ref = _msf_refGen + index-1; + ver = seq; + + if (err>0 || err == -1 ) + { + + err = 0; + for (i=0; i < SEQ_LENGTH; i++) + { + if (* ref != *ver) + { + err++; + if (matchCnt) + { + if (matchCnt < 10) + { + op[pp++]=_msf_numbers[matchCnt][0]; + } + else if (matchCnt < 100) + { + op[pp++]=_msf_numbers[matchCnt][0]; + op[pp++]=_msf_numbers[matchCnt][1]; + } + else + { + op[pp++]=_msf_numbers[matchCnt][0]; + op[pp++]=_msf_numbers[matchCnt][1]; + op[pp++]=_msf_numbers[matchCnt][2]; + } + + matchCnt = 0; + } + op[pp++]=*ref; + } + else + { + matchCnt++; + } + ref++; + ver++; + } + + } + if (err == 0) + { + matchCnt = SEQ_LENGTH; + } + + if (matchCnt>0) + { + if (matchCnt < 10) + { + op[pp++]=_msf_numbers[matchCnt][0]; + } + else if (matchCnt < 100) + { + op[pp++]=_msf_numbers[matchCnt][0]; + op[pp++]=_msf_numbers[matchCnt][1]; + } + else + { + op[pp++]=_msf_numbers[matchCnt][0]; + op[pp++]=_msf_numbers[matchCnt][1]; + op[pp++]=_msf_numbers[matchCnt][2]; + } + } + op[pp]='\0'; + + return err; +} + +/**********************************************/ +void mapSingleEndSeqListBal(unsigned int *l1, int s1, unsigned int *l2, int s2, int dir) +{ + + if (s1 == 0 || s2 == 0) + { + return; + } + else if (s1 == s2 && s1 <= 50) + { + + int j = 0; + int z = 0; + int *locs; + int *seqInfo; + char *_tmpSeq, *_tmpQual; + char rqual[QUAL_LENGTH+1]; + rqual[QUAL_LENGTH]='\0'; + + if (dir > 0) + { + locs = (int *) l1; + seqInfo = (int *) l2; + } + else + { + locs = (int *) l2; + seqInfo = (int *) l1; + } + + + for (j=0; j<s2; j++) + { + int re = _msf_samplingLocsSize * 2; + int r = seqInfo[j]/re; + if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) + { + continue; + } + + int x = seqInfo[j] % re; + int o = x % _msf_samplingLocsSize; + char d = (x/_msf_samplingLocsSize)?1:0; + + + if (d) + { + reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[r].rseq; + } + else + { + _tmpQual = _msf_seqList[r].qual; + _tmpSeq = _msf_seqList[r].seq; + } + + + for (z=0; z<s1; z++) + { + int genLoc = locs[z]-_msf_samplingLocs[o]; + + + if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) + continue; + + int err = -1; + + + + err = verifySingleEnd(genLoc, _tmpSeq, o); + + + + if (err != -1) + { + calculateMD(genLoc, _tmpSeq, err, &_msf_op); + mappingCnt++; + _msf_seqList[r].hits[0]++; + + _msf_output.QNAME = _msf_seqList[r].name; + _msf_output.FLAG = 16 * d; + _msf_output.RNAME = _msf_refGenName; + _msf_output.POS = genLoc + _msf_refGenOffset; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = _msf_cigar; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = _msf_op; + + output(_msf_output); + + + if (_msf_seqList[r].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[r].hits[0] = 2; + } + + + if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) + { + completedSeqCnt++; + break; + } + } + + } + } + } + else + { + int tmp1=s1/2, tmp2= s2/2; + if (tmp1 != 0) + mapSingleEndSeqListBal(l1, tmp1, l2+tmp2, s2-tmp2, dir); + mapSingleEndSeqListBal(l2+tmp2, s2-tmp2, l1+tmp1, s1-tmp1, -dir); + if (tmp2 !=0) + mapSingleEndSeqListBal(l1+tmp1, s1-tmp1, l2, tmp2, dir); + if (tmp1 + tmp2 != 0) + mapSingleEndSeqListBal(l2, tmp2, l1, tmp1, -dir); + } +} + + +/**********************************************/ +void mapSingleEndSeqListTOP(unsigned int *l1, int s1, unsigned int *l2, int s2) +{ + if (s1 < s2) + { + mapSingleEndSeqListBal(l1, s1, l2, s1,1); + mapSingleEndSeqListTOP(l1, s1, l2+s1, s2-s1); + } + else if (s1 > s2) + { + mapSingleEndSeqListBal(l1, s2, l2, s2,1); + mapSingleEndSeqListTOP(l1+s2, s1-s2, l2, s2); + } + else + { + mapSingleEndSeqListBal(l1, s1, l2, s2,1); + } +} + + +/**********************************************/ +void mapSingleEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2) +{ + if ( s2/s1 <= 2) + { + int j = 0; + int z = 0; + int *locs = (int *) l1; + int *seqInfo = (int *) l2; + char *_tmpSeq, *_tmpQual; + char rqual[QUAL_LENGTH+1]; + rqual[QUAL_LENGTH]='\0'; + + for (j=0; j<s2; j++) + { + int re = _msf_samplingLocsSize * 2; + int r = seqInfo[j]/re; + if (maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) + { + continue; + } + + int x = seqInfo[j] % re; + int o = x % _msf_samplingLocsSize; + char d = (x/_msf_samplingLocsSize)?1:0; + + + if (d) + { + reverse(_msf_seqList[r].qual, rqual, QUAL_LENGTH); + _tmpQual = rqual; + _tmpSeq = _msf_seqList[r].rseq; + } + else + { + _tmpQual = _msf_seqList[r].qual; + _tmpSeq = _msf_seqList[r].seq; + } + + + for (z=0; z<s1; z++) + { + int genLoc = locs[z]-_msf_samplingLocs[o]; + + + if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) + continue; + + int err = -1; + + + + err = verifySingleEnd(genLoc, _tmpSeq, o); + + + + if (err != -1) + { + calculateMD(genLoc, _tmpSeq, err, &_msf_op); + mappingCnt++; + _msf_seqList[r].hits[0]++; + + _msf_output.QNAME = _msf_seqList[r].name; + _msf_output.FLAG = 16 * d; + _msf_output.RNAME = _msf_refGenName; + _msf_output.POS = genLoc + _msf_refGenOffset; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = _msf_cigar; + _msf_output.MRNAME = "*"; + _msf_output.MPOS = 0; + _msf_output.ISIZE = 0; + _msf_output.SEQ = _tmpSeq; + _msf_output.QUAL = _tmpQual; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = _msf_op; + + output(_msf_output); + + + if (_msf_seqList[r].hits[0] == 1) + { + mappedSeqCnt++; + } + + if ( maxHits == 0 ) + { + _msf_seqList[r].hits[0] = 2; + } + + + if ( maxHits!=0 && _msf_seqList[r].hits[0] == maxHits) + { + completedSeqCnt++; + break; + } + } + + } + } + } + else if (s1 == 1) + { + // fprintf(stderr, "1"); + int tmp = s2/2; + mapSingleEndSeqList(l1, s1, l2, tmp); + mapSingleEndSeqList(l1, s1, l2+tmp, s2-tmp); + } + else if (s2 == 1) + { + // fprintf(stderr, "2"); + int tmp = s1/2; + mapSingleEndSeqList(l1, tmp, l2, s2); + mapSingleEndSeqList(l1+tmp, s1-tmp, l2, s2); + } + else + { + // fprintf(stderr, "3"); + int tmp1=s1/2, tmp2= s2/2; + mapSingleEndSeqList(l1, tmp1, l2, tmp2); + mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2); + mapSingleEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2); + mapSingleEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2); + } +} +/**********************************************/ +int mapSingleEndSeq() +{ + int i = 0; + unsigned int *locs = NULL; + unsigned int *seqInfo = NULL; + while ( i < _msf_rIndexSize ) + { + + locs = getCandidates (_msf_rIndex[i].hv); + if ( locs != NULL) + { + seqInfo = _msf_rIndex[i].seqInfo; + mapSingleEndSeqListTOP (locs+1, locs[0], seqInfo+1, seqInfo[0]); + } + i++; + } + return 1; +} + + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +int compareOut (const void *a, const void *b) +{ + FullMappingInfo *aInfo = (FullMappingInfo *)a; + FullMappingInfo *bInfo = (FullMappingInfo *)b; + return aInfo->loc - bInfo->loc; +} + +/**********************************************/ +void mapPairedEndSeqList(unsigned int *l1, int s1, unsigned int *l2, int s2) +{ + if ( s2/s1 <= 2) + { + int j = 0; + int z = 0; + int *locs = (int *) l1; + int *seqInfo = (int *) l2; + char *_tmpSeq, *_tmpQual; + char rqual[QUAL_LENGTH+1]; + rqual[QUAL_LENGTH]='\0'; + + for (j=0; j<s2; j++) + { + int re = _msf_samplingLocsSize * 2; + int r = seqInfo[j]/re; + + if (pairedEndDiscordantMode && (_msf_seqList[r].hits[0] == 1 || (_msf_seqHits[r/2] > DISCORDANT_CUT_OFF) )) + { + continue; + } + + int x = seqInfo[j] % re; + int o = x % _msf_samplingLocsSize; + char d = (x/_msf_samplingLocsSize)?-1:1; + + + if (d==-1) + { + _tmpSeq = _msf_seqList[r].rseq; + } + else + { + _tmpSeq = _msf_seqList[r].seq; + } + + + for (z=0; z<s1; z++) + { + int genLoc = locs[z]-_msf_samplingLocs[o]; + + + if ( genLoc < _msf_refGenBeg || genLoc > _msf_refGenEnd ) + continue; + + int err = -1; + + + + err = verifySingleEnd(genLoc, _tmpSeq, o); + + + if (err != -1) + { + MappingLocations *parent = NULL; + MappingLocations *child = _msf_mappingInfo[r].next; + + genLoc+= _msf_refGenOffset; + + int i = 0; + for (i=0; i<(_msf_mappingInfo[r].size/MAP_CHUNKS); i++) + { + parent = child; + child = child->next; + } + + if (child==NULL) + { + MappingLocations *tmp = getMem(sizeof(MappingLocations)); + tmp->next = NULL; + tmp->loc[0]=genLoc * d; + if (parent == NULL) + _msf_mappingInfo[r].next = tmp; + else + parent->next = tmp; + } + else + { + child->loc[_msf_mappingInfo[r].size % MAP_CHUNKS] = genLoc * d; + } + + + + _msf_mappingInfo[r].size++; + + } + + } + } + } + else if (s1 == 1) + { + int tmp = s2/2; + mapPairedEndSeqList(l1, s1, l2, tmp); + mapPairedEndSeqList(l1, s1, l2+tmp, s2-tmp); + } + else if (s2 == 1) + { + int tmp = s1/2; + mapPairedEndSeqList(l1, tmp, l2, s2); + mapPairedEndSeqList(l1+tmp, s1-tmp, l2, s2); + } + else + { + int tmp1=s1/2, tmp2= s2/2; + mapPairedEndSeqList(l1, tmp1, l2, tmp2); + mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2, tmp2); + mapPairedEndSeqList(l1+tmp1, s1-tmp1, l2+tmp2, s2-tmp2); + mapPairedEndSeqList(l1, tmp1, l2+tmp2, s2-tmp2); + } +} + +/**********************************************/ +int mapPairedEndSeq() +{ + int i = 0; + unsigned int *locs = NULL; + unsigned int *seqInfo = NULL; + while ( i < _msf_rIndexSize ) + { + locs = getCandidates (_msf_rIndex[i].hv); + if ( locs != NULL) + { + seqInfo = _msf_rIndex[i].seqInfo; + mapPairedEndSeqList(locs+1, locs[0], seqInfo+1, seqInfo[0]); + + } + i++; + } + + + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + MappingLocations *cur, *tmp; + int tmpOut; + int j; + int lmax=0, rmax=0; + + sprintf(fname1, "%s__%s__%d__1",mappingOutputPath, mappingOutput, _msf_openFiles); + sprintf(fname2, "%s__%s__%d__2",mappingOutputPath, mappingOutput, _msf_openFiles); + + FILE* out; + FILE* out1 = fileOpen(fname1, "w"); + FILE* out2 = fileOpen(fname2, "w"); + + _msf_openFiles++; + + for (i=0; i<_msf_seqListSize; i++) + { + + if (i%2==0) + { + out = out1; + + if (lmax < _msf_mappingInfo[i].size) + { + lmax = _msf_mappingInfo[i].size; + } + } + else + { + out = out2; + if (rmax < _msf_mappingInfo[i].size) + { + rmax = _msf_mappingInfo[i].size; + } + } + + tmpOut = fwrite(&(_msf_mappingInfo[i].size), sizeof(int), 1, out); + if (_msf_mappingInfo[i].size > 0) + { + cur = _msf_mappingInfo[i].next; + for (j=0; j < _msf_mappingInfo[i].size; j++) + { + if ( j>0 && j%MAP_CHUNKS==0) + { + cur = cur->next; + } + tmpOut = fwrite(&(cur->loc[j % MAP_CHUNKS]), sizeof(int), 1, out); + } + _msf_mappingInfo[i].size = 0; + // _msf_mappingInfo[i].next = NULL; + } + } + + _msf_maxLSize += lmax; + _msf_maxRSize += rmax; + + fclose(out1); + fclose(out2); + + //fprintf(stdout, "%d %d\n", _msf_maxLSize, _msf_maxRSize); + +} + +/**********************************************/ +void outputPairedEnd() +{ + + char *curGen; + char *curGenName; + int tmpOut; + + loadRefGenome(&_msf_refGen, &_msf_refGenName, &tmpOut); + + FILE* in1[_msf_openFiles]; + FILE* in2[_msf_openFiles]; + + char fname1[_msf_openFiles][FILE_NAME_LENGTH]; + char fname2[_msf_openFiles][FILE_NAME_LENGTH]; + + // discordant + FILE *out, *out1, *out2; + + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + char fname5[FILE_NAME_LENGTH]; + + if (pairedEndDiscordantMode) + { + sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname4, "%s__%s__oea1", mappingOutputPath, mappingOutput); + sprintf(fname5, "%s__%s__oea2", mappingOutputPath, mappingOutput); + out = fileOpen(fname3, "a"); + out1 = fileOpen(fname4, "a"); + out2 = fileOpen(fname5, "a"); + } + + + + int i; + + FullMappingInfo *mi1 = getMem(sizeof(FullMappingInfo) * _msf_maxLSize); + FullMappingInfo *mi2 = getMem(sizeof(FullMappingInfo) * _msf_maxRSize); + + + for (i=0; i<_msf_openFiles; i++) + { + sprintf(fname1[i], "%s__%s__%d__1", mappingOutputPath, mappingOutput, i); + sprintf(fname2[i], "%s__%s__%d__2", mappingOutputPath, mappingOutput, i); + in1[i] = fileOpen(fname1[i], "r"); + in2[i] = fileOpen(fname2[i], "r"); + } + + + int size; + int j, k; + int size1, size2; + + for (i=0; i<_msf_seqListSize/2; i++) + { + size1 = size2 = 0; + for (j=0; j<_msf_openFiles; j++) + { + tmpOut = fread(&size, sizeof(int), 1, in1[j]); + if ( size > 0 ) + { + for (k=0; k<size; k++) + { + + mi1[size1+k].dir = 1; + tmpOut = fread (&(mi1[size1+k].loc), sizeof(int), 1, in1[j]); + if (mi1[size1+k].loc<1) + { + mi1[size1+k].loc *= -1; + mi1[size1+k].dir = -1; + } + } + qsort(mi1+size1, size, sizeof(FullMappingInfo), compareOut); + size1+=size; + } + } + + for (j=0; j<_msf_openFiles; j++) + { + tmpOut = fread(&size, sizeof(int), 1, in2[j]); + if ( size > 0 ) + { + for (k=0; k<size; k++) + { + + mi2[size2+k].dir = 1; + tmpOut = fread (&(mi2[size2+k].loc), sizeof(int), 1, in2[j]); + + if (mi2[size2+k].loc<1) + { + mi2[size2+k].loc *= -1; + mi2[size2+k].dir = -1; + } + } + qsort(mi2+size2, size, sizeof(FullMappingInfo), compareOut); + size2+=size; + } + } + + //if (i == 6615) + // fprintf(stdout, "%d: %s %d %d ",i, _msf_seqList[i*2].name, size1, size2); + + int lm, ll, rl, rm; + int pos = 0; + + if (pairedEndDiscordantMode) + { + + for (j=0; j<size1; j++) + { + lm = mi1[j].loc - maxPairEndedDiscordantDistance + 1; + ll = mi1[j].loc - minPairEndedDiscordantDistance + 1; + rl = mi1[j].loc + minPairEndedDiscordantDistance - 1; + rm = mi1[j].loc + maxPairEndedDiscordantDistance - 1; + + while (pos<size2 && mi2[pos].loc < lm) + { + pos++; + } + + k = pos; + while (k<size2 && mi2[k].loc<=rm) + { + if ( mi2[k].loc <= ll || mi2[k].loc >= rl) + { + if ( (mi1[j].loc < mi2[k].loc && mi1[j].dir==1 && mi2[k].dir == -1) || + (mi1[j].loc > mi2[k].loc && mi1[j].dir==-1 && mi2[k].dir == 1) ) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + size1=0; + size2=0; + break; + } + } + k++; + } + } + + _msf_seqHits[i]+= size1*size2; + if (_msf_seqHits[i]> DISCORDANT_CUT_OFF) + { + _msf_seqList[i*2].hits[0]=1; + _msf_seqList[i*2+1].hits[0]=1; + size1=0; + size2=0; + } + } + //if (i == 6615) + // fprintf(stdout, "%d %d\n", size1, size2); + + char *seq1, *seq2, *rseq1, *rseq2, *qual1, *qual2; + char rqual1[QUAL_LENGTH+1], rqual2[QUAL_LENGTH+1]; + rqual1[QUAL_LENGTH] = rqual2[QUAL_LENGTH] = '\0'; + seq1 = _msf_seqList[i*2].seq; + rseq1 = _msf_seqList[i*2].rseq; + qual1 = _msf_seqList[i*2].qual; + reverse(_msf_seqList[i*2].qual, rqual1, QUAL_LENGTH); + + seq2 = _msf_seqList[i*2+1].seq; + rseq2 = _msf_seqList[i*2+1].rseq; + qual2 = _msf_seqList[i*2+1].qual; + reverse(_msf_seqList[i*2+1].qual, rqual2, QUAL_LENGTH); + + + if (pairedEndDiscordantMode) + { + for (k=0; k<size1; k++) + { + int tm = -1; + mi1[k].score = calculateScore(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, (mi1[k].dir==-1)?rqual1:qual1, &tm); + mi1[k].err = tm; + } + + for (k=0; k<size2; k++) + { + int tm = -1; + mi2[k].score = calculateScore(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, (mi2[k].dir==-1)?rqual2:qual2, &tm); + mi2[k].err = tm; + } + + } + else + { + for (k=0; k<size1; k++) + { + mi1[k].err = calculateMD(mi1[k].loc, (mi1[k].dir==-1)?rseq1:seq1, -1, &_msf_op); + sprintf(mi1[k].md, "%s", _msf_op); + } + + for (k=0; k<size2; k++) + { + mi2[k].err = calculateMD(mi2[k].loc, (mi2[k].dir==-1)?rseq2:seq2, -1, &_msf_op); + sprintf(mi2[k].md, "%s", _msf_op); + } + } + pos = 0; + + for (j=0; j<size1; j++) + { + lm = mi1[j].loc - maxPairEndedDistance + 1; + ll = mi1[j].loc - minPairEndedDistance + 1; + rl = mi1[j].loc + minPairEndedDistance - 1; + rm = mi1[j].loc + maxPairEndedDistance - 1; + + //fprintf(stdout, "%d %d %d %d %d\n",lm, ll,mi1[j].loc ,rl, rm); + + while (pos<size2 && mi2[pos].loc < lm) + { + pos++; + } + + //fprintf(stdout, "POS: %d %d \n", pos, mi2[pos].loc); + + k = pos; + while (k<size2 && mi2[k].loc <= rm) + { + if (mi2[k].loc <= ll || mi2[k].loc >= rl) + { + if (pairedEndDiscordantMode) + { + int tmp; + int rNo = i; + int loc = mi1[j].loc*mi1[j].dir; + int err = mi1[j].err; + float sc = mi1[j].score; + + char l = strlen(_msf_refGenName); + + tmp = fwrite(&rNo, sizeof(int), 1, out); + + tmp = fwrite(&l, sizeof(char), 1, out); + tmp = fwrite(_msf_refGenName, sizeof(char), l, out); + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(char), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + + + loc = mi2[k].loc*mi2[k].dir; + err = mi2[k].err; + sc = mi2[k].score; + + tmp = fwrite(&loc, sizeof(int), 1, out); + tmp = fwrite(&err, sizeof(char), 1, out); + tmp = fwrite(&sc, sizeof(float), 1, out); + } // end discordant + else + { //start sampe + char *seq; + char *qual; + char d1; + char d2; + int isize; + int proper=0; + // ISIZE CALCULATION + // The distance between outer edges + isize = abs(mi1[j].loc - mi2[k].loc)+SEQ_LENGTH-1; + if (mi1[j].loc - mi2[k].loc > 0) + { + isize *= -1; + } + + d1 = (mi1[j].dir == -1)?1:0; + d2 = (mi2[k].dir == -1)?1:0; + + if ( d1 ) + { + seq = rseq1; + qual = rqual1; + } + else + { + seq = seq1; + qual = qual1; + } + + if ( (mi1[j].loc < mi2[k].loc && !d1 && d2) || + (mi1[j].loc > mi2[k].loc && d1 && !d2) ) + { + proper = 2; + } + else + { + proper = 0; + } + + + _msf_output.POS = mi1[j].loc; + _msf_output.MPOS = mi2[k].loc; + _msf_output.FLAG = 1+proper+16*d1+32*d2+64; + _msf_output.ISIZE = isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = _msf_cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi1[j].err; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi1[j].md; + + + output(_msf_output); + + if ( d2 ) + { + seq = rseq2; + qual = rqual2; + } + else + { + seq = seq2; + qual = qual2; + } + + _msf_output.POS = mi2[k].loc; + _msf_output.MPOS = mi1[j].loc; + _msf_output.FLAG = 1+proper+16*d2+32*d1+128; + _msf_output.ISIZE = -isize; + _msf_output.SEQ = seq, + _msf_output.QUAL = qual; + _msf_output.QNAME = _msf_seqList[i*2].name; + _msf_output.RNAME = _msf_refGenName; + _msf_output.MAPQ = 255; + _msf_output.CIGAR = _msf_cigar; + _msf_output.MRNAME = "="; + + _msf_output.optSize = 2; + _msf_output.optFields = _msf_optionalFields; + + _msf_optionalFields[0].tag = "NM"; + _msf_optionalFields[0].type = 'i'; + _msf_optionalFields[0].iVal = mi2[k].err;; + + _msf_optionalFields[1].tag = "MD"; + _msf_optionalFields[1].type = 'Z'; + _msf_optionalFields[1].sVal = mi2[k].md; + + output(_msf_output); + } //end sampe + } + k++; + } + } + } + + if (pairedEndDiscordantMode) + { + fclose(out); + } + + + freeMem(mi1, sizeof(FullMappingInfo)*_msf_maxLSize); + freeMem(mi2, sizeof(FullMappingInfo)*_msf_maxRSize); + + for (i=0; i<_msf_openFiles; i++) + { + fclose(in1[i]); + fclose(in2[i]); + //fprintf(stdout, "%s %s \n", fname1[i], fname2[i]); + unlink(fname1[i]); + unlink(fname2[i]); + } + _msf_openFiles = 0; + +} + +/**********************************************/ +/**********************************************/ +/**********************************************/ +/**********************************************/ +float calculateScore(int index, char *seq, char *qual, int *err) +{ + int i; + char *ref; + char *ver; + + ref = _msf_refGen + index-1; + ver = seq; + float score = 1; + + if (*err > 0 || *err == -1) + { + *err = 0; + + for (i=0; i < SEQ_LENGTH; i++) + { + if (*ref != *ver) + { + //fprintf(stdout, "%c %c %d", *ref, *ver, *err); + (*err)++; + score *= 0.001 + 1/pow( 10, ((qual[i]-33)/10.0) ); + } + ref++; + ver++; + } + + } + return score; +} + +/**********************************************/ +void outputPairedEndDiscPP() +{ + char genName[SEQ_LENGTH]; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + char fname5[FILE_NAME_LENGTH]; + char fname6[FILE_NAME_LENGTH]; + char l; + int loc1, loc2; + char err1, err2; + char dir1, dir2; + float sc1, sc2, lsc=0; + int flag = 0; + int rNo,lrNo = -1; + int tmp; + FILE *in, *in1, *in2, *out, *out1, *out2; + + sprintf(fname1, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname2, "%s__%s__oea1", mappingOutputPath, mappingOutput); + sprintf(fname3, "%s__%s__oea2", mappingOutputPath, mappingOutput); + sprintf(fname4, "%s%s_DIVET.vh", mappingOutputPath, mappingOutput); + sprintf(fname5, "%s%s_OEA1.vh", mappingOutputPath, mappingOutput); + sprintf(fname6, "%s%s_OEA2.vh", mappingOutputPath, mappingOutput); + + in = fileOpen(fname1, "r"); + in1 = fileOpen(fname2, "r"); + in2 = fileOpen(fname3, "r"); + out = fileOpen(fname4, "w"); + out1 = fileOpen(fname5, "w"); + out2 = fileOpen(fname6, "w"); + if (in != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in); + } + else + { + flag = 0; + } + + + while (flag) + { + + tmp = fread(&l, sizeof(char), 1, in); + tmp = fread(genName, sizeof(char), l, in); + genName[l]='\0'; + tmp = fread(&loc1, sizeof(int), 1, in); + tmp = fread(&err1, sizeof(char), 1, in); + tmp = fread(&sc1, sizeof(float), 1, in); + tmp = fread(&loc2, sizeof(int), 1, in); + tmp = fread(&err2, sizeof(char), 1, in); + tmp = fread(&sc2, sizeof(float), 1, in); + + //if (rNo ==6615) + // fprintf(stdout, "%s %d: %d %0.20f %d %d %0.20f\n", genName, loc1, err1, sc1, loc2, err2, sc2); + + if (_msf_seqList[rNo*2].hits[0] % 2 == 0 && _msf_seqHits[rNo] < DISCORDANT_CUT_OFF) + { + dir1 = dir2 = 'F'; + + if (loc1 < 0) + { + dir1 = 'R'; + loc1 = -loc1; + } + + if (loc2 < 0) + { + dir2 = 'R'; + loc2 = -loc2; + } + + if (rNo != lrNo) + { + int j; + for (j=0; j<SEQ_LENGTH; j++) + { + lsc += _msf_seqList[rNo*2].qual[j]+_msf_seqList[rNo*2+1].qual[j]; + } + lsc /= 2*SEQ_LENGTH; + lsc -= 33; + lrNo = rNo; + } + + int inv = 0; + int eve = 0; + int dist = 0; + char event; + + //fprintf(stdout, "%c %c ", dir1, dir2); + + if ( dir1 == dir2 ) + { + event = 'V'; + //fprintf(stdout, "Inverstion \n"); + } + else + { + if (loc1 < loc2) + { + + //fprintf(stdout, "< %d ", loc2-loc1-SEQ_LENGTH); + + if (dir1 == 'R' && dir2 == 'F') + { + event = 'E'; + + //fprintf(stdout, "Everted \n"); + } + else if ( loc2 - loc1 >= maxPairEndedDiscordantDistance ) + { + event = 'D'; + //fprintf(stdout, "Deletion \n"); + } + else + { + event = 'I'; + //fprintf(stdout, "Insertion \n"); + } + } + else if (loc2 < loc1) + { + //fprintf(stdout, "> %d ", loc1-loc2-SEQ_LENGTH); + if (dir2 == 'R' && dir1 == 'F') + { + event = 'E'; + //fprintf(stdout, "Everted \n"); + } + else if ( loc1 - loc2 >= maxPairEndedDiscordantDistance ) + { + event = 'D'; + //fprintf(stdout, "Deletion \n"); + } + else + { + event = 'I'; + //fprintf(stdout, "Insertion \n"); + } + } + } + _msf_seqList[rNo*2].hits[0] = 2; + fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n", + _msf_seqList[rNo*2].name, genName, loc1, (loc1+SEQ_LENGTH-1), dir1, genName, loc2, (loc2+SEQ_LENGTH-1), dir2, event, (err1+err2), lsc, sc1*sc2); + } + flag = fread(&rNo, sizeof(int), 1, in); + + } + + /* + MappingInfoNode *lr[_msf_seqListSize/2]; + MappingInfoNode *rr[_msf_seqListSize/2]; + MappingInfoNode *cur, *tmpDel, *cur2; + + + int ls[_msf_seqListSize/2]; + int rs[_msf_seqListSize/2]; + + + int i=0; + + for (i = 0; i<_msf_seqListSize/2; i++) + { + lr[i] = rr[i] = NULL; + ls[i] = rs[i] = 0; + } + + + + if (in1 != NULL) + { + flag = fread(&rNo, sizeof(int), 1, in1); + } + else + { + flag = 0; + } + + + while (flag) + { + tmp = fread(&loc1, sizeof(int), 1, in1); + tmp = fread(&err1, sizeof(char), 1, in1); + tmp = fread(&sc1, sizeof(float), 1, in1); + tmp = fread(&l, sizeof(char), 1, in1); + tmp = fread(genName, sizeof(char), l, in1); + genName[l]='\0'; + + if (_msf_seqList[rNo*2].hits[0] == 0) + { + + if ( ls[rNo] < DISCORDANT_CUT_OFF ) + { + ls[rNo]++; + + cur = lr[rNo]; + + if (cur !=NULL) + { + if (err1 == cur->err) + { + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + nr->next = lr[rNo]; + sprintf(nr->chr,"%s", genName); + lr[rNo] = nr; + } + else if (err1 < cur->err) + { + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + sprintf(nr->chr,"%s", genName); + nr->next = NULL; + lr[rNo] = nr; + while (cur!=NULL) + { + tmpDel = cur; + cur = cur->next; + freeMem(tmpDel, sizeof(MappingInfoNode)); + } +} +} +else +{ + + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + sprintf(nr->chr,"%s", genName); + nr->next = NULL; + lr[rNo] = nr; +} + +if (ls[rNo] > DISCORDANT_CUT_OFF) +{ + cur = lr[rNo]; + while (cur!=NULL) + { + tmpDel = cur; + cur = cur->next; + freeMem(tmpDel, sizeof(MappingInfoNode)); + } +} +} + +} +flag = fread(&rNo, sizeof(int), 1, in1); + +} + + +if (in2 != NULL) +{ + flag = fread(&rNo, sizeof(int), 1, in2); +} +else +{ + flag = 0; +} + + +while (flag) +{ + tmp = fread(&loc1, sizeof(int), 1, in2); + tmp = fread(&err1, sizeof(char), 1, in2); + tmp = fread(&sc1, sizeof(float), 1, in2); + tmp = fread(&l, sizeof(char), 1, in2); + tmp = fread(genName, sizeof(char), l, in2); + genName[l]='\0'; + + if (_msf_seqList[rNo*2].hits[0] == 0) + { + + if ( rs[rNo] < DISCORDANT_CUT_OFF ) + { + rs[rNo]++; + + cur = rr[rNo]; + + if (cur !=NULL) + { + if (err1 == cur->err) + { + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + nr->next = rr[rNo]; + sprintf(nr->chr,"%s", genName); + rr[rNo] = nr; + } + else if (err1 < cur->err) + { + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + sprintf(nr->chr,"%s", genName); + nr->next = NULL; + rr[rNo] = nr; + while (cur!=NULL) + { + tmpDel = cur; + cur = cur->next; + freeMem(tmpDel, sizeof(MappingInfoNode)); + } + } + } + else + { + + MappingInfoNode *nr = getMem(sizeof(MappingInfoNode)); + + nr->loc = loc1; + nr->err = err1; + nr->score = sc1; + sprintf(nr->chr,"%s", genName); + nr->next = NULL; + rr[rNo] = nr; + } + + if (rs[rNo] > DISCORDANT_CUT_OFF) + { + cur = rr[rNo]; + while (cur!=NULL) + { + tmpDel = cur; + cur = cur->next; + freeMem(tmpDel, sizeof(MappingInfoNode)); + } + } + } + } + flag = fread(&rNo, sizeof(int), 1, in2); + +} + + +for (i=0; i<_msf_seqListSize/2; i++) +{ + int j; + for (j=0; j<SEQ_LENGTH; j++) + { + lsc += _msf_seqList[i*2].qual[j]+_msf_seqList[i*2+1].qual[j]; + } + lsc /= 2*SEQ_LENGTH; + lsc -= 33; + if (ls[i] * rs[i] < DISCORDANT_CUT_OFF && ls[i] & rs[i] > 0) + { + cur = lr[i]; + while (cur != NULL) + { + cur2 = rr[i]; + if (cur->loc < 0) + { + dir1 = 'R'; + loc1 = -cur->loc; + } + else + { + dir1 = 'F'; + loc1 = cur->loc; + } + while (cur2 != NULL) + { + + if (cur2->loc < 0) + { + dir2 = 'R'; + loc2 = -cur2->loc; + } + else + { + dir2 = 'F'; + loc2 = cur2->loc; + } + + fprintf(out, "%s\t%s\t%d\t%d\t%c\t%s\t%d\t%d\t%c\t%c\t%d\t%0.0f\t%0.20f\n", + _msf_seqList[i*2].name, cur->chr, loc1, (loc1+SEQ_LENGTH-1), dir1, cur2->chr, loc2, (loc2+SEQ_LENGTH-1), dir2, 'T', (cur->err+cur2->err), lsc, cur->score*cur2->score); + cur2 = cur2->next; + } + cur = cur->next; + } + } + +}*/ + + +fclose(in); +fclose(in1); +fclose(in2); +fclose(out); +fclose(out1); +fclose(out2); + +unlink(fname1); +unlink(fname2); +unlink(fname3); +unlink(fname5); +unlink(fname6); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/MrsFAST.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,103 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __MRS_FAST__ +#define __MRS_FAST__ + +#include "Reads.h" + +#define MAP_CHUNKS 15 + +// Pair is used to pre-processing and making the read index table +typedef struct +{ + int hv; + int seqInfo; +} Pair; + +typedef struct +{ + int hv; + unsigned int *seqInfo; +} ReadIndexTable; + + +typedef struct mn +{ + int loc; + char dir; + char err; + float score; + char md[40]; + char chr[10]; +} FullMappingInfo; + +typedef struct lc +{ + int loc[MAP_CHUNKS]; + struct lc *next; +} MappingLocations; + +typedef struct inf +{ + int size; + MappingLocations *next; +} MappingInfo; + +typedef struct +{ + FILE * fp; + char name[400]; +} FILE_STRUCT; + +extern long long verificationCnt; +extern long long mappingCnt; +extern long long mappedSeqCnt; +extern long long completedSeqCnt; + +void initFAST( Read *seqList, + int seqListSize, + int *samplingLocs, + int samplingLocsSize, + char *fileName); + +void finalizeFAST(); + +int mapSingleEndSeq(); +int mapPaiedEndSeq(); +void outputPairedEnd(); +void outputPairedEndDiscPP(); +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Output.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,175 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <zlib.h> +#include <string.h> +#include "Common.h" +#include "Output.h" + +FILE *_out_fp; +gzFile _out_gzfp; + + + +void finalizeGZOutput() +{ + gzclose(_out_gzfp); +} + +void finalizeTXOutput() +{ + fclose(_out_fp); +} + + +void gzOutputQ(SAM map) +{ + gzprintf(_out_gzfp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + gzprintf(_out_gzfp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + gzprintf(_out_gzfp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + gzprintf(_out_gzfp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + gzprintf(_out_gzfp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + gzprintf(_out_gzfp, "\n"); +} + +void outputQ(SAM map) +{ + + fprintf(_out_fp, "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + map.QNAME, + map.FLAG, + map.RNAME, + map.POS, + map.MAPQ, + map.CIGAR, + map.MRNAME, + map.MPOS, + map.ISIZE, + map.SEQ, + map.QUAL); + + + int i; + + for ( i = 0; i < map.optSize; i++) + { + switch (map.optFields[i].type) + { + case 'A': + fprintf(_out_fp, "\t%s:%c:%c", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].cVal); + break; + case 'i': + fprintf(_out_fp, "\t%s:%c:%d", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].iVal); + break; + case 'f': + fprintf(_out_fp, "\t%s:%c:%f", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].fVal); + break; + case 'Z': + case 'H': + fprintf(_out_fp, "\t%s:%c:%s", map.optFields[i].tag, map.optFields[i].type, map.optFields[i].sVal); + break; + } + } + + fprintf(_out_fp, "\n"); +} + +int initOutput ( char *fileName, int compressed) +{ + if (compressed) + { + char newFileName[strlen(mappingOutputPath)+strlen(fileName)+4]; + sprintf(newFileName, "%s%s.gz", mappingOutputPath, fileName); + _out_gzfp = fileOpenGZ(newFileName, "w1f"); + if (_out_gzfp == Z_NULL) + { + return 0; + } + + finalizeOutput = &finalizeGZOutput; + + output = &gzOutputQ; + } + else + { + + char newFileName[strlen(mappingOutputPath)+strlen(fileName)]; + sprintf(newFileName, "%s%s", mappingOutputPath, fileName); + + _out_fp = fileOpen(newFileName, "w"); + if (_out_fp == NULL) + { + return 0; + } + + finalizeOutput = &finalizeTXOutput; + output = &outputQ; + } + return 1; +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Output.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,76 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __OUTPUT__ +#define __OUTPUT__ + +#define FORWARD 0 +#define REVERSE 1 + +typedef struct +{ + char *tag; + char type; + char cVal; + int iVal; + float fVal; + char *sVal; +} OPT_FIELDS; + +typedef struct +{ + char *QNAME; + short FLAG; + char *RNAME; + int POS; + unsigned char MAPQ; + char *CIGAR; + char *MRNAME; + int MPOS; + int ISIZE; + char *SEQ; + char *QUAL; + + int optSize; + OPT_FIELDS *optFields; +} SAM; + +int initOutput(char *fileName, int compressed); +void (*finalizeOutput)(); +void (*output)(SAM map); + + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Reads.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,559 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <zlib.h> +#include "Common.h" +#include "Reads.h" + + + +FILE *_r_fp1; +FILE *_r_fp2; +gzFile _r_gzfp1; +gzFile _r_gzfp2; +Read *_r_seq; +int _r_seqCnt; +int *_r_samplingLocs; + +/**********************************************/ +char *(*readFirstSeq)(char *); +char *(*readSecondSeq)(char *); +/**********************************************/ +char *readFirstSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); +} + +/**********************************************/ +char *readSecondSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); +} +/**********************************************/ +char *readFirstSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); +} + +/**********************************************/ +char *readSecondSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); +} +/**********************************************/ +int readAllReads(char *fileName1, + char *fileName2, + int compressed, + unsigned char *fastq, + unsigned char pairedEnd, + Read **seqList, + unsigned int *seqListSize) +{ + double startTime=getTime(); + + char seq1[SEQ_MAX_LENGTH]; + char rseq1[SEQ_MAX_LENGTH]; + char name1[SEQ_MAX_LENGTH]; + char qual1[SEQ_MAX_LENGTH]; + char seq2[SEQ_MAX_LENGTH]; + char rseq2[SEQ_MAX_LENGTH]; + char name2[SEQ_MAX_LENGTH]; + char qual2[SEQ_MAX_LENGTH]; + + char dummy[SEQ_MAX_LENGTH]; + char ch; + int err1, err2; + int nCnt; + int discarded = 0; + int seqCnt = 0; + int maxCnt = 0; + int i; + Read *list = NULL; + + + if (!compressed) + { + _r_fp1 = fileOpen( fileName1, "r"); + + if (_r_fp1 == NULL) + { + return 0; + } + + ch = fgetc(_r_fp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_fp2 = fileOpen ( fileName2, "r" ); + if (_r_fp2 == NULL) + { + return 0; + } + } + else + { + _r_fp2 = _r_fp1; + } + + readFirstSeq = &readFirstSeqTXT; + readSecondSeq = &readSecondSeqTXT; + } + else + { + + _r_gzfp1 = fileOpenGZ (fileName1, "r"); + + if (_r_gzfp1 == NULL) + { + return 0; + } + + ch = gzgetc(_r_gzfp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_fp2 = fileOpenGZ ( fileName2, "r" ); + if (_r_fp2 == NULL) + { + return 0; + } + } + else + { + _r_fp2 = _r_fp1; + } + + readFirstSeq = &readFirstSeqGZ; + readSecondSeq = &readSecondSeqGZ; + } + + if (ch == '>') + *fastq = 0; + else + *fastq = 1; + + // Counting the number of lines in the file + while (readFirstSeq(dummy)) maxCnt++; + + if (!compressed) + { + rewind(_r_fp1); + } + else + { + gzrewind(_r_gzfp1); + } + + // Calculating the Maximum # of sequences + if (*fastq) + { + maxCnt /= 4; + } + else + { + maxCnt /= 2; + } + + + + if (pairedEnd && fileName2 != NULL ) + maxCnt *= 2; + + list = getMem(sizeof(Read)*maxCnt); + + while( readFirstSeq(name1) ) + { + err1 = 0; + err2 = 0; + readFirstSeq(seq1); + name1[strlen(name1)-1] = '\0'; + for (i=0; i<strlen(name1);i++) + { + if (name1[i] == ' ') + { + name1[i] = '\0'; + break; + } + + } + + if ( *fastq ) + { + readFirstSeq(dummy); + readFirstSeq(qual1); + qual1[strlen(qual1)-1] = '\0'; + } + else + { + sprintf(qual1, "*"); + } + + + // Cropping + if (cropSize > 0) + { + seq1[cropSize] = '\0'; + if ( *fastq ) + qual1[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq1); i++) + { + seq1[i] = toupper (seq1[i]); + if (seq1[i] == 'N') + { + nCnt++; + } + else if (isspace(seq1[i])) + { + + seq1[i] = '\0'; + break; + } + } + + if (nCnt > errThreshold) + { + err1 = 1; + } + + // Reading the second seq of pair-ends + if (pairedEnd) + { + readSecondSeq(name2); + readSecondSeq(seq2); + name2[strlen(name2)-1] = '\0'; + for (i=0; i<strlen(name2);i++) + { + if (name2[i] == ' ') + { + name2[i] = '\0'; + break; + } + + } + + if ( *fastq ) + { + readSecondSeq(dummy); + readSecondSeq(qual2); + + qual2[strlen(qual2)-1] = '\0'; + } + else + { + sprintf(qual2, "*"); + } + + + // Cropping + if (cropSize > 0) + { + seq2[cropSize] = '\0'; + if ( *fastq ) + qual2[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq2); i++) + { + seq2[i] = toupper (seq2[i]); + if (seq2[i] == 'N') + { + nCnt++; + + } + else if (isspace(seq2[i])) + { + seq2[i] = '\0'; + } + } + if (nCnt > errThreshold) + { + err2 = 1; + } + } + + if (!pairedEnd && !err1) + { + + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + seqCnt++; + + } + else if (pairedEnd && !err1 && !err2) + { + // Naming Conventions X/1, X/2 OR X + int tmplen = strlen(name1); + if (strcmp(name1, name2) != 0) + { + tmplen = strlen(name1)-2; + } + + if (strcmp(name1, "@IL11_266:2:1:922:509/1") == 0) + { + fprintf(stdout, "%d\n", seqCnt); + } + //first seq + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + name1[tmplen]='\0'; + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + + seqCnt++; + + //second seq + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + reverseComplete(seq2, rseq2, _mtmp); + rseq2[_mtmp] = '\0'; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq2[i]; + list[seqCnt].rseq[i] = rseq2[i] ; + list[seqCnt].qual[i] = qual2[i]; + } + + + name2[tmplen]='\0'; + sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); + + + seqCnt++; + + } + else + { + discarded++; + } + } + + if (seqCnt > 0) + { + QUAL_LENGTH = SEQ_LENGTH = strlen(list[0].seq); + if (! *fastq) + { + QUAL_LENGTH = 1; + } + //fprintf(stderr, "%d %d\n", SEQ_LENGTH, QUAL_LENGTH); + } + else + { + fprintf(stdout, "ERR: No reads can be found for mapping\n"); + return 0; + } + + + if (pairedEnd) + { +// seqCnt /= 2; + } + + + // Closing Files + if (!compressed) + { + fclose(_r_fp1); + if ( pairedEnd && fileName2 != NULL ) + { + fclose(_r_fp2); + } + } + else + { + gzclose(_r_gzfp1); + if ( pairedEnd && fileName2 != NULL) + { + gzclose(_r_fp2); + } + } + + *seqList = list; + *seqListSize = seqCnt; + + _r_seq = list; + _r_seqCnt = seqCnt; + + fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + //totalLoadingTime+=getTime()-startTime; + + return 1; +} +/**********************************************/ +void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) +{ + int i; + int samLocsSize = errThreshold + 1; + int *samLocs = getMem(sizeof(int)*samLocsSize); + + for (i=0; i<samLocsSize; i++) + { + samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; + if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) + samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; + } + + // Outputing the sampling locations + +/* int j; + for (i=0; i<SEQ_LENGTH; i++) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n"); + + for ( i=0; i<samLocsSize; i++ ) + { + for ( j=0; j<samLocs[i]; j++ ) + { + fprintf(stdout," "); + } + for (j=0; j<WINDOW_SIZE; j++) + { + fprintf(stdout,"+"); + } + fprintf(stdout, "\n"); + fflush(stdout); + } + for ( i=0; i<SEQ_LENGTH; i++ ) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n");*/ + *samplingLocs = samLocs; + *samplingLocsSize = samLocsSize; + _r_samplingLocs = samLocs; +} + +void finalizeReads(char *fileName) +{ + FILE *fp1=NULL; + + if (fileName != NULL) + { + fp1 = fileOpen(fileName, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) + { + fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0) + { + fprintf(fp1, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (_r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); + } + else if (_r_seq[i].hits[0] == 0) + { + fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); + } + } + + fclose(fp1); + if (pairedEndMode) + _r_seqCnt *= 2; + + for (i = 0; i < _r_seqCnt; i++) + { + freeMem(_r_seq[i].hits,0); + } + + + freeMem(_r_seq,0); + freeMem(_r_samplingLocs,0); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/Reads.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,52 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef __READ__ +#define __READ__ + +typedef struct +{ + char *name; + char *seq; + char *rseq; + char *qual; + char *hits; +} Read; + +int readAllReads(char *fileName1, char *fileName2, int compressed, unsigned char *fastq, unsigned char pe, Read **seqList, unsigned int *seqListSize); +void loadSamplingLocations(int **samplingLocs, int *samplingLocsSize); +void finalizeReads(char *fileName); +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/RefGenome.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,161 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include "Common.h" +#include "RefGenome.h" + +FILE *_rg_fp; +char *_rg_gen; +char *_rg_name; +int _rg_offset; +int _rg_contGen; + +/**********************************************/ +int initLoadingRefGenome(char *fileName) +{ + char ch; + _rg_fp = fileOpen (fileName, "r"); + if (fscanf(_rg_fp, "%c", &ch)) + { + if (ch == '>') + { + _rg_contGen = 0; + _rg_offset = 0; + _rg_gen = getMem(CONTIG_MAX_SIZE); + _rg_name = getMem(CONTIG_NAME_SIZE); + return 1; + } + } + return 0; +} +/**********************************************/ +void finalizeLoadingRefGenome() +{ + freeMem(_rg_gen, CONTIG_MAX_SIZE); + freeMem(_rg_name, CONTIG_NAME_SIZE); + fclose(_rg_fp); +} +/**********************************************/ +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff) +{ + char ch; + int i; + int returnVal = 0; + int actualSize=0; + int size; + char *tmp; + + // New Conting + if (!_rg_contGen) + { + size = 0; + tmp = fgets(_rg_name, SEQ_MAX_LENGTH, _rg_fp); + int k; + for (k=0; k<strlen(_rg_name);k++) + { + if (_rg_name[k] == ' ') + { + _rg_name[k]='\0'; + break; + } + } + + } + else + { + size=strlen(_rg_gen); + for( i = 0 ; i < CONTIG_OVERLAP ; i++ ) + { + + _rg_gen[i] = _rg_gen[size-CONTIG_OVERLAP+i]; + if (_rg_gen[i] != 'N') + actualSize++; + } + size = CONTIG_OVERLAP; + } + while( fscanf(_rg_fp, "%c", &ch) > 0 ) + { + if (ch == '>') + { + _rg_contGen = 0; + returnVal = 1; + break; + } + else if (!isspace(ch)) + { + ch = toupper(ch); + _rg_gen[size++] = ch; + if (ch != 'N') + { + actualSize++; + } + if (actualSize == CONTIG_SIZE || size == CONTIG_MAX_SIZE) + { + _rg_contGen = 1; + returnVal=1; + break; + } + } + + } + + _rg_gen[size] = '\0'; + for (i=strlen(_rg_name)-1; i >= 0; i--) + if (!isspace(_rg_name[i])) + break; + _rg_name[i+1] = '\0'; + + *refGenOff = _rg_offset; + *refGenName = _rg_name; + *refGen = _rg_gen; + + if (_rg_contGen == 1) + { + _rg_offset += size-CONTIG_OVERLAP; + } + else + { + _rg_offset = 0; + } + + + return returnVal; +} +/**********************************************/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/RefGenome.h Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,44 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-12-08 + */ + + +#ifndef _REF_GENOME_ +#define _REF_GENOME_ + +int initLoadingRefGenome(char *fileName); +void finalizeLoadingRefGenome(); +int loadRefGenome(char **refGen, char **refGenName, int *refGenOff); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrsfast-2.3.0.2/baseFAST.c Tue Feb 21 10:39:28 2012 -0500 @@ -0,0 +1,398 @@ +/* + * Copyright (c) <2008 - 2009>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the name of the <ORGANIZATION> nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Author : Faraz Hach + * Email : fhach AT cs DOT sfu + * Last Update : 2009-02-01 + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "Common.h" +#include "CommandLineParser.h" +#include "Reads.h" +#include "Output.h" +#include "HashTable.h" +#include "MrsFAST.h" + +char *versionNumber = "2.3"; // Current Version +unsigned char seqFastq; + +int main(int argc, char *argv[]) +{ + if (!parseCommandLine(argc, argv)) + return 1; + + configHashTable(); + /**************************************************** + * INDEXING + ***************************************************/ + if (indexingMode) + { + int i; + /******************************** + * Regulard Mode + ********************************/ + if (!bisulfiteMode) + { + configHashTable(); + for (i = 0; i < fileCnt; i++) + { + generateHashTable(fileName[i][0], fileName[i][1]); + } + } + else + /******************************** + * Bisulfite Mode + ********************************/ + { // TODO + } + } + /**************************************************** + * SEARCHING + ***************************************************/ + else + { + Read *seqList; + unsigned int seqListSize; + int fc; + int samplingLocsSize; + int *samplingLocs; + double totalLoadingTime = 0; + double totalMappingTime = 0; + double startTime; + double loadingTime; + double mappingTime; + double lstartTime; + double ppTime; + double tmpTime;; + char *prevGen = getMem(CONTIG_NAME_SIZE); + prevGen[0]='\0'; + char *curGen; + int flag; + double maxMem=0; + char fname1[FILE_NAME_LENGTH]; + char fname2[FILE_NAME_LENGTH]; + char fname3[FILE_NAME_LENGTH]; + char fname4[FILE_NAME_LENGTH]; + char fname5[FILE_NAME_LENGTH]; + // Loading Sequences & Sampling Locations + startTime = getTime(); + if (bisulfiteMode && !pairedEndMode && seqFile1 == NULL) + { + //TODO + } + else + { + if (!readAllReads(seqFile1, seqFile2, seqCompressed, &seqFastq, pairedEndMode, &seqList, &seqListSize)) + { + return 1; + } + } + + //loadSamplingLocations(&samplingLocs, &samplingLocsSize); + totalLoadingTime += getTime()-startTime; + + + + + if (pairedEndMode) + { + //Switching to Inferred Size + minPairEndedDistance = minPairEndedDistance - SEQ_LENGTH + 2; + maxPairEndedDistance = maxPairEndedDistance - SEQ_LENGTH + 2; + if (pairedEndDiscordantMode) + { + maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance - SEQ_LENGTH + 2; + minPairEndedDiscordantDistance = minPairEndedDiscordantDistance - SEQ_LENGTH + 2; + } + + /* The size between the ends; + minPairEndedDistance = minPairEndedDistance + SEQ_LENGTH + 1; + maxPairEndedDistance = maxPairEndedDistance + SEQ_LENGTH + 1; + if (pairedEndDiscordantMode) + { + maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance + SEQ_LENGTH + 1; + minPairEndedDiscordantDistance = minPairEndedDiscordantDistance + SEQ_LENGTH + 1; + }*/ + sprintf(fname1, "%s__%s__1", mappingOutputPath, mappingOutput); + sprintf(fname2, "%s__%s__2", mappingOutputPath, mappingOutput); + sprintf(fname3, "%s__%s__disc", mappingOutputPath, mappingOutput); + sprintf(fname4, "%s__%s__oea1", mappingOutputPath, mappingOutput); + sprintf(fname5, "%s__%s__oea2", mappingOutputPath, mappingOutput); + unlink(fname1); + unlink(fname2); + unlink(fname3); + unlink(fname4); + unlink(fname5); + } + + // Preparing output + initOutput(mappingOutput, outCompressed); + + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads"); + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + + /******************************** + * Regular Mode + ********************************/ + if (!bisulfiteMode) + { + if (!pairedEndMode) + { + for (fc = 0; fc <fileCnt; fc++) + { + if (!initLoadingHashTable(fileName[fc][1])) + { + return 1; + } + + loadSamplingLocations(&samplingLocs, &samplingLocsSize); + + mappingTime = 0; + loadingTime = 0; + prevGen[0] = '\0'; + flag = 1; + + do + { + flag = loadHashTable ( &tmpTime ); // Reading a fragment + curGen = getRefGenomeName(); + + // First Time + if (flag && prevGen[0]== '\0') + { + sprintf(prevGen, "%s", curGen); + } + + if ( !flag || strcmp(prevGen, curGen)!=0) + { + + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + + totalMappingTime += mappingTime; + totalLoadingTime += loadingTime; + + loadingTime = 0; + mappingTime = 0; + maxMem = 0; + + if (!flag) + { + break; + } + } + else if (progressRep && mappingTime != 0) + { + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + } + + sprintf(prevGen, "%s", curGen); + + loadingTime += tmpTime; + lstartTime = getTime(); + + initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); + + mapSingleEndSeq(); + + mappingTime += getTime() - lstartTime; + if (maxMem < getMemUsage()) + { + maxMem = getMemUsage(); + } + } while (flag); + + } // end for; + finalizeFAST(); + finalizeLoadingHashTable(); + + } + // Pairedend Mapping Mode + else + { + + for (fc = 0; fc <fileCnt; fc++) + { + if (!initLoadingHashTable(fileName[fc][1])) + { + return 1; + } + + + loadSamplingLocations(&samplingLocs, &samplingLocsSize); + + mappingTime = 0; + loadingTime = 0; + prevGen[0] = '\0'; + flag = 1; + + do + { + flag = loadHashTable ( &tmpTime ); // Reading a fragment + curGen = getRefGenomeName(); + + // First Time + if (flag && prevGen[0]== '\0') + { + sprintf(prevGen, "%s", curGen); + } + + if ( !flag || strcmp(prevGen, curGen)!=0) + { + + // DISCORDANT + lstartTime = getTime(); + outputPairedEnd(); + mappingTime += getTime() - lstartTime; + //DISCORDANT + + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + + totalMappingTime += mappingTime; + totalLoadingTime += loadingTime; + + loadingTime = 0; + mappingTime = 0; + maxMem = 0; + + if (!flag) + { + break; + } + } + else if (progressRep && mappingTime != 0) + { + fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n", + prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt); + fflush(stdout); + } + + sprintf(prevGen, "%s", curGen); + + loadingTime += tmpTime; + lstartTime = getTime(); + + initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]); + + mapPairedEndSeq(); + + mappingTime += getTime() - lstartTime; + if (maxMem < getMemUsage()) + { + maxMem = getMemUsage(); + } + } while (flag); + + } // end for; + + finalizeLoadingHashTable(); + if (pairedEndDiscordantMode) + { + lstartTime = getTime(); + outputPairedEndDiscPP(); + ppTime = getTime() - lstartTime; + } + finalizeFAST(); + } + } + /******************************** + * Bisulfite Mode + ********************************/ + { + //TODO + } + + + finalizeOutput(); + + fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n"); + fprintf(stdout, "%19s%16.2f%18.2f\n\n", "Total:",totalLoadingTime, totalMappingTime); + if (pairedEndDiscordantMode) + fprintf(stdout, "Post Processing Time: %18.2f \n", ppTime); + fprintf(stdout, "%-30s%10.2f\n","Total Time:", totalMappingTime+totalLoadingTime); + fprintf(stdout, "%-30s%10d\n","Total No. of Reads:", seqListSize); + fprintf(stdout, "%-30s%10lld\n","Total No. of Mappings:", mappingCnt); + fprintf(stdout, "%-30s%10.0f\n\n","Avg No. of locations verified:", ceil((float)verificationCnt/seqListSize)); + + int cof = (pairedEndMode)?2:1; + + if (progressRep && maxHits != 0) + { + int frequency[maxHits+1]; + int i; + for ( i=0 ; i <= maxHits; i++) + { + frequency[i] = 0; + } + + + for (fc = 0; fc < seqListSize; fc++) + { + frequency[*(seqList[fc*cof].hits)]++; + } + frequency[maxHits] = completedSeqCnt; + for ( i=0 ; i <= maxHits; i++) + { + fprintf(stdout, "%-30s%10d%10d%10.2f%%\n","Reads Mapped to ", i, frequency[i], 100*(float)frequency[i]/(float)seqListSize); + } + } + + + if (strcmp(unmappedOutput, "") == 0) + { + char fn[strlen(mappingOutputPath) + strlen(mappingOutput) + 6 ]; + sprintf(fn, "%s%s.nohit", mappingOutputPath, mappingOutput ); + finalizeReads(fn); + } + else + { + finalizeReads(unmappedOutput); + } + + + + freeMem(prevGen, CONTIG_NAME_SIZE); + } + + + + return 1; +}