Mercurial > repos > calkan > mrfast
diff mrfast-2.1.0.5/Reads.c @ 1:d4054b05b015 default tip
Version update to 2.1.0.5
author | calkan |
---|---|
date | Fri, 09 Mar 2012 07:35:51 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mrfast-2.1.0.5/Reads.c Fri Mar 09 07:35:51 2012 -0500 @@ -0,0 +1,798 @@ +/* + * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list + * of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. + * - Neither the names of the University of Washington, Simon Fraser University, + * nor the names of its contributors may be + * used to endorse or promote products derived from this software without specific + * prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Authors: + Farhad Hormozdiari + Faraz Hach + Can Alkan + Emails: + farhadh AT uw DOT edu + fhach AT cs DOT sfu DOT ca + calkan AT uw DOT edu + */ + + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <zlib.h> +#include "Common.h" +#include "Reads.h" + +#define CHARCODE(a) (a=='A' ? 0 : (a=='C' ? 1 : (a=='G' ? 2 : (a=='T' ? 3 : 4)))) + +FILE *_r_fp1; +FILE *_r_fp2; +gzFile _r_gzfp1; +gzFile _r_gzfp2; +Read *_r_seq; +int _r_seqCnt; +int *_r_samplingLocs; + +/**********************************************/ +char *(*readFirstSeq)(char *); +char *(*readSecondSeq)(char *); +/**********************************************/ +char *readFirstSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp1); +} + +/**********************************************/ +char *readSecondSeqTXT( char *seq ) +{ + return fgets(seq, SEQ_MAX_LENGTH, _r_fp2); +} +/**********************************************/ +char *readFirstSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp1, seq, SEQ_MAX_LENGTH); +} + +/**********************************************/ +char *readSecondSeqGZ( char *seq ) +{ + return gzgets(_r_gzfp2, seq, SEQ_MAX_LENGTH); +} +/**********************************************/ +int toCompareRead(const void * elem1, const void * elem2) +{ + return strcmp(((Read *)elem1)->seq, ((Read *)elem2)->seq); +} +/**********************************************/ +int readAllReads(char *fileName1, + char *fileName2, + int compressed, + unsigned char *fastq, + unsigned char pairedEnd, + Read **seqList, + unsigned int *seqListSize) +{ + double startTime=getTime(); + + char seq1[SEQ_MAX_LENGTH]; + char rseq1[SEQ_MAX_LENGTH]; + char name1[SEQ_MAX_LENGTH]; + char qual1[SEQ_MAX_LENGTH]; + char seq2[SEQ_MAX_LENGTH]; + char rseq2[SEQ_MAX_LENGTH]; + char name2[SEQ_MAX_LENGTH]; + char qual2[SEQ_MAX_LENGTH]; + + char dummy[SEQ_MAX_LENGTH]; + char ch; + int err1, err2; + int nCnt; + int discarded = 0; + int seqCnt = 0; + int maxCnt = 0; + int i; + Read *list = NULL; + + int clipped = 0; + + + if (!compressed) + { + _r_fp1 = fileOpen( fileName1, "r"); + + if (_r_fp1 == NULL) + { + return 0; + } + + ch = fgetc(_r_fp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_fp2 = fileOpen ( fileName2, "r" ); + if (_r_fp2 == NULL) + { + return 0; + } + } + else + { + _r_fp2 = _r_fp1; + } + + readFirstSeq = &readFirstSeqTXT; + readSecondSeq = &readSecondSeqTXT; + } + else + { + + _r_gzfp1 = fileOpenGZ (fileName1, "r"); + + if (_r_gzfp1 == NULL) + { + return 0; + } + + ch = gzgetc(_r_gzfp1); + + if ( pairedEnd && fileName2 != NULL ) + { + _r_gzfp2 = fileOpenGZ ( fileName2, "r" ); + if (_r_gzfp2 == NULL) + { + return 0; + } + } + else + { + _r_gzfp2 = _r_gzfp1; + } + + readFirstSeq = &readFirstSeqGZ; + readSecondSeq = &readSecondSeqGZ; + } + + if (ch == '>') + *fastq = 0; + else + *fastq = 1; + + // Counting the number of lines in the file + while (readFirstSeq(dummy)) maxCnt++; + + if (!compressed) + { + rewind(_r_fp1); + } + else + { + gzrewind(_r_gzfp1); + } + + // Calculating the Maximum # of sequences + if (*fastq) + { + maxCnt /= 4; + } + else + { + maxCnt /= 2; + } + + if (pairedEnd && fileName2 != NULL ) + maxCnt *= 2; + + list = getMem(sizeof(Read)*maxCnt); + + while( readFirstSeq(name1) ) + { + err1 = 0; + err2 = 0; + readFirstSeq(seq1); + name1[strlen(name1)-1] = '\0'; + + if ( *fastq ) + { + readFirstSeq(dummy); + readFirstSeq(qual1); + qual1[strlen(qual1)-1] = '\0'; + } + else + { + sprintf(qual1, "*"); + } + + // Cropping + if (cropSize > 0) + { + seq1[cropSize] = '\0'; + if ( *fastq ) + qual1[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq1); i++) + { + seq1[i] = toupper (seq1[i]); + if (seq1[i] == 'N') + { + nCnt++; + } + else if (isspace(seq1[i])) + { + + seq1[i] = '\0'; + break; + } + } + + if (nCnt > errThreshold) + { + err1 = 1; + } + + // Reading the second seq of pair-ends + if (pairedEnd) + { + readSecondSeq(name2); + readSecondSeq(seq2); + name2[strlen(name2)-1] = '\0'; + + + if ( *fastq ) + { + readSecondSeq(dummy); + readSecondSeq(qual2); + + qual2[strlen(qual2)-1] = '\0'; + } + else + { + sprintf(qual2, "*"); + } + + + // Cropping + if (cropSize > 0) + { + seq2[cropSize] = '\0'; + if ( *fastq ) + qual2[cropSize] = '\0'; + } + + + nCnt = 0; + for (i=0; i<strlen(seq2); i++) + { + seq2[i] = toupper (seq2[i]); + if (seq2[i] == 'N') + { + nCnt++; + + } + else if (isspace(seq2[i])) + { + seq2[i] = '\0'; + } + } + if (nCnt > errThreshold) + { + err2 = 1; + } + + + if (strlen(seq1) < strlen(seq2)) { + seq2[strlen(seq1)] = '\0'; + if ( *fastq ) + qual2[strlen(seq1)] = '\0'; + if (!clipped) clipped = 2; + } + else if (strlen(seq1) > strlen(seq2)){ + seq1[strlen(seq2)] = '\0'; + if ( *fastq ) + qual1[strlen(seq2)] = '\0'; + if (!clipped) clipped = 1; + } + + if (clipped == 1 || clipped == 2){ + fprintf(stdout, "[PE mode Warning] Sequence lengths are different, read #%d is clipped to match.\n", clipped); + clipped = 3; + } + + + + } + + if (!pairedEnd && !err1) + { + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+strlen(name1)+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + seqCnt++; + + } + else if (pairedEnd && !err1 && !err2) + { + + // Naming Conventions X/1, X/2 OR X + int tmplen = strlen(name1); + if (strcmp(name1, name2) != 0) + { + tmplen = strlen(name1)-2; + } + + //first seq + int _mtmp = strlen(seq1); + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq1, rseq1, _mtmp); + rseq1[_mtmp] = '\0'; + int i; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq1[i]; + list[seqCnt].rseq[i] = rseq1[i] ; + list[seqCnt].qual[i] = qual1[i]; + } + + + name1[tmplen]='\0'; + + //MAKE HASH VALUE + short code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + int j = 0; + int tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name1)+1,'\0'); + + + seqCnt++; + + //second seq + list[seqCnt].hits = getMem (1+3*_mtmp+3+tmplen+1); + list[seqCnt].seq = list[seqCnt].hits + 1; + list[seqCnt].rseq = list[seqCnt].seq + _mtmp+1; + list[seqCnt].qual = list[seqCnt].rseq + _mtmp+1; + list[seqCnt].name = list[seqCnt].qual + _mtmp+1; + + list[seqCnt].hashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].rhashValue = getMem(sizeof(short)*_mtmp); + list[seqCnt].readNumber = seqCnt; + + reverseComplete(seq2, rseq2, _mtmp); + rseq2[_mtmp] = '\0'; + + list[seqCnt].hits[0] = 0; + + for (i=0; i<=_mtmp; i++) + { + list[seqCnt].seq[i] = seq2[i]; + list[seqCnt].rseq[i] = rseq2[i] ; + list[seqCnt].qual[i] = qual2[i]; + } + + + name2[tmplen]='\0'; + + //MAKE HASH VALUE + code = 0; + + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].seq[i]); + list[seqCnt].hashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].hashValue[i] = (list[seqCnt].hashValue[i-1] - 125 * CHARCODE(seq1[i-1])) * 5 + CHARCODE(seq1[i+3]); + } + + + code = 0; + for(i=0; i < 4; i++) + code = code * 5 + CHARCODE(list[seqCnt].rseq[i]); + list[seqCnt].rhashValue[0] = code; + + + for(i = 1; i < _mtmp-3; i++) + { + list[seqCnt].rhashValue[i] = (list[seqCnt].rhashValue[i-1] - 125 * CHARCODE(rseq1[i-1])) * 5 + CHARCODE(rseq1[i+3]); + } + + j = 0; + tmpSize = _mtmp / (errThreshold+1); + + list[seqCnt].hashValSampleSize = getMem(sizeof(int)*(errThreshold+1)); + for(i=0; i < errThreshold+1; i++) + { + code = 0; + for(j = 0; j < tmpSize; j++) + { + code = code * 5 + CHARCODE(list[seqCnt].seq[i*tmpSize+j]); + } + list[seqCnt].hashValSampleSize[i] = code; + } + + sprintf(list[seqCnt].name,"%s%c", ((char*)name2)+1,'\0'); + + seqCnt++; + + } + else + { + discarded++; + } + } + + if (seqCnt > 0) + { + SEQ_LENGTH = strlen(list[0].seq); + } + else + { + fprintf(stdout, "ERROR: No reads can be found for mapping\n"); + return 0; + } + + + // Closing Files + if (!compressed) + { + fclose(_r_fp1); + if ( pairedEnd && fileName2 != NULL ) + { + fclose(_r_fp2); + } + } + else + { + gzclose(_r_gzfp1); + if ( pairedEnd && fileName2 != NULL) + { + gzclose(_r_fp2); + } + } + + //qsort(list, seqCnt, sizeof(Read), toCompareRead); + + adjustQual(list, seqCnt); + + *seqList = list; + *seqListSize = seqCnt; + + + _r_seq = list; + _r_seqCnt = seqCnt; + + if ( pairedEnd ) discarded *= 2; + + if (seqCnt>1) + fprintf(stdout, "%d sequences are read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + else + fprintf(stdout, "%d sequence is read in %0.2f. (%d discarded) [Mem:%0.2f M]\n", seqCnt, (getTime()-startTime), discarded, getMemUsage()); + + + + return 1; +} +/**********************************************/ +void loadSamplingLocations(int **samplingLocs, int * samplingLocsSize) +{ + int i; + int samLocsSize = errThreshold + 1; + int *samLocs = getMem(sizeof(int)*samLocsSize); + + for (i=0; i<samLocsSize; i++) + { + samLocs[i] = (SEQ_LENGTH / samLocsSize) *i; + if ( samLocs[i] + WINDOW_SIZE > SEQ_LENGTH) + samLocs[i] = SEQ_LENGTH - WINDOW_SIZE; + } + + // Outputing the sampling locations + + /* + + int j; + for (i=0; i<SEQ_LENGTH; i++) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n"); + + for ( i=0; i<samLocsSize; i++ ) + { + for ( j=0; j<samLocs[i]; j++ ) + { + fprintf(stdout," "); + } + for (j=0; j<WINDOW_SIZE; j++) + { + fprintf(stdout,"+"); + } + fprintf(stdout, "\n"); + fflush(stdout); + } + + + for ( i=0; i<SEQ_LENGTH; i++ ) + { + fprintf(stdout, "-"); + } + fprintf(stdout, "\n"); + + */ + + *samplingLocs = samLocs; + *samplingLocsSize = samLocsSize; + _r_samplingLocs = samLocs; +} + +void finalizeReads(char *fileName) +{ + FILE *fp1=NULL; + + if (fileName != NULL) + { + fp1 = fileOpen(fileName, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual,"*")!=0) + { + fprintf(fp1,"@%s/1\n%s\n+\n%s\n@%s/2\n%s\n+\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].qual, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[i*2+1].qual); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] == 0) + { + fprintf(fp1, ">%s/1\n%s\n>%s/2\n%shits=%d\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq, _r_seq[2*i+1].hits[0]); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0 && strcmp(_r_seq[i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n", _r_seq[i].name, _r_seq[i].seq, _r_seq[i].qual); + } + else if (!pairedEndMode && _r_seq[i].hits[0] == 0) + { + fprintf(fp1,">%s\n%s\n", _r_seq[i].name, _r_seq[i].seq); + } + } + + fclose(fp1); + if (pairedEndMode) + _r_seqCnt *= 2; + + for (i = 0; i < _r_seqCnt; i++) + { + freeMem(_r_seq[i].hits,0); + } + + + freeMem(_r_seq,0); + freeMem(_r_samplingLocs,0); +} + +void adjustQual(Read *list, int seqCnt){ + /* This function will automatically determine the phred_offset and readjust quality values if needed */ + int i,j,q, offset=64; + int len = strlen(list[0].qual); + + for (i=0; i<10000 && i<seqCnt; i++){ + for (j=0;j<len;j++){ + q = (int) list[i].qual[j] - offset; + if (q < 0){ + offset = 33; + break; + } + } + if (offset == 33) + break; + } + + if (offset == 64){ + fprintf(stdout, "[Quality Warning] Phred offset is 64. Readjusting to 33.\n"); + fflush(stdout); + for (i=0;i<seqCnt;i++){ + for (j=0;j<len;j++){ + list[i].qual[j] -= 31; + } + } + } +} + + + +/*void finalizeOEAReads(char *fileName) +{ + FILE *fp1=NULL; + FILE *fp2=NULL; + + //printf("OEA%s\n", fileName); + char fileNameOEA1[200]; + char fileNameOEA2[200]; + sprintf(fileNameOEA1, "%s_OEA1", fileName); + sprintf(fileNameOEA2, "%s_OEA2", fileName); + + + if (fileName != NULL) + { + fp1 = fileOpen(fileNameOEA1, "w"); + fp2 = fileOpen(fileNameOEA2, "w"); + } + if (pairedEndMode) + _r_seqCnt /=2; + + int i=0; + printf("%d\n", _r_seqCnt); + for (i = 0; i < _r_seqCnt; i++) + { + if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp1,">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")==0) + { + fprintf(fp2, ">%s/1\n%s\n>%s/2\n%s\n", _r_seq[i*2].name, _r_seq[i*2].seq, _r_seq[i*2].name, _r_seq[i*2+1].seq); + } + else if (pairedEndMode && _r_seq[2*i].hits[0] == 0 && _r_seq[2*i+1].hits[0] != 0 && strcmp(_r_seq[2*i].qual, "*")!=0) + { + fprintf(fp1,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + else if ( pairedEndMode && _r_seq[2*i].hits[0] != 0 && _r_seq[2*i+1].hits[0] == 0 && strcmp(_r_seq[2*i].qual, "*")!=0 ) + { + fprintf(fp2,"@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n", _r_seq[2*i].name, + _r_seq[2*i].seq, + _r_seq[2*i].qual, + _r_seq[2*i+1].name, + _r_seq[2*i+1].seq, + _r_seq[2*i+1].qual); + } + } + + fclose(fp1); + fclose(fp2); + +}*/