view mrfast-2.1.0.5/baseFAST.c @ 1:d4054b05b015 default tip

Version update to 2.1.0.5
author calkan
date Fri, 09 Mar 2012 07:35:51 -0500
parents
children
line wrap: on
line source

/*
 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this list
 * of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or other
 *   materials provided with the distribution.
 * - Neither the names of the University of Washington, Simon Fraser University, 
 *   nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without specific
 *   prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
  Authors: 
      Farhad Hormozdiari
      Faraz Hach
      Can Alkan
  Emails:
      farhadh AT uw DOT edu
      fhach AT cs DOT sfu DOT ca
      calkan AT uw DOT edu
*/





#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "Common.h"
#include "CommandLineParser.h"
#include "Reads.h"
#include "Output.h"
#include "HashTable.h"
#include "MrFAST.h"

char 				*versionNumber = "2.1";			// Current Version
unsigned char		seqFastq;

int main(int argc, char *argv[])
{
	if (!parseCommandLine(argc, argv))
		return 1;

	configHashTable();
	/****************************************************
	 * INDEXING
	 ***************************************************/
	if (indexingMode)
	{
		int i;
		/********************************
		 * Regular Mode
		 ********************************/
		configHashTable();
		for (i = 0; i < fileCnt; i++)
		{
			generateHashTable(fileName[i][0], fileName[i][1]);
		}
	}
	/****************************************************
	 * SEARCHING
	 ***************************************************/
	else
	{
		Read *seqList;
		unsigned int seqListSize;
		int fc;
		int samplingLocsSize;
		int *samplingLocs;
		double totalLoadingTime = 0;
		double totalMappingTime = 0;
		double startTime;
		double loadingTime;
		double mappingTime;
		double lstartTime;
		double ppTime = 0.0;
		double tmpTime;;
		char *prevGen = getMem(CONTIG_NAME_SIZE);
		prevGen[0]='\0';
		char *curGen;
		int	flag;
		double maxMem=0;
		char fname1[FILE_NAME_LENGTH];
		char fname2[FILE_NAME_LENGTH];
		char fname3[FILE_NAME_LENGTH];
		char fname4[FILE_NAME_LENGTH];
		char fname5[FILE_NAME_LENGTH];
	
		char outputFileName[FILE_NAME_LENGTH];
		// Loading Sequences & Sampling Locations
		startTime = getTime();
		if (!readAllReads(seqFile1, seqFile2, seqCompressed, &seqFastq, pairedEndMode, &seqList, &seqListSize))
		{
			return 1;
		}

		loadSamplingLocations(&samplingLocs, &samplingLocsSize);
		totalLoadingTime += getTime()-startTime;

		if (pairedEndMode)
		{
			//Switching to Inferred Size 
			minPairEndedDistance = minPairEndedDistance - SEQ_LENGTH + 2;
			maxPairEndedDistance = maxPairEndedDistance - SEQ_LENGTH + 2;
			if (pairedEndDiscordantMode)
			{
				maxPairEndedDiscordantDistance = maxPairEndedDiscordantDistance - SEQ_LENGTH + 2;
				minPairEndedDiscordantDistance = minPairEndedDiscordantDistance - SEQ_LENGTH + 2;
			}
			
			sprintf(fname1, "__%s__1", mappingOutput);
			sprintf(fname2, "__%s__2", mappingOutput);
			sprintf(fname3, "__%s__disc", mappingOutput);
			sprintf(fname4, "__%s__oea1", mappingOutput);
			sprintf(fname5, "__%s__oea2", mappingOutput);
			unlink(fname1);
			unlink(fname2);
			unlink(fname3);
			unlink(fname4);
			unlink(fname5);
		}

		sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput);
		// Preparing output
		initOutput(outputFileName, outCompressed);

		fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n");
		fprintf(stdout, "| %15s | %15s | %15s | %15s | %15s %15s |\n","Genome Name","Loading Time", "Mapping Time", "Memory Usage(M)","Total Mappings","Mapped reads");
		fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n");

		/********************************
		 * Regular Mode
		 ********************************/
			if (!pairedEndMode)
			{
				initLookUpTable();
				if(bestMode)
                                        initBestMapping(seqListSize);

				for (fc = 0; fc <fileCnt; fc++)
				{
					if (!initLoadingHashTable(fileName[fc][1]))
					{
						return 1;
					}
							
					mappingTime = 0;
					loadingTime = 0;
					prevGen[0] = '\0';
					flag = 1;

					do
					{
						flag = loadHashTable ( &tmpTime, errThreshold);  			// Reading a fragment
						curGen = getRefGenomeName();

						// First Time
						if (flag && prevGen[0]== '\0')
						{
							sprintf(prevGen, "%s", curGen);
						}

						if ( !flag || strcmp(prevGen, curGen)!=0)
						{

							fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n",
									prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt);
							fflush(stdout);

							totalMappingTime += mappingTime;
							totalLoadingTime += loadingTime;

							loadingTime = 0;
							mappingTime = 0;
							maxMem = 0;

							if (!flag)
							{
								break;
							}
						}
						else if (progressRep && mappingTime != 0)
						{
							fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n",
									prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt);
							fflush(stdout);
						}

						sprintf(prevGen, "%s", curGen);

						loadingTime += tmpTime;
//						lstartTime = getTime();

						initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]);

						lstartTime = getTime();

								
						mapAllSingleEndSeq();

		
						
						mappingTime += getTime() - lstartTime;
						if (maxMem < getMemUsage())
						{
							maxMem = getMemUsage();					 
						}
					} while (flag);

				} // end for;
				if(bestMode)
					finalizeBestSingleMapping();
				finalizeFAST();
				finalizeLoadingHashTable();

			}
			// Pairedend Mapping Mode
			else
			{
				initLookUpTable();
				if(pairedEndMode)
					initBestConcordantDiscordant(seqListSize);
				for (fc = 0; fc < fileCnt; fc++)
				{
					if (!initLoadingHashTable(fileName[fc][1]))
					{
						return 1;
					}
					mappingTime = 0;
					loadingTime = 0;
					prevGen[0] = '\0';
					flag = 1;

					do
					{
						flag = loadHashTable ( &tmpTime , errThreshold);  			// Reading a fragment
						curGen = getRefGenomeName();

						// First Time
						if (flag && prevGen[0]== '\0')
						{
							sprintf(prevGen, "%s", curGen);
						}

						if ( !flag || strcmp(prevGen, curGen)!=0)
						{

							// DISCORDANT
							lstartTime = getTime();					
							outputPairedEnd();
							mappingTime += getTime() - lstartTime;
							//DISCORDANT			

							fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n",
									prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt);
							fflush(stdout);

							totalMappingTime += mappingTime;
							totalLoadingTime += loadingTime;

							loadingTime = 0;
							mappingTime = 0;
							maxMem = 0;

							if (!flag)
							{
								break;
							}
						}
						else if (progressRep && mappingTime != 0)
						{
							fprintf(stdout, "| %15s | %15.2f | %15.2f | %15.2f | %15lld %15lld |\n",
									prevGen,loadingTime, mappingTime, maxMem, mappingCnt , mappedSeqCnt);
							fflush(stdout);
						}

						sprintf(prevGen, "%s", curGen);

						loadingTime += tmpTime;
						lstartTime = getTime();
					
						initFAST(seqList, seqListSize, samplingLocs, samplingLocsSize, fileName[fc][0]);
						
						mapPairedEndSeq();
					
						mappingTime += getTime() - lstartTime;
						if (maxMem < getMemUsage())
						{
							maxMem = getMemUsage();
						}

					} while (flag);

				} // end for;
				
				if(pairedEndMode)
				{		
					sprintf(outputFileName, "%s%s",mappingOutputPath , mappingOutput);		
					finalizeOEAReads(outputFileName);			
					outputAllTransChromosal(transChromosal);
					finalizeBestConcordantDiscordant();
				}
			
				finalizeLoadingHashTable();
				
				if (pairedEndDiscordantMode)
				{
					lstartTime = getTime();							
					outputPairedEndDiscPP();
					ppTime = getTime() - lstartTime;
				}
				finalizeFAST();
			} //else

			finalizeOutput();

		fprintf(stdout, "-----------------------------------------------------------------------------------------------------------\n");
		fprintf(stdout, "%19s%16.2f%18.2f\n\n", "Total:",totalLoadingTime, totalMappingTime);
		if (pairedEndDiscordantMode)
			fprintf(stdout, "Post Processing Time: %18.2f \n", ppTime);
		fprintf(stdout, "%-30s%10.2f\n","Total Time:", totalMappingTime+totalLoadingTime);
		fprintf(stdout, "%-30s%10d\n","Total No. of Reads:", seqListSize);
		fprintf(stdout, "%-30s%10lld\n","Total No. of Mappings:", mappingCnt);
		fprintf(stdout, "%-30s%10.0f\n\n","Avg No. of locations verified:", ceil((float)verificationCnt/seqListSize));

		int cof = (pairedEndMode)?2:1;

		if (progressRep && maxHits != 0)
		{
			int frequency[maxHits+1];
			int i;
			for ( i=0 ; i <= maxHits; i++)
			{
				frequency[i] = 0;
			}


			for (fc = 0; fc < seqListSize; fc++)
			{
			  frequency[(int)(*(seqList[fc*cof].hits))]++;
			}
			frequency[maxHits] = completedSeqCnt;
			for ( i=0 ; i <= maxHits; i++)
			{
				fprintf(stdout, "%-30s%10d%10d%10.2f%%\n","Reads Mapped to ", i, frequency[i], 100*(float)frequency[i]/(float)seqListSize);
			}
		}


		finalizeReads(unmappedOutput);
		freeMem(prevGen, CONTIG_NAME_SIZE);
	}

	return 1;
}