diff mrfast-2.1.0.4/CommandLineParser.c @ 0:7b3dc85dc7fd

Uploaded mrfast source tarball
author calkan
date Tue, 21 Feb 2012 10:29:47 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mrfast-2.1.0.4/CommandLineParser.c	Tue Feb 21 10:29:47 2012 -0500
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this list
+ * of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice, this
+ *   list of conditions and the following disclaimer in the documentation and/or other
+ *   materials provided with the distribution.
+ * - Neither the names of the University of Washington, Simon Fraser University, 
+ *   nor the names of its contributors may be
+ *   used to endorse or promote products derived from this software without specific
+ *   prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Authors: 
+        Farhad Hormozdiari
+        Faraz Hach
+	Can Alkan
+  Emails: 
+        farhadh AT uw DOT edu
+        fhach AT cs DOT sfu DOT ca
+        calkan AT uw DOT edu
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <ctype.h>
+#include "Common.h"
+#include "CommandLineParser.h"
+
+int						uniqueMode=1;
+int						indexingMode;
+int						searchingMode;
+int						pairedEndMode;
+int						pairedEndDiscordantMode;
+int						transChromosal=0;
+int						pairedEndProfilingMode;
+int						seqCompressed;
+int						outCompressed;
+int						cropSize = 0;
+int						progressRep = 0;
+int						minPairEndedDistance=-1;
+int						maxPairEndedDistance=-1;
+int						minPairEndedDiscordantDistance=-1;
+int						maxPairEndedDiscordantDistance=-1;
+int 					bestMode;
+int 					nosamMode;
+char					*seqFile1;
+char					*seqFile2;
+char					*mappingOutput = "output";
+char					*mappingOutputPath = "";
+char					*unmappedOutput = "unmapped";
+char					fileName[1000][2][FILE_NAME_LENGTH];
+int						fileCnt;
+int 					maxOEAOutput=1000;
+int 					maxDiscordantOutput=10000;
+unsigned char			errThreshold=2;
+unsigned char			maxHits=0;
+unsigned char			WINDOW_SIZE = 12;
+unsigned int			CONTIG_SIZE;
+unsigned int			CONTIG_MAX_SIZE;
+
+void printHelp();
+
+int parseCommandLine (int argc, char *argv[])
+{
+
+	int o;
+	int index;
+	char *fastaFile = NULL;
+	char *batchFile = NULL ;
+	int  batchMode = 0;
+		
+	static struct option longOptions[] = 
+	{
+		{"pe",				no_argument,		&pairedEndMode,		1},
+		{"discordant-vh",	        no_argument,		&pairedEndDiscordantMode,	1},
+		{"trans",                     no_argument,            &transChromosal        ,       1},
+		{"profile",			no_argument, 		&pairedEndProfilingMode,	1},
+		{"seqcomp",			no_argument,		&seqCompressed,		1},
+		{"outcomp",			no_argument,		&outCompressed,		1},
+		{"progress",		no_argument,		&progressRep,		1},
+		{"best",		no_argument,		&bestMode,		1},
+		{"index",			required_argument,	0, 					'i'},
+		{"search",			required_argument,	0,					's'},
+		{"help",			no_argument,		0,					'h'},
+		{"version",			no_argument,		0,					'v'},
+		{"seq",				required_argument,	0,					'x'},
+		{"seq1",			required_argument,	0,					'x'},
+		{"seq2",			required_argument,	0,					'y'},
+		{"ws",				required_argument,  0,					'w'},
+		{"min",				required_argument,  0,					'l'},
+		{"max",				required_argument,  0,					'm'},
+		{"crop",			required_argument,  0,					'c'},
+		{"maxoea",            required_argument,  0,                  'a'},
+		{"maxdis",            required_argument,  0,                  'd'},
+		{"nosam",        no_argument,        &nosamMode,      1},
+		{0,  0,  0, 0},
+	};
+
+	while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 )
+	{
+		switch (o)
+		{
+			case 'a':
+				maxOEAOutput = atoi(optarg);
+				break;
+			case 'd':
+				maxDiscordantOutput = atoi(optarg);
+				break;
+			case 'i':
+				indexingMode = 1;
+				fastaFile = optarg;
+				break;
+			case 's':
+				searchingMode = 1;
+				fastaFile = optarg;
+				break;
+			case 'b':
+				batchMode = 1;
+				break;
+			case 'c': 
+				cropSize = atoi(optarg);
+				break;
+			case 'w':
+				WINDOW_SIZE = atoi(optarg);
+				break;
+			case 'x':
+				seqFile1 = optarg;
+				break;
+			case 'y':
+				seqFile2 = optarg;
+				break;
+			case 'u':
+				unmappedOutput = optarg;
+				break;
+			case 'o':
+				mappingOutput = getMem(FILE_NAME_LENGTH);
+				mappingOutputPath = getMem(FILE_NAME_LENGTH);
+				stripPath (optarg, &mappingOutputPath, &mappingOutput);
+				break;
+			case 'n':
+				maxHits = atoi(optarg);
+				break;
+			case 'e':
+				errThreshold = atoi(optarg);
+				break;
+			case 'l':
+				minPairEndedDistance = atoi(optarg);
+				break;
+			case 'm':
+				maxPairEndedDistance = atoi(optarg);
+				break;					
+			case 'h':
+				printHelp();
+				return 0;
+				break;
+			case 'v':
+				fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF);
+				return 0;
+				break;
+				/*		        case '?':
+ 			        fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name);
+			        abort();
+				break;*/
+		}
+
+	}
+	if (indexingMode + searchingMode != 1)
+	{
+		fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n");
+		return 0;
+	}
+
+	if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11)
+	{
+		fprintf(stdout, "ERROR: Window size should be in [12..15]\n");
+		return 0;
+	}
+
+
+	if ( indexingMode )
+	{
+		CONTIG_SIZE		= 15000000;
+		CONTIG_MAX_SIZE	= 40000000;
+
+		if (batchMode)
+		{
+			batchFile = fastaFile;
+			fastaFile = NULL;
+		}
+
+		if (batchFile == NULL && fastaFile == NULL)
+		{
+			fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n");
+			return 0;
+		}
+
+		if (pairedEndDiscordantMode)
+		{
+			fprintf(stdout, "ERROR: --discordant cannot be used in indexing mode. \n");
+			return 0;
+		}
+
+	}
+
+
+	if ( searchingMode )
+	{
+		CONTIG_SIZE		= 300000000;
+		CONTIG_MAX_SIZE	= 300000000;
+
+
+		if (batchMode)
+		{
+			batchFile = fastaFile;
+			fastaFile = NULL;
+		}
+
+		if (batchFile == NULL && fastaFile == NULL)
+		{
+			fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n");
+			return 0;
+		}
+
+		if (seqFile1 == NULL && seqFile2 == NULL)
+		{
+			fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n");
+			return 0;
+		}
+
+
+		if (!pairedEndMode && seqFile2 != NULL)
+		{
+			fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n");
+			return 0;
+		}
+
+		if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance))
+		{
+			fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n");
+			return 0;
+		}
+
+		if (pairedEndMode && seqFile1 == NULL)
+		{
+			fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n");
+			return 0;
+		}
+
+		if (!pairedEndMode && pairedEndDiscordantMode)
+		{
+			fprintf(stdout, "ERROR: --discordant should be used with --pe");
+			return 0;
+		}
+
+		if (!pairedEndMode && pairedEndProfilingMode)
+		{
+			fprintf(stdout, "ERROR: --profile should be used with --pe");
+			return 0;
+		}
+	}
+
+	int i = 0;
+
+
+	if (batchMode)
+	{
+		FILE *fp = fileOpen(batchFile, "r");
+
+		if (fp == NULL)
+			return 0;
+
+		fileCnt  = 0;
+
+		while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp))
+		{
+			for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--)
+				if ( !isspace(fileName[fileCnt][0][i]))
+					break;
+			fileName[fileCnt][0][i+1] = '\0';
+
+			if (strcmp(fileName[fileCnt][0], "") != 0)
+			{
+				sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); 
+				fileCnt++;
+			}
+		}
+	}
+	else
+	{
+		sprintf(fileName[fileCnt][0], "%s", fastaFile);
+		sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); 
+		fileCnt++;
+	}
+
+
+	if (pairedEndProfilingMode)
+	{
+
+		minPairEndedDistance = 0;
+		maxPairEndedDistance = 300000000;
+
+	}
+
+	if (pairedEndDiscordantMode)
+	{
+		minPairEndedDiscordantDistance = minPairEndedDistance;
+		maxPairEndedDiscordantDistance = maxPairEndedDistance;
+
+		minPairEndedDistance = 0;
+		maxPairEndedDistance = 300000000;
+	}
+
+	return 1;
+}
+
+
+void printHelp()
+{
+	char *errorType;
+	if (mrFAST)
+	{
+		fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n");
+		fprintf(stdout,"Usage: mrfast [options]\n\n");
+		errorType="edit distance";
+	}
+	else
+	{
+		fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n");
+		fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n");
+		fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n");
+		fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports  \n");
+		fprintf(stdout,"the output in SAM format.\n\n");
+		fprintf(stdout,"Usage: mrsFAST [options]\n\n");
+		errorType="hamming distance";
+	}
+
+	fprintf(stdout,"General Options:\n");
+	fprintf(stdout," -v|--version\t\tCurrent Version.\n");
+	fprintf(stdout," -h\t\t\tShows the help file.\n");
+	fprintf(stdout,"\n\n");
+
+	fprintf(stdout,"Indexing Options:\n");
+	fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n");
+	fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n");
+	fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n");
+	fprintf(stdout,"\n\n");
+
+	fprintf(stdout,"Searching Options:\n");
+	fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n");
+	fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n");
+	fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n");
+	fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n");
+	fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n");
+	fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads.  \n");
+	fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n");
+	fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n");
+        fprintf(stdout," --best   \t\tOnly the best mapping from all the possible mapping is returned.\n");
+	fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n");
+	fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n");
+	fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType);
+	fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n");
+	fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n");
+
+	fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n");
+	fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n");
+}