comparison mrfast-2.1.0.4/CommandLineParser.c @ 0:7b3dc85dc7fd

Uploaded mrfast source tarball
author calkan
date Tue, 21 Feb 2012 10:29:47 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7b3dc85dc7fd
1 /*
2 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * Redistributions of source code must retain the above copyright notice, this list
9 * of conditions and the following disclaimer.
10 * - Redistributions in binary form must reproduce the above copyright notice, this
11 * list of conditions and the following disclaimer in the documentation and/or other
12 * materials provided with the distribution.
13 * - Neither the names of the University of Washington, Simon Fraser University,
14 * nor the names of its contributors may be
15 * used to endorse or promote products derived from this software without specific
16 * prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 Authors:
33 Farhad Hormozdiari
34 Faraz Hach
35 Can Alkan
36 Emails:
37 farhadh AT uw DOT edu
38 fhach AT cs DOT sfu DOT ca
39 calkan AT uw DOT edu
40 */
41
42
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <getopt.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include "Common.h"
49 #include "CommandLineParser.h"
50
51 int uniqueMode=1;
52 int indexingMode;
53 int searchingMode;
54 int pairedEndMode;
55 int pairedEndDiscordantMode;
56 int transChromosal=0;
57 int pairedEndProfilingMode;
58 int seqCompressed;
59 int outCompressed;
60 int cropSize = 0;
61 int progressRep = 0;
62 int minPairEndedDistance=-1;
63 int maxPairEndedDistance=-1;
64 int minPairEndedDiscordantDistance=-1;
65 int maxPairEndedDiscordantDistance=-1;
66 int bestMode;
67 int nosamMode;
68 char *seqFile1;
69 char *seqFile2;
70 char *mappingOutput = "output";
71 char *mappingOutputPath = "";
72 char *unmappedOutput = "unmapped";
73 char fileName[1000][2][FILE_NAME_LENGTH];
74 int fileCnt;
75 int maxOEAOutput=1000;
76 int maxDiscordantOutput=10000;
77 unsigned char errThreshold=2;
78 unsigned char maxHits=0;
79 unsigned char WINDOW_SIZE = 12;
80 unsigned int CONTIG_SIZE;
81 unsigned int CONTIG_MAX_SIZE;
82
83 void printHelp();
84
85 int parseCommandLine (int argc, char *argv[])
86 {
87
88 int o;
89 int index;
90 char *fastaFile = NULL;
91 char *batchFile = NULL ;
92 int batchMode = 0;
93
94 static struct option longOptions[] =
95 {
96 {"pe", no_argument, &pairedEndMode, 1},
97 {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1},
98 {"trans", no_argument, &transChromosal , 1},
99 {"profile", no_argument, &pairedEndProfilingMode, 1},
100 {"seqcomp", no_argument, &seqCompressed, 1},
101 {"outcomp", no_argument, &outCompressed, 1},
102 {"progress", no_argument, &progressRep, 1},
103 {"best", no_argument, &bestMode, 1},
104 {"index", required_argument, 0, 'i'},
105 {"search", required_argument, 0, 's'},
106 {"help", no_argument, 0, 'h'},
107 {"version", no_argument, 0, 'v'},
108 {"seq", required_argument, 0, 'x'},
109 {"seq1", required_argument, 0, 'x'},
110 {"seq2", required_argument, 0, 'y'},
111 {"ws", required_argument, 0, 'w'},
112 {"min", required_argument, 0, 'l'},
113 {"max", required_argument, 0, 'm'},
114 {"crop", required_argument, 0, 'c'},
115 {"maxoea", required_argument, 0, 'a'},
116 {"maxdis", required_argument, 0, 'd'},
117 {"nosam", no_argument, &nosamMode, 1},
118 {0, 0, 0, 0},
119 };
120
121 while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 )
122 {
123 switch (o)
124 {
125 case 'a':
126 maxOEAOutput = atoi(optarg);
127 break;
128 case 'd':
129 maxDiscordantOutput = atoi(optarg);
130 break;
131 case 'i':
132 indexingMode = 1;
133 fastaFile = optarg;
134 break;
135 case 's':
136 searchingMode = 1;
137 fastaFile = optarg;
138 break;
139 case 'b':
140 batchMode = 1;
141 break;
142 case 'c':
143 cropSize = atoi(optarg);
144 break;
145 case 'w':
146 WINDOW_SIZE = atoi(optarg);
147 break;
148 case 'x':
149 seqFile1 = optarg;
150 break;
151 case 'y':
152 seqFile2 = optarg;
153 break;
154 case 'u':
155 unmappedOutput = optarg;
156 break;
157 case 'o':
158 mappingOutput = getMem(FILE_NAME_LENGTH);
159 mappingOutputPath = getMem(FILE_NAME_LENGTH);
160 stripPath (optarg, &mappingOutputPath, &mappingOutput);
161 break;
162 case 'n':
163 maxHits = atoi(optarg);
164 break;
165 case 'e':
166 errThreshold = atoi(optarg);
167 break;
168 case 'l':
169 minPairEndedDistance = atoi(optarg);
170 break;
171 case 'm':
172 maxPairEndedDistance = atoi(optarg);
173 break;
174 case 'h':
175 printHelp();
176 return 0;
177 break;
178 case 'v':
179 fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF);
180 return 0;
181 break;
182 /* case '?':
183 fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name);
184 abort();
185 break;*/
186 }
187
188 }
189 if (indexingMode + searchingMode != 1)
190 {
191 fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n");
192 return 0;
193 }
194
195 if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11)
196 {
197 fprintf(stdout, "ERROR: Window size should be in [12..15]\n");
198 return 0;
199 }
200
201
202 if ( indexingMode )
203 {
204 CONTIG_SIZE = 15000000;
205 CONTIG_MAX_SIZE = 40000000;
206
207 if (batchMode)
208 {
209 batchFile = fastaFile;
210 fastaFile = NULL;
211 }
212
213 if (batchFile == NULL && fastaFile == NULL)
214 {
215 fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n");
216 return 0;
217 }
218
219 if (pairedEndDiscordantMode)
220 {
221 fprintf(stdout, "ERROR: --discordant cannot be used in indexing mode. \n");
222 return 0;
223 }
224
225 }
226
227
228 if ( searchingMode )
229 {
230 CONTIG_SIZE = 300000000;
231 CONTIG_MAX_SIZE = 300000000;
232
233
234 if (batchMode)
235 {
236 batchFile = fastaFile;
237 fastaFile = NULL;
238 }
239
240 if (batchFile == NULL && fastaFile == NULL)
241 {
242 fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n");
243 return 0;
244 }
245
246 if (seqFile1 == NULL && seqFile2 == NULL)
247 {
248 fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n");
249 return 0;
250 }
251
252
253 if (!pairedEndMode && seqFile2 != NULL)
254 {
255 fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n");
256 return 0;
257 }
258
259 if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance))
260 {
261 fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n");
262 return 0;
263 }
264
265 if (pairedEndMode && seqFile1 == NULL)
266 {
267 fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n");
268 return 0;
269 }
270
271 if (!pairedEndMode && pairedEndDiscordantMode)
272 {
273 fprintf(stdout, "ERROR: --discordant should be used with --pe");
274 return 0;
275 }
276
277 if (!pairedEndMode && pairedEndProfilingMode)
278 {
279 fprintf(stdout, "ERROR: --profile should be used with --pe");
280 return 0;
281 }
282 }
283
284 int i = 0;
285
286
287 if (batchMode)
288 {
289 FILE *fp = fileOpen(batchFile, "r");
290
291 if (fp == NULL)
292 return 0;
293
294 fileCnt = 0;
295
296 while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp))
297 {
298 for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--)
299 if ( !isspace(fileName[fileCnt][0][i]))
300 break;
301 fileName[fileCnt][0][i+1] = '\0';
302
303 if (strcmp(fileName[fileCnt][0], "") != 0)
304 {
305 sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]);
306 fileCnt++;
307 }
308 }
309 }
310 else
311 {
312 sprintf(fileName[fileCnt][0], "%s", fastaFile);
313 sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]);
314 fileCnt++;
315 }
316
317
318 if (pairedEndProfilingMode)
319 {
320
321 minPairEndedDistance = 0;
322 maxPairEndedDistance = 300000000;
323
324 }
325
326 if (pairedEndDiscordantMode)
327 {
328 minPairEndedDiscordantDistance = minPairEndedDistance;
329 maxPairEndedDiscordantDistance = maxPairEndedDistance;
330
331 minPairEndedDistance = 0;
332 maxPairEndedDistance = 300000000;
333 }
334
335 return 1;
336 }
337
338
339 void printHelp()
340 {
341 char *errorType;
342 if (mrFAST)
343 {
344 fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n");
345 fprintf(stdout,"Usage: mrfast [options]\n\n");
346 errorType="edit distance";
347 }
348 else
349 {
350 fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n");
351 fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n");
352 fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n");
353 fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n");
354 fprintf(stdout,"the output in SAM format.\n\n");
355 fprintf(stdout,"Usage: mrsFAST [options]\n\n");
356 errorType="hamming distance";
357 }
358
359 fprintf(stdout,"General Options:\n");
360 fprintf(stdout," -v|--version\t\tCurrent Version.\n");
361 fprintf(stdout," -h\t\t\tShows the help file.\n");
362 fprintf(stdout,"\n\n");
363
364 fprintf(stdout,"Indexing Options:\n");
365 fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n");
366 fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n");
367 fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n");
368 fprintf(stdout,"\n\n");
369
370 fprintf(stdout,"Searching Options:\n");
371 fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n");
372 fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n");
373 fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n");
374 fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n");
375 fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n");
376 fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads. \n");
377 fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n");
378 fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n");
379 fprintf(stdout," --best \t\tOnly the best mapping from all the possible mapping is returned.\n");
380 fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n");
381 fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n");
382 fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType);
383 fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n");
384 fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n");
385
386 fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n");
387 fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n");
388 }