Mercurial > repos > calkan > mrfast
comparison mrfast-2.1.0.5/CommandLineParser.c @ 1:d4054b05b015 default tip
Version update to 2.1.0.5
| author | calkan |
|---|---|
| date | Fri, 09 Mar 2012 07:35:51 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:7b3dc85dc7fd | 1:d4054b05b015 |
|---|---|
| 1 /* | |
| 2 * Copyright (c) <2008 - 2012>, University of Washington, Simon Fraser University | |
| 3 * All rights reserved. | |
| 4 * | |
| 5 * Redistribution and use in source and binary forms, with or without modification, | |
| 6 * are permitted provided that the following conditions are met: | |
| 7 * | |
| 8 * Redistributions of source code must retain the above copyright notice, this list | |
| 9 * of conditions and the following disclaimer. | |
| 10 * - Redistributions in binary form must reproduce the above copyright notice, this | |
| 11 * list of conditions and the following disclaimer in the documentation and/or other | |
| 12 * materials provided with the distribution. | |
| 13 * - Neither the names of the University of Washington, Simon Fraser University, | |
| 14 * nor the names of its contributors may be | |
| 15 * used to endorse or promote products derived from this software without specific | |
| 16 * prior written permission. | |
| 17 * | |
| 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR | |
| 22 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 24 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 25 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| 26 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 27 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 /* | |
| 32 Authors: | |
| 33 Farhad Hormozdiari | |
| 34 Faraz Hach | |
| 35 Can Alkan | |
| 36 Emails: | |
| 37 farhadh AT uw DOT edu | |
| 38 fhach AT cs DOT sfu DOT ca | |
| 39 calkan AT uw DOT edu | |
| 40 */ | |
| 41 | |
| 42 | |
| 43 #include <stdio.h> | |
| 44 #include <stdlib.h> | |
| 45 #include <getopt.h> | |
| 46 #include <string.h> | |
| 47 #include <ctype.h> | |
| 48 #include "Common.h" | |
| 49 #include "CommandLineParser.h" | |
| 50 | |
| 51 int uniqueMode=1; | |
| 52 int indexingMode; | |
| 53 int searchingMode; | |
| 54 int pairedEndMode; | |
| 55 int pairedEndDiscordantMode; | |
| 56 int transChromosal=0; | |
| 57 int pairedEndProfilingMode; | |
| 58 int seqCompressed; | |
| 59 int outCompressed; | |
| 60 int cropSize = 0; | |
| 61 int progressRep = 0; | |
| 62 int minPairEndedDistance=-1; | |
| 63 int maxPairEndedDistance=-1; | |
| 64 int minPairEndedDiscordantDistance=-1; | |
| 65 int maxPairEndedDiscordantDistance=-1; | |
| 66 int bestMode; | |
| 67 int nosamMode; | |
| 68 char *seqFile1; | |
| 69 char *seqFile2; | |
| 70 char *mappingOutput = "output"; | |
| 71 char *mappingOutputPath = ""; | |
| 72 char *unmappedOutput = "unmapped"; | |
| 73 char fileName[1000][2][FILE_NAME_LENGTH]; | |
| 74 int fileCnt; | |
| 75 int maxOEAOutput=100; | |
| 76 int maxDiscordantOutput=300; | |
| 77 unsigned char errThreshold=2; | |
| 78 unsigned char maxHits=0; | |
| 79 unsigned char WINDOW_SIZE = 12; | |
| 80 unsigned int CONTIG_SIZE; | |
| 81 unsigned int CONTIG_MAX_SIZE; | |
| 82 | |
| 83 void printHelp(); | |
| 84 | |
| 85 int parseCommandLine (int argc, char *argv[]) | |
| 86 { | |
| 87 | |
| 88 int o; | |
| 89 int index; | |
| 90 char *fastaFile = NULL; | |
| 91 char *batchFile = NULL ; | |
| 92 int batchMode = 0; | |
| 93 | |
| 94 static struct option longOptions[] = | |
| 95 { | |
| 96 {"pe", no_argument, &pairedEndMode, 1}, | |
| 97 {"discordant-vh", no_argument, &pairedEndDiscordantMode, 1}, | |
| 98 {"trans", no_argument, &transChromosal , 1}, | |
| 99 {"profile", no_argument, &pairedEndProfilingMode, 1}, | |
| 100 {"seqcomp", no_argument, &seqCompressed, 1}, | |
| 101 {"outcomp", no_argument, &outCompressed, 1}, | |
| 102 {"progress", no_argument, &progressRep, 1}, | |
| 103 {"best", no_argument, &bestMode, 1}, | |
| 104 {"index", required_argument, 0, 'i'}, | |
| 105 {"search", required_argument, 0, 's'}, | |
| 106 {"help", no_argument, 0, 'h'}, | |
| 107 {"version", no_argument, 0, 'v'}, | |
| 108 {"seq", required_argument, 0, 'x'}, | |
| 109 {"seq1", required_argument, 0, 'x'}, | |
| 110 {"seq2", required_argument, 0, 'y'}, | |
| 111 {"ws", required_argument, 0, 'w'}, | |
| 112 {"min", required_argument, 0, 'l'}, | |
| 113 {"max", required_argument, 0, 'm'}, | |
| 114 {"crop", required_argument, 0, 'c'}, | |
| 115 {"maxoea", required_argument, 0, 'a'}, | |
| 116 {"maxdis", required_argument, 0, 'd'}, | |
| 117 {"nosam", no_argument, &nosamMode, 1}, | |
| 118 {0, 0, 0, 0}, | |
| 119 }; | |
| 120 | |
| 121 while ( (o = getopt_long ( argc, argv, "bhvn:e:o:u:i:s:x:y:w:l:m:c:a:d:", longOptions, &index)) != -1 ) | |
| 122 { | |
| 123 switch (o) | |
| 124 { | |
| 125 case 'a': | |
| 126 maxOEAOutput = atoi(optarg); | |
| 127 if (maxOEAOutput == 0) | |
| 128 maxOEAOutput = 100000; | |
| 129 break; | |
| 130 case 'd': | |
| 131 maxDiscordantOutput = atoi(optarg); | |
| 132 if (maxDiscordantOutput == 0) | |
| 133 maxDiscordantOutput = 100000; | |
| 134 break; | |
| 135 case 'i': | |
| 136 indexingMode = 1; | |
| 137 fastaFile = optarg; | |
| 138 break; | |
| 139 case 's': | |
| 140 searchingMode = 1; | |
| 141 fastaFile = optarg; | |
| 142 break; | |
| 143 case 'b': | |
| 144 batchMode = 1; | |
| 145 break; | |
| 146 case 'c': | |
| 147 cropSize = atoi(optarg); | |
| 148 break; | |
| 149 case 'w': | |
| 150 WINDOW_SIZE = atoi(optarg); | |
| 151 break; | |
| 152 case 'x': | |
| 153 seqFile1 = optarg; | |
| 154 break; | |
| 155 case 'y': | |
| 156 seqFile2 = optarg; | |
| 157 break; | |
| 158 case 'u': | |
| 159 unmappedOutput = optarg; | |
| 160 break; | |
| 161 case 'o': | |
| 162 mappingOutput = getMem(FILE_NAME_LENGTH); | |
| 163 mappingOutputPath = getMem(FILE_NAME_LENGTH); | |
| 164 stripPath (optarg, &mappingOutputPath, &mappingOutput); | |
| 165 break; | |
| 166 case 'n': | |
| 167 maxHits = atoi(optarg); | |
| 168 break; | |
| 169 case 'e': | |
| 170 errThreshold = atoi(optarg); | |
| 171 break; | |
| 172 case 'l': | |
| 173 minPairEndedDistance = atoi(optarg); | |
| 174 break; | |
| 175 case 'm': | |
| 176 maxPairEndedDistance = atoi(optarg); | |
| 177 break; | |
| 178 case 'h': | |
| 179 printHelp(); | |
| 180 return 0; | |
| 181 break; | |
| 182 case 'v': | |
| 183 fprintf(stdout, "%s.%s\n", versionNumber, versionNumberF); | |
| 184 return 0; | |
| 185 break; | |
| 186 /* case '?': | |
| 187 fprintf(stderr, "Unknown parameter: %s\n", longOptions[index].name); | |
| 188 abort(); | |
| 189 break;*/ | |
| 190 } | |
| 191 | |
| 192 } | |
| 193 if (indexingMode + searchingMode != 1) | |
| 194 { | |
| 195 fprintf(stdout, "ERROR: Indexing / Searching mode should be selected\n"); | |
| 196 return 0; | |
| 197 } | |
| 198 | |
| 199 if (WINDOW_SIZE > 15 || WINDOW_SIZE < 11) | |
| 200 { | |
| 201 fprintf(stdout, "ERROR: Window size should be in [12..15]\n"); | |
| 202 return 0; | |
| 203 } | |
| 204 | |
| 205 | |
| 206 if ( indexingMode ) | |
| 207 { | |
| 208 CONTIG_SIZE = 15000000; | |
| 209 CONTIG_MAX_SIZE = 40000000; | |
| 210 | |
| 211 if (batchMode) | |
| 212 { | |
| 213 batchFile = fastaFile; | |
| 214 fastaFile = NULL; | |
| 215 } | |
| 216 | |
| 217 if (batchFile == NULL && fastaFile == NULL) | |
| 218 { | |
| 219 fprintf(stdout, "ERROR: Reference(s) should be indicated for indexing\n"); | |
| 220 return 0; | |
| 221 } | |
| 222 | |
| 223 if (pairedEndDiscordantMode) | |
| 224 { | |
| 225 fprintf(stdout, "ERROR: --discordant-vh cannot be used in indexing mode. \n"); | |
| 226 return 0; | |
| 227 } | |
| 228 | |
| 229 } | |
| 230 | |
| 231 | |
| 232 if ( searchingMode ) | |
| 233 { | |
| 234 CONTIG_SIZE = 300000000; | |
| 235 CONTIG_MAX_SIZE = 300000000; | |
| 236 | |
| 237 | |
| 238 if (batchMode) | |
| 239 { | |
| 240 batchFile = fastaFile; | |
| 241 fastaFile = NULL; | |
| 242 } | |
| 243 | |
| 244 if (batchFile == NULL && fastaFile == NULL) | |
| 245 { | |
| 246 fprintf(stdout, "ERROR: Index File(s) should be indiciated for searching\n"); | |
| 247 return 0; | |
| 248 } | |
| 249 | |
| 250 if (seqFile1 == NULL && seqFile2 == NULL) | |
| 251 { | |
| 252 fprintf(stdout, "ERROR: Please indicate a sequence file for searching.\n"); | |
| 253 return 0; | |
| 254 } | |
| 255 | |
| 256 | |
| 257 if (!pairedEndMode && seqFile2 != NULL) | |
| 258 { | |
| 259 fprintf(stdout, "ERROR: Second File can be indicated in pairedend mode\n"); | |
| 260 return 0; | |
| 261 } | |
| 262 | |
| 263 if (pairedEndMode && (minPairEndedDistance <0 || maxPairEndedDistance < 0 || minPairEndedDistance > maxPairEndedDistance)) | |
| 264 { | |
| 265 fprintf(stdout, "ERROR: Please enter a valid range for pairedend sequences.\n"); | |
| 266 return 0; | |
| 267 } | |
| 268 | |
| 269 if (pairedEndMode && seqFile1 == NULL) | |
| 270 { | |
| 271 fprintf(stdout, "ERROR: Please indicate the first file for pairedend search.\n"); | |
| 272 return 0; | |
| 273 } | |
| 274 | |
| 275 if (!pairedEndMode && pairedEndDiscordantMode) | |
| 276 { | |
| 277 fprintf(stdout, "ERROR: --discordant should be used with --pe"); | |
| 278 return 0; | |
| 279 } | |
| 280 | |
| 281 if (!pairedEndMode && pairedEndProfilingMode) | |
| 282 { | |
| 283 fprintf(stdout, "ERROR: --profile should be used with --pe"); | |
| 284 return 0; | |
| 285 } | |
| 286 | |
| 287 if (pairedEndMode) | |
| 288 pairedEndDiscordantMode = 1; | |
| 289 } | |
| 290 | |
| 291 int i = 0; | |
| 292 | |
| 293 | |
| 294 if (batchMode) | |
| 295 { | |
| 296 FILE *fp = fileOpen(batchFile, "r"); | |
| 297 | |
| 298 if (fp == NULL) | |
| 299 return 0; | |
| 300 | |
| 301 fileCnt = 0; | |
| 302 | |
| 303 while ( fgets(fileName[fileCnt][0], FILE_NAME_LENGTH, fp)) | |
| 304 { | |
| 305 for (i = strlen(fileName[fileCnt][0])-1; i>=0; i--) | |
| 306 if ( !isspace(fileName[fileCnt][0][i])) | |
| 307 break; | |
| 308 fileName[fileCnt][0][i+1] = '\0'; | |
| 309 | |
| 310 if (strcmp(fileName[fileCnt][0], "") != 0) | |
| 311 { | |
| 312 sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); | |
| 313 fileCnt++; | |
| 314 } | |
| 315 } | |
| 316 } | |
| 317 else | |
| 318 { | |
| 319 sprintf(fileName[fileCnt][0], "%s", fastaFile); | |
| 320 sprintf(fileName[fileCnt][1], "%s.index", fileName[fileCnt][0]); | |
| 321 fileCnt++; | |
| 322 } | |
| 323 | |
| 324 | |
| 325 if (pairedEndProfilingMode) | |
| 326 { | |
| 327 | |
| 328 minPairEndedDistance = 0; | |
| 329 maxPairEndedDistance = 300000000; | |
| 330 | |
| 331 } | |
| 332 | |
| 333 if (pairedEndDiscordantMode) | |
| 334 { | |
| 335 minPairEndedDiscordantDistance = minPairEndedDistance; | |
| 336 maxPairEndedDiscordantDistance = maxPairEndedDistance; | |
| 337 | |
| 338 minPairEndedDistance = 0; | |
| 339 maxPairEndedDistance = 300000000; | |
| 340 } | |
| 341 | |
| 342 return 1; | |
| 343 } | |
| 344 | |
| 345 | |
| 346 void printHelp() | |
| 347 { | |
| 348 char *errorType; | |
| 349 if (mrFAST) | |
| 350 { | |
| 351 fprintf(stdout,"mrFAST : Micro-Read Fast Alignment Search Tool.\n\n"); | |
| 352 fprintf(stdout,"Usage: mrfast [options]\n\n"); | |
| 353 errorType="edit distance"; | |
| 354 } | |
| 355 else | |
| 356 { | |
| 357 fprintf(stdout,"mrsFAST : Micro-Read Substitutions (only) Fast Alignment Search Tool.\n\n"); | |
| 358 fprintf(stdout,"mrsFAST is a cache oblivious read mapping tool. mrsFAST capable of mapping\n"); | |
| 359 fprintf(stdout,"single and paired end reads to the reference genome. Bisulfite treated \n"); | |
| 360 fprintf(stdout,"sequences are not supported in this version. By default mrsFAST reports \n"); | |
| 361 fprintf(stdout,"the output in SAM format.\n\n"); | |
| 362 fprintf(stdout,"Usage: mrsFAST [options]\n\n"); | |
| 363 errorType="hamming distance"; | |
| 364 } | |
| 365 | |
| 366 fprintf(stdout,"General Options:\n"); | |
| 367 fprintf(stdout," -v|--version\t\tCurrent Version.\n"); | |
| 368 fprintf(stdout," -h\t\t\tShows the help file.\n"); | |
| 369 fprintf(stdout,"\n\n"); | |
| 370 | |
| 371 fprintf(stdout,"Indexing Options:\n"); | |
| 372 fprintf(stdout," --index [file]\t\tGenerate an index from the specified fasta file. \n"); | |
| 373 fprintf(stdout," -b\t\t\tIndicates the indexing will be done in batch mode.\n\t\t\tThe file specified in --index should contain the \n\t\t\tlist of fasta files.\n"); | |
| 374 fprintf(stdout," --ws [int]\t\tSet window size for indexing (default:12 max:14).\n"); | |
| 375 fprintf(stdout,"\n\n"); | |
| 376 | |
| 377 fprintf(stdout,"Searching Options:\n"); | |
| 378 fprintf(stdout," --search [file]\tSearch in the specified genome. Provide the path to the fasta file. \n\t\t\tIndex file should be in the same directory.\n"); | |
| 379 fprintf(stdout," -b\t\t\tIndicates the mapping will be done in batch mode. \n\t\t\tThe file specified in --search should contain the \n\t\t\tlist of fasta files.\n"); | |
| 380 fprintf(stdout," --pe \t\t\tSearch will be done in Paired-End mode.\n"); | |
| 381 fprintf(stdout," --seq [file]\t\tInput sequences in fasta/fastq format [file]. If \n\t\t\tpaired end reads are interleaved, use this option.\n"); | |
| 382 fprintf(stdout," --seq1 [file]\t\tInput sequences in fasta/fastq format [file] (First \n\t\t\tfile). Use this option to indicate the first file of \n\t\t\tpaired end reads. \n"); | |
| 383 fprintf(stdout," --seq2 [file]\t\tInput sequences in fasta/fastq format [file] (Second \n\t\t\tfile). Use this option to indicate the second file of \n\t\t\tpaired end reads. \n"); | |
| 384 fprintf(stdout," -o [file]\t\tOutput of the mapped sequences. The default is \"output\".\n"); | |
| 385 fprintf(stdout," -u [file]\t\tSave unmapped sequences in fasta/fastq format.\n"); | |
| 386 fprintf(stdout," --best \t\tOnly the best mapping from all the possible mapping is returned.\n"); | |
| 387 fprintf(stdout," --seqcomp \t\tIndicates that the input sequences are compressed (gz).\n"); | |
| 388 fprintf(stdout," --outcomp \t\tIndicates that output file should be compressed (gz).\n"); | |
| 389 fprintf(stdout," -e [int]\t\tMaximum allowed %s (default 2).\n", errorType); | |
| 390 fprintf(stdout," --min [int]\t\tMin distance allowed between a pair of end sequences.\n"); | |
| 391 fprintf(stdout," --max [int]\t\tMax distance allowed between a pair of end sequences.\n"); | |
| 392 | |
| 393 fprintf(stdout," --maxoea [int]\t\tMax number of One End Anchored (OEA) returned for each read pair. We recommend 100 or above for NovelSeq use.\n"); | |
| 394 fprintf(stdout," --maxdis [int]\t\tMax number of discordant map locations returned for each read pair. We recommend 300 or above for VariationHunter use.\n"); | |
| 395 } |
