Mercurial > repos > nikhil-joshi > sickle
comparison src/trim_single.c @ 9:7939dd56c4b4 draft
Uploaded
| author | nikhil-joshi |
|---|---|
| date | Sat, 14 Mar 2015 18:19:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 8:3ef3eb63a297 | 9:7939dd56c4b4 |
|---|---|
| 1 #include <assert.h> | |
| 2 #include <ctype.h> | |
| 3 #include <stdlib.h> | |
| 4 #include <zlib.h> | |
| 5 #include <stdio.h> | |
| 6 #include <getopt.h> | |
| 7 #include "sickle.h" | |
| 8 #include "kseq.h" | |
| 9 #include "print_record.h" | |
| 10 | |
| 11 __KS_GETC(gzread, BUFFER_SIZE) | |
| 12 __KS_GETUNTIL(gzread, BUFFER_SIZE) | |
| 13 __KSEQ_READ | |
| 14 | |
| 15 int single_qual_threshold = 20; | |
| 16 int single_length_threshold = 20; | |
| 17 | |
| 18 static struct option single_long_options[] = { | |
| 19 {"fastq-file", required_argument, 0, 'f'}, | |
| 20 {"output-file", required_argument, 0, 'o'}, | |
| 21 {"qual-type", required_argument, 0, 't'}, | |
| 22 {"qual-threshold", required_argument, 0, 'q'}, | |
| 23 {"length-threshold", required_argument, 0, 'l'}, | |
| 24 {"no-fiveprime", no_argument, 0, 'x'}, | |
| 25 {"discard-n", no_argument, 0, 'n'}, | |
| 26 {"gzip-output", no_argument, 0, 'g'}, | |
| 27 {"quiet", no_argument, 0, 'z'}, | |
| 28 {GETOPT_HELP_OPTION_DECL}, | |
| 29 {GETOPT_VERSION_OPTION_DECL}, | |
| 30 {NULL, 0, NULL, 0} | |
| 31 }; | |
| 32 | |
| 33 void single_usage(int status, char *msg) { | |
| 34 | |
| 35 fprintf(stderr, "\nUsage: %s se [options] -f <fastq sequence file> -t <quality type> -o <trimmed fastq file>\n\ | |
| 36 \n\ | |
| 37 Options:\n\ | |
| 38 -f, --fastq-file, Input fastq file (required)\n\ | |
| 39 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\ | |
| 40 -o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME); | |
| 41 | |
| 42 fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ | |
| 43 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ | |
| 44 -x, --no-fiveprime, Don't do five prime trimming.\n\ | |
| 45 -n, --trunc-n, Truncate sequences at position of first N.\n\ | |
| 46 -g, --gzip-output, Output gzipped files.\n\ | |
| 47 --quiet, Don't print out any trimming information\n\ | |
| 48 --help, display this help and exit\n\ | |
| 49 --version, output version information and exit\n\n"); | |
| 50 | |
| 51 if (msg) fprintf(stderr, "%s\n\n", msg); | |
| 52 exit(status); | |
| 53 } | |
| 54 | |
| 55 int single_main(int argc, char *argv[]) { | |
| 56 | |
| 57 gzFile se = NULL; | |
| 58 kseq_t *fqrec; | |
| 59 int l; | |
| 60 FILE *outfile = NULL; | |
| 61 gzFile outfile_gzip = NULL; | |
| 62 int debug = 0; | |
| 63 int optc; | |
| 64 extern char *optarg; | |
| 65 int qualtype = -1; | |
| 66 cutsites *p1cut; | |
| 67 char *outfn = NULL; | |
| 68 char *infn = NULL; | |
| 69 int kept = 0; | |
| 70 int discard = 0; | |
| 71 int quiet = 0; | |
| 72 int no_fiveprime = 0; | |
| 73 int trunc_n = 0; | |
| 74 int gzip_output = 0; | |
| 75 int total=0; | |
| 76 | |
| 77 while (1) { | |
| 78 int option_index = 0; | |
| 79 optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index); | |
| 80 | |
| 81 if (optc == -1) | |
| 82 break; | |
| 83 | |
| 84 switch (optc) { | |
| 85 if (single_long_options[option_index].flag != 0) | |
| 86 break; | |
| 87 | |
| 88 case 'f': | |
| 89 infn = (char *) malloc(strlen(optarg) + 1); | |
| 90 strcpy(infn, optarg); | |
| 91 break; | |
| 92 | |
| 93 case 't': | |
| 94 if (!strcmp(optarg, "illumina")) | |
| 95 qualtype = ILLUMINA; | |
| 96 else if (!strcmp(optarg, "solexa")) | |
| 97 qualtype = SOLEXA; | |
| 98 else if (!strcmp(optarg, "sanger")) | |
| 99 qualtype = SANGER; | |
| 100 else { | |
| 101 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg); | |
| 102 return EXIT_FAILURE; | |
| 103 } | |
| 104 break; | |
| 105 | |
| 106 case 'o': | |
| 107 outfn = (char *) malloc(strlen(optarg) + 1); | |
| 108 strcpy(outfn, optarg); | |
| 109 break; | |
| 110 | |
| 111 case 'q': | |
| 112 single_qual_threshold = atoi(optarg); | |
| 113 if (single_qual_threshold < 0) { | |
| 114 fprintf(stderr, "Quality threshold must be >= 0\n"); | |
| 115 return EXIT_FAILURE; | |
| 116 } | |
| 117 break; | |
| 118 | |
| 119 case 'l': | |
| 120 single_length_threshold = atoi(optarg); | |
| 121 if (single_length_threshold < 0) { | |
| 122 fprintf(stderr, "Length threshold must be >= 0\n"); | |
| 123 return EXIT_FAILURE; | |
| 124 } | |
| 125 break; | |
| 126 | |
| 127 case 'x': | |
| 128 no_fiveprime = 1; | |
| 129 break; | |
| 130 | |
| 131 case 'n': | |
| 132 trunc_n = 1; | |
| 133 break; | |
| 134 | |
| 135 case 'g': | |
| 136 gzip_output = 1; | |
| 137 break; | |
| 138 | |
| 139 case 'z': | |
| 140 quiet = 1; | |
| 141 break; | |
| 142 | |
| 143 case 'd': | |
| 144 debug = 1; | |
| 145 break; | |
| 146 | |
| 147 case_GETOPT_HELP_CHAR(single_usage) | |
| 148 case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); | |
| 149 | |
| 150 case '?': | |
| 151 single_usage(EXIT_FAILURE, NULL); | |
| 152 break; | |
| 153 | |
| 154 default: | |
| 155 single_usage(EXIT_FAILURE, NULL); | |
| 156 break; | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 | |
| 161 if (qualtype == -1 || !infn || !outfn) { | |
| 162 single_usage(EXIT_FAILURE, "****Error: Must have quality type, input file, and output file."); | |
| 163 } | |
| 164 | |
| 165 if (!strcmp(infn, outfn)) { | |
| 166 fprintf(stderr, "****Error: Input file is same as output file.\n\n"); | |
| 167 return EXIT_FAILURE; | |
| 168 } | |
| 169 | |
| 170 se = gzopen(infn, "r"); | |
| 171 if (!se) { | |
| 172 fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn); | |
| 173 return EXIT_FAILURE; | |
| 174 } | |
| 175 | |
| 176 if (!gzip_output) { | |
| 177 outfile = fopen(outfn, "w"); | |
| 178 if (!outfile) { | |
| 179 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | |
| 180 return EXIT_FAILURE; | |
| 181 } | |
| 182 } else { | |
| 183 outfile_gzip = gzopen(outfn, "w"); | |
| 184 if (!outfile_gzip) { | |
| 185 fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | |
| 186 return EXIT_FAILURE; | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 | |
| 191 fqrec = kseq_init(se); | |
| 192 | |
| 193 while ((l = kseq_read(fqrec)) >= 0) { | |
| 194 | |
| 195 p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); | |
| 196 total++; | |
| 197 | |
| 198 if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); | |
| 199 | |
| 200 /* if sequence quality and length pass filter then output record, else discard */ | |
| 201 if (p1cut->three_prime_cut >= 0) { | |
| 202 if (!gzip_output) { | |
| 203 /* This print statement prints out the sequence string starting from the 5' cut */ | |
| 204 /* and then only prints out to the 3' cut, however, we need to adjust the 3' cut */ | |
| 205 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */ | |
| 206 | |
| 207 print_record (outfile, fqrec, p1cut); | |
| 208 } else { | |
| 209 print_record_gzip (outfile_gzip, fqrec, p1cut); | |
| 210 } | |
| 211 | |
| 212 kept++; | |
| 213 } | |
| 214 | |
| 215 else discard++; | |
| 216 | |
| 217 free(p1cut); | |
| 218 } | |
| 219 | |
| 220 if (!quiet) fprintf(stdout, "\nSE input file: %s\n\nTotal FastQ records: %d\nFastQ records kept: %d\nFastQ records discarded: %d\n\n", infn, total, kept, discard); | |
| 221 | |
| 222 kseq_destroy(fqrec); | |
| 223 gzclose(se); | |
| 224 if (!gzip_output) fclose(outfile); | |
| 225 else gzclose(outfile_gzip); | |
| 226 | |
| 227 return EXIT_SUCCESS; | |
| 228 } |
