| 9 | 1 #include <assert.h> | 
|  | 2 #include <ctype.h> | 
|  | 3 #include <stdlib.h> | 
|  | 4 #include <zlib.h> | 
|  | 5 #include <stdio.h> | 
|  | 6 #include <getopt.h> | 
|  | 7 #include "sickle.h" | 
|  | 8 #include "kseq.h" | 
|  | 9 #include "print_record.h" | 
|  | 10 | 
|  | 11 __KS_GETC(gzread, BUFFER_SIZE) | 
|  | 12 __KS_GETUNTIL(gzread, BUFFER_SIZE) | 
|  | 13 __KSEQ_READ | 
|  | 14 | 
|  | 15 int single_qual_threshold = 20; | 
|  | 16 int single_length_threshold = 20; | 
|  | 17 | 
|  | 18 static struct option single_long_options[] = { | 
|  | 19     {"fastq-file", required_argument, 0, 'f'}, | 
|  | 20     {"output-file", required_argument, 0, 'o'}, | 
|  | 21     {"qual-type", required_argument, 0, 't'}, | 
|  | 22     {"qual-threshold", required_argument, 0, 'q'}, | 
|  | 23     {"length-threshold", required_argument, 0, 'l'}, | 
|  | 24     {"no-fiveprime", no_argument, 0, 'x'}, | 
|  | 25     {"discard-n", no_argument, 0, 'n'}, | 
|  | 26     {"gzip-output", no_argument, 0, 'g'}, | 
|  | 27     {"quiet", no_argument, 0, 'z'}, | 
|  | 28     {GETOPT_HELP_OPTION_DECL}, | 
|  | 29     {GETOPT_VERSION_OPTION_DECL}, | 
|  | 30     {NULL, 0, NULL, 0} | 
|  | 31 }; | 
|  | 32 | 
|  | 33 void single_usage(int status, char *msg) { | 
|  | 34 | 
|  | 35     fprintf(stderr, "\nUsage: %s se [options] -f <fastq sequence file> -t <quality type> -o <trimmed fastq file>\n\ | 
|  | 36 \n\ | 
|  | 37 Options:\n\ | 
|  | 38 -f, --fastq-file, Input fastq file (required)\n\ | 
|  | 39 -t, --qual-type, Type of quality values (solexa (CASAVA < 1.3), illumina (CASAVA 1.3 to 1.7), sanger (which is CASAVA >= 1.8)) (required)\n\ | 
|  | 40 -o, --output-file, Output trimmed fastq file (required)\n", PROGRAM_NAME); | 
|  | 41 | 
|  | 42     fprintf(stderr, "-q, --qual-threshold, Threshold for trimming based on average quality in a window. Default 20.\n\ | 
|  | 43 -l, --length-threshold, Threshold to keep a read based on length after trimming. Default 20.\n\ | 
|  | 44 -x, --no-fiveprime, Don't do five prime trimming.\n\ | 
|  | 45 -n, --trunc-n, Truncate sequences at position of first N.\n\ | 
|  | 46 -g, --gzip-output, Output gzipped files.\n\ | 
|  | 47 --quiet, Don't print out any trimming information\n\ | 
|  | 48 --help, display this help and exit\n\ | 
|  | 49 --version, output version information and exit\n\n"); | 
|  | 50 | 
|  | 51     if (msg) fprintf(stderr, "%s\n\n", msg); | 
|  | 52     exit(status); | 
|  | 53 } | 
|  | 54 | 
|  | 55 int single_main(int argc, char *argv[]) { | 
|  | 56 | 
|  | 57     gzFile se = NULL; | 
|  | 58     kseq_t *fqrec; | 
|  | 59     int l; | 
|  | 60     FILE *outfile = NULL; | 
|  | 61     gzFile outfile_gzip = NULL; | 
|  | 62     int debug = 0; | 
|  | 63     int optc; | 
|  | 64     extern char *optarg; | 
|  | 65     int qualtype = -1; | 
|  | 66     cutsites *p1cut; | 
|  | 67     char *outfn = NULL; | 
|  | 68     char *infn = NULL; | 
|  | 69     int kept = 0; | 
|  | 70     int discard = 0; | 
|  | 71     int quiet = 0; | 
|  | 72     int no_fiveprime = 0; | 
|  | 73     int trunc_n = 0; | 
|  | 74     int gzip_output = 0; | 
|  | 75     int total=0; | 
|  | 76 | 
|  | 77     while (1) { | 
|  | 78         int option_index = 0; | 
|  | 79         optc = getopt_long(argc, argv, "df:t:o:q:l:zxng", single_long_options, &option_index); | 
|  | 80 | 
|  | 81         if (optc == -1) | 
|  | 82             break; | 
|  | 83 | 
|  | 84         switch (optc) { | 
|  | 85             if (single_long_options[option_index].flag != 0) | 
|  | 86                 break; | 
|  | 87 | 
|  | 88         case 'f': | 
|  | 89             infn = (char *) malloc(strlen(optarg) + 1); | 
|  | 90             strcpy(infn, optarg); | 
|  | 91             break; | 
|  | 92 | 
|  | 93         case 't': | 
|  | 94             if (!strcmp(optarg, "illumina")) | 
|  | 95                 qualtype = ILLUMINA; | 
|  | 96             else if (!strcmp(optarg, "solexa")) | 
|  | 97                 qualtype = SOLEXA; | 
|  | 98             else if (!strcmp(optarg, "sanger")) | 
|  | 99                 qualtype = SANGER; | 
|  | 100             else { | 
|  | 101                 fprintf(stderr, "Error: Quality type '%s' is not a valid type.\n", optarg); | 
|  | 102                 return EXIT_FAILURE; | 
|  | 103             } | 
|  | 104             break; | 
|  | 105 | 
|  | 106         case 'o': | 
|  | 107             outfn = (char *) malloc(strlen(optarg) + 1); | 
|  | 108             strcpy(outfn, optarg); | 
|  | 109             break; | 
|  | 110 | 
|  | 111         case 'q': | 
|  | 112             single_qual_threshold = atoi(optarg); | 
|  | 113             if (single_qual_threshold < 0) { | 
|  | 114                 fprintf(stderr, "Quality threshold must be >= 0\n"); | 
|  | 115                 return EXIT_FAILURE; | 
|  | 116             } | 
|  | 117             break; | 
|  | 118 | 
|  | 119         case 'l': | 
|  | 120             single_length_threshold = atoi(optarg); | 
|  | 121             if (single_length_threshold < 0) { | 
|  | 122                 fprintf(stderr, "Length threshold must be >= 0\n"); | 
|  | 123                 return EXIT_FAILURE; | 
|  | 124             } | 
|  | 125             break; | 
|  | 126 | 
|  | 127         case 'x': | 
|  | 128             no_fiveprime = 1; | 
|  | 129             break; | 
|  | 130 | 
|  | 131         case 'n': | 
|  | 132             trunc_n = 1; | 
|  | 133             break; | 
|  | 134 | 
|  | 135         case 'g': | 
|  | 136             gzip_output = 1; | 
|  | 137             break; | 
|  | 138 | 
|  | 139         case 'z': | 
|  | 140             quiet = 1; | 
|  | 141             break; | 
|  | 142 | 
|  | 143         case 'd': | 
|  | 144             debug = 1; | 
|  | 145             break; | 
|  | 146 | 
|  | 147         case_GETOPT_HELP_CHAR(single_usage) | 
|  | 148         case_GETOPT_VERSION_CHAR(PROGRAM_NAME, VERSION, AUTHORS); | 
|  | 149 | 
|  | 150         case '?': | 
|  | 151             single_usage(EXIT_FAILURE, NULL); | 
|  | 152             break; | 
|  | 153 | 
|  | 154         default: | 
|  | 155             single_usage(EXIT_FAILURE, NULL); | 
|  | 156             break; | 
|  | 157         } | 
|  | 158     } | 
|  | 159 | 
|  | 160 | 
|  | 161     if (qualtype == -1 || !infn || !outfn) { | 
|  | 162         single_usage(EXIT_FAILURE, "****Error: Must have quality type, input file, and output file."); | 
|  | 163     } | 
|  | 164 | 
|  | 165     if (!strcmp(infn, outfn)) { | 
|  | 166         fprintf(stderr, "****Error: Input file is same as output file.\n\n"); | 
|  | 167         return EXIT_FAILURE; | 
|  | 168     } | 
|  | 169 | 
|  | 170     se = gzopen(infn, "r"); | 
|  | 171     if (!se) { | 
|  | 172         fprintf(stderr, "****Error: Could not open input file '%s'.\n\n", infn); | 
|  | 173         return EXIT_FAILURE; | 
|  | 174     } | 
|  | 175 | 
|  | 176     if (!gzip_output) { | 
|  | 177         outfile = fopen(outfn, "w"); | 
|  | 178         if (!outfile) { | 
|  | 179             fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | 
|  | 180             return EXIT_FAILURE; | 
|  | 181         } | 
|  | 182     } else { | 
|  | 183         outfile_gzip = gzopen(outfn, "w"); | 
|  | 184         if (!outfile_gzip) { | 
|  | 185             fprintf(stderr, "****Error: Could not open output file '%s'.\n\n", outfn); | 
|  | 186             return EXIT_FAILURE; | 
|  | 187         } | 
|  | 188     } | 
|  | 189 | 
|  | 190 | 
|  | 191     fqrec = kseq_init(se); | 
|  | 192 | 
|  | 193     while ((l = kseq_read(fqrec)) >= 0) { | 
|  | 194 | 
|  | 195         p1cut = sliding_window(fqrec, qualtype, single_length_threshold, single_qual_threshold, no_fiveprime, trunc_n, debug); | 
|  | 196         total++; | 
|  | 197 | 
|  | 198         if (debug) printf("P1cut: %d,%d\n", p1cut->five_prime_cut, p1cut->three_prime_cut); | 
|  | 199 | 
|  | 200         /* if sequence quality and length pass filter then output record, else discard */ | 
|  | 201         if (p1cut->three_prime_cut >= 0) { | 
|  | 202             if (!gzip_output) { | 
|  | 203                 /* This print statement prints out the sequence string starting from the 5' cut */ | 
|  | 204                 /* and then only prints out to the 3' cut, however, we need to adjust the 3' cut */ | 
|  | 205                 /* by subtracting the 5' cut because the 3' cut was calculated on the original sequence */ | 
|  | 206 | 
|  | 207                 print_record (outfile, fqrec, p1cut); | 
|  | 208             } else { | 
|  | 209                 print_record_gzip (outfile_gzip, fqrec, p1cut); | 
|  | 210             } | 
|  | 211 | 
|  | 212             kept++; | 
|  | 213         } | 
|  | 214 | 
|  | 215         else discard++; | 
|  | 216 | 
|  | 217         free(p1cut); | 
|  | 218     } | 
|  | 219 | 
|  | 220     if (!quiet) fprintf(stdout, "\nSE input file: %s\n\nTotal FastQ records: %d\nFastQ records kept: %d\nFastQ records discarded: %d\n\n", infn, total, kept, discard); | 
|  | 221 | 
|  | 222     kseq_destroy(fqrec); | 
|  | 223     gzclose(se); | 
|  | 224     if (!gzip_output) fclose(outfile); | 
|  | 225     else gzclose(outfile_gzip); | 
|  | 226 | 
|  | 227     return EXIT_SUCCESS; | 
|  | 228 } |