| 4 | 1 #include <assert.h> | 
|  | 2 #include <ctype.h> | 
|  | 3 #include <stdlib.h> | 
|  | 4 #include <zlib.h> | 
|  | 5 #include <stdio.h> | 
|  | 6 #include <getopt.h> | 
|  | 7 #include "sickle.h" | 
|  | 8 #include "kseq.h" | 
|  | 9 | 
|  | 10 int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { | 
|  | 11   /* | 
|  | 12      Return the adjusted quality, depending on quality type. | 
|  | 13 | 
|  | 14      Note that this uses the array in sickle.h, which *approximates* | 
|  | 15      the SOLEXA (pre-1.3 pipeline) qualities as linear. This is | 
|  | 16      inaccurate with low-quality bases. | 
|  | 17   */ | 
|  | 18 | 
|  | 19   int qual_value = (int) qualchar; | 
|  | 20 | 
|  | 21   if (qual_value < quality_constants[qualtype][Q_MIN] || qual_value > quality_constants[qualtype][Q_MAX]) { | 
|  | 22 	fprintf (stderr, "ERROR: Quality value (%d) does not fall within correct range for %s encoding.\n", qual_value, typenames[qualtype]); | 
|  | 23 	fprintf (stderr, "Range for %s encoding: %d-%d\n", typenames[qualtype], quality_constants[qualtype][Q_MIN], quality_constants[qualtype][Q_MAX]); | 
|  | 24 	fprintf (stderr, "FastQ record: %s\n", fqrec->name.s); | 
|  | 25 	fprintf (stderr, "Quality string: %s\n", fqrec->qual.s); | 
|  | 26 	fprintf (stderr, "Quality char: '%c'\n", qualchar); | 
|  | 27 	fprintf (stderr, "Quality position: %d\n", pos+1); | 
|  | 28 	exit(1); | 
|  | 29   } | 
|  | 30 | 
|  | 31   return (qual_value - quality_constants[qualtype][Q_OFFSET]); | 
|  | 32 } | 
|  | 33 | 
|  | 34 | 
|  | 35 cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { | 
|  | 36 | 
|  | 37 	int window_size = (int) (0.1 * fqrec->seq.l); | 
|  | 38 	int i,j; | 
|  | 39 	int window_start=0; | 
|  | 40 	int window_total=0; | 
|  | 41 	int three_prime_cut = fqrec->seq.l; | 
|  | 42 	int five_prime_cut = 0; | 
|  | 43 	int found_five_prime = 0; | 
|  | 44 	double window_avg; | 
|  | 45 	cutsites* retvals; | 
|  | 46     char *npos; | 
|  | 47 | 
|  | 48 	/* discard if the length of the sequence is less than the length threshold */ | 
|  | 49     if (fqrec->seq.l < length_threshold) { | 
|  | 50 		retvals = (cutsites*) malloc (sizeof(cutsites)); | 
|  | 51 		retvals->three_prime_cut = -1; | 
|  | 52 		retvals->five_prime_cut = -1; | 
|  | 53 		return (retvals); | 
|  | 54 	} | 
|  | 55 | 
|  | 56 	/* if the seq length is less then 10bp, */ | 
|  | 57 	/* then make the window size the length of the seq */ | 
|  | 58 	if (window_size == 0) window_size = fqrec->seq.l; | 
|  | 59 | 
|  | 60 	for (i=0; i<window_size; i++) { | 
|  | 61 		window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i); | 
|  | 62 	} | 
|  | 63 | 
|  | 64 	for (i=0; i <= fqrec->qual.l - window_size; i++) { | 
|  | 65 | 
|  | 66 		window_avg = (double)window_total / (double)window_size; | 
|  | 67 | 
|  | 68         if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg); | 
|  | 69 | 
|  | 70 		/* Finding the 5' cutoff */ | 
|  | 71 		/* Find when the average quality in the window goes above the threshold starting from the 5' end */ | 
|  | 72 		if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) { | 
|  | 73 | 
|  | 74         if (debug) printf ("inside 5-prime cut\n"); | 
|  | 75 | 
|  | 76 			/* at what point in the window does the quality go above the threshold? */ | 
|  | 77 			for (j=window_start; j<window_start+window_size; j++) { | 
|  | 78 				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) { | 
|  | 79 					five_prime_cut = j; | 
|  | 80 					break; | 
|  | 81 				} | 
|  | 82 			} | 
|  | 83 | 
|  | 84             if (debug) printf ("five_prime_cut: %d\n", five_prime_cut); | 
|  | 85 | 
|  | 86 			found_five_prime = 1; | 
|  | 87 		} | 
|  | 88 | 
|  | 89 		/* Finding the 3' cutoff */ | 
|  | 90 		/* if the average quality in the window is less than the threshold */ | 
|  | 91 		/* or if the window is the last window in the read */ | 
|  | 92 		if ((window_avg < qual_threshold || | 
|  | 93 			window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) { | 
|  | 94 | 
|  | 95 			/* at what point in the window does the quality dip below the threshold? */ | 
|  | 96 			for (j=window_start; j<window_start+window_size; j++) { | 
|  | 97 				if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) { | 
|  | 98 					three_prime_cut = j; | 
|  | 99 					break; | 
|  | 100 				} | 
|  | 101 			} | 
|  | 102 | 
|  | 103 			break; | 
|  | 104 		} | 
|  | 105 | 
|  | 106 		/* instead of sliding the window, subtract the first qual and add the next qual */ | 
|  | 107 		window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start); | 
|  | 108 		if (window_start+window_size < fqrec->qual.l) { | 
|  | 109 			window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size); | 
|  | 110 		} | 
|  | 111 		window_start++; | 
|  | 112 	} | 
|  | 113 | 
|  | 114 | 
|  | 115     /* If truncate N option is selected, and sequence has Ns, then */ | 
|  | 116     /* change 3' cut site to be the base before the first N */ | 
|  | 117     if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) { | 
|  | 118         three_prime_cut = npos - fqrec->seq.s; | 
|  | 119     } | 
|  | 120 | 
|  | 121     /* if cutting length is less than threshold then return -1 for both */ | 
|  | 122     /* to indicate that the read should be discarded */ | 
|  | 123     /* Also, if you never find a five prime cut site, then discard whole read */ | 
|  | 124     if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) { | 
|  | 125         three_prime_cut = -1; | 
|  | 126         five_prime_cut = -1; | 
|  | 127 | 
|  | 128         if (debug) printf("%s\n", fqrec->name.s); | 
|  | 129     } | 
|  | 130 | 
|  | 131     if (debug) printf ("\n\n"); | 
|  | 132 | 
|  | 133 	retvals = (cutsites*) malloc (sizeof(cutsites)); | 
|  | 134 	retvals->three_prime_cut = three_prime_cut; | 
|  | 135 	retvals->five_prime_cut = five_prime_cut; | 
|  | 136 	return (retvals); | 
|  | 137 } |