Mercurial > repos > nikhil-joshi > sickle
comparison src/sliding.c @ 9:7939dd56c4b4 draft
Uploaded
| author | nikhil-joshi |
|---|---|
| date | Sat, 14 Mar 2015 18:19:57 -0400 |
| parents | c70137414dcd |
| children |
comparison
equal
deleted
inserted
replaced
| 8:3ef3eb63a297 | 9:7939dd56c4b4 |
|---|---|
| 1 #include <assert.h> | |
| 2 #include <ctype.h> | |
| 3 #include <stdlib.h> | |
| 4 #include <zlib.h> | |
| 5 #include <stdio.h> | |
| 6 #include <getopt.h> | |
| 7 #include "sickle.h" | |
| 8 #include "kseq.h" | |
| 9 | |
| 10 int get_quality_num (char qualchar, int qualtype, kseq_t *fqrec, int pos) { | |
| 11 /* | |
| 12 Return the adjusted quality, depending on quality type. | |
| 13 | |
| 14 Note that this uses the array in sickle.h, which *approximates* | |
| 15 the SOLEXA (pre-1.3 pipeline) qualities as linear. This is | |
| 16 inaccurate with low-quality bases. | |
| 17 */ | |
| 18 | |
| 19 int qual_value = (int) qualchar; | |
| 20 | |
| 21 if (qual_value < quality_constants[qualtype][Q_MIN] || qual_value > quality_constants[qualtype][Q_MAX]) { | |
| 22 fprintf (stderr, "ERROR: Quality value (%d) does not fall within correct range for %s encoding.\n", qual_value, typenames[qualtype]); | |
| 23 fprintf (stderr, "Range for %s encoding: %d-%d\n", typenames[qualtype], quality_constants[qualtype][Q_MIN], quality_constants[qualtype][Q_MAX]); | |
| 24 fprintf (stderr, "FastQ record: %s\n", fqrec->name.s); | |
| 25 fprintf (stderr, "Quality string: %s\n", fqrec->qual.s); | |
| 26 fprintf (stderr, "Quality char: '%c'\n", qualchar); | |
| 27 fprintf (stderr, "Quality position: %d\n", pos+1); | |
| 28 exit(1); | |
| 29 } | |
| 30 | |
| 31 return (qual_value - quality_constants[qualtype][Q_OFFSET]); | |
| 32 } | |
| 33 | |
| 34 | |
| 35 cutsites* sliding_window (kseq_t *fqrec, int qualtype, int length_threshold, int qual_threshold, int no_fiveprime, int trunc_n, int debug) { | |
| 36 | |
| 37 int window_size = (int) (0.1 * fqrec->seq.l); | |
| 38 int i,j; | |
| 39 int window_start=0; | |
| 40 int window_total=0; | |
| 41 int three_prime_cut = fqrec->seq.l; | |
| 42 int five_prime_cut = 0; | |
| 43 int found_five_prime = 0; | |
| 44 double window_avg; | |
| 45 cutsites* retvals; | |
| 46 char *npos; | |
| 47 | |
| 48 /* discard if the length of the sequence is less than the length threshold */ | |
| 49 if (fqrec->seq.l < length_threshold) { | |
| 50 retvals = (cutsites*) malloc (sizeof(cutsites)); | |
| 51 retvals->three_prime_cut = -1; | |
| 52 retvals->five_prime_cut = -1; | |
| 53 return (retvals); | |
| 54 } | |
| 55 | |
| 56 /* if the seq length is less then 10bp, */ | |
| 57 /* then make the window size the length of the seq */ | |
| 58 if (window_size == 0) window_size = fqrec->seq.l; | |
| 59 | |
| 60 for (i=0; i<window_size; i++) { | |
| 61 window_total += get_quality_num (fqrec->qual.s[i], qualtype, fqrec, i); | |
| 62 } | |
| 63 | |
| 64 for (i=0; i <= fqrec->qual.l - window_size; i++) { | |
| 65 | |
| 66 window_avg = (double)window_total / (double)window_size; | |
| 67 | |
| 68 if (debug) printf ("no_fiveprime: %d, found 5prime: %d, window_avg: %f\n", no_fiveprime, found_five_prime, window_avg); | |
| 69 | |
| 70 /* Finding the 5' cutoff */ | |
| 71 /* Find when the average quality in the window goes above the threshold starting from the 5' end */ | |
| 72 if (no_fiveprime == 0 && found_five_prime == 0 && window_avg >= qual_threshold) { | |
| 73 | |
| 74 if (debug) printf ("inside 5-prime cut\n"); | |
| 75 | |
| 76 /* at what point in the window does the quality go above the threshold? */ | |
| 77 for (j=window_start; j<window_start+window_size; j++) { | |
| 78 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) >= qual_threshold) { | |
| 79 five_prime_cut = j; | |
| 80 break; | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 if (debug) printf ("five_prime_cut: %d\n", five_prime_cut); | |
| 85 | |
| 86 found_five_prime = 1; | |
| 87 } | |
| 88 | |
| 89 /* Finding the 3' cutoff */ | |
| 90 /* if the average quality in the window is less than the threshold */ | |
| 91 /* or if the window is the last window in the read */ | |
| 92 if ((window_avg < qual_threshold || | |
| 93 window_start+window_size > fqrec->qual.l) && (found_five_prime == 1 || no_fiveprime)) { | |
| 94 | |
| 95 /* at what point in the window does the quality dip below the threshold? */ | |
| 96 for (j=window_start; j<window_start+window_size; j++) { | |
| 97 if (get_quality_num (fqrec->qual.s[j], qualtype, fqrec, j) < qual_threshold) { | |
| 98 three_prime_cut = j; | |
| 99 break; | |
| 100 } | |
| 101 } | |
| 102 | |
| 103 break; | |
| 104 } | |
| 105 | |
| 106 /* instead of sliding the window, subtract the first qual and add the next qual */ | |
| 107 window_total -= get_quality_num (fqrec->qual.s[window_start], qualtype, fqrec, window_start); | |
| 108 if (window_start+window_size < fqrec->qual.l) { | |
| 109 window_total += get_quality_num (fqrec->qual.s[window_start+window_size], qualtype, fqrec, window_start+window_size); | |
| 110 } | |
| 111 window_start++; | |
| 112 } | |
| 113 | |
| 114 | |
| 115 /* If truncate N option is selected, and sequence has Ns, then */ | |
| 116 /* change 3' cut site to be the base before the first N */ | |
| 117 if (trunc_n && ((npos = strstr(fqrec->seq.s, "N")) || (npos = strstr(fqrec->seq.s, "n")))) { | |
| 118 three_prime_cut = npos - fqrec->seq.s; | |
| 119 } | |
| 120 | |
| 121 /* if cutting length is less than threshold then return -1 for both */ | |
| 122 /* to indicate that the read should be discarded */ | |
| 123 /* Also, if you never find a five prime cut site, then discard whole read */ | |
| 124 if ((found_five_prime == 0 && !no_fiveprime) || (three_prime_cut - five_prime_cut < length_threshold)) { | |
| 125 three_prime_cut = -1; | |
| 126 five_prime_cut = -1; | |
| 127 | |
| 128 if (debug) printf("%s\n", fqrec->name.s); | |
| 129 } | |
| 130 | |
| 131 if (debug) printf ("\n\n"); | |
| 132 | |
| 133 retvals = (cutsites*) malloc (sizeof(cutsites)); | |
| 134 retvals->three_prime_cut = three_prime_cut; | |
| 135 retvals->five_prime_cut = five_prime_cut; | |
| 136 return (retvals); | |
| 137 } |
