6
|
1 /**********************************************************************
|
|
2 * Author: Jonathan Richards
|
|
3 * Date: 12/11/2014
|
|
4 *
|
|
5 * This tool removes tandem repeats from ends of unaligned sequencing
|
|
6 * reads (leaving one copy). This prevents reads that don't span the
|
|
7 * repeated region from overlapping and leading to innaccurate SNPs
|
|
8 * calls.
|
|
9 *
|
|
10 * The maximimum repeat length is adjustable (use 1 to trim only
|
|
11 * homopolymers).
|
|
12 *
|
|
13 * The "aggressive" option should not be touched in general. Setting to
|
|
14 * 0 will prevent the program from trimming to exactly 1 copy of the
|
|
15 * repeat, instead leaving between 1 and 2 copies. Why this would be
|
|
16 * useful, I don't know.
|
|
17 *
|
|
18 * This program could also be a useful first step before assembly. More
|
|
19 * testing needs to be done.
|
|
20 *
|
|
21 * Special thanks to my advisor, Professor Alison Gammie, for bringing
|
|
22 * this problem to my attention and to my reseach partner, Mitchell
|
|
23 * Vollger, for help testing and finalizing.
|
|
24 **********************************************************************/
|
|
25
|
|
26 #include <stdio.h>
|
|
27 #include <stdlib.h>
|
|
28 #include <stdbool.h>
|
|
29 #include <sys/types.h>
|
|
30 #include <assert.h>
|
|
31 #include <errno.h>
|
|
32
|
|
33 //use my getline for portability
|
|
34 //adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu
|
|
35 //http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c
|
|
36 ssize_t getline(char** lineptr, size_t* n, FILE* stream) {
|
|
37 size_t nchars_avail;
|
|
38 char* read_pos;
|
|
39 int save_errno;
|
|
40 ssize_t ret;
|
|
41 register int c;
|
|
42
|
|
43 if (!lineptr || !n || !stream) {
|
|
44 errno = EINVAL;
|
|
45 return -1;
|
|
46 }
|
|
47 if (!*lineptr) {
|
|
48 *n = 128;
|
|
49 *lineptr = malloc(*n);
|
|
50 if (!*lineptr) {
|
|
51 errno = ENOMEM;
|
|
52 return -1;
|
|
53 }
|
|
54 }
|
|
55 nchars_avail = *n;
|
|
56 read_pos = *lineptr;
|
|
57 for (;;) {
|
|
58 c = getc(stream);
|
|
59 save_errno = errno;
|
|
60 if (c != '\r') { //for portability...
|
|
61 assert((*lineptr+*n)==(read_pos+nchars_avail));
|
|
62 if (nchars_avail < 2) {
|
|
63 *n *= 2;
|
|
64 nchars_avail = *n + *lineptr - read_pos;
|
|
65 *lineptr = realloc(*lineptr, *n);
|
|
66 if (!*lineptr) {
|
|
67 errno = ENOMEM;
|
|
68 return -1;
|
|
69 }
|
|
70 read_pos = *n - nchars_avail + *lineptr;
|
|
71 assert((*lineptr+*n) == (read_pos+nchars_avail));
|
|
72 }
|
|
73 if (ferror(stream)) {
|
|
74 errno = save_errno;
|
|
75 return -1;
|
|
76 }
|
|
77 if (c == EOF) {
|
|
78 if (read_pos == *lineptr)
|
|
79 return -1;
|
|
80 else
|
|
81 break;
|
|
82 }
|
|
83 *read_pos++ = c;
|
|
84 nchars_avail--;
|
|
85 if (c == '\n')
|
|
86 break;
|
|
87 }
|
|
88 }
|
|
89 *read_pos = '\0';
|
|
90 ret = read_pos - *lineptr;
|
|
91 return ret;
|
|
92 }
|
|
93
|
|
94 int main(int argc, char *argv[]) {
|
|
95 char *line = NULL;
|
|
96 size_t len = 0;
|
|
97 ssize_t line_length;
|
|
98
|
|
99 int count = 0;
|
|
100 size_t leftTrim = 0;
|
|
101 size_t rightTrim = 0;
|
|
102 size_t i;
|
|
103 size_t i_max = 10;
|
|
104 size_t j;
|
|
105 size_t r;
|
|
106 size_t length;
|
|
107 size_t longest_region;
|
|
108 char *ptr;
|
|
109 bool matched = false;
|
|
110 bool aggressive_trim = true;
|
|
111
|
|
112 FILE *file = fopen(argv[1], "r");
|
|
113
|
|
114 if (argc >= 3) {
|
|
115 i_max = strtol(argv[2], &ptr, 10);
|
|
116 if (argc >= 4) {
|
|
117 aggressive_trim = strtol(argv[3], &ptr, 10);
|
|
118 }
|
|
119 }
|
|
120 if (file != NULL) {
|
|
121 while ((line_length = getline(&line, &len, file)) != -1) {
|
|
122 count++;
|
|
123 switch (count) {
|
|
124
|
|
125 //read name
|
|
126 case 1:
|
|
127 fputs(line, stdout);
|
|
128 break;
|
|
129
|
|
130 //read sequence
|
|
131 case 2:
|
|
132 //find leftTrim
|
|
133 longest_region = 0;
|
|
134 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
|
|
135 if (line[0] == line[i]) {
|
|
136 matched = true;
|
|
137 j=1;
|
|
138 r=0;
|
|
139 while (matched == true) {
|
|
140 if (j == i) {
|
|
141 r++;
|
|
142 j=0;
|
|
143 } else if (line[j] != line[(r+1)*i+j]) {
|
|
144 //no length comparison needed because of \n at end
|
|
145 matched = false;
|
|
146 if (aggressive_trim) {
|
|
147 length = r*i+j;
|
|
148 } else {
|
|
149 length = r*i;
|
|
150 }
|
|
151 if (length > longest_region && r>0) {
|
|
152 longest_region = length;
|
|
153 }
|
|
154 } else {
|
|
155 j++;
|
|
156 }
|
|
157 }
|
|
158
|
|
159 }
|
|
160 }
|
|
161 leftTrim = longest_region;
|
|
162
|
|
163 //find rightTrim
|
|
164 longest_region = 0;
|
|
165 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
|
|
166 if (line[line_length-2] == line[line_length-2-i]) {
|
|
167 matched = true;
|
|
168 j=1;
|
|
169 r=0;
|
|
170 while (matched == true) {
|
|
171 if (j == i) {
|
|
172 r++;
|
|
173 j=0;
|
|
174 } else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j])
|
|
175 || line_length-2-(r+1)*i-j == leftTrim) {
|
|
176 matched = false;
|
|
177 if (aggressive_trim) {
|
|
178 length = r*i+j;
|
|
179 } else {
|
|
180 length = r*i;
|
|
181 }
|
|
182 if (length > longest_region && r>0) {
|
|
183 longest_region = length;
|
|
184 }
|
|
185 } else {
|
|
186 j++;
|
|
187 }
|
|
188 }
|
|
189 }
|
|
190 }
|
|
191 rightTrim = line_length-longest_region-1;
|
|
192
|
|
193 //print trimmed line
|
|
194 line[rightTrim] = '\n';
|
|
195 line[rightTrim+1] = '\0';
|
|
196 fputs(line+leftTrim, stdout);
|
|
197 break;
|
|
198
|
|
199 //+
|
|
200 case 3:
|
|
201 fputs(line, stdout);
|
|
202 break;
|
|
203
|
|
204 //read qualities
|
|
205 case 4:
|
|
206 count = 0; //reset to read title
|
|
207 line[rightTrim] = '\n';
|
|
208 line[rightTrim+1] = '\0';
|
|
209 fputs(line+leftTrim, stdout);
|
|
210 break;
|
|
211 default:
|
|
212 break;
|
|
213 }
|
|
214 }
|
|
215 free(line);
|
|
216 fclose(file);
|
|
217 } else {
|
|
218 perror(argv[1]);
|
|
219 }
|
|
220 return 0;
|
|
221 } |