Mercurial > repos > mrvollger > trtr
comparison myTools/trtr.c @ 6:f9f71cf4c3e3
Uploaded
author | mrvollger |
---|---|
date | Thu, 11 Dec 2014 15:42:44 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:1009e9e20d5f | 6:f9f71cf4c3e3 |
---|---|
1 /********************************************************************** | |
2 * Author: Jonathan Richards | |
3 * Date: 12/11/2014 | |
4 * | |
5 * This tool removes tandem repeats from ends of unaligned sequencing | |
6 * reads (leaving one copy). This prevents reads that don't span the | |
7 * repeated region from overlapping and leading to innaccurate SNPs | |
8 * calls. | |
9 * | |
10 * The maximimum repeat length is adjustable (use 1 to trim only | |
11 * homopolymers). | |
12 * | |
13 * The "aggressive" option should not be touched in general. Setting to | |
14 * 0 will prevent the program from trimming to exactly 1 copy of the | |
15 * repeat, instead leaving between 1 and 2 copies. Why this would be | |
16 * useful, I don't know. | |
17 * | |
18 * This program could also be a useful first step before assembly. More | |
19 * testing needs to be done. | |
20 * | |
21 * Special thanks to my advisor, Professor Alison Gammie, for bringing | |
22 * this problem to my attention and to my reseach partner, Mitchell | |
23 * Vollger, for help testing and finalizing. | |
24 **********************************************************************/ | |
25 | |
26 #include <stdio.h> | |
27 #include <stdlib.h> | |
28 #include <stdbool.h> | |
29 #include <sys/types.h> | |
30 #include <assert.h> | |
31 #include <errno.h> | |
32 | |
33 //use my getline for portability | |
34 //adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu | |
35 //http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c | |
36 ssize_t getline(char** lineptr, size_t* n, FILE* stream) { | |
37 size_t nchars_avail; | |
38 char* read_pos; | |
39 int save_errno; | |
40 ssize_t ret; | |
41 register int c; | |
42 | |
43 if (!lineptr || !n || !stream) { | |
44 errno = EINVAL; | |
45 return -1; | |
46 } | |
47 if (!*lineptr) { | |
48 *n = 128; | |
49 *lineptr = malloc(*n); | |
50 if (!*lineptr) { | |
51 errno = ENOMEM; | |
52 return -1; | |
53 } | |
54 } | |
55 nchars_avail = *n; | |
56 read_pos = *lineptr; | |
57 for (;;) { | |
58 c = getc(stream); | |
59 save_errno = errno; | |
60 if (c != '\r') { //for portability... | |
61 assert((*lineptr+*n)==(read_pos+nchars_avail)); | |
62 if (nchars_avail < 2) { | |
63 *n *= 2; | |
64 nchars_avail = *n + *lineptr - read_pos; | |
65 *lineptr = realloc(*lineptr, *n); | |
66 if (!*lineptr) { | |
67 errno = ENOMEM; | |
68 return -1; | |
69 } | |
70 read_pos = *n - nchars_avail + *lineptr; | |
71 assert((*lineptr+*n) == (read_pos+nchars_avail)); | |
72 } | |
73 if (ferror(stream)) { | |
74 errno = save_errno; | |
75 return -1; | |
76 } | |
77 if (c == EOF) { | |
78 if (read_pos == *lineptr) | |
79 return -1; | |
80 else | |
81 break; | |
82 } | |
83 *read_pos++ = c; | |
84 nchars_avail--; | |
85 if (c == '\n') | |
86 break; | |
87 } | |
88 } | |
89 *read_pos = '\0'; | |
90 ret = read_pos - *lineptr; | |
91 return ret; | |
92 } | |
93 | |
94 int main(int argc, char *argv[]) { | |
95 char *line = NULL; | |
96 size_t len = 0; | |
97 ssize_t line_length; | |
98 | |
99 int count = 0; | |
100 size_t leftTrim = 0; | |
101 size_t rightTrim = 0; | |
102 size_t i; | |
103 size_t i_max = 10; | |
104 size_t j; | |
105 size_t r; | |
106 size_t length; | |
107 size_t longest_region; | |
108 char *ptr; | |
109 bool matched = false; | |
110 bool aggressive_trim = true; | |
111 | |
112 FILE *file = fopen(argv[1], "r"); | |
113 | |
114 if (argc >= 3) { | |
115 i_max = strtol(argv[2], &ptr, 10); | |
116 if (argc >= 4) { | |
117 aggressive_trim = strtol(argv[3], &ptr, 10); | |
118 } | |
119 } | |
120 if (file != NULL) { | |
121 while ((line_length = getline(&line, &len, file)) != -1) { | |
122 count++; | |
123 switch (count) { | |
124 | |
125 //read name | |
126 case 1: | |
127 fputs(line, stdout); | |
128 break; | |
129 | |
130 //read sequence | |
131 case 2: | |
132 //find leftTrim | |
133 longest_region = 0; | |
134 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat | |
135 if (line[0] == line[i]) { | |
136 matched = true; | |
137 j=1; | |
138 r=0; | |
139 while (matched == true) { | |
140 if (j == i) { | |
141 r++; | |
142 j=0; | |
143 } else if (line[j] != line[(r+1)*i+j]) { | |
144 //no length comparison needed because of \n at end | |
145 matched = false; | |
146 if (aggressive_trim) { | |
147 length = r*i+j; | |
148 } else { | |
149 length = r*i; | |
150 } | |
151 if (length > longest_region && r>0) { | |
152 longest_region = length; | |
153 } | |
154 } else { | |
155 j++; | |
156 } | |
157 } | |
158 | |
159 } | |
160 } | |
161 leftTrim = longest_region; | |
162 | |
163 //find rightTrim | |
164 longest_region = 0; | |
165 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat | |
166 if (line[line_length-2] == line[line_length-2-i]) { | |
167 matched = true; | |
168 j=1; | |
169 r=0; | |
170 while (matched == true) { | |
171 if (j == i) { | |
172 r++; | |
173 j=0; | |
174 } else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j]) | |
175 || line_length-2-(r+1)*i-j == leftTrim) { | |
176 matched = false; | |
177 if (aggressive_trim) { | |
178 length = r*i+j; | |
179 } else { | |
180 length = r*i; | |
181 } | |
182 if (length > longest_region && r>0) { | |
183 longest_region = length; | |
184 } | |
185 } else { | |
186 j++; | |
187 } | |
188 } | |
189 } | |
190 } | |
191 rightTrim = line_length-longest_region-1; | |
192 | |
193 //print trimmed line | |
194 line[rightTrim] = '\n'; | |
195 line[rightTrim+1] = '\0'; | |
196 fputs(line+leftTrim, stdout); | |
197 break; | |
198 | |
199 //+ | |
200 case 3: | |
201 fputs(line, stdout); | |
202 break; | |
203 | |
204 //read qualities | |
205 case 4: | |
206 count = 0; //reset to read title | |
207 line[rightTrim] = '\n'; | |
208 line[rightTrim+1] = '\0'; | |
209 fputs(line+leftTrim, stdout); | |
210 break; | |
211 default: | |
212 break; | |
213 } | |
214 } | |
215 free(line); | |
216 fclose(file); | |
217 } else { | |
218 perror(argv[1]); | |
219 } | |
220 return 0; | |
221 } |