annotate myTools/trtr.c @ 6:f9f71cf4c3e3

Uploaded
author mrvollger
date Thu, 11 Dec 2014 15:42:44 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
1 /**********************************************************************
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
2 * Author: Jonathan Richards
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
3 * Date: 12/11/2014
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
4 *
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
5 * This tool removes tandem repeats from ends of unaligned sequencing
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
6 * reads (leaving one copy). This prevents reads that don't span the
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
7 * repeated region from overlapping and leading to innaccurate SNPs
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
8 * calls.
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
9 *
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
10 * The maximimum repeat length is adjustable (use 1 to trim only
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
11 * homopolymers).
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
12 *
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
13 * The "aggressive" option should not be touched in general. Setting to
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
14 * 0 will prevent the program from trimming to exactly 1 copy of the
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
15 * repeat, instead leaving between 1 and 2 copies. Why this would be
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
16 * useful, I don't know.
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
17 *
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
18 * This program could also be a useful first step before assembly. More
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
19 * testing needs to be done.
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
20 *
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
21 * Special thanks to my advisor, Professor Alison Gammie, for bringing
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
22 * this problem to my attention and to my reseach partner, Mitchell
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
23 * Vollger, for help testing and finalizing.
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
24 **********************************************************************/
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
25
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
26 #include <stdio.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
27 #include <stdlib.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
28 #include <stdbool.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
29 #include <sys/types.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
30 #include <assert.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
31 #include <errno.h>
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
32
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
33 //use my getline for portability
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
34 //adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
35 //http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
36 ssize_t getline(char** lineptr, size_t* n, FILE* stream) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
37 size_t nchars_avail;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
38 char* read_pos;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
39 int save_errno;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
40 ssize_t ret;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
41 register int c;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
42
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
43 if (!lineptr || !n || !stream) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
44 errno = EINVAL;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
45 return -1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
46 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
47 if (!*lineptr) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
48 *n = 128;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
49 *lineptr = malloc(*n);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
50 if (!*lineptr) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
51 errno = ENOMEM;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
52 return -1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
53 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
54 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
55 nchars_avail = *n;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
56 read_pos = *lineptr;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
57 for (;;) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
58 c = getc(stream);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
59 save_errno = errno;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
60 if (c != '\r') { //for portability...
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
61 assert((*lineptr+*n)==(read_pos+nchars_avail));
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
62 if (nchars_avail < 2) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
63 *n *= 2;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
64 nchars_avail = *n + *lineptr - read_pos;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
65 *lineptr = realloc(*lineptr, *n);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
66 if (!*lineptr) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
67 errno = ENOMEM;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
68 return -1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
69 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
70 read_pos = *n - nchars_avail + *lineptr;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
71 assert((*lineptr+*n) == (read_pos+nchars_avail));
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
72 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
73 if (ferror(stream)) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
74 errno = save_errno;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
75 return -1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
76 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
77 if (c == EOF) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
78 if (read_pos == *lineptr)
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
79 return -1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
80 else
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
81 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
82 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
83 *read_pos++ = c;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
84 nchars_avail--;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
85 if (c == '\n')
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
86 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
87 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
88 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
89 *read_pos = '\0';
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
90 ret = read_pos - *lineptr;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
91 return ret;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
92 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
93
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
94 int main(int argc, char *argv[]) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
95 char *line = NULL;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
96 size_t len = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
97 ssize_t line_length;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
98
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
99 int count = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
100 size_t leftTrim = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
101 size_t rightTrim = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
102 size_t i;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
103 size_t i_max = 10;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
104 size_t j;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
105 size_t r;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
106 size_t length;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
107 size_t longest_region;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
108 char *ptr;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
109 bool matched = false;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
110 bool aggressive_trim = true;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
111
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
112 FILE *file = fopen(argv[1], "r");
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
113
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
114 if (argc >= 3) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
115 i_max = strtol(argv[2], &ptr, 10);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
116 if (argc >= 4) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
117 aggressive_trim = strtol(argv[3], &ptr, 10);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
118 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
119 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
120 if (file != NULL) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
121 while ((line_length = getline(&line, &len, file)) != -1) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
122 count++;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
123 switch (count) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
124
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
125 //read name
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
126 case 1:
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
127 fputs(line, stdout);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
128 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
129
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
130 //read sequence
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
131 case 2:
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
132 //find leftTrim
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
133 longest_region = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
134 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
135 if (line[0] == line[i]) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
136 matched = true;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
137 j=1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
138 r=0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
139 while (matched == true) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
140 if (j == i) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
141 r++;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
142 j=0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
143 } else if (line[j] != line[(r+1)*i+j]) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
144 //no length comparison needed because of \n at end
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
145 matched = false;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
146 if (aggressive_trim) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
147 length = r*i+j;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
148 } else {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
149 length = r*i;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
150 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
151 if (length > longest_region && r>0) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
152 longest_region = length;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
153 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
154 } else {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
155 j++;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
156 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
157 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
158
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
159 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
160 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
161 leftTrim = longest_region;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
162
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
163 //find rightTrim
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
164 longest_region = 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
165 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
166 if (line[line_length-2] == line[line_length-2-i]) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
167 matched = true;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
168 j=1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
169 r=0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
170 while (matched == true) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
171 if (j == i) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
172 r++;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
173 j=0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
174 } else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j])
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
175 || line_length-2-(r+1)*i-j == leftTrim) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
176 matched = false;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
177 if (aggressive_trim) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
178 length = r*i+j;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
179 } else {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
180 length = r*i;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
181 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
182 if (length > longest_region && r>0) {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
183 longest_region = length;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
184 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
185 } else {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
186 j++;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
187 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
188 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
189 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
190 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
191 rightTrim = line_length-longest_region-1;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
192
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
193 //print trimmed line
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
194 line[rightTrim] = '\n';
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
195 line[rightTrim+1] = '\0';
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
196 fputs(line+leftTrim, stdout);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
197 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
198
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
199 //+
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
200 case 3:
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
201 fputs(line, stdout);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
202 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
203
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
204 //read qualities
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
205 case 4:
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
206 count = 0; //reset to read title
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
207 line[rightTrim] = '\n';
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
208 line[rightTrim+1] = '\0';
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
209 fputs(line+leftTrim, stdout);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
210 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
211 default:
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
212 break;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
213 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
214 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
215 free(line);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
216 fclose(file);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
217 } else {
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
218 perror(argv[1]);
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
219 }
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
220 return 0;
f9f71cf4c3e3 Uploaded
mrvollger
parents:
diff changeset
221 }