comparison myTools/trtr.c @ 6:f9f71cf4c3e3

Uploaded
author mrvollger
date Thu, 11 Dec 2014 15:42:44 -0500
parents
children
comparison
equal deleted inserted replaced
5:1009e9e20d5f 6:f9f71cf4c3e3
1 /**********************************************************************
2 * Author: Jonathan Richards
3 * Date: 12/11/2014
4 *
5 * This tool removes tandem repeats from ends of unaligned sequencing
6 * reads (leaving one copy). This prevents reads that don't span the
7 * repeated region from overlapping and leading to innaccurate SNPs
8 * calls.
9 *
10 * The maximimum repeat length is adjustable (use 1 to trim only
11 * homopolymers).
12 *
13 * The "aggressive" option should not be touched in general. Setting to
14 * 0 will prevent the program from trimming to exactly 1 copy of the
15 * repeat, instead leaving between 1 and 2 copies. Why this would be
16 * useful, I don't know.
17 *
18 * This program could also be a useful first step before assembly. More
19 * testing needs to be done.
20 *
21 * Special thanks to my advisor, Professor Alison Gammie, for bringing
22 * this problem to my attention and to my reseach partner, Mitchell
23 * Vollger, for help testing and finalizing.
24 **********************************************************************/
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdbool.h>
29 #include <sys/types.h>
30 #include <assert.h>
31 #include <errno.h>
32
33 //use my getline for portability
34 //adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu
35 //http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c
36 ssize_t getline(char** lineptr, size_t* n, FILE* stream) {
37 size_t nchars_avail;
38 char* read_pos;
39 int save_errno;
40 ssize_t ret;
41 register int c;
42
43 if (!lineptr || !n || !stream) {
44 errno = EINVAL;
45 return -1;
46 }
47 if (!*lineptr) {
48 *n = 128;
49 *lineptr = malloc(*n);
50 if (!*lineptr) {
51 errno = ENOMEM;
52 return -1;
53 }
54 }
55 nchars_avail = *n;
56 read_pos = *lineptr;
57 for (;;) {
58 c = getc(stream);
59 save_errno = errno;
60 if (c != '\r') { //for portability...
61 assert((*lineptr+*n)==(read_pos+nchars_avail));
62 if (nchars_avail < 2) {
63 *n *= 2;
64 nchars_avail = *n + *lineptr - read_pos;
65 *lineptr = realloc(*lineptr, *n);
66 if (!*lineptr) {
67 errno = ENOMEM;
68 return -1;
69 }
70 read_pos = *n - nchars_avail + *lineptr;
71 assert((*lineptr+*n) == (read_pos+nchars_avail));
72 }
73 if (ferror(stream)) {
74 errno = save_errno;
75 return -1;
76 }
77 if (c == EOF) {
78 if (read_pos == *lineptr)
79 return -1;
80 else
81 break;
82 }
83 *read_pos++ = c;
84 nchars_avail--;
85 if (c == '\n')
86 break;
87 }
88 }
89 *read_pos = '\0';
90 ret = read_pos - *lineptr;
91 return ret;
92 }
93
94 int main(int argc, char *argv[]) {
95 char *line = NULL;
96 size_t len = 0;
97 ssize_t line_length;
98
99 int count = 0;
100 size_t leftTrim = 0;
101 size_t rightTrim = 0;
102 size_t i;
103 size_t i_max = 10;
104 size_t j;
105 size_t r;
106 size_t length;
107 size_t longest_region;
108 char *ptr;
109 bool matched = false;
110 bool aggressive_trim = true;
111
112 FILE *file = fopen(argv[1], "r");
113
114 if (argc >= 3) {
115 i_max = strtol(argv[2], &ptr, 10);
116 if (argc >= 4) {
117 aggressive_trim = strtol(argv[3], &ptr, 10);
118 }
119 }
120 if (file != NULL) {
121 while ((line_length = getline(&line, &len, file)) != -1) {
122 count++;
123 switch (count) {
124
125 //read name
126 case 1:
127 fputs(line, stdout);
128 break;
129
130 //read sequence
131 case 2:
132 //find leftTrim
133 longest_region = 0;
134 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
135 if (line[0] == line[i]) {
136 matched = true;
137 j=1;
138 r=0;
139 while (matched == true) {
140 if (j == i) {
141 r++;
142 j=0;
143 } else if (line[j] != line[(r+1)*i+j]) {
144 //no length comparison needed because of \n at end
145 matched = false;
146 if (aggressive_trim) {
147 length = r*i+j;
148 } else {
149 length = r*i;
150 }
151 if (length > longest_region && r>0) {
152 longest_region = length;
153 }
154 } else {
155 j++;
156 }
157 }
158
159 }
160 }
161 leftTrim = longest_region;
162
163 //find rightTrim
164 longest_region = 0;
165 for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
166 if (line[line_length-2] == line[line_length-2-i]) {
167 matched = true;
168 j=1;
169 r=0;
170 while (matched == true) {
171 if (j == i) {
172 r++;
173 j=0;
174 } else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j])
175 || line_length-2-(r+1)*i-j == leftTrim) {
176 matched = false;
177 if (aggressive_trim) {
178 length = r*i+j;
179 } else {
180 length = r*i;
181 }
182 if (length > longest_region && r>0) {
183 longest_region = length;
184 }
185 } else {
186 j++;
187 }
188 }
189 }
190 }
191 rightTrim = line_length-longest_region-1;
192
193 //print trimmed line
194 line[rightTrim] = '\n';
195 line[rightTrim+1] = '\0';
196 fputs(line+leftTrim, stdout);
197 break;
198
199 //+
200 case 3:
201 fputs(line, stdout);
202 break;
203
204 //read qualities
205 case 4:
206 count = 0; //reset to read title
207 line[rightTrim] = '\n';
208 line[rightTrim+1] = '\0';
209 fputs(line+leftTrim, stdout);
210 break;
211 default:
212 break;
213 }
214 }
215 free(line);
216 fclose(file);
217 } else {
218 perror(argv[1]);
219 }
220 return 0;
221 }