diff tool-dependencies/trtrR-1.0.0/trtr.c @ 30:c92ea5a8dec6

planemo upload for repository https://github.com/lparsons/galaxy_tools/tree/master/tools/trtr commit 6f76316d57fa4ec6b16aeed61e197b2c117f006d-dirty
author lparsons
date Mon, 14 Mar 2016 16:18:55 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-dependencies/trtrR-1.0.0/trtr.c	Mon Mar 14 16:18:55 2016 -0400
@@ -0,0 +1,221 @@
+/**********************************************************************
+* Author: Jonathan Richards
+* Date: 12/11/2014
+*
+* This tool removes tandem repeats from ends of unaligned sequencing
+* reads (leaving one copy). This prevents reads that don't span the
+* repeated region from overlapping and leading to innaccurate SNPs
+* calls.
+* 
+* The maximimum repeat length is adjustable (use 1 to trim only
+* homopolymers).
+*
+* The "aggressive" option should not be touched in general. Setting to
+* 0 will prevent the program from trimming to exactly 1 copy of the
+* repeat, instead leaving between 1 and 2 copies. Why this would be
+* useful, I don't know.
+* 
+* This program could also be a useful first step before assembly. More
+* testing needs to be done.
+*
+* Special thanks to my advisor, Professor Alison Gammie, for bringing
+* this problem to my attention and to my reseach partner, Mitchell
+* Vollger, for help testing and finalizing.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <assert.h>
+#include <errno.h>
+
+//use my getline for portability
+//adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu
+//http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c
+ssize_t getline(char** lineptr, size_t* n, FILE* stream) {
+	size_t nchars_avail;
+	char* read_pos;
+	int save_errno;
+	ssize_t ret;
+	register int c;
+
+	if (!lineptr || !n || !stream) {
+		errno = EINVAL;
+		return -1;
+	}
+	if (!*lineptr) {
+		*n = 128;
+		*lineptr = malloc(*n);
+		if (!*lineptr) {
+			errno = ENOMEM;
+			return -1;
+		}
+	}
+	nchars_avail = *n;
+	read_pos = *lineptr;
+	for (;;) {
+		c = getc(stream);
+		save_errno = errno;
+		if (c != '\r') { //for portability...
+			assert((*lineptr+*n)==(read_pos+nchars_avail));
+			if (nchars_avail < 2) {
+				*n *= 2;
+				nchars_avail = *n + *lineptr - read_pos;
+				*lineptr = realloc(*lineptr, *n);
+				if (!*lineptr) {
+					errno = ENOMEM;
+					return -1;
+				}
+				read_pos = *n - nchars_avail + *lineptr;
+				assert((*lineptr+*n) == (read_pos+nchars_avail));
+			}
+			if (ferror(stream)) {
+				errno = save_errno;
+				return -1;
+			}
+			if (c == EOF) {
+				if (read_pos == *lineptr)
+					return -1;
+				else
+					break;
+			}
+			*read_pos++ = c;
+			nchars_avail--;
+			if (c == '\n')
+				break;
+		}
+	}
+	*read_pos = '\0';
+	ret = read_pos - *lineptr;
+	return ret;
+}
+
+int main(int argc, char *argv[]) {
+	char *line = NULL;
+	size_t len = 0;
+	ssize_t line_length;
+
+	int count = 0;
+	size_t leftTrim = 0;
+	size_t rightTrim = 0;
+	size_t i;
+	size_t i_max = 10;
+	size_t j;
+	size_t r;
+	size_t length;
+	size_t longest_region;
+	char *ptr;
+	bool matched = false;
+	bool aggressive_trim = true;
+	
+	FILE *file = fopen(argv[1], "r");
+
+	if (argc >= 3) {
+		i_max = strtol(argv[2], &ptr, 10);
+		if (argc >= 4) {
+			aggressive_trim = strtol(argv[3], &ptr, 10);
+		}
+	}
+	if (file != NULL) {
+		while ((line_length = getline(&line, &len, file)) != -1) {
+			count++;
+			switch (count) {
+
+				//read name
+				case 1:
+					fputs(line, stdout);
+					break;
+				
+				//read sequence
+				case 2:
+					//find leftTrim
+					longest_region = 0;
+					for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
+						if (line[0] == line[i]) {
+							matched = true;
+							j=1;
+							r=0;
+							while (matched == true) {
+								if (j == i) {
+									r++;
+									j=0;
+								} else if (line[j] != line[(r+1)*i+j]) {
+								//no length comparison needed because of \n at end
+									matched = false;
+									if (aggressive_trim) {
+										length = r*i+j;
+									} else {
+										length = r*i;
+									}
+									if (length > longest_region && r>0) {
+										longest_region = length;
+									}
+								} else {
+									j++;
+								}
+							}
+							
+						}
+					}
+					leftTrim = longest_region;
+
+					//find rightTrim
+					longest_region = 0;
+					for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
+						if (line[line_length-2] == line[line_length-2-i]) {
+							matched = true;
+							j=1;
+							r=0;
+							while (matched == true) {
+								if (j == i) {
+									r++;
+									j=0;
+								} else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j]) 
+									|| line_length-2-(r+1)*i-j == leftTrim) {
+									matched = false;
+									if (aggressive_trim) {
+										length = r*i+j;
+									} else {
+										length = r*i;
+									}
+									if (length > longest_region && r>0) {
+										longest_region = length;
+									}
+								} else {
+									j++;
+								}
+							}
+						}
+					}
+					rightTrim = line_length-longest_region-1;
+					
+					//print trimmed line
+					line[rightTrim] = '\n';
+					line[rightTrim+1] = '\0';
+					fputs(line+leftTrim, stdout);
+					break;
+				
+				//+
+				case 3:
+					fputs(line, stdout);
+					break;
+				
+				//read qualities
+				case 4:
+					count = 0; //reset to read title
+					line[rightTrim] = '\n';
+					line[rightTrim+1] = '\0';
+					fputs(line+leftTrim, stdout);
+					break;
+				default:
+					break; 
+			}
+		}
+		free(line);
+		fclose(file);
+	} else {
+		perror(argv[1]);
+	}
+    return 0;
+}
\ No newline at end of file