view myTools/trtr.c @ 6:f9f71cf4c3e3

Uploaded
author mrvollger
date Thu, 11 Dec 2014 15:42:44 -0500
parents
children
line wrap: on
line source

/**********************************************************************
* Author: Jonathan Richards
* Date: 12/11/2014
*
* This tool removes tandem repeats from ends of unaligned sequencing
* reads (leaving one copy). This prevents reads that don't span the
* repeated region from overlapping and leading to innaccurate SNPs
* calls.
* 
* The maximimum repeat length is adjustable (use 1 to trim only
* homopolymers).
*
* The "aggressive" option should not be touched in general. Setting to
* 0 will prevent the program from trimming to exactly 1 copy of the
* repeat, instead leaving between 1 and 2 copies. Why this would be
* useful, I don't know.
* 
* This program could also be a useful first step before assembly. More
* testing needs to be done.
*
* Special thanks to my advisor, Professor Alison Gammie, for bringing
* this problem to my attention and to my reseach partner, Mitchell
* Vollger, for help testing and finalizing.
**********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>

//use my getline for portability
//adapted from getline.c written by Jan Brittenson, bson@gnu.ai.mit.edu
//http://www.opensource.apple.com/source/cvs/cvs-19/cvs/lib/getline.c
ssize_t getline(char** lineptr, size_t* n, FILE* stream) {
	size_t nchars_avail;
	char* read_pos;
	int save_errno;
	ssize_t ret;
	register int c;

	if (!lineptr || !n || !stream) {
		errno = EINVAL;
		return -1;
	}
	if (!*lineptr) {
		*n = 128;
		*lineptr = malloc(*n);
		if (!*lineptr) {
			errno = ENOMEM;
			return -1;
		}
	}
	nchars_avail = *n;
	read_pos = *lineptr;
	for (;;) {
		c = getc(stream);
		save_errno = errno;
		if (c != '\r') { //for portability...
			assert((*lineptr+*n)==(read_pos+nchars_avail));
			if (nchars_avail < 2) {
				*n *= 2;
				nchars_avail = *n + *lineptr - read_pos;
				*lineptr = realloc(*lineptr, *n);
				if (!*lineptr) {
					errno = ENOMEM;
					return -1;
				}
				read_pos = *n - nchars_avail + *lineptr;
				assert((*lineptr+*n) == (read_pos+nchars_avail));
			}
			if (ferror(stream)) {
				errno = save_errno;
				return -1;
			}
			if (c == EOF) {
				if (read_pos == *lineptr)
					return -1;
				else
					break;
			}
			*read_pos++ = c;
			nchars_avail--;
			if (c == '\n')
				break;
		}
	}
	*read_pos = '\0';
	ret = read_pos - *lineptr;
	return ret;
}

int main(int argc, char *argv[]) {
	char *line = NULL;
	size_t len = 0;
	ssize_t line_length;

	int count = 0;
	size_t leftTrim = 0;
	size_t rightTrim = 0;
	size_t i;
	size_t i_max = 10;
	size_t j;
	size_t r;
	size_t length;
	size_t longest_region;
	char *ptr;
	bool matched = false;
	bool aggressive_trim = true;
	
	FILE *file = fopen(argv[1], "r");

	if (argc >= 3) {
		i_max = strtol(argv[2], &ptr, 10);
		if (argc >= 4) {
			aggressive_trim = strtol(argv[3], &ptr, 10);
		}
	}
	if (file != NULL) {
		while ((line_length = getline(&line, &len, file)) != -1) {
			count++;
			switch (count) {

				//read name
				case 1:
					fputs(line, stdout);
					break;
				
				//read sequence
				case 2:
					//find leftTrim
					longest_region = 0;
					for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
						if (line[0] == line[i]) {
							matched = true;
							j=1;
							r=0;
							while (matched == true) {
								if (j == i) {
									r++;
									j=0;
								} else if (line[j] != line[(r+1)*i+j]) {
								//no length comparison needed because of \n at end
									matched = false;
									if (aggressive_trim) {
										length = r*i+j;
									} else {
										length = r*i;
									}
									if (length > longest_region && r>0) {
										longest_region = length;
									}
								} else {
									j++;
								}
							}
							
						}
					}
					leftTrim = longest_region;

					//find rightTrim
					longest_region = 0;
					for (i=1; i<=i_max && i<=line_length/2; i++) { //size of repeat
						if (line[line_length-2] == line[line_length-2-i]) {
							matched = true;
							j=1;
							r=0;
							while (matched == true) {
								if (j == i) {
									r++;
									j=0;
								} else if ((line[line_length-2-j] != line[line_length-2-(r+1)*i-j]) 
									|| line_length-2-(r+1)*i-j == leftTrim) {
									matched = false;
									if (aggressive_trim) {
										length = r*i+j;
									} else {
										length = r*i;
									}
									if (length > longest_region && r>0) {
										longest_region = length;
									}
								} else {
									j++;
								}
							}
						}
					}
					rightTrim = line_length-longest_region-1;
					
					//print trimmed line
					line[rightTrim] = '\n';
					line[rightTrim+1] = '\0';
					fputs(line+leftTrim, stdout);
					break;
				
				//+
				case 3:
					fputs(line, stdout);
					break;
				
				//read qualities
				case 4:
					count = 0; //reset to read title
					line[rightTrim] = '\n';
					line[rightTrim+1] = '\0';
					fputs(line+leftTrim, stdout);
					break;
				default:
					break; 
			}
		}
		free(line);
		fclose(file);
	} else {
		perror(argv[1]);
	}
    return 0;
}