Mercurial > repos > portiahollyoak > fastuniq
view source/fastq_uniq.c @ 0:816cb55b5a2d draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit c4769fd68ad9583d4b9dbdf212e4ecb5968cef1c-dirty
| author | portiahollyoak | 
|---|---|
| date | Thu, 02 Jun 2016 11:34:51 -0400 | 
| parents | |
| children | 
line wrap: on
 line source
/* This program was used to remove duplicates in paired FASTQ sequences, * which is usually appeared in mate pair libraries. * * This file and its partner was written by Haibin Xu, December 2011. */ #ifndef MAX_FILE_NUMBER #define MAX_FILE_NUMBER 1000 #endif #include <unistd.h> #include "fastq_pair_array.h" void fastq_uniq_usage() { fprintf(stderr, "-i : The input file list of paired FSATQ sequence files [FILE IN]\n"); fprintf(stderr, " Maximum 1000 pairs\n"); fprintf(stderr, "\n"); fprintf(stderr, " This parameter is used to specify a list of paired sequence files in\n"); fprintf(stderr, " FASTQ format as input, in which two adjacent files with reads in the\n"); fprintf(stderr, " same order belong to a pair.\n"); fprintf(stderr, "\n"); fprintf(stderr, "-t : Output sequence format [q/f/p]\n"); fprintf(stderr, " q : FASTQ format into TWO output files\n"); fprintf(stderr, " f : FASTA format into TWO output files\n"); fprintf(stderr, " p : FASTA format into ONE output file\n"); fprintf(stderr, " default = q\n"); fprintf(stderr, "\n"); fprintf(stderr, " This parameter is used to specify sequence format in output file(s).\n"); fprintf(stderr, " FastUniq could output read pairs into two files in either FASTQ [q]\n"); fprintf(stderr, " or FASTA [f] format, in which reads in the same order belonging to a\n"); fprintf(stderr, " pair. FastUniq could also output read pairs into a single file in\n"); fprintf(stderr, " FASTA format [p], in which adjacent reads belonging to a pair.\n"); fprintf(stderr, "\n"); fprintf(stderr, "-o : The first output file [FILE OUT]\n"); fprintf(stderr, "\n"); fprintf(stderr, "-p : The second output file [FILE OUT]\n"); fprintf(stderr, " Optional. ONLY required when output sequence format(-t) is specify as\n"); fprintf(stderr, " [q] or [f].\n"); fprintf(stderr, "\n"); fprintf(stderr, "-c : Types of sequence descriptions for output [0/1]\n"); fprintf(stderr, " 0 : The raw descriptions\n"); fprintf(stderr, " 1 : New serial numbers assigned by FastUniq\n"); fprintf(stderr, " default = 0\n"); fprintf(stderr, "\n"); return; } int main (int argc, const char * argv[]) { FILE *fp_in_list, *fp_in_left, *fp_in_right, *fp_out_left, *fp_out_right; char str_in_left[MAX_FILE_NUMBER][1000], str_in_right[MAX_FILE_NUMBER][1000]; char str_in_list[1000], str_out_left[1000], str_out_right[1000]; char s_left[1000], s_right[1000]; char output_format; int description_type; int flag_i=0, flag_o=0, flag_t=0, flag_p=0, flag_c=0; char ch; FASTQ_PAIR *fq_pair; FASTQ_PAIR_ARRAY *fq_pair_array, *temp_fq_pair_array; long i, seq_pair_count; if(argc==1) { fastq_uniq_usage(); return 1; } /* initializing */ for(i=0;i<MAX_FILE_NUMBER;i++) { str_in_left[i][0]='\0'; str_in_right[i][0]='\0'; } str_in_list[0]='\0'; str_out_left[0]='\0'; str_out_right[0]='\0'; output_format='\0'; /* obtain inputted arguments */ while((ch=getopt(argc, argv, "i:t:o:p:c:"))!=-1) { switch(ch) { case 'i': strcpy(str_in_list,optarg); if(strcmp(str_in_list,"")!=0) flag_i=1; else { fastq_uniq_usage(); return 1; } break; case 't': if(strlen(optarg)==1) { if(optarg[0]=='q') { output_format='q'; flag_t=1; break; } else if(optarg[0]=='f') { output_format='f'; flag_t=1; break; } else if(optarg[0]=='p') { output_format='p'; flag_t=1; break; } else { fastq_uniq_usage(); return 1; } } fastq_uniq_usage(); return 1; case 'o': strcpy(str_out_left,optarg); if(strcmp(str_out_left,"")!=0) flag_o=1; else { fastq_uniq_usage(); return 1; } break; case 'p': strcpy(str_out_right,optarg); if(strcmp(str_out_right,"")!=0) flag_p=1; else { fastq_uniq_usage(); return 1; } break; case 'c': if(strlen(optarg)==1) { if(optarg[0]=='0') { description_type=0; flag_c=1; break; } else if(optarg[0]=='1') { description_type=1; flag_c=1; break; } else { fastq_uniq_usage(); return 1; } } fastq_uniq_usage(); return 1; default: fastq_uniq_usage(); break; } } /* check inputted arguments */ if(flag_i==0) { fprintf(stderr, "Error in input the name of FASTQ file list!\n"); return 1; } if(flag_t==0) output_format='q'; if(flag_o==0 || (output_format!='p' && flag_p==0)) { fprintf(stderr, "Error in output sequence file name!\n"); return 1; } if(flag_c==0) description_type=0; /* get pair-end FASTQ file list */ if((fp_in_list=fopen(str_in_list, "r"))==NULL) { fprintf(stderr, "Error in open FASTQ file list %s for read!\n", str_in_list); return 1; } for(i=0; !feof(fp_in_list) && i<MAX_FILE_NUMBER;) { /* get the file store left FASTQ sequences */ s_left[0]='\0'; fgets(s_left, 1000, fp_in_list); if(s_left[0]=='\0') continue; else if(strlen(s_left)>=2 && s_left[strlen(s_left)-1]=='\n') s_left[strlen(s_left)-1]='\0'; else { fprintf(stderr, "Error in read from FASTQ file list!\n"); return 1; } /* get the file store right FASTQ sequences */ s_right[0]='\0'; fgets(s_right, 1000, fp_in_list); if(strlen(s_right)>=2) { if(s_right[strlen(s_right)-1]=='\n') s_right[strlen(s_right)-1]='\0'; } else { fprintf(stderr, "Error in read from FASTQ file list!\n"); return 1; } /* append the fiel name to list array */ strcpy(str_in_left[i], s_left); strcpy(str_in_right[i++], s_right); } fclose(fp_in_list); /* check the status of pair-end FASTQ files */ for(i=0;i<MAX_FILE_NUMBER;i++) { /* check whether list reached the end */ if(str_in_left[i][0]=='\0') break; /* check file status */ if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) { fprintf(stderr, "Error in open left fastq file %s for read!\n", str_in_left[i]); return 1; } fclose(fp_in_left); if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) { fprintf(stderr, "Error in open right fastq file %s for read!\n", str_in_right[i]); return 1; } fclose(fp_in_right); } /* read all pair-end FASTQ sequences into memory */ seq_pair_count=0; if((fq_pair_array=fastq_pair_array_create())==NULL) { fprintf(stderr, "Error in allocate enough memory!\n"); return 1; } if((temp_fq_pair_array=fastq_pair_array_create())==NULL) { fprintf(stderr, "Error in allocate enough memory!\n"); return 1; } for(i=0;i<MAX_FILE_NUMBER;i++) { /* check whether list reached the end */ if(str_in_left[i][0]=='\0') break; /* open inputted pair-end FASTQ file */ if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) { fprintf(stderr, "Error in open left fastq file %s for read!\n", str_in_left[i]); return 1; } if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) { fprintf(stderr, "Error in open right fastq file %s for read!\n", str_in_right[i]); return 1; } /* read sequences */ for(;!feof(fp_in_left) && !feof(fp_in_right);) { fq_pair=NULL; if((fq_pair=fastq_pair_create())==NULL) { fprintf(stderr, "Error in allocate enough memory!\n"); return 1; } if(output_format=='f' || output_format=='p') { /* NOT require quality */ if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 0)!=0) { fastq_pair_remove(fq_pair); break; } } else { /* require quality */ if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 1)!=0) { fastq_pair_remove(fq_pair); break; } } fastq_pair_array_append(fq_pair, fq_pair_array); fastq_pair_array_append(fq_pair, temp_fq_pair_array); seq_pair_count++; } if(!feof(fp_in_left) && !feof(fp_in_right)) { fprintf(stderr, "Error in Reading pair-end FASTQ sequence!\n"); return 1; } } /* create memory address index for each BLOCK in a FASTQ_PAIR_ARRAY */ fastq_pair_array_generate_index(fq_pair_array); fastq_pair_array_generate_index(temp_fq_pair_array); /* sort the pair-end FASTQ sequences */ fastq_pair_array_sort(fq_pair_array, temp_fq_pair_array, 1, seq_pair_count); /* open output fastq file */ if((fp_out_left=fopen(str_out_left, "w"))==NULL) { fprintf(stderr, "Error in open left fastq file %s for write!\n", str_out_left); return 1; } if(str_out_right[0]!='\0') { if((fp_out_right=fopen(str_out_right, "w"))==NULL) { fprintf(stderr, "Error in open right fastq file %s for write!\n", str_out_right); return 1; } } /* output the sequence in specific format */ if(output_format=='f') fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fa", description_type, 1); else if(output_format=='p') fastq_pair_array_printf(fq_pair_array, fp_out_left, NULL, "fa", description_type, 1); else fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fq", description_type, 1); /* close output files */ fclose(fp_out_left); if(str_out_right[0]!='\0') fclose(fp_out_right); // /* free memory */ // fastq_pair_array_remove(fq_pair_array); return 0; }
