Mercurial > repos > portiahollyoak > fastuniq
comparison source/fastq_uniq.c @ 0:816cb55b5a2d draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit c4769fd68ad9583d4b9dbdf212e4ecb5968cef1c-dirty
| author | portiahollyoak |
|---|---|
| date | Thu, 02 Jun 2016 11:34:51 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:816cb55b5a2d |
|---|---|
| 1 /* This program was used to remove duplicates in paired FASTQ sequences, | |
| 2 * which is usually appeared in mate pair libraries. | |
| 3 * | |
| 4 * This file and its partner was written by Haibin Xu, December 2011. | |
| 5 */ | |
| 6 | |
| 7 #ifndef MAX_FILE_NUMBER | |
| 8 #define MAX_FILE_NUMBER 1000 | |
| 9 #endif | |
| 10 | |
| 11 #include <unistd.h> | |
| 12 #include "fastq_pair_array.h" | |
| 13 | |
| 14 void fastq_uniq_usage() | |
| 15 { | |
| 16 fprintf(stderr, "-i : The input file list of paired FSATQ sequence files [FILE IN]\n"); | |
| 17 fprintf(stderr, " Maximum 1000 pairs\n"); | |
| 18 fprintf(stderr, "\n"); | |
| 19 fprintf(stderr, " This parameter is used to specify a list of paired sequence files in\n"); | |
| 20 fprintf(stderr, " FASTQ format as input, in which two adjacent files with reads in the\n"); | |
| 21 fprintf(stderr, " same order belong to a pair.\n"); | |
| 22 fprintf(stderr, "\n"); | |
| 23 fprintf(stderr, "-t : Output sequence format [q/f/p]\n"); | |
| 24 fprintf(stderr, " q : FASTQ format into TWO output files\n"); | |
| 25 fprintf(stderr, " f : FASTA format into TWO output files\n"); | |
| 26 fprintf(stderr, " p : FASTA format into ONE output file\n"); | |
| 27 fprintf(stderr, " default = q\n"); | |
| 28 fprintf(stderr, "\n"); | |
| 29 fprintf(stderr, " This parameter is used to specify sequence format in output file(s).\n"); | |
| 30 fprintf(stderr, " FastUniq could output read pairs into two files in either FASTQ [q]\n"); | |
| 31 fprintf(stderr, " or FASTA [f] format, in which reads in the same order belonging to a\n"); | |
| 32 fprintf(stderr, " pair. FastUniq could also output read pairs into a single file in\n"); | |
| 33 fprintf(stderr, " FASTA format [p], in which adjacent reads belonging to a pair.\n"); | |
| 34 fprintf(stderr, "\n"); | |
| 35 fprintf(stderr, "-o : The first output file [FILE OUT]\n"); | |
| 36 fprintf(stderr, "\n"); | |
| 37 fprintf(stderr, "-p : The second output file [FILE OUT]\n"); | |
| 38 fprintf(stderr, " Optional. ONLY required when output sequence format(-t) is specify as\n"); | |
| 39 fprintf(stderr, " [q] or [f].\n"); | |
| 40 fprintf(stderr, "\n"); | |
| 41 fprintf(stderr, "-c : Types of sequence descriptions for output [0/1]\n"); | |
| 42 fprintf(stderr, " 0 : The raw descriptions\n"); | |
| 43 fprintf(stderr, " 1 : New serial numbers assigned by FastUniq\n"); | |
| 44 fprintf(stderr, " default = 0\n"); | |
| 45 fprintf(stderr, "\n"); | |
| 46 return; | |
| 47 } | |
| 48 | |
| 49 int main (int argc, const char * argv[]) | |
| 50 { | |
| 51 FILE *fp_in_list, *fp_in_left, *fp_in_right, *fp_out_left, *fp_out_right; | |
| 52 char str_in_left[MAX_FILE_NUMBER][1000], str_in_right[MAX_FILE_NUMBER][1000]; | |
| 53 char str_in_list[1000], str_out_left[1000], str_out_right[1000]; | |
| 54 char s_left[1000], s_right[1000]; | |
| 55 char output_format; | |
| 56 int description_type; | |
| 57 int flag_i=0, flag_o=0, flag_t=0, flag_p=0, flag_c=0; | |
| 58 char ch; | |
| 59 FASTQ_PAIR *fq_pair; | |
| 60 FASTQ_PAIR_ARRAY *fq_pair_array, *temp_fq_pair_array; | |
| 61 long i, seq_pair_count; | |
| 62 | |
| 63 if(argc==1) | |
| 64 { | |
| 65 fastq_uniq_usage(); | |
| 66 return 1; | |
| 67 } | |
| 68 | |
| 69 /* initializing */ | |
| 70 for(i=0;i<MAX_FILE_NUMBER;i++) | |
| 71 { | |
| 72 str_in_left[i][0]='\0'; | |
| 73 str_in_right[i][0]='\0'; | |
| 74 } | |
| 75 str_in_list[0]='\0'; | |
| 76 str_out_left[0]='\0'; | |
| 77 str_out_right[0]='\0'; | |
| 78 output_format='\0'; | |
| 79 | |
| 80 /* obtain inputted arguments */ | |
| 81 while((ch=getopt(argc, argv, "i:t:o:p:c:"))!=-1) | |
| 82 { | |
| 83 switch(ch) | |
| 84 { | |
| 85 case 'i': | |
| 86 strcpy(str_in_list,optarg); | |
| 87 if(strcmp(str_in_list,"")!=0) | |
| 88 flag_i=1; | |
| 89 else | |
| 90 { | |
| 91 fastq_uniq_usage(); | |
| 92 return 1; | |
| 93 } | |
| 94 break; | |
| 95 case 't': | |
| 96 if(strlen(optarg)==1) | |
| 97 { | |
| 98 if(optarg[0]=='q') | |
| 99 { | |
| 100 output_format='q'; | |
| 101 flag_t=1; | |
| 102 break; | |
| 103 } | |
| 104 else if(optarg[0]=='f') | |
| 105 { | |
| 106 output_format='f'; | |
| 107 flag_t=1; | |
| 108 break; | |
| 109 } | |
| 110 else if(optarg[0]=='p') | |
| 111 { | |
| 112 output_format='p'; | |
| 113 flag_t=1; | |
| 114 break; | |
| 115 } | |
| 116 else | |
| 117 { | |
| 118 fastq_uniq_usage(); | |
| 119 return 1; | |
| 120 } | |
| 121 } | |
| 122 fastq_uniq_usage(); | |
| 123 return 1; | |
| 124 case 'o': | |
| 125 strcpy(str_out_left,optarg); | |
| 126 if(strcmp(str_out_left,"")!=0) | |
| 127 flag_o=1; | |
| 128 else | |
| 129 { | |
| 130 fastq_uniq_usage(); | |
| 131 return 1; | |
| 132 } | |
| 133 break; | |
| 134 case 'p': | |
| 135 strcpy(str_out_right,optarg); | |
| 136 if(strcmp(str_out_right,"")!=0) | |
| 137 flag_p=1; | |
| 138 else | |
| 139 { | |
| 140 fastq_uniq_usage(); | |
| 141 return 1; | |
| 142 } | |
| 143 break; | |
| 144 case 'c': | |
| 145 if(strlen(optarg)==1) | |
| 146 { | |
| 147 if(optarg[0]=='0') | |
| 148 { | |
| 149 description_type=0; | |
| 150 flag_c=1; | |
| 151 break; | |
| 152 } | |
| 153 else if(optarg[0]=='1') | |
| 154 { | |
| 155 description_type=1; | |
| 156 flag_c=1; | |
| 157 break; | |
| 158 } | |
| 159 else | |
| 160 { | |
| 161 fastq_uniq_usage(); | |
| 162 return 1; | |
| 163 } | |
| 164 } | |
| 165 fastq_uniq_usage(); | |
| 166 return 1; | |
| 167 default: | |
| 168 fastq_uniq_usage(); | |
| 169 break; | |
| 170 } | |
| 171 } | |
| 172 | |
| 173 /* check inputted arguments */ | |
| 174 if(flag_i==0) | |
| 175 { | |
| 176 fprintf(stderr, "Error in input the name of FASTQ file list!\n"); | |
| 177 return 1; | |
| 178 } | |
| 179 if(flag_t==0) | |
| 180 output_format='q'; | |
| 181 if(flag_o==0 || (output_format!='p' && flag_p==0)) | |
| 182 { | |
| 183 fprintf(stderr, "Error in output sequence file name!\n"); | |
| 184 return 1; | |
| 185 } | |
| 186 if(flag_c==0) | |
| 187 description_type=0; | |
| 188 | |
| 189 /* get pair-end FASTQ file list */ | |
| 190 if((fp_in_list=fopen(str_in_list, "r"))==NULL) | |
| 191 { | |
| 192 fprintf(stderr, "Error in open FASTQ file list %s for read!\n", | |
| 193 str_in_list); | |
| 194 return 1; | |
| 195 } | |
| 196 for(i=0; !feof(fp_in_list) && i<MAX_FILE_NUMBER;) | |
| 197 { | |
| 198 /* get the file store left FASTQ sequences */ | |
| 199 s_left[0]='\0'; | |
| 200 fgets(s_left, 1000, fp_in_list); | |
| 201 if(s_left[0]=='\0') | |
| 202 continue; | |
| 203 else if(strlen(s_left)>=2 && s_left[strlen(s_left)-1]=='\n') | |
| 204 s_left[strlen(s_left)-1]='\0'; | |
| 205 else | |
| 206 { | |
| 207 fprintf(stderr, "Error in read from FASTQ file list!\n"); | |
| 208 return 1; | |
| 209 } | |
| 210 | |
| 211 /* get the file store right FASTQ sequences */ | |
| 212 s_right[0]='\0'; | |
| 213 fgets(s_right, 1000, fp_in_list); | |
| 214 if(strlen(s_right)>=2) | |
| 215 { | |
| 216 if(s_right[strlen(s_right)-1]=='\n') | |
| 217 s_right[strlen(s_right)-1]='\0'; | |
| 218 } | |
| 219 else | |
| 220 { | |
| 221 fprintf(stderr, "Error in read from FASTQ file list!\n"); | |
| 222 return 1; | |
| 223 } | |
| 224 | |
| 225 /* append the fiel name to list array */ | |
| 226 strcpy(str_in_left[i], s_left); | |
| 227 strcpy(str_in_right[i++], s_right); | |
| 228 } | |
| 229 fclose(fp_in_list); | |
| 230 | |
| 231 /* check the status of pair-end FASTQ files */ | |
| 232 for(i=0;i<MAX_FILE_NUMBER;i++) | |
| 233 { | |
| 234 /* check whether list reached the end */ | |
| 235 if(str_in_left[i][0]=='\0') | |
| 236 break; | |
| 237 | |
| 238 /* check file status */ | |
| 239 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) | |
| 240 { | |
| 241 fprintf(stderr, "Error in open left fastq file %s for read!\n", | |
| 242 str_in_left[i]); | |
| 243 return 1; | |
| 244 } | |
| 245 fclose(fp_in_left); | |
| 246 | |
| 247 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) | |
| 248 { | |
| 249 fprintf(stderr, "Error in open right fastq file %s for read!\n", | |
| 250 str_in_right[i]); | |
| 251 return 1; | |
| 252 } | |
| 253 fclose(fp_in_right); | |
| 254 } | |
| 255 | |
| 256 | |
| 257 /* read all pair-end FASTQ sequences into memory */ | |
| 258 seq_pair_count=0; | |
| 259 if((fq_pair_array=fastq_pair_array_create())==NULL) | |
| 260 { | |
| 261 fprintf(stderr, "Error in allocate enough memory!\n"); | |
| 262 return 1; | |
| 263 } | |
| 264 if((temp_fq_pair_array=fastq_pair_array_create())==NULL) | |
| 265 { | |
| 266 fprintf(stderr, "Error in allocate enough memory!\n"); | |
| 267 return 1; | |
| 268 } | |
| 269 for(i=0;i<MAX_FILE_NUMBER;i++) | |
| 270 { | |
| 271 /* check whether list reached the end */ | |
| 272 if(str_in_left[i][0]=='\0') | |
| 273 break; | |
| 274 | |
| 275 /* open inputted pair-end FASTQ file */ | |
| 276 if((fp_in_left=fopen(str_in_left[i], "r"))==NULL) | |
| 277 { | |
| 278 fprintf(stderr, "Error in open left fastq file %s for read!\n", | |
| 279 str_in_left[i]); | |
| 280 return 1; | |
| 281 } | |
| 282 if((fp_in_right=fopen(str_in_right[i], "r"))==NULL) | |
| 283 { | |
| 284 fprintf(stderr, "Error in open right fastq file %s for read!\n", | |
| 285 str_in_right[i]); | |
| 286 return 1; | |
| 287 } | |
| 288 | |
| 289 /* read sequences */ | |
| 290 for(;!feof(fp_in_left) && !feof(fp_in_right);) | |
| 291 { | |
| 292 fq_pair=NULL; | |
| 293 if((fq_pair=fastq_pair_create())==NULL) | |
| 294 { | |
| 295 fprintf(stderr, "Error in allocate enough memory!\n"); | |
| 296 return 1; | |
| 297 } | |
| 298 | |
| 299 if(output_format=='f' || output_format=='p') | |
| 300 { | |
| 301 /* NOT require quality */ | |
| 302 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 0)!=0) | |
| 303 { | |
| 304 fastq_pair_remove(fq_pair); | |
| 305 break; | |
| 306 } | |
| 307 } | |
| 308 else | |
| 309 { | |
| 310 /* require quality */ | |
| 311 if(fastq_pair_scanf(fq_pair, fp_in_left, fp_in_right, description_type==0?1:0, 1)!=0) | |
| 312 { | |
| 313 fastq_pair_remove(fq_pair); | |
| 314 break; | |
| 315 } | |
| 316 } | |
| 317 | |
| 318 fastq_pair_array_append(fq_pair, fq_pair_array); | |
| 319 fastq_pair_array_append(fq_pair, temp_fq_pair_array); | |
| 320 seq_pair_count++; | |
| 321 } | |
| 322 | |
| 323 if(!feof(fp_in_left) && !feof(fp_in_right)) | |
| 324 { | |
| 325 fprintf(stderr, "Error in Reading pair-end FASTQ sequence!\n"); | |
| 326 return 1; | |
| 327 } | |
| 328 } | |
| 329 | |
| 330 /* create memory address index for each BLOCK in a FASTQ_PAIR_ARRAY */ | |
| 331 fastq_pair_array_generate_index(fq_pair_array); | |
| 332 fastq_pair_array_generate_index(temp_fq_pair_array); | |
| 333 | |
| 334 /* sort the pair-end FASTQ sequences */ | |
| 335 fastq_pair_array_sort(fq_pair_array, temp_fq_pair_array, 1, seq_pair_count); | |
| 336 | |
| 337 /* open output fastq file */ | |
| 338 if((fp_out_left=fopen(str_out_left, "w"))==NULL) | |
| 339 { | |
| 340 fprintf(stderr, "Error in open left fastq file %s for write!\n", | |
| 341 str_out_left); | |
| 342 return 1; | |
| 343 } | |
| 344 | |
| 345 if(str_out_right[0]!='\0') | |
| 346 { | |
| 347 if((fp_out_right=fopen(str_out_right, "w"))==NULL) | |
| 348 { | |
| 349 fprintf(stderr, "Error in open right fastq file %s for write!\n", | |
| 350 str_out_right); | |
| 351 return 1; | |
| 352 } | |
| 353 } | |
| 354 | |
| 355 /* output the sequence in specific format */ | |
| 356 if(output_format=='f') | |
| 357 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fa", description_type, 1); | |
| 358 else if(output_format=='p') | |
| 359 fastq_pair_array_printf(fq_pair_array, fp_out_left, NULL, "fa", description_type, 1); | |
| 360 else | |
| 361 fastq_pair_array_printf(fq_pair_array, fp_out_left, fp_out_right, "fq", description_type, 1); | |
| 362 | |
| 363 /* close output files */ | |
| 364 fclose(fp_out_left); | |
| 365 if(str_out_right[0]!='\0') | |
| 366 fclose(fp_out_right); | |
| 367 | |
| 368 // /* free memory */ | |
| 369 // fastq_pair_array_remove(fq_pair_array); | |
| 370 | |
| 371 return 0; | |
| 372 } | |
| 373 |
