# HG changeset patch # User czlab # Date 1526607070 14400 # Node ID 14e7247c1fa01845294a48c39b9451b137cc38ec Uploaded diff -r 000000000000 -r 14e7247c1fa0 fastqFilter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqFilter.xml Thu May 17 21:31:10 2018 -0400 @@ -0,0 +1,101 @@ + + + + fastq_filter.pl -v + #if $sampleIndex.filterBySampleIndex == "yes": + -index $sampleIndex.sequence + #end if + -maxN $maxN -if sanger -f $filterString -of $outputFormat $inputfile $outputfile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What this tool does** + +This tool extracts reads passing quality filters. + +It takes as input Sanger FASTQ files and output FASTQ/A files of filtered reads. + +----- + +**FASTQ format** + +Check quality score in the FASTQ file for the right format. + +Reference https://en.wikipedia.org/wiki/FASTQ_format#Quality : + +* Sanger format can encode a Phred quality score from 0 to 93 using ASCII 33 to 126. +* Solexa/Illumina 1.0 format can encode a Solexa/Illumina quality score from -5 to 62 using ASCII 59 to 126. + +See http://www.asciitable.com/ for ASCII table. + +----- + +**Filter by sample index (optional)** + +For users who would like to start from a FASTQ file consisting of multiple libraries. + +For example: + +If you have six samples with indexes GTCA, GCAT, ACTG, AGCT, GCAT, TCGA, you can extract reads for each library with indicated index sequences (e.g. GTCA, etc.) starting from position 0 in the read. For example, you could specify 0:GTCA, etc. + +----- + +**How to set the filter** + +You can apply multiple filtering criteria based on the quality scores for each read. They are separated by commas. + +Each critieron is composed of four components (e.g. method1:start1-end1:score1,method2:start2-end2:score2) + +1. Method: min or mean, which means requirement on minimal or mean score of a region +2. Start: the first nucleotide to consider (0-based) +3. End: the last nucleotide to consider (0-based) +4. score: the threshold required + +**Parameter suggestion** + +For example: + +* For Standard CLIP protocol filtering: mean:0-29:20 (this specifies a mean score of 20 or above in the first 30 bases, which includes 5 positions with sample indexes and the random barcode, followed by 25 positions with the actual CLIP tag). +* For iCLIP/BrdU CLIP filtering: mean:0-38:20 (this specifies a mean score of 20 or above in the first 39 bases, which includes 14 positions with sample indexes and the random barcode, followed by 25 positions with the actual CLIP tag). + +The reason to filter as such is because low quality reads can introduce mapping errors and background. They will inflate the number of unique tags after removal of PCR duplicates. + + + + +