Mercurial > repos > devteam > fastq_stats

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_stats.py	Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,48 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader, fastqAggregator
+
+VALID_NUCLEOTIDES = [ 'A', 'C', 'G', 'T', 'N' ]
+VALID_COLOR_SPACE = map( str, range( 7 ) ) + [ '.' ]
+SUMMARY_STAT_ORDER = ['read_count', 'min_score', 'max_score', 'sum_score', 'mean_score', 'q1', 'med_score', 'q3', 'iqr', 'left_whisker', 'right_whisker' ]
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    input_type = sys.argv[3] or 'sanger'
+
+    aggregator = fastqAggregator()
+    num_reads = None
+    fastq_read = None
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        aggregator.consume_read( fastq_read )
+    out = open( output_filename, 'wb' )
+    valid_nucleotides = VALID_NUCLEOTIDES
+    if fastq_read:
+        if fastq_read.sequence_space == 'base':
+            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\tA_Count\tC_Count\tG_Count\tT_Count\tN_Count\tother_bases\tother_base_count\n' )
+        else:
+            out.write( '#column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW\toutliers\t0_Count\t1_Count\t2_Count\t3_Count\t4_Count\t5_Count\t6_Count\t._Count\tother_bases\tother_base_count\n' )
+            valid_nucleotides = VALID_COLOR_SPACE
+    for i in range( aggregator.get_max_read_length() ):
+        column_stats = aggregator.get_summary_statistics_for_column( i )
+        out.write( '%i\t' % ( i + 1 ) )
+        out.write( '%s\t' * len( SUMMARY_STAT_ORDER ) % tuple( [ column_stats[ key ] for key in SUMMARY_STAT_ORDER ] ) )
+        out.write( '%s\t' % ','.join( map( str, column_stats['outliers'] ) ) )
+        base_counts = aggregator.get_base_counts_for_column( i )
+        for nuc in valid_nucleotides:
+            out.write( "%s\t" % base_counts.get( nuc, 0 ) )
+        extra_nucs = sorted( [ nuc for nuc in base_counts.keys() if nuc not in valid_nucleotides ] )
+        out.write( "%s\t%s\n" % ( ','.join( extra_nucs ), ','.join( str( base_counts[nuc] ) for nuc in extra_nucs ) ) )
+    out.close()
+    if num_reads is None:
+        print "No valid fastq reads could be processed."
+    else:
+        print "%i fastq reads were processed." % ( num_reads + 1 )
+        print "Based upon quality values and sequence characters, the input data is valid for: %s" % ( ", ".join( aggregator.get_valid_formats() ) or "None" )
+        ascii_range = aggregator.get_ascii_range()
+        decimal_range =  aggregator.get_decimal_range()
+        print "Input ASCII range: %s(%i) - %s(%i)" % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) ) #print using repr, since \x00 (null) causes info truncation in galaxy when printed
+        print "Input decimal range: %i - %i" % ( decimal_range[0], decimal_range[1] )
+
+if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_stats.xml	Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,74 @@
+<tool id="fastq_stats" name="FASTQ Summary Statistics" version="1.0.0">
+  <description>by column</description>
+  <requirements>
+    <requirement type="package" version="1.0.0">galaxy_sequence_utils</requirement>
+  </requirements>
+  <command interpreter="python">fastq_stats.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqillumina,fastqsolexa,fastqcssanger" label="FASTQ File"/>
+  </inputs>
+  <outputs>
+    <data name="output_file" format="tabular" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_file" value="fastq_stats1.fastq" ftype="fastqsanger" />
+      <output name="output_file" file="fastq_stats_1_out.tabular" />
+    </test>
+  </tests>
+  <help>
+This tool creates summary statistics on a FASTQ file.
+
+.. class:: infomark
+
+**TIP:** This statistics report can be used as input for the **Boxplot** tools.
+
+-----
+
+**The output file will contain the following fields:**
+
+* column      = column number (1 to 36 for a 36-cycles read Solexa file)
+* count       = number of bases found in this column.
+* min         = Lowest quality score value found in this column.
+* max         = Highest quality score value found in this column.
+* sum         = Sum of quality score values for this column.
+* mean        = Mean quality score value for this column.
+* Q1          = 1st quartile quality score.
+* med         = Median quality score.
+* Q3          = 3rd quartile quality score.
+* IQR         = Inter-Quartile range (Q3-Q1).
+* lW          = 'Left-Whisker' value (for boxplotting).
+* rW          = 'Right-Whisker' value (for boxplotting).
+* outliers    = Scores falling beyond the left and right whiskers (comma separated list).
+* A_Count     = Count of 'A' nucleotides found in this column.
+* C_Count     = Count of 'C' nucleotides found in this column.
+* G_Count     = Count of 'G' nucleotides found in this column.
+* T_Count     = Count of 'T' nucleotides found in this column.
+* N_Count     = Count of 'N' nucleotides found in this column.
+* Other_Nucs  = Comma separated list of other nucleotides found in this column.
+* Other_Count = Comma separated count of other nucleotides found in this column.
+
+For example::
+
+  #column   count   min max sum mean    Q1  med Q3  IQR lW  rW  outliers    A_Count C_Count G_Count T_Count N_Count other_bases other_base_count
+  1   14336356    2   33  450600675   31.4306281875   32.0    33.0    33.0    1.0 31  33  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30    4482314 2199633 4425957 3208745 19707
+  2   14336356    2   34  441135033   30.7703737965   30.0    33.0    33.0    3.0 26  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25   4419184 2170537 4627987 3118567 81
+  3   14336356    2   34  433659182   30.2489127642   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4310988 2941988 3437467 3645784 129
+  4   14336356    2   34  433635331   30.2472490917   29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4110637 3007028 3671749 3546839 103
+  5   14336356    2   34  432498583   30.167957813    29.0    32.0    33.0    4.0 23  34  2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22    4348275 2935903 3293025 3759029 124
+
+-----
+
+.. class:: warningmark
+
+Adapter bases in color space reads are excluded from statistics.
+
+------
+
+**Citation**
+
+If you use this tool, please cite `Blankenberg D, Gordon A, Von Kuster G, Coraor N, Taylor J, Nekrutenko A; Galaxy Team. Manipulation of FASTQ data with Galaxy. Bioinformatics. 2010 Jul 15;26(14):1783-5. &lt;http://www.ncbi.nlm.nih.gov/pubmed/20562416&gt;`_
+
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_stats1.fastq	Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,36 @@
+@CSHL_3_FC042AGLLWW:1:2:7:203
+GTACGCATGACCGAACCCCCCNCCCCCCAATTGGTT
++CSHL_3_FC042AGLLWW:1:2:7:203
+BBC?7?B6>ABB?B;BBBCC9&;BCBBBBBBBB>>A
+@CSHL_3_FC042AGLLWW:1:2:7:33
+CAATGCCTCCAATTGGTTAATCCCCCTATATATACT
++CSHL_3_FC042AGLLWW:1:2:7:33
+8BBB?B;BB8?6@9B8BB=8.&1?,&;931&&&(BB
+@CSHL_3_FC042AGLLWW:1:2:7:169
+GCAGCAGGCGCGTCAGAGAGCCCCCCCCCCCCCCCC
++CSHL_3_FC042AGLLWW:1:2:7:169
+B@.?B=6BBB@.@BBBBBBBBBBBBBBB7=;6(663
+@CSHL_3_FC042AGLLWW:1:2:7:1436
+AATTATTTATTAAATTTTAATAATATGGGAGACACT
++CSHL_3_FC042AGLLWW:1:2:7:1436
+B?BBBBBBBBBBBBBBB@6ABBBBB@4@BBBBB77<
+@CSHL_3_FC042AGLLWW:1:2:7:292
+GGAGAAATACACACAATTGGTTAATCCCCCTATATA
++CSHL_3_FC042AGLLWW:1:2:7:292
+CBCBBBBBBB6.BBBBBBBBBBB=9&66&1@>6&3&
+@CSHL_3_FC042AGLLWW:1:2:7:1819
+AATTCAAACCACCCCAACCCACACACAGAGATACAA
++CSHL_3_FC042AGLLWW:1:2:7:1819
+B==2777-BB-0&96866&,66-&.6&66,6-*2.6
+@CSHL_3_FC042AGLLWW:1:2:7:1875
+GCAAAAGAGTAGTGTACCCCCCCCCCCCCCCCCCCC
++CSHL_3_FC042AGLLWW:1:2:7:1875
+BBBBBBBBB9699&9BBBBBA@;BBBBBBBBB9&96
+@CSHL_3_FC042AGLLWW:1:2:8:624
+ACTGCAATTGGTTAATCCCCCTATATAGCGCTGTGG
++CSHL_3_FC042AGLLWW:1:2:8:624
+BB<4?A9ABB@>>009.6?@<.6@BBBBBBBBBBBB
+@CSHL_3_FC042AGLLWW:1:2:8:250
+TGCCGCGCACACTGATGCAATTGGTTAATCCCCCTA
++CSHL_3_FC042AGLLWW:1:2:8:250
+BBBBBBBB?BBBBBBCCC<,91&6<39;?+6,3,9&
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_stats_1_out.tabular	Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,37 @@
+#column	count	min	max	sum	mean	Q1	med	Q3	IQR	lW	rW	outliers	A_Count	C_Count	G_Count	T_Count	N_Count	other_bases	other_base_count
+1	9	23	34	288	32.0	33.0	33.0	33.0	0.0	33	33	23,34	3	1	4	1	0
+2	9	28	33	287	31.8888888889	30.5	33.0	33.0	2.5	28	33		3	3	2	1	0
+3	9	13	34	268	29.7777777778	27.5	33.0	33.5	6.0	27	34	13	5	1	0	3	0
+4	9	17	33	261	29.0	24.5	33.0	33.0	8.5	17	33		1	2	3	3	0
+5	9	22	33	269	29.8888888889	26.0	33.0	33.0	7.0	22	33		3	3	3	0	0
+6	9	22	33	277	30.7777777778	29.0	33.0	33.0	4.0	28	33	22	5	3	0	1	0
+7	9	21	33	258	28.6666666667	23.0	33.0	33.0	10.0	21	33		4	1	3	1	0
+8	9	12	33	263	29.2222222222	26.5	33.0	33.0	6.5	21	33	12	2	1	1	5	0
+9	9	29	33	290	32.2222222222	31.5	33.0	33.0	1.5	30	33	29	3	3	2	1	0
+10	9	23	33	277	30.7777777778	28.0	33.0	33.0	5.0	23	33		1	4	2	2	0
+11	9	12	33	245	27.2222222222	21.0	31.0	33.0	12.0	12	33		5	2	1	1	0
+12	9	13	33	214	23.7777777778	14.0	24.0	33.0	19.0	13	33		2	4	2	1	0
+13	9	5	33	249	27.6666666667	26.5	31.0	33.0	6.5	24	33	5	2	1	1	5	0
+14	9	5	33	233	25.8888888889	19.5	33.0	33.0	13.5	5	33		3	3	2	1	0
+15	9	15	33	251	27.8888888889	22.5	33.0	33.0	10.5	15	33		5	1	1	2	0
+16	9	23	34	269	29.8888888889	23.5	33.0	33.0	9.5	23	34		3	1	2	3	0
+17	9	13	34	266	29.5555555556	27.0	33.0	33.0	6.0	21	34	13	2	3	1	3	0
+18	9	21	34	272	30.2222222222	26.0	33.0	33.0	7.0	21	34		0	5	1	3	0
+19	9	5	34	244	27.1111111111	24.0	30.0	33.0	9.0	21	34	5	4	4	1	0	0
+20	9	11	34	241	26.7777777778	17.0	32.0	33.0	16.0	11	34		3	4	2	0	0
+21	9	13	33	240	26.6666666667	22.5	27.0	33.0	10.5	13	33		1	4	0	4	0
+22	9	5	33	190	21.1111111111	9.0	21.0	33.0	24.0	5	33		1	4	0	3	1
+23	9	5	33	205	22.7777777778	14.0	26.0	33.0	19.0	5	33		4	4	1	0	0
+24	9	5	33	247	27.4444444444	24.5	31.0	33.0	8.5	21	33	5	1	5	1	2	0
+25	9	11	34	241	26.7777777778	18.5	33.0	33.0	14.5	11	34		3	4	0	2	0
+26	9	5	33	212	23.5555555556	11.5	31.0	33.0	21.5	5	33		0	6	0	3	0
+27	9	5	33	227	25.2222222222	20.0	26.0	33.0	13.0	5	33		3	4	1	1	0
+28	9	21	33	255	28.3333333333	22.5	31.0	33.0	10.5	21	33		2	4	3	0	0
+29	9	5	33	228	25.3333333333	19.5	30.0	33.0	13.5	5	33		2	4	1	2	0
+30	9	10	33	213	23.6666666667	13.5	28.0	33.0	19.5	10	33		3	4	2	0	0
+31	9	5	33	236	26.2222222222	21.0	31.0	33.0	12.0	5	33		1	4	1	3	0
+32	9	5	33	210	23.3333333333	11.5	29.0	33.0	21.5	5	33		3	3	0	3	0
+33	9	5	33	183	20.3333333333	8.0	21.0	33.0	25.0	5	33		1	4	2	2	0
+34	9	5	33	150	16.6666666667	6.0	17.0	25.5	19.5	5	33		3	4	1	1	0
+35	9	13	33	217	24.1111111111	19.5	24.0	31.0	11.5	13	33		1	4	1	3	0
+36	9	5	33	195	21.6666666667	11.5	21.0	32.5	21.0	5	33		3	2	1	3	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Mon Jan 27 09:29:03 2014 -0500
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="galaxy_sequence_utils" version="1.0.0">
+      <repository changeset_revision="0643676ad5f7" name="package_galaxy_utils_1_0" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>