# HG changeset patch # User peterjc # Date 1431529738 14400 # Node ID 8ff0ac66f1a30e477c1a12d5926038882db8a253 # Parent 1d773da0ccf0467c38bb662b0304ac1ff787498b v0.0.4; Report FASTQ counts; misc internal changes diff -r 1d773da0ccf0 -r 8ff0ac66f1a3 tools/seq_filter_by_mapping/README.rst --- a/tools/seq_filter_by_mapping/README.rst Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/README.rst Wed May 13 11:08:58 2015 -0400 @@ -1,7 +1,7 @@ Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping ====================================================================== -This tool is copyright 2014 by Peter Cock, The James Hutton Institute +This tool is copyright 2014-2015 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -62,6 +62,10 @@ ------- ---------------------------------------------------------------------- v0.0.1 - Initial version. v0.0.2 - Fixed some error messages. +v0.0.3 - Report counts for FASTQ as done for FASTA and SFF files. +v0.0.4 - Use the ``format_source=...`` tag. + - Reorder XML elements (internal change only). + - Planemo for Tool Shed upload (``.shed.yml``, internal change only). ======= ====================================================================== @@ -74,22 +78,31 @@ Much of the code was copied from my older tool: https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_id -For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder:: +For pushing a release to the test or main "Galaxy Tool Shed", use the following +Planemo commands (which requires you have set your Tool Shed access details in +``~/.planemo.yml`` and that you have access rights on the Tool Shed):: + + $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + +or:: - $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq + $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + +To just build and check the tar ball, use:: -Check this worked:: - - $ tar -tzf seq_filter_by_mapping.tar.gz + $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + ... + $ tar -tzf shed_upload.tar.gz + test-data/SRR639755_mito_pairs.fastq.gz + test-data/SRR639755_sample_by_coord.sam + test-data/SRR639755_sample_lax.fastq + test-data/SRR639755_sample_strict.fastq tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml - test-data/SRR639755_mito_pairs.fastq.gz - test-data/SRR639755_sample_by_coord.sam - test-data/SRR639755_sample_strict.fastq - test-data/SRR639755_sample_lax.fastq Licence (MIT) diff -r 1d773da0ccf0 -r 8ff0ac66f1a3 tools/seq_filter_by_mapping/seq_filter_by_mapping.py --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 13 11:08:58 2015 -0400 @@ -64,7 +64,7 @@ options, args = parser.parse_args() if options.version: - print "v0.0.2" + print "v0.0.3" sys.exit(0) in_file = options.input @@ -282,6 +282,7 @@ def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator + pos_count = neg_count = 0 handle = open(in_file, "r") if out_positive_file is not None and out_negative_file is not None: print "Generating two FASTQ files" @@ -292,8 +293,10 @@ # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) if clean_name(title.split(None, 1)[0]) in ids: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + pos_count += 1 else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + neg_count += 1 positive_handle.close() negative_handle.close() elif out_positive_file is not None: @@ -302,16 +305,23 @@ for title, seq, qual in FastqGeneralIterator(handle): if clean_name(title.split(None, 1)[0]) in ids: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + pos_count += 1 + else: + neg_count += 1 positive_handle.close() elif out_negative_file is not None: print "Generating non-matching FASTQ file" negative_handle = open(out_negative_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) not in ids: + if clean_name(title.split(None, 1)[0]) in ids: + pos_count += 1 + else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) + neg_count += 1 negative_handle.close() handle.close() - # This does not currently bother to record record counts (faster) + return pos_count, neg_count + def sff_filter(in_file, pos_file, neg_file, wanted): """SFF filter.""" @@ -353,18 +363,15 @@ if seq_format.lower()=="sff": - # Now write filtered SFF file based on IDs wanted - pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids) - # At the time of writing, Galaxy doesn't show SFF file read counts, - # so it is useful to put them in stdout and thus shown in job info. - print "%i with and %i without specified IDs" % (pos_count, neg_count) + sequence_filter = sff_filter elif seq_format.lower()=="fasta": - # Write filtered FASTA file based on IDs from tabular file - pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids) - print "%i with and %i without specified IDs" % (pos_count, neg_count) + sequence_filter = fasta_filter elif seq_format.lower().startswith("fastq"): - #Write filtered FASTQ file based on IDs from mapping file - fastq_filter(in_file, out_positive_file, out_negative_file, ids) - # This does not currently track the counts + sequence_filter = fastq_filter else: sys_exit("Unsupported file type %r" % seq_format) + +pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) +print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) +fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) +print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction)) diff -r 1d773da0ccf0 -r 8ff0ac66f1a3 tools/seq_filter_by_mapping/seq_filter_by_mapping.xml --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 13 11:08:58 2015 -0400 @@ -1,4 +1,4 @@ - + from SAM/BAM file biopython @@ -6,6 +6,11 @@ samtools samtools + + + + + seq_filter_by_mapping.py --version seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode @@ -19,11 +24,6 @@ ## Now loop over all the mapping files #for i in $mapping_file#${i} #end for# - - - - - @@ -47,10 +47,10 @@ - + output_choice_cond["output_choice"] != "neg" - + output_choice_cond["output_choice"] != "pos" diff -r 1d773da0ccf0 -r 8ff0ac66f1a3 tools/seq_filter_by_mapping/tool_dependencies.xml --- a/tools/seq_filter_by_mapping/tool_dependencies.xml Tue Jan 27 08:31:13 2015 -0500 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 13 11:08:58 2015 -0400 @@ -4,6 +4,6 @@ - +