Mercurial > repos > devteam > fastq_paired_end_joiner
diff fastq_paired_end_joiner.py @ 1:270a8ed8a300 draft
Uploaded tool version 2.0.0 by Simone Leo.
author | devteam |
---|---|
date | Mon, 07 Jul 2014 15:37:28 -0400 |
parents | 2793d1d765b9 |
children | 6a7f5da7c76d |
line wrap: on
line diff
--- a/fastq_paired_end_joiner.py Mon Jan 27 09:25:44 2014 -0500 +++ b/fastq_paired_end_joiner.py Mon Jul 07 15:37:28 2014 -0400 @@ -1,6 +1,114 @@ -#Dan Blankenberg -import sys, os, shutil -from galaxy_utils.sequence.fastq import fastqReader, fastqNamedReader, fastqWriter, fastqJoiner +""" +Extended version of Dan Blankenberg's fastq joiner ( adds support for +recent Illumina headers ). +""" + +import sys, re +import galaxy_utils.sequence.fastq as fq + + +class IDManager( object ): + + def __init__( self, sep="\t" ): + """ + Recent Illumina FASTQ header format:: + + @<COORDS> <FLAGS> + COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y> + FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence> + + where the whitespace character between <COORDS> and <FLAGS> can be + either a space or a tab. + """ + self.sep = sep + + def parse_id( self, identifier ): + try: + coords, flags = identifier.strip()[1:].split( self.sep, 1 ) + except ValueError: + raise RuntimeError( "bad identifier: %r" % ( identifier, )) + return coords.split( ":" ), flags.split( ":" ) + + def join_id( self, parsed_id ): + coords, flags = parsed_id + return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags )) + + def get_read_number( self, parsed_id ): + return int( parsed_id[1][0] ) + + def set_read_number( self, parsed_id, n ): + parsed_id[1][0] = "%d" % n + + def get_paired_identifier( self, read ): + t = self.parse_id( read.identifier ) + n = self.get_read_number( t ) + if n == 1: + pn = 2 + elif n == 2: + pn = 1 + else: + raise RuntimeError( "Unknown read number '%d'" % n ) + self.set_read_number( t, pn ) + return self.join_id( t ) + + +class FastqJoiner( fq.fastqJoiner ): + + def __init__( self, format, force_quality_encoding=None, sep="\t" ): + super( FastqJoiner, self ).__init__( format, force_quality_encoding ) + self.id_manager = IDManager( sep ) + + def join( self, read1, read2 ): + force_quality_encoding = self.force_quality_encoding + if not force_quality_encoding: + if read1.is_ascii_encoded(): + force_quality_encoding = 'ascii' + else: + force_quality_encoding = 'decimal' + read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) + read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding ) + #-- + t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ] + if self.id_manager.get_read_number( t1 ) == 2: + if not self.id_manager.get_read_number( t2 ) == 1: + raise RuntimeError( "input files are not from mated pairs" ) + read1, read2 = read2, read1 + t1, t2 = t2, t1 + #-- + rval = fq.FASTQ_FORMATS[self.format]() + rval.identifier = read1.identifier + rval.description = "+" + if len( read1.description ) > 1: + rval.description += rval.identifier[1:] + if rval.sequence_space == 'color': + # convert to nuc space, join, then convert back + rval.sequence = rval.convert_base_to_color_space( + read1.convert_color_to_base_space( read1.sequence ) + + read2.convert_color_to_base_space( read2.sequence ) + ) + else: + rval.sequence = read1.sequence + read2.sequence + if force_quality_encoding == 'ascii': + rval.quality = read1.quality + read2.quality + else: + rval.quality = "%s %s" % ( + read1.quality.strip(), read2.quality.strip() + ) + return rval + + def get_paired_identifier( self, read ): + return self.id_manager.get_paired_identifier( read ) + + +def sniff_sep( fastq_fn ): + header = "" + with open( fastq_fn ) as f: + while header == "": + try: + header = f.next().strip() + except StopIteration: + raise RuntimeError( "%r: empty file" % ( fastq_fn, ) ) + return re.search( r"\s", header ).group() def main(): #Read command line arguments @@ -10,16 +118,22 @@ input2_type = sys.argv[4] or 'sanger' output_filename = sys.argv[5] + fastq_style = sys.argv[6] or 'old' + #-- if input1_type != input2_type: print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type ) - input2 = fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) - joiner = fastqJoiner( input1_type ) - out = fastqWriter( open( output_filename, 'wb' ), format = input1_type ) - + if fastq_style == 'new': + sep = sniff_sep( input1_filename ) + joiner = FastqJoiner( input1_type, sep=sep ) + else: + joiner = fq.fastqJoiner( input1_type ) + #-- + input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type ) + out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type ) i = None skip_count = 0 - for i, fastq_read in enumerate( fastqReader( open( input1_filename, 'rb' ), format = input1_type ) ): + for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ): identifier = joiner.get_paired_identifier( fastq_read ) fastq_paired = input2.get( identifier ) if fastq_paired is None: @@ -32,7 +146,7 @@ print "Your file contains no valid FASTQ reads." else: print input2.has_data() - print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, float( i - skip_count + 1 ) / float( i + 1 ) * 100.0 ) + print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 ) if __name__ == "__main__": main()