1
|
1 """
|
|
2 Extended version of Dan Blankenberg's fastq joiner ( adds support for
|
|
3 recent Illumina headers ).
|
|
4 """
|
|
5
|
|
6 import sys, re
|
|
7 import galaxy_utils.sequence.fastq as fq
|
|
8
|
|
9
|
|
10 class IDManager( object ):
|
|
11
|
|
12 def __init__( self, sep="\t" ):
|
|
13 """
|
|
14 Recent Illumina FASTQ header format::
|
|
15
|
|
16 @<COORDS> <FLAGS>
|
|
17 COORDS = <Instrument>:<Run #>:<Flowcell ID>:<Lane>:<Tile>:<X>:<Y>
|
|
18 FLAGS = <Read>:<Is Filtered>:<Control Number>:<Index Sequence>
|
|
19
|
|
20 where the whitespace character between <COORDS> and <FLAGS> can be
|
|
21 either a space or a tab.
|
|
22 """
|
|
23 self.sep = sep
|
|
24
|
|
25 def parse_id( self, identifier ):
|
|
26 try:
|
|
27 coords, flags = identifier.strip()[1:].split( self.sep, 1 )
|
|
28 except ValueError:
|
|
29 raise RuntimeError( "bad identifier: %r" % ( identifier, ))
|
|
30 return coords.split( ":" ), flags.split( ":" )
|
|
31
|
|
32 def join_id( self, parsed_id ):
|
|
33 coords, flags = parsed_id
|
|
34 return "@%s%s%s" % ( ":".join( coords ), self.sep, ":".join( flags ))
|
|
35
|
|
36 def get_read_number( self, parsed_id ):
|
|
37 return int( parsed_id[1][0] )
|
|
38
|
|
39 def set_read_number( self, parsed_id, n ):
|
|
40 parsed_id[1][0] = "%d" % n
|
|
41
|
|
42 def get_paired_identifier( self, read ):
|
|
43 t = self.parse_id( read.identifier )
|
|
44 n = self.get_read_number( t )
|
|
45 if n == 1:
|
|
46 pn = 2
|
|
47 elif n == 2:
|
|
48 pn = 1
|
|
49 else:
|
|
50 raise RuntimeError( "Unknown read number '%d'" % n )
|
|
51 self.set_read_number( t, pn )
|
|
52 return self.join_id( t )
|
|
53
|
|
54
|
|
55 class FastqJoiner( fq.fastqJoiner ):
|
|
56
|
|
57 def __init__( self, format, force_quality_encoding=None, sep="\t" ):
|
|
58 super( FastqJoiner, self ).__init__( format, force_quality_encoding )
|
|
59 self.id_manager = IDManager( sep )
|
|
60
|
|
61 def join( self, read1, read2 ):
|
|
62 force_quality_encoding = self.force_quality_encoding
|
|
63 if not force_quality_encoding:
|
|
64 if read1.is_ascii_encoded():
|
|
65 force_quality_encoding = 'ascii'
|
|
66 else:
|
|
67 force_quality_encoding = 'decimal'
|
|
68 read1 = read1.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
|
|
69 read2 = read2.convert_read_to_format( self.format, force_quality_encoding=force_quality_encoding )
|
|
70 #--
|
|
71 t1, t2 = [ self.id_manager.parse_id( r.identifier ) for r in ( read1, read2 ) ]
|
|
72 if self.id_manager.get_read_number( t1 ) == 2:
|
|
73 if not self.id_manager.get_read_number( t2 ) == 1:
|
|
74 raise RuntimeError( "input files are not from mated pairs" )
|
|
75 read1, read2 = read2, read1
|
|
76 t1, t2 = t2, t1
|
|
77 #--
|
|
78 rval = fq.FASTQ_FORMATS[self.format]()
|
|
79 rval.identifier = read1.identifier
|
|
80 rval.description = "+"
|
|
81 if len( read1.description ) > 1:
|
|
82 rval.description += rval.identifier[1:]
|
|
83 if rval.sequence_space == 'color':
|
|
84 # convert to nuc space, join, then convert back
|
|
85 rval.sequence = rval.convert_base_to_color_space(
|
|
86 read1.convert_color_to_base_space( read1.sequence ) +
|
|
87 read2.convert_color_to_base_space( read2.sequence )
|
|
88 )
|
|
89 else:
|
|
90 rval.sequence = read1.sequence + read2.sequence
|
|
91 if force_quality_encoding == 'ascii':
|
|
92 rval.quality = read1.quality + read2.quality
|
|
93 else:
|
|
94 rval.quality = "%s %s" % (
|
|
95 read1.quality.strip(), read2.quality.strip()
|
|
96 )
|
|
97 return rval
|
|
98
|
|
99 def get_paired_identifier( self, read ):
|
|
100 return self.id_manager.get_paired_identifier( read )
|
|
101
|
|
102
|
|
103 def sniff_sep( fastq_fn ):
|
|
104 header = ""
|
|
105 with open( fastq_fn ) as f:
|
|
106 while header == "":
|
|
107 try:
|
|
108 header = f.next().strip()
|
|
109 except StopIteration:
|
|
110 raise RuntimeError( "%r: empty file" % ( fastq_fn, ) )
|
|
111 return re.search( r"\s", header ).group()
|
0
|
112
|
|
113 def main():
|
|
114 #Read command line arguments
|
|
115 input1_filename = sys.argv[1]
|
|
116 input1_type = sys.argv[2] or 'sanger'
|
|
117 input2_filename = sys.argv[3]
|
|
118 input2_type = sys.argv[4] or 'sanger'
|
|
119 output_filename = sys.argv[5]
|
|
120
|
1
|
121 fastq_style = sys.argv[6] or 'old'
|
|
122 #--
|
0
|
123 if input1_type != input2_type:
|
|
124 print "WARNING: You are trying to join files of two different types: %s and %s." % ( input1_type, input2_type )
|
|
125
|
1
|
126 if fastq_style == 'new':
|
|
127 sep = sniff_sep( input1_filename )
|
|
128 joiner = FastqJoiner( input1_type, sep=sep )
|
|
129 else:
|
|
130 joiner = fq.fastqJoiner( input1_type )
|
|
131 #--
|
|
132 input2 = fq.fastqNamedReader( open( input2_filename, 'rb' ), input2_type )
|
|
133 out = fq.fastqWriter( open( output_filename, 'wb' ), format=input1_type )
|
0
|
134 i = None
|
|
135 skip_count = 0
|
1
|
136 for i, fastq_read in enumerate( fq.fastqReader( open( input1_filename, 'rb' ), format=input1_type ) ):
|
0
|
137 identifier = joiner.get_paired_identifier( fastq_read )
|
|
138 fastq_paired = input2.get( identifier )
|
|
139 if fastq_paired is None:
|
|
140 skip_count += 1
|
|
141 else:
|
|
142 out.write( joiner.join( fastq_read, fastq_paired ) )
|
|
143 out.close()
|
|
144
|
|
145 if i is None:
|
|
146 print "Your file contains no valid FASTQ reads."
|
|
147 else:
|
|
148 print input2.has_data()
|
1
|
149 print 'Joined %s of %s read pairs (%.2f%%).' % ( i - skip_count + 1, i + 1, ( i - skip_count + 1 ) / ( i + 1 ) * 100.0 )
|
0
|
150
|
|
151 if __name__ == "__main__":
|
|
152 main()
|