| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 """
 | 
| 
 | 
     4 convert fastqsolexa file to separated sequence and quality files.
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 assume each sequence and quality score are contained in one line
 | 
| 
 | 
     7 the order should be:
 | 
| 
 | 
     8 1st line: @title_of_seq
 | 
| 
 | 
     9 2nd line: nucleotides
 | 
| 
 | 
    10 3rd line: +title_of_qualityscore (might be skipped)
 | 
| 
 | 
    11 4th line: quality scores 
 | 
| 
 | 
    12 (in three forms: a. digits, b. ASCII codes, the first char as the coding base, c. ASCII codes without the first char.)
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 Usage:
 | 
| 
 | 
    15 %python fastqsolexa_to_fasta_qual.py <your_fastqsolexa_filename> <output_seq_filename> <output_score_filename>
 | 
| 
 | 
    16 """
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 import sys, os
 | 
| 
 | 
    19 from math import *
 | 
| 
 | 
    20 
 | 
| 
 | 
    21 assert sys.version_info[:2] >= ( 2, 4 )
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 def stop_err( msg ):
 | 
| 
 | 
    24     sys.stderr.write( "%s" % msg )
 | 
| 
 | 
    25     sys.exit()
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 def __main__():
 | 
| 
 | 
    28     infile_name = sys.argv[1]
 | 
| 
 | 
    29     outfile_seq = open( sys.argv[2], 'w' )
 | 
| 
 | 
    30     outfile_score = open( sys.argv[3], 'w' )
 | 
| 
 | 
    31     datatype = sys.argv[4]
 | 
| 
 | 
    32     seq_title_startswith = ''
 | 
| 
 | 
    33     qual_title_startswith = ''
 | 
| 
 | 
    34     default_coding_value = 64
 | 
| 
 | 
    35     fastq_block_lines = 0
 | 
| 
 | 
    36     
 | 
| 
 | 
    37     for i, line in enumerate( file( infile_name ) ):
 | 
| 
 | 
    38         line = line.rstrip()
 | 
| 
 | 
    39         if not line or line.startswith( '#' ):
 | 
| 
 | 
    40             continue
 | 
| 
 | 
    41         fastq_block_lines = ( fastq_block_lines + 1 ) % 4
 | 
| 
 | 
    42         line_startswith = line[0:1]
 | 
| 
 | 
    43         if fastq_block_lines == 1:
 | 
| 
 | 
    44             # first line is @title_of_seq
 | 
| 
 | 
    45             if not seq_title_startswith:
 | 
| 
 | 
    46                 seq_title_startswith = line_startswith
 | 
| 
 | 
    47             if line_startswith != seq_title_startswith:
 | 
| 
 | 
    48                 outfile_seq.close()
 | 
| 
 | 
    49                 outfile_score.close()
 | 
| 
 | 
    50                 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
 | 
| 
 | 
    51             read_title = line[1:]
 | 
| 
 | 
    52             outfile_seq.write( '>%s\n' % line[1:] )
 | 
| 
 | 
    53         elif fastq_block_lines == 2:
 | 
| 
 | 
    54             # second line is nucleotides
 | 
| 
 | 
    55             read_length = len( line )
 | 
| 
 | 
    56             outfile_seq.write( '%s\n' % line )
 | 
| 
 | 
    57         elif fastq_block_lines == 3:
 | 
| 
 | 
    58             # third line is +title_of_qualityscore ( might be skipped )
 | 
| 
 | 
    59             if not qual_title_startswith:
 | 
| 
 | 
    60                 qual_title_startswith = line_startswith
 | 
| 
 | 
    61             if line_startswith != qual_title_startswith:
 | 
| 
 | 
    62                 outfile_seq.close()
 | 
| 
 | 
    63                 outfile_score.close()
 | 
| 
 | 
    64                 stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )    
 | 
| 
 | 
    65             quality_title = line[1:]
 | 
| 
 | 
    66             if quality_title and read_title != quality_title:
 | 
| 
 | 
    67                 outfile_seq.close()
 | 
| 
 | 
    68                 outfile_score.close()
 | 
| 
 | 
    69                 stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
 | 
| 
 | 
    70             if not quality_title:
 | 
| 
 | 
    71                 outfile_score.write( '>%s\n' % read_title )
 | 
| 
 | 
    72             else:
 | 
| 
 | 
    73                 outfile_score.write( '>%s\n' % line[1:] )
 | 
| 
 | 
    74         else:
 | 
| 
 | 
    75             # fourth line is quality scores
 | 
| 
 | 
    76             qual = ''
 | 
| 
 | 
    77             fastq_integer = True
 | 
| 
 | 
    78             # peek: ascii or digits?
 | 
| 
 | 
    79             val = line.split()[0]
 | 
| 
 | 
    80             try: 
 | 
| 
 | 
    81                 check = int( val )
 | 
| 
 | 
    82                 fastq_integer = True
 | 
| 
 | 
    83             except:
 | 
| 
 | 
    84                 fastq_integer = False
 | 
| 
 | 
    85                 
 | 
| 
 | 
    86             if fastq_integer:
 | 
| 
 | 
    87                 # digits
 | 
| 
 | 
    88                 qual = line
 | 
| 
 | 
    89             else:
 | 
| 
 | 
    90                 # ascii
 | 
| 
 | 
    91                 quality_score_length = len( line )
 | 
| 
 | 
    92                 if quality_score_length == read_length + 1:
 | 
| 
 | 
    93                     # first char is qual_score_startswith
 | 
| 
 | 
    94                     qual_score_startswith = ord( line[0:1] )
 | 
| 
 | 
    95                     line = line[1:]
 | 
| 
 | 
    96                 elif quality_score_length == read_length:
 | 
| 
 | 
    97                     qual_score_startswith = default_coding_value
 | 
| 
 | 
    98                 else:
 | 
| 
 | 
    99                     stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
 | 
| 
 | 
   100                 for j, char in enumerate( line ):
 | 
| 
 | 
   101                     score = ord( char ) - qual_score_startswith    # 64
 | 
| 
 | 
   102                     qual = "%s%s " % ( qual, str( score ) )
 | 
| 
 | 
   103             outfile_score.write( '%s\n' % qual )
 | 
| 
 | 
   104               
 | 
| 
 | 
   105     outfile_seq.close()
 | 
| 
 | 
   106     outfile_score.close()
 | 
| 
 | 
   107 
 | 
| 
 | 
   108 if __name__ == "__main__": __main__() 
 | 
| 
 | 
   109      |