diff tools/next_gen_conversion/solid2fastq.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/solid2fastq.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+
+import sys
+import string
+import optparse
+import tempfile
+import sqlite3
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+    
+def solid2sanger( quality_string, min_qual = 0 ):
+    sanger = ""
+    quality_string = quality_string.rstrip( " " )
+    for qv in quality_string.split(" "):
+        try:
+            if int( qv ) < 0:
+                qv = '0'
+            if int( qv ) < min_qual:
+                return False
+                break
+            sanger += chr( int( qv ) + 33 )
+        except:
+            pass    
+    return sanger
+
+def Translator(frm='', to='', delete='', keep=None):
+    allchars = string.maketrans('','')
+    if len(to) == 1:
+        to = to * len(frm)
+    trans = string.maketrans(frm, to)
+    if keep is not None:
+        delete = allchars.translate(allchars, keep.translate(allchars, delete))
+    def callable(s):
+        return s.translate(trans, delete)
+    return callable
+    
+def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ):
+ 
+    # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter,
+    # which can have three values: fastq, txt, and db
+    # fastq = fastq format
+    # txt = space delimited format with defline, reads, and qvs
+    # dp = dump data into sqlite3 db. 
+    # IMPORTNAT! If out = db two optins must be provided:
+    #   1. f_out must be a db connection object initialized with sqlite3.connect()
+    #   2. table_name must be provided
+    
+    if out == 'db':
+        cursor = f_out.cursor()
+        sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name
+        cursor.execute(sql)
+    
+    lines = []
+    line = " "
+    while line:
+        for f in [ f_reads, f_qual ]:
+            line = f.readline().rstrip( '\n\r' )
+            while line.startswith( '#' ):
+                line = f.readline().rstrip( '\n\r' )
+            lines.append( line )
+    
+            
+        if lines[0].startswith( '>' ) and lines[1].startswith( '>' ):
+            
+            if lines[0] != lines[1]:
+                stop_err('Files reads and quality score files are out of sync and likely corrupted. Please, check your input data')
+            
+            defline = lines[0][1:]
+            if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ):
+                defline = defline[ : len( defline )-3 ]
+                
+        elif ( not lines[0].startswith( '>' ) and not lines[1].startswith( '>' ) and len( lines[0] ) > 0 and len( lines[1] ) > 0 ):
+
+            if trim_first_base:
+                lines[0] = lines[0][1:]
+            if double_encode:
+                de = Translator(frm="0123.", to="ACGTN")
+                lines[0] = de(lines[0])
+            qual = solid2sanger( lines[1], int( min_qual ) )
+            if qual:
+                if out == 'fastq':
+                    f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) 
+                if out == 'txt':
+                    f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) )
+                if out == 'db':
+                    cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) )     
+        lines = []
+
+def main():
+
+    usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]"
+    parser = optparse.OptionParser(usage=usage)
+    
+        
+    parser.add_option(
+        '--fr','--f_reads',
+        metavar="F3_CSFASTA_FILE",
+        dest='fr',
+        help='Name of F3 file with color space reads')
+        
+    parser.add_option(
+        '--fq','--f_qual',
+        metavar="F3_QUAL_FILE",
+        dest='fq',
+        help='Name of F3 file with color quality values')
+        
+    parser.add_option(
+        '--fout','--f3_fastq_output',
+        metavar="F3_OUTPUT",
+        dest='fout',
+        help='Name for F3 output file')
+                
+    parser.add_option(
+        '--rr','--r_reads',
+        metavar="R3_CSFASTA_FILE",
+        dest='rr',
+        default = False,
+        help='Name of R3 file with color space reads')
+        
+    parser.add_option(
+        '--rq','--r_qual',
+        metavar="R3_QUAL_FILE",
+        dest='rq',
+        default = False,
+        help='Name of R3 file with color quality values')
+        
+    parser.add_option(
+        '--rout',
+        metavar="R3_OUTPUT",
+        dest='rout',
+        help='Name for F3 output file')
+    
+    parser.add_option(
+        '-q','--min_qual',
+        dest='min_qual',
+        default = '-1000',
+        help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000')
+        
+    parser.add_option(
+        '-t','--trim_name',
+        dest='trim_name',
+        action='store_true',
+        default = False,
+        help='Trim _R3 and _F3 off read names. Default is False')
+    
+    parser.add_option(
+        '-f','--trim_first_base',
+        dest='trim_first_base',
+        action='store_true',
+        default = False,
+        help='Remove the first base of reads in color-space. Default is False')
+        
+    parser.add_option(
+        '-d','--double_encode',
+        dest='de',
+        action='store_true',
+        default = False,
+        help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False')
+    
+    options, args = parser.parse_args()
+    
+    if not ( options.fout and options.fr and options.fq ):
+        parser.error("""
+        One or more of the three required paremetrs is missing:
+        (1) --fr F3.csfasta file
+        (2) --fq F3.qual file
+        (3) --fout name of output file
+        Use --help for more info
+        """)
+
+    fr =  open ( options.fr , 'r' ) 
+    fq =  open ( options.fq , 'r' ) 
+    f_out = open ( options.fout , 'w' )
+    
+    if options.rr and options.rq:
+        rr =  open ( options.rr , 'r' ) 
+        rq =  open ( options.rq , 'r' ) 
+        if not options.rout:
+            parser.error("Provide the name for f3 output using --rout option. Use --help for more info")
+        r_out = open ( options.rout, 'w' )
+    
+        db = tempfile.NamedTemporaryFile()
+        
+        try:
+            con = sqlite3.connect(db.name)
+            cur = con.cursor()
+        except:
+            stop_err('Cannot connect to %s\n') % db.name
+    
+         
+        merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" )
+        merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" )
+        cur.execute('create index f3_name on f3( name )')
+        cur.execute('create index r3_name on r3( name )')
+         
+        cur.execute('select * from f3,r3 where f3.name = r3.name')
+        for item in cur:
+            f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) )
+            r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) )
+        
+            
+    else:
+        merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual )
+        
+    
+      
+    f_out.close()
+
+if __name__ == "__main__":
+    main()
+    
+ 
\ No newline at end of file