diff tools/regVariation/qv_to_bqv.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/qv_to_bqv.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+"""
+Adapted from bx/scripts/qv_to_bqv.py
+
+Convert a qual (qv) file to several BinnedArray files for fast seek.
+This script takes approximately 4 seconds per 1 million base pairs.
+
+The input format is fasta style quality -- fasta headers followed by 
+whitespace separated integers.
+
+usage: %prog qual_file output_file
+"""
+
+import pkg_resources 
+pkg_resources.require( "bx-python" )
+pkg_resources.require( "numpy" )
+import string
+import psyco_full
+import sys, re, os, tempfile
+from bx.binned_array import BinnedArrayWriter
+from bx.cookbook import *
+import fileinput
+
+def load_scores_ba_dir( dir ):
+    """
+    Return a dict-like object (keyed by chromosome) that returns 
+    FileBinnedArray objects created from "key.ba" files in `dir`
+    """
+    return FileBinnedArrayDir( dir )
+
+def main():
+    args = sys.argv[1:]
+    try:
+        qual_file_dir = args[0]
+        #mydir="/home/gua110/Desktop/chimp_quality_scores/chr22.qa"
+        mydir="/home/gua110/Desktop/rhesus_quality_scores/rheMac2.qual.qv"
+        qual_file_dir = mydir.replace(mydir.split("/")[-1], "")
+        output_file = args[ 1 ]
+        fo = open(output_file,"w")
+    except:
+        print "usage: qual_file output_file"
+        sys.exit()
+    
+    tmpfile = tempfile.NamedTemporaryFile()
+    cmdline = "ls " + qual_file_dir + "*.qa | cat >> " + tmpfile.name
+    os.system (cmdline)
+    for qual_file in tmpfile.readlines():
+        qual = fileinput.FileInput( qual_file.strip() )
+        outfile = None
+        outbin = None
+        base_count = 0
+        mega_count = 0
+    
+        for line in qual:
+            line = line.rstrip("\r\n")
+            if line.startswith(">"):
+                # close old
+                if outbin and outfile:
+                    print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
+                    outbin.finish()
+                    outfile.close()
+                # start new file
+                region = line.lstrip(">")
+                #outfname = output_file + "." + region + ".bqv" #CHANGED
+                outfname = qual_file.strip() + ".bqv"
+                print >>fo, "Writing region " + region + " to file " + outfname
+                outfile = open( outfname , "wb")
+                outbin = BinnedArrayWriter(outfile, typecode='b', default=0)
+                base_count = 0
+                mega_count = 0
+            else:
+                if outfile and outbin:
+                    nums = line.split()
+                    for val in nums:
+                        outval = int(val)
+                        assert outval <= 255 and outval >= 0
+                        outbin.write(outval)
+                        base_count += 1
+                    if (mega_count * 1000000) <= base_count:
+                        sys.stdout.write(str(mega_count)+" ")
+                        sys.stdout.flush()
+                        mega_count = base_count // 1000000 + 1
+        if outbin and outfile:
+            print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
+            outbin.finish()
+            outfile.close()
+
+if __name__ == "__main__":
+    main()