0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Adapted from bx/scripts/qv_to_bqv.py
|
|
5
|
|
6 Convert a qual (qv) file to several BinnedArray files for fast seek.
|
|
7 This script takes approximately 4 seconds per 1 million base pairs.
|
|
8
|
|
9 The input format is fasta style quality -- fasta headers followed by
|
|
10 whitespace separated integers.
|
|
11
|
|
12 usage: %prog qual_file output_file
|
|
13 """
|
|
14
|
|
15 import pkg_resources
|
|
16 pkg_resources.require( "bx-python" )
|
|
17 pkg_resources.require( "numpy" )
|
|
18 import string
|
|
19 import psyco_full
|
|
20 import sys, re, os, tempfile
|
|
21 from bx.binned_array import BinnedArrayWriter
|
|
22 from bx.cookbook import *
|
|
23 import fileinput
|
|
24
|
|
25 def load_scores_ba_dir( dir ):
|
|
26 """
|
|
27 Return a dict-like object (keyed by chromosome) that returns
|
|
28 FileBinnedArray objects created from "key.ba" files in `dir`
|
|
29 """
|
|
30 return FileBinnedArrayDir( dir )
|
|
31
|
|
32 def main():
|
|
33 args = sys.argv[1:]
|
|
34 try:
|
|
35 qual_file_dir = args[0]
|
|
36 #mydir="/home/gua110/Desktop/chimp_quality_scores/chr22.qa"
|
|
37 mydir="/home/gua110/Desktop/rhesus_quality_scores/rheMac2.qual.qv"
|
|
38 qual_file_dir = mydir.replace(mydir.split("/")[-1], "")
|
|
39 output_file = args[ 1 ]
|
|
40 fo = open(output_file,"w")
|
|
41 except:
|
|
42 print "usage: qual_file output_file"
|
|
43 sys.exit()
|
|
44
|
|
45 tmpfile = tempfile.NamedTemporaryFile()
|
|
46 cmdline = "ls " + qual_file_dir + "*.qa | cat >> " + tmpfile.name
|
|
47 os.system (cmdline)
|
|
48 for qual_file in tmpfile.readlines():
|
|
49 qual = fileinput.FileInput( qual_file.strip() )
|
|
50 outfile = None
|
|
51 outbin = None
|
|
52 base_count = 0
|
|
53 mega_count = 0
|
|
54
|
|
55 for line in qual:
|
|
56 line = line.rstrip("\r\n")
|
|
57 if line.startswith(">"):
|
|
58 # close old
|
|
59 if outbin and outfile:
|
|
60 print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
|
|
61 outbin.finish()
|
|
62 outfile.close()
|
|
63 # start new file
|
|
64 region = line.lstrip(">")
|
|
65 #outfname = output_file + "." + region + ".bqv" #CHANGED
|
|
66 outfname = qual_file.strip() + ".bqv"
|
|
67 print >>fo, "Writing region " + region + " to file " + outfname
|
|
68 outfile = open( outfname , "wb")
|
|
69 outbin = BinnedArrayWriter(outfile, typecode='b', default=0)
|
|
70 base_count = 0
|
|
71 mega_count = 0
|
|
72 else:
|
|
73 if outfile and outbin:
|
|
74 nums = line.split()
|
|
75 for val in nums:
|
|
76 outval = int(val)
|
|
77 assert outval <= 255 and outval >= 0
|
|
78 outbin.write(outval)
|
|
79 base_count += 1
|
|
80 if (mega_count * 1000000) <= base_count:
|
|
81 sys.stdout.write(str(mega_count)+" ")
|
|
82 sys.stdout.flush()
|
|
83 mega_count = base_count // 1000000 + 1
|
|
84 if outbin and outfile:
|
|
85 print "\nFinished region " + region + " at " + str(base_count) + " base pairs."
|
|
86 outbin.finish()
|
|
87 outfile.close()
|
|
88
|
|
89 if __name__ == "__main__":
|
|
90 main()
|