Mercurial > repos > xuebing > sharplabtool
comparison tools/solid_tools/solid_qual_stats.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 | |
4 import sys, os, zipfile, tempfile | |
5 | |
6 QUAL_UPPER_BOUND = 41 | |
7 QUAL_LOWER_BOUND = 1 | |
8 | |
9 def stop_err( msg ): | |
10 sys.stderr.write( "%s\n" % msg ) | |
11 sys.exit() | |
12 | |
13 def unzip( filename ): | |
14 zip_file = zipfile.ZipFile( filename, 'r' ) | |
15 tmpfilename = tempfile.NamedTemporaryFile().name | |
16 for name in zip_file.namelist(): | |
17 file( tmpfilename, 'a' ).write( zip_file.read( name ) ) | |
18 zip_file.close() | |
19 return tmpfilename | |
20 | |
21 def __main__(): | |
22 | |
23 infile_score_name = sys.argv[1].strip() | |
24 fout = open(sys.argv[2].strip(),'r+w') | |
25 | |
26 infile_is_zipped = False | |
27 if zipfile.is_zipfile( infile_score_name ): | |
28 infile_is_zipped = True | |
29 infile_name = unzip( infile_score_name ) | |
30 else: | |
31 infile_name = infile_score_name | |
32 | |
33 readlen = None | |
34 invalid_lines = 0 | |
35 j = 0 | |
36 for line in file( infile_name ): | |
37 line = line.strip() | |
38 if not(line) or line.startswith("#") or line.startswith(">"): | |
39 continue | |
40 elems = line.split() | |
41 try: | |
42 for item in elems: | |
43 int(item) | |
44 if not readlen: | |
45 readlen = len(elems) | |
46 if len(elems) != readlen: | |
47 print "Note: Reads in the input dataset are of variable lengths." | |
48 j += 1 | |
49 except ValueError: | |
50 invalid_lines += 1 | |
51 if j > 10: | |
52 break | |
53 | |
54 position_dict = {} | |
55 print >>fout, "column\tcount\tmin\tmax\tsum\tmean\tQ1\tmed\tQ3\tIQR\tlW\trW" | |
56 for k,line in enumerate(file( infile_name )): | |
57 line = line.strip() | |
58 if not(line) or line.startswith("#") or line.startswith(">"): | |
59 continue | |
60 elems = line.split() | |
61 if position_dict == {}: | |
62 for pos in range(readlen): | |
63 position_dict[pos] = [0]*QUAL_UPPER_BOUND | |
64 if len(elems) != readlen: | |
65 invalid_lines += 1 | |
66 continue | |
67 for ind,item in enumerate(elems): | |
68 try: | |
69 item = int(item) | |
70 position_dict[ind][item]+=1 | |
71 except: | |
72 pass | |
73 | |
74 invalid_positions = 0 | |
75 for pos in position_dict: | |
76 carr = position_dict[pos] #count array for position pos | |
77 total = sum(carr) #number of bases found in this column. | |
78 med_elem = int(round(total/2.0)) | |
79 lowest = None #Lowest quality score value found in this column. | |
80 highest = None #Highest quality score value found in this column. | |
81 median = None #Median quality score value found in this column. | |
82 qsum = 0.0 #Sum of quality score values for this column. | |
83 q1 = None #1st quartile quality score. | |
84 q3 = None #3rd quartile quality score. | |
85 q1_elem = int(round((total+1)/4.0)) | |
86 q3_elem = int(round((total+1)*3/4.0)) | |
87 | |
88 try: | |
89 for ind,cnt in enumerate(carr): | |
90 qsum += ind*cnt | |
91 | |
92 if cnt!=0: | |
93 highest = ind | |
94 | |
95 if lowest==None and cnt!=0: #first non-zero count | |
96 lowest = ind | |
97 | |
98 if q1==None: | |
99 if sum(carr[:ind+1]) >= q1_elem: | |
100 q1 = ind | |
101 | |
102 if median==None: | |
103 if sum(carr[:ind+1]) < med_elem: | |
104 continue | |
105 median = ind | |
106 if total%2 == 0: #even number of elements | |
107 median2 = median | |
108 if sum(carr[:ind+1]) < med_elem+1: | |
109 for ind2,elem in enumerate(carr[ind+1:]): | |
110 if elem != 0: | |
111 median2 = ind+ind2+1 | |
112 break | |
113 median = (median + median2)/2.0 | |
114 | |
115 | |
116 if q3==None: | |
117 if sum(carr[:ind+1]) >= q3_elem: | |
118 q3 = ind | |
119 | |
120 | |
121 mean = qsum/total #Mean quality score value for this column. | |
122 iqr = q3-q1 | |
123 left_whisker = max(q1 - 1.5*iqr,lowest) | |
124 right_whisker = min(q3 + 1.5*iqr,highest) | |
125 | |
126 print >>fout,"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(pos+1,total,lowest,highest,qsum,mean,q1,median,q3,iqr,left_whisker,right_whisker) | |
127 except: | |
128 invalid_positions += 1 | |
129 nullvals = ['NA']*11 | |
130 print >>fout,"%s\t%s" %(pos+1,'\t'.join(nullvals)) | |
131 | |
132 if invalid_lines: | |
133 print "Skipped %d reads as invalid." %invalid_lines | |
134 if invalid_positions: | |
135 print "Skipped stats computation for %d read positions." %invalid_positions | |
136 | |
137 if __name__=="__main__": | |
138 __main__() | |
139 | |
140 |