Mercurial > repos > artbio > lumpy_sv
comparison pairend_distro.py @ 0:796552c157de draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit d06124e8a097f3f665b4955281f40fe811eaee64
author | artbio |
---|---|
date | Mon, 24 Jul 2017 08:03:17 -0400 |
parents | |
children | 1ed8619a5611 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:796552c157de |
---|---|
1 #!/usr/bin/env python | |
2 # (c) 2012 - Ryan M. Layer | |
3 # Hall Laboratory | |
4 # Quinlan Laboratory | |
5 # Department of Computer Science | |
6 # Department of Biochemistry and Molecular Genetics | |
7 # Department of Public Health Sciences and Center for Public Health Genomics, | |
8 # University of Virginia | |
9 # rl6sf@virginia.edu | |
10 | |
11 import sys | |
12 import numpy as np | |
13 from operator import itemgetter | |
14 from optparse import OptionParser | |
15 | |
16 # some constants for sam/bam field ids | |
17 SAM_FLAG = 1 | |
18 SAM_REFNAME = 2 | |
19 SAM_MATE_REFNAME = 6 | |
20 SAM_ISIZE = 8 | |
21 | |
22 parser = OptionParser() | |
23 | |
24 parser.add_option("-r", | |
25 "--read_length", | |
26 type="int", | |
27 dest="read_length", | |
28 help="Read length") | |
29 | |
30 parser.add_option("-X", | |
31 dest="X", | |
32 type="int", | |
33 help="Number of stdevs from mean to extend") | |
34 | |
35 parser.add_option("-N", | |
36 dest="N", | |
37 type="int", | |
38 help="Number to sample") | |
39 | |
40 parser.add_option("-o", | |
41 dest="output_file", | |
42 help="Output file") | |
43 | |
44 parser.add_option("-m", | |
45 dest="mads", | |
46 type="int", | |
47 default=10, | |
48 help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)") | |
49 | |
50 def unscaled_upper_mad(xs): | |
51 """Return a tuple consisting of the median of xs followed by the | |
52 unscaled median absolute deviation of the values in xs that lie | |
53 above the median. | |
54 """ | |
55 med = np.median(xs) | |
56 return med, np.median(xs[xs > med] - med) | |
57 | |
58 | |
59 (options, args) = parser.parse_args() | |
60 | |
61 if not options.read_length: | |
62 parser.error('Read length not given') | |
63 | |
64 if not options.X: | |
65 parser.error('X not given') | |
66 | |
67 if not options.N: | |
68 parser.error('N not given') | |
69 | |
70 if not options.output_file: | |
71 parser.error('Output file not given') | |
72 | |
73 | |
74 required = 97 | |
75 restricted = 3484 | |
76 flag_mask = required | restricted | |
77 | |
78 L = [] | |
79 c = 0 | |
80 | |
81 for l in sys.stdin: | |
82 if c >= options.N: | |
83 break | |
84 | |
85 A = l.rstrip().split('\t') | |
86 flag = int(A[SAM_FLAG]) | |
87 refname = A[SAM_REFNAME] | |
88 mate_refname = A[SAM_MATE_REFNAME] | |
89 isize = int(A[SAM_ISIZE]) | |
90 | |
91 want = mate_refname == "=" and flag & flag_mask == required and isize >= 0 | |
92 if want: | |
93 c += 1 | |
94 L.append(isize) | |
95 | |
96 # warn if very few elements in distribution | |
97 min_elements = 1000 | |
98 if len(L) < min_elements: | |
99 sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements)) | |
100 mean = "NA" | |
101 stdev = "NA" | |
102 | |
103 else: | |
104 # Remove outliers | |
105 L = np.array(L) | |
106 L.sort() | |
107 med, umad = unscaled_upper_mad(L) | |
108 upper_cutoff = med + options.mads * umad | |
109 L = L[L < upper_cutoff] | |
110 new_len = len(L) | |
111 removed = c - new_len | |
112 sys.stderr.write("Removed %d outliers with isize >= %d\n" % | |
113 (removed, upper_cutoff)) | |
114 c = new_len | |
115 | |
116 mean = np.mean(L) | |
117 stdev = np.std(L) | |
118 | |
119 start = options.read_length | |
120 end = int(mean + options.X*stdev) | |
121 | |
122 H = [0] * (end - start + 1) | |
123 s = 0 | |
124 | |
125 for x in L: | |
126 if (x >= start) and (x <= end): | |
127 j = int(x - start) | |
128 H[j] = H[ int(x - start) ] + 1 | |
129 s += 1 | |
130 | |
131 f = open(options.output_file, 'w') | |
132 | |
133 for i in range(end - start): | |
134 o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n" | |
135 f.write(o) | |
136 | |
137 | |
138 f.close() | |
139 | |
140 print('mean:' + str(mean) + '\tstdev:' + str(stdev)) |