annotate peak_calling_script.py @ 5:6242a111983d draft

Uploaded
author nitrozyna
date Tue, 16 Jan 2018 15:15:36 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
1
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
2 from __future__ import print_function
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
3 import sys
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
4 import numpy
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
5 import math
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
6 import random
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
7 import csv
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
8 import matplotlib.pyplot as plt
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
9 import pystache
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
10 import json
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
11 from sklearn import mixture
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
12
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
13 x = []
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
14 y = []
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
15
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
16 toolInput = sys.argv[1]
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
17 toolOutput = sys.argv[2]
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
18 toolWebsite = sys.argv[3]
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
19
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
20 with open(sys.argv[1], 'rb') as csvfile:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
21 spamreader = csv.reader(csvfile, delimiter='\t')
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
22 for i, row in enumerate(spamreader):
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
23 if i != 0:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
24 x.append(int(row[0]))
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
25 y.append(int(row[1]))
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
26
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
27 # you have to set this manually to weed out all the noise. Every bit of noise should be below it.
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
28 threshold = 20
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
29 rightLimit = 200
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
30
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
31 # unravelling histogram into samples.
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
32 samples = []
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
33 for no, value in enumerate([int(round(i)) for i in y]):
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
34 if value > threshold and no < rightLimit:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
35 for _ in range(value):
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
36 samples.append(no)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
37
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
38 # total number of reads
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
39 totalAmp = len(samples)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
40
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
41 # reshaping numpy arrays to indicate that we pass a lot of samples, not a lot of features.
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
42 xArray = numpy.array(x).reshape(1, -1)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
43 samplesArray = numpy.array(samples).reshape(-1, 1)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
44
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
45 # learning a gaussian mixture model.
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
46 gmm2 = mixture.BayesianGaussianMixture(n_components=2).fit(samplesArray)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
47
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
48 # getting the mean of each gaussian
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
49 means = [x[int(round(i[0]))] for i in gmm2.means_]
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
50
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
51 # rounding errors
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
52 roundErr = [i[0] - int(round(i[0])) for i in gmm2.means_]
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
53
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
54 # getting the coverage of each gaussian
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
55 weights = gmm2.weights_
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
56
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
57 sampleID = toolOutput + ".html"
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
58
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
59 with open(toolOutput, "w") as f:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
60 print("sampleID", file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
61 print("Al1", file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
62 print("Al2", file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
63 print("frac1", file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
64 print("frac2", file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
65 print(file=f)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
66 print(sampleID, file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
67 print(means[0], file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
68 print(means[1], file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
69 print(weights[0], file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
70 print(weights[1], file=f, end="\t")
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
71
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
72 template_dir = {
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
73 "sampleID": sampleID,
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
74 "al1": means[0],
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
75 "al2": means[1],
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
76 "freq1": weights[0],
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
77 "freq2": weights[1],
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
78 "x": json.dumps(x),
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
79 "y": json.dumps(y)
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
80 }
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
81 with open(toolWebsite) as wt:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
82 with open(sampleID, "w") as wr:
6242a111983d Uploaded
nitrozyna
parents:
diff changeset
83 wr.write(pystache.render(wt.read(), template_dir))