Mercurial > repos > nitrozyna > dm1_genotypying
comparison peak_calling_script.py @ 8:15a3d5439e7b draft
Uploaded
author | nitrozyna |
---|---|
date | Thu, 29 Mar 2018 12:06:11 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
7:d10ac6a3f293 | 8:15a3d5439e7b |
---|---|
1 | |
2 from __future__ import print_function | |
3 import sys | |
4 import numpy | |
5 import math | |
6 import random | |
7 import csv | |
8 import matplotlib.pyplot as plt | |
9 import pystache | |
10 import json | |
11 from sklearn import mixture | |
12 | |
13 | |
14 x = [] | |
15 y = [] | |
16 | |
17 toolInput = sys.argv[1] | |
18 toolOutput = sys.argv[2] | |
19 toolWebsite = sys.argv[3] | |
20 inName = sys.argv[4] | |
21 | |
22 with open(toolInput, 'rb') as csvfile: | |
23 spamreader = csv.reader(csvfile, delimiter='\t') | |
24 for i, row in enumerate(spamreader): | |
25 if i != 0: | |
26 x.append(int(row[0])) | |
27 y.append(int(row[1])) | |
28 | |
29 csvfile.close() | |
30 | |
31 # you have to set this manually to weed out all the noise. Every bit of noise should be below it. | |
32 threshold = 20 | |
33 rightLimit = 200 | |
34 | |
35 # unravelling histogram into samples. | |
36 samples = [] | |
37 for no, value in enumerate([int(round(i)) for i in y]): | |
38 if value > threshold and no < rightLimit: | |
39 for _ in range(value): | |
40 samples.append(no) | |
41 | |
42 # total number of reads | |
43 totalAmp = len(samples) | |
44 | |
45 # reshaping numpy arrays to indicate that we pass a lot of samples, not a lot of features. | |
46 xArray = numpy.array(x).reshape(1, -1) | |
47 samplesArray = numpy.array(samples).reshape(-1, 1) | |
48 | |
49 # learning a gaussian mixture model. | |
50 gmm2 = mixture.BayesianGaussianMixture(n_components=2).fit(samplesArray) | |
51 | |
52 # getting the mean of each gaussian | |
53 means = [x[int(round(i[0]))] for i in gmm2.means_] | |
54 | |
55 # rounding errors | |
56 roundErr = [i[0] - int(round(i[0])) for i in gmm2.means_] | |
57 | |
58 # getting the coverage of each gaussian | |
59 weights = gmm2.weights_ | |
60 | |
61 sampleID = inName | |
62 | |
63 #with open(toolOutput, "w") as f: | |
64 # print("sampleID", file=f, end="\t") | |
65 # print("Al1", file=f, end="\t") | |
66 # print("Al2", file=f, end="\t") | |
67 # print("frac1", file=f, end="\t") | |
68 # print("frac2", file=f, end="\t") | |
69 # print(file=f) | |
70 # print(sampleID, file=f, end="\t") | |
71 # print(means[0], file=f, end="\t") | |
72 # print(means[1], file=f, end="\t") | |
73 # print(weights[0], file=f, end="\t") | |
74 # print(weights[1], file=f, end="\t") | |
75 | |
76 template_dir = { | |
77 "sampleID": sampleID, | |
78 "al1": means[0], | |
79 "al2": means[1], | |
80 "freq1": weights[0], | |
81 "freq2": weights[1], | |
82 "x": json.dumps(x), | |
83 "y": json.dumps(y) | |
84 } | |
85 with open(toolWebsite) as wt: | |
86 with open(toolOutput, "w") as wr: | |
87 wr.write(pystache.render(wt.read(), template_dir)) | |
88 | |
89 wt.close() | |
90 wr.close() |