annotate kmer_read_m3.py @ 0:1472b4f4fbfe draft

planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author cstrittmatter
date Fri, 12 Apr 2019 14:58:18 -0400
parents
children 1434bc7b4786
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
1 #Reads fastq filenames, runs kmerread, and generates report
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
2 #MKM 1/1/2019
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
3
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
4 import re
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
5 import sys
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
6 import numpy as np
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
7 from subprocess import Popen, PIPE
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
8
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
9 if __name__ == '__main__':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
10 if len(sys.argv) > 2:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
11 item = sys.argv[1].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
12 dirnm = sys.argv[2].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
13 if item == "-w":
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
14 wdir = dirnm
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
15 else:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
16 print "w?"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
17 if len(sys.argv) > 4:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
18 item = sys.argv[3].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
19 dirnm = sys.argv[4].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
20 if item == "-d":
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
21 oname = dirnm + "/mitokmer_result.csv"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
22 else:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
23 print "d?"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
24
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
25 if len(sys.argv) > 7:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
26 item = sys.argv[5].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
27 filenm1 = sys.argv[6].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
28 filenm2 = sys.argv[7].strip()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
29 if item == "-i":
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
30 file1 = filenm1
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
31 file2 = filenm2
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
32 else:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
33 print "i?"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
34
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
35 target = 0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
36 wdir += "/"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
37 reffile = wdir + "mitochondria_refkey.txt"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
38 cname = wdir + "result.txt"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
39 exclude_b = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
40
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
41 #read in reference names and counts
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
42 factor_list = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
43 name_list = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
44 in_use = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
45 data_file = open(reffile, 'r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
46 data = ''.join(data_file.readlines())
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
47 data_file.close()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
48 lines = data.split('\n')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
49 lines.pop(0) #header
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
50 for line in lines:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
51 if len(line) > 1:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
52 target, name, count, hit, tested, gsize, nstrains = line.split('\t')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
53 row = name.split('_')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
54 target = int(target)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
55 hit = float(hit)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
56 if nstrains != '0':
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
57 gensize = float(gsize) / float(nstrains)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
58 else:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
59 gensize = 1.0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
60 tested = float(tested)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
61 use = "1"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
62 if target in exclude_b or count < 10.0 or hit < 10.0 or len(row) < 5:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
63 use = "0"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
64 in_use.append(int(use))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
65 if use == "1":
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
66 name_list.append(name)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
67 factor_list.append(tested / hit / gensize)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
68 factor_arr = np.array(factor_list)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
69 num_targs = len(name_list)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
70 process = Popen([wdir + '/kmerread', '-wdir', wdir, '-f1', file1, '-f2', file2])
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
71
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
72 (stdout, stderr) = process.communicate()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
73
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
74 #read in output files
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
75 noid_list = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
76 read_ct = []
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
77 num_cols = 1
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
78 m = np.zeros((num_targs,num_cols))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
79 col = 0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
80 data_file = open(cname, 'r')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
81 data = ''.join(data_file.readlines())
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
82 data_file.close()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
83 lines = data.split('\n')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
84 read_ct.append(0.0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
85 index = 0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
86 for line in lines:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
87 if len(line) > 1:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
88 t_s, count, uniq = line.split(',')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
89 target = int(t_s)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
90 read_ct[col] += float(count)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
91 if target > 0:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
92 if in_use[target]:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
93 m[index, col] = float(count)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
94 index += 1
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
95 else:
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
96 noid_list.append(int(count))
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
97
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
98 b = m * factor_arr[:,None] #normalize each row by kmer coverage
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
99 sums = np.sum(b, axis=0)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
100 b = b / sums[None,:]
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
101 b = b * 100.0
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
102 rowmax = b.max(axis=1)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
103
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
104 out_file = open(oname, 'w')
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
105 output = "taxid,reads,abundance\n"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
106 out_file.write(output)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
107 output = "total,"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
108 for i in range(num_cols):
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
109 output += str(read_ct[i]) + ",,"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
110 output += "\n"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
111 out_file.write(output)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
112 output = "no_id,"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
113 for i in range(num_cols):
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
114 output += str(noid_list[i]) + ",,"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
115 output += "\n"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
116 out_file.write(output)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
117 for i in range(num_targs):
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
118 #l = order_row[i]
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
119 if rowmax[i] > 0.000: #only output non-zero results
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
120 output = name_list[i]
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
121 for j in range(num_cols):
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
122 output += ',' + "{0:.0f}".format(m[i,j]) + ',' + "{0:.3f}".format(b[i,j])
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
123 output += "\n"
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
124 out_file.write(output)
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
125 out_file.close()
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
126
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
127
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
128
1472b4f4fbfe planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
cstrittmatter
parents:
diff changeset
129