comparison kmer_read_m3.py @ 0:1472b4f4fbfe draft

planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author cstrittmatter
date Fri, 12 Apr 2019 14:58:18 -0400
parents
children 1434bc7b4786
comparison
equal deleted inserted replaced
-1:000000000000 0:1472b4f4fbfe
1 #Reads fastq filenames, runs kmerread, and generates report
2 #MKM 1/1/2019
3
4 import re
5 import sys
6 import numpy as np
7 from subprocess import Popen, PIPE
8
9 if __name__ == '__main__':
10 if len(sys.argv) > 2:
11 item = sys.argv[1].strip()
12 dirnm = sys.argv[2].strip()
13 if item == "-w":
14 wdir = dirnm
15 else:
16 print "w?"
17 if len(sys.argv) > 4:
18 item = sys.argv[3].strip()
19 dirnm = sys.argv[4].strip()
20 if item == "-d":
21 oname = dirnm + "/mitokmer_result.csv"
22 else:
23 print "d?"
24
25 if len(sys.argv) > 7:
26 item = sys.argv[5].strip()
27 filenm1 = sys.argv[6].strip()
28 filenm2 = sys.argv[7].strip()
29 if item == "-i":
30 file1 = filenm1
31 file2 = filenm2
32 else:
33 print "i?"
34
35 target = 0
36 wdir += "/"
37 reffile = wdir + "mitochondria_refkey.txt"
38 cname = wdir + "result.txt"
39 exclude_b = []
40
41 #read in reference names and counts
42 factor_list = []
43 name_list = []
44 in_use = []
45 data_file = open(reffile, 'r')
46 data = ''.join(data_file.readlines())
47 data_file.close()
48 lines = data.split('\n')
49 lines.pop(0) #header
50 for line in lines:
51 if len(line) > 1:
52 target, name, count, hit, tested, gsize, nstrains = line.split('\t')
53 row = name.split('_')
54 target = int(target)
55 hit = float(hit)
56 if nstrains != '0':
57 gensize = float(gsize) / float(nstrains)
58 else:
59 gensize = 1.0
60 tested = float(tested)
61 use = "1"
62 if target in exclude_b or count < 10.0 or hit < 10.0 or len(row) < 5:
63 use = "0"
64 in_use.append(int(use))
65 if use == "1":
66 name_list.append(name)
67 factor_list.append(tested / hit / gensize)
68 factor_arr = np.array(factor_list)
69 num_targs = len(name_list)
70 process = Popen([wdir + '/kmerread', '-wdir', wdir, '-f1', file1, '-f2', file2])
71
72 (stdout, stderr) = process.communicate()
73
74 #read in output files
75 noid_list = []
76 read_ct = []
77 num_cols = 1
78 m = np.zeros((num_targs,num_cols))
79 col = 0
80 data_file = open(cname, 'r')
81 data = ''.join(data_file.readlines())
82 data_file.close()
83 lines = data.split('\n')
84 read_ct.append(0.0)
85 index = 0
86 for line in lines:
87 if len(line) > 1:
88 t_s, count, uniq = line.split(',')
89 target = int(t_s)
90 read_ct[col] += float(count)
91 if target > 0:
92 if in_use[target]:
93 m[index, col] = float(count)
94 index += 1
95 else:
96 noid_list.append(int(count))
97
98 b = m * factor_arr[:,None] #normalize each row by kmer coverage
99 sums = np.sum(b, axis=0)
100 b = b / sums[None,:]
101 b = b * 100.0
102 rowmax = b.max(axis=1)
103
104 out_file = open(oname, 'w')
105 output = "taxid,reads,abundance\n"
106 out_file.write(output)
107 output = "total,"
108 for i in range(num_cols):
109 output += str(read_ct[i]) + ",,"
110 output += "\n"
111 out_file.write(output)
112 output = "no_id,"
113 for i in range(num_cols):
114 output += str(noid_list[i]) + ",,"
115 output += "\n"
116 out_file.write(output)
117 for i in range(num_targs):
118 #l = order_row[i]
119 if rowmax[i] > 0.000: #only output non-zero results
120 output = name_list[i]
121 for j in range(num_cols):
122 output += ',' + "{0:.0f}".format(m[i,j]) + ',' + "{0:.3f}".format(b[i,j])
123 output += "\n"
124 out_file.write(output)
125 out_file.close()
126
127
128
129