Mercurial > repos > cstrittmatter > mitokmer
diff kmer_read_m3.py @ 0:1472b4f4fbfe draft
planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author | cstrittmatter |
---|---|
date | Fri, 12 Apr 2019 14:58:18 -0400 |
parents | |
children | 1434bc7b4786 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmer_read_m3.py Fri Apr 12 14:58:18 2019 -0400 @@ -0,0 +1,129 @@ +#Reads fastq filenames, runs kmerread, and generates report +#MKM 1/1/2019 + +import re +import sys +import numpy as np +from subprocess import Popen, PIPE + +if __name__ == '__main__': + if len(sys.argv) > 2: + item = sys.argv[1].strip() + dirnm = sys.argv[2].strip() + if item == "-w": + wdir = dirnm + else: + print "w?" + if len(sys.argv) > 4: + item = sys.argv[3].strip() + dirnm = sys.argv[4].strip() + if item == "-d": + oname = dirnm + "/mitokmer_result.csv" + else: + print "d?" + + if len(sys.argv) > 7: + item = sys.argv[5].strip() + filenm1 = sys.argv[6].strip() + filenm2 = sys.argv[7].strip() + if item == "-i": + file1 = filenm1 + file2 = filenm2 + else: + print "i?" + + target = 0 + wdir += "/" + reffile = wdir + "mitochondria_refkey.txt" + cname = wdir + "result.txt" + exclude_b = [] + + #read in reference names and counts + factor_list = [] + name_list = [] + in_use = [] + data_file = open(reffile, 'r') + data = ''.join(data_file.readlines()) + data_file.close() + lines = data.split('\n') + lines.pop(0) #header + for line in lines: + if len(line) > 1: + target, name, count, hit, tested, gsize, nstrains = line.split('\t') + row = name.split('_') + target = int(target) + hit = float(hit) + if nstrains != '0': + gensize = float(gsize) / float(nstrains) + else: + gensize = 1.0 + tested = float(tested) + use = "1" + if target in exclude_b or count < 10.0 or hit < 10.0 or len(row) < 5: + use = "0" + in_use.append(int(use)) + if use == "1": + name_list.append(name) + factor_list.append(tested / hit / gensize) + factor_arr = np.array(factor_list) + num_targs = len(name_list) + process = Popen([wdir + '/kmerread', '-wdir', wdir, '-f1', file1, '-f2', file2]) + + (stdout, stderr) = process.communicate() + + #read in output files + noid_list = [] + read_ct = [] + num_cols = 1 + m = np.zeros((num_targs,num_cols)) + col = 0 + data_file = open(cname, 'r') + data = ''.join(data_file.readlines()) + data_file.close() + lines = data.split('\n') + read_ct.append(0.0) + index = 0 + for line in lines: + if len(line) > 1: + t_s, count, uniq = line.split(',') + target = int(t_s) + read_ct[col] += float(count) + if target > 0: + if in_use[target]: + m[index, col] = float(count) + index += 1 + else: + noid_list.append(int(count)) + + b = m * factor_arr[:,None] #normalize each row by kmer coverage + sums = np.sum(b, axis=0) + b = b / sums[None,:] + b = b * 100.0 + rowmax = b.max(axis=1) + + out_file = open(oname, 'w') + output = "taxid,reads,abundance\n" + out_file.write(output) + output = "total," + for i in range(num_cols): + output += str(read_ct[i]) + ",," + output += "\n" + out_file.write(output) + output = "no_id," + for i in range(num_cols): + output += str(noid_list[i]) + ",," + output += "\n" + out_file.write(output) + for i in range(num_targs): + #l = order_row[i] + if rowmax[i] > 0.000: #only output non-zero results + output = name_list[i] + for j in range(num_cols): + output += ',' + "{0:.0f}".format(m[i,j]) + ',' + "{0:.3f}".format(b[i,j]) + output += "\n" + out_file.write(output) + out_file.close() + + + +