diff kmer_read_m3.py @ 0:1472b4f4fbfe draft

planemo upload commit a4aeec0f52695b1cbda85ed1faf85d66ba897ab2
author cstrittmatter
date Fri, 12 Apr 2019 14:58:18 -0400
parents
children 1434bc7b4786
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmer_read_m3.py	Fri Apr 12 14:58:18 2019 -0400
@@ -0,0 +1,129 @@
+#Reads fastq filenames, runs kmerread, and generates report
+#MKM 1/1/2019
+
+import re
+import sys
+import numpy as np
+from subprocess import Popen, PIPE
+
+if __name__ == '__main__':
+    if len(sys.argv) > 2:
+        item = sys.argv[1].strip()
+        dirnm = sys.argv[2].strip()
+        if item == "-w":
+            wdir = dirnm
+        else:
+            print "w?"
+    if len(sys.argv) > 4:
+        item = sys.argv[3].strip()
+        dirnm = sys.argv[4].strip()
+        if item == "-d":
+            oname = dirnm + "/mitokmer_result.csv"
+        else:
+            print "d?"
+
+    if len(sys.argv) > 7:
+        item = sys.argv[5].strip()
+        filenm1 = sys.argv[6].strip()
+        filenm2 = sys.argv[7].strip()
+        if item == "-i":
+            file1 = filenm1
+            file2 = filenm2
+        else:
+            print "i?"
+
+    target = 0
+    wdir += "/"
+    reffile = wdir + "mitochondria_refkey.txt"
+    cname = wdir + "result.txt"
+    exclude_b = []
+
+    #read in reference names and counts
+    factor_list = []
+    name_list = []
+    in_use = []
+    data_file = open(reffile, 'r')
+    data = ''.join(data_file.readlines())
+    data_file.close()
+    lines = data.split('\n')
+    lines.pop(0) #header
+    for line in lines:
+        if len(line) > 1:
+            target, name, count, hit, tested, gsize, nstrains  = line.split('\t')
+            row = name.split('_')
+            target = int(target)
+            hit = float(hit)
+            if nstrains != '0':
+	        gensize = float(gsize) / float(nstrains)
+            else:
+                gensize = 1.0
+            tested = float(tested)
+            use = "1"
+            if target in exclude_b or count < 10.0 or hit < 10.0 or len(row) < 5:
+                use = "0"
+            in_use.append(int(use))
+            if use == "1":
+                name_list.append(name)
+                factor_list.append(tested / hit / gensize)
+    factor_arr = np.array(factor_list)
+    num_targs = len(name_list)
+    process = Popen([wdir + '/kmerread', '-wdir', wdir, '-f1', file1, '-f2', file2])
+
+    (stdout, stderr) = process.communicate()
+
+    #read in output files
+    noid_list = []
+    read_ct = []
+    num_cols = 1
+    m = np.zeros((num_targs,num_cols))
+    col = 0
+    data_file = open(cname, 'r')
+    data = ''.join(data_file.readlines())
+    data_file.close()
+    lines = data.split('\n')
+    read_ct.append(0.0)
+    index = 0
+    for line in lines:
+        if len(line) > 1:
+            t_s, count, uniq = line.split(',')
+            target = int(t_s)
+            read_ct[col] += float(count) 
+            if target > 0:
+                if in_use[target]: 
+                    m[index, col] = float(count)
+                    index += 1
+            else:
+                noid_list.append(int(count))
+
+    b = m * factor_arr[:,None] #normalize each row by kmer coverage
+    sums = np.sum(b, axis=0)
+    b = b / sums[None,:]
+    b = b * 100.0
+    rowmax = b.max(axis=1)
+
+    out_file = open(oname, 'w')
+    output = "taxid,reads,abundance\n"
+    out_file.write(output)
+    output = "total,"
+    for i in range(num_cols):
+        output += str(read_ct[i]) + ",,"
+    output += "\n"
+    out_file.write(output)
+    output = "no_id,"
+    for i in range(num_cols):
+        output += str(noid_list[i]) + ",,"
+    output += "\n"
+    out_file.write(output)
+    for i in range(num_targs):
+        #l = order_row[i]
+        if rowmax[i] > 0.000: #only output non-zero results
+            output = name_list[i]
+            for j in range(num_cols):
+                output += ',' + "{0:.0f}".format(m[i,j]) + ',' + "{0:.3f}".format(b[i,j])
+            output += "\n"
+            out_file.write(output)
+    out_file.close()
+
+
+
+