Mercurial > repos > rnateam > graphclust_postprocessing
view addCdhitseqs.py @ 17:f93c868203cc draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/GraphClust/CollectResults commit 4406735e44aba20859c252be39f4e99df28c7a92
author | rnateam |
---|---|
date | Sat, 27 Oct 2018 13:23:06 -0400 |
parents | 869a6e807d76 |
children |
line wrap: on
line source
import re import glob import sys cdhitcluster = sys.argv[1] cluster_seqs_stats_path = "RESULTS/*.cluster.all" cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) repSeqRedSeqdict = {} repLine = "" count = 0 first = False add_FullId = "" k = 0 with open(cdhitcluster, 'r+') as f: content = f.read() reps = re.compile("^.*\*$", re.MULTILINE).findall(content) lines = content.split('\n') for i in range(0, len(lines)): line = lines[i] if ">Cluster" in line: first = True count = 0 repLine = reps[k] k = k+1 continue elif not first: count += 1 first = False else: first = False lineArr = [] if count > 0: repLine = repLine.strip() rep_FullId = repLine.split()[2] rep_FullId = rep_FullId.replace(">","") rep_FullId = rep_FullId.replace("...","") if "*" in line or not line.strip(): continue line = line.strip() add_FullId = line.split()[2] add_FullId = add_FullId.replace(">","") add_FullId = add_FullId.replace("...","") lineArr.append(add_FullId) repSeqRedSeqdict[rep_FullId] = lineArr toWrite = "" for singleFile in sorted(cluster_seqs_stats_files): toWrite = "" with open(singleFile, "r+") as clFile: file_lines = clFile.readlines() for line in file_lines: line = '\t'.join(line.split()) toWrite += line + '\n' clFile.seek(0) clFile.write(toWrite) clFile.truncate() first_line = file_lines[0] toWrite = "" cols = first_line.split() file_content = '\n'.join(file_lines) for key, val in repSeqRedSeqdict.items(): if key in file_content: for i in val: cols[3] = "---" cols[4] = "CD-Hit" cols[7] = str(i) if len(first_line.split()) > 9: cols[9] = str(i.rsplit("_",1)[0]) toWrite += '\t'.join(cols) toWrite +="\n" clFile.write(toWrite)