Mercurial > repos > rnateam > graphclust_postprocessing
comparison addCdhitseqs.py @ 6:869a6e807d76 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit 057c2fd398055dc86eb2c00d8a74f301d5c231d9-dirty
author | rnateam |
---|---|
date | Wed, 22 Feb 2017 16:51:06 -0500 |
parents | 79b9117aef01 |
children |
comparison
equal
deleted
inserted
replaced
5:4310ac018d05 | 6:869a6e807d76 |
---|---|
1 import re | 1 import re |
2 import glob | 2 import glob |
3 import sys | 3 import sys |
4 | 4 |
5 cdhitcluster = sys.argv[1] | 5 cdhitcluster = sys.argv[1] |
6 #clusters = sys.argv[2] | |
7 | 6 |
8 cluster_seqs_stats_path = "RESULTS/*.cluster.all" | 7 cluster_seqs_stats_path = "RESULTS/*.cluster.all" |
9 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) | 8 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) |
10 | 9 |
11 #clusterFiles = clusters.split(',') | |
12 repSeqRedSeqdict = {} | 10 repSeqRedSeqdict = {} |
13 repLine = "" | 11 repLine = "" |
14 count = 0 | 12 count = 0 |
15 first = False | 13 first = False |
14 add_FullId = "" | |
15 k = 0 | |
16 | 16 |
17 with open(cdhitcluster, 'r+') as f: | 17 with open(cdhitcluster, 'r+') as f: |
18 lines = f.readlines() | 18 content = f.read() |
19 reps = re.compile("^.*\*$", re.MULTILINE).findall(content) | |
20 lines = content.split('\n') | |
21 | |
19 for i in range(0, len(lines)): | 22 for i in range(0, len(lines)): |
20 line = lines[i] | 23 line = lines[i] |
21 if ">Cluster" in line: | 24 if ">Cluster" in line: |
22 first = True | 25 first = True |
23 count = 0 | 26 count = 0 |
24 if i+1 < len(lines): | 27 repLine = reps[k] |
25 repLine = lines[i+1] | 28 k = k+1 |
26 continue | 29 continue |
27 elif not first: | 30 elif not first: |
28 count += 1 | 31 count += 1 |
29 first = False | 32 first = False |
30 else: | 33 else: |
31 first = False | 34 first = False |
32 lineArr = [] | 35 lineArr = [] |
33 if count > 0: | 36 if count > 0: |
34 repLine = repLine.strip() | 37 repLine = repLine.strip() |
35 rep_FullId = repLine.split()[2] | 38 rep_FullId = repLine.split()[2] |
36 rep_FullId = rep_FullId.replace(">", "") | 39 rep_FullId = rep_FullId.replace(">","") |
37 #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] | 40 rep_FullId = rep_FullId.replace("...","") |
38 rep_FullId = rep_FullId.replace("...", "") | 41 if "*" in line or not line.strip(): |
42 continue | |
39 line = line.strip() | 43 line = line.strip() |
40 add_FullId = line.split()[2] | 44 add_FullId = line.split()[2] |
41 add_FullId = add_FullId.replace(">", "") | 45 add_FullId = add_FullId.replace(">","") |
42 add_FullId = add_FullId.replace("...", "") | 46 add_FullId = add_FullId.replace("...","") |
43 #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0] | |
44 lineArr.append(add_FullId) | 47 lineArr.append(add_FullId) |
45 repSeqRedSeqdict[rep_FullId] = lineArr | 48 repSeqRedSeqdict[rep_FullId] = lineArr |
46 #lineArr.append(add_short_id) | |
47 #repSeqRedSeqdict[rep_short_id] = lineArr | |
48 | 49 |
49 toWrite = "" | 50 toWrite = "" |
50 | |
51 for singleFile in sorted(cluster_seqs_stats_files): | 51 for singleFile in sorted(cluster_seqs_stats_files): |
52 with open(singleFile, "a+") as clFile: | 52 toWrite = "" |
53 file_content = clFile.read() | 53 with open(singleFile, "r+") as clFile: |
54 first_line = file_content.split('\n')[0] | 54 file_lines = clFile.readlines() |
55 for line in file_lines: | |
56 line = '\t'.join(line.split()) | |
57 toWrite += line + '\n' | |
58 clFile.seek(0) | |
59 clFile.write(toWrite) | |
60 clFile.truncate() | |
61 first_line = file_lines[0] | |
62 toWrite = "" | |
63 cols = first_line.split() | |
64 file_content = '\n'.join(file_lines) | |
55 for key, val in repSeqRedSeqdict.items(): | 65 for key, val in repSeqRedSeqdict.items(): |
56 if key in file_content: | 66 if key in file_content: |
67 | |
57 for i in val: | 68 for i in val: |
58 toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" | 69 cols[3] = "---" |
70 cols[4] = "CD-Hit" | |
71 cols[7] = str(i) | |
72 if len(first_line.split()) > 9: | |
73 cols[9] = str(i.rsplit("_",1)[0]) | |
74 toWrite += '\t'.join(cols) | |
75 toWrite +="\n" | |
59 clFile.write(toWrite) | 76 clFile.write(toWrite) |