annotate lib/assembly_tools.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2 import sys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 import logging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 import subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 import os
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 import tempfile
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 import config
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 import shutil
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 import itertools
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 import pickle
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 import shlex
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 from lib.parallel.parallel import parallel2 as parallel
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13 from lib.parallel.parallel import get_max_proc
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 REQUIRED_VERSION = (3, 4)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 MAX_BUFFER_SIZE = 100000
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 if sys.version_info < REQUIRED_VERSION:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18 raise Exception("\n\npython 3.4 or higher is required!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 LOGGER = logging.getLogger(__name__)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 MAX_FASTA_IN_DIRECTORY = 1000
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 def assembly(sequences, hitsort, clusters_info, assembly_dir,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23 contigs_file, min_size_of_cluster_for_assembly = 0):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 Runs assembly on sequences (SequenceSet). Assembly is
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26 performed on each cluster separatelly, clusters are taken
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27 from hitsort(Graph)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 Cluster_listing - list of Clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 if cluster.tandem_rank is 1 or 2 - no assembly is performed!!
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32 # iterate over large clusters, assembly is performed on sequences stored in
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 # cluster_little[index].fasta_file_full - for annotated clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 # sequences of small clusters are retrieved from database
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 fasta_seqs = [
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36 i.fasta_file_full
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 for i in clusters_info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 if not i.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 ]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40 prefixes = ["CL{}".format(i.index)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 for i in clusters_info
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 if not i.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43 LOGGER.info("Number of clusters for assembly: {}".format(hitsort.number_of_clusters))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 LOGGER.info("Assembling large clusters")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 assembly_files = parallel(cap3worker, fasta_seqs, prefixes)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 LOGGER.info("Large clusters assembled")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 # some clusters - tanem rank 1 assembled
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 j = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 for cl in clusters_info:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 if cl.tandem_rank in config.SKIP_CAP3_ASSEMBLY_TANDEM_RANKS:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 cl.assembly_files = {i: None for i in config.CAP3_FILES_MAPPING}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 consensus_file = cl.dir + "/tarean_contigs.fasta"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 cl.assembly_files["{}.{}.contigs"] = consensus_file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 with open(cl.dir_tarean + "/tarean_contigs.fasta",
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 'r') as fin, open(consensus_file, 'w') as fout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 for line in fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 if line[0] == ">":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 line = ">CL{}Contig{}".format(cl.index, line[1:])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 fout.write(line)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 cl.assembly_files = assembly_files[j]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 j += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 # assembly of small clusters:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 # connection to files were results will be concatenated
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 LOGGER.info("Assembly of small cluster - making tmp files")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 tmp_dir_root = tempfile.mkdtemp()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 tmp_subdir = tempfile.mkdtemp(dir=tmp_dir_root)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 nproc = get_max_proc()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 prefixes = [[] for i in range(nproc)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 tmp_seq_files = [[] for i in range(nproc)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73 LOGGER.info("Assembly of small clusters - saving small cluster to tmp files")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 seq_dictionary = sequences.toDict()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 fasta_count = 0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76 chunk_counter = itertools.cycle(range(nproc))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77 for index in range(len(clusters_info) + 1, hitsort.number_of_clusters):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 chunk = next(chunk_counter)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 ids = hitsort.get_cluster_reads(index)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 if len(ids) < min_size_of_cluster_for_assembly:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 break
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82 prefixes[chunk].append("CL{}".format(index))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83 fasta_count += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 if fasta_count > MAX_FASTA_IN_DIRECTORY:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 # create new subdir to keep number of files in directory low
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 fasta_count = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 tmp_subdir = tempfile.mkdtemp(dir=tmp_dir_root)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 fasta_file_name = "{}/{}".format(tmp_subdir, index)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 write_seqDict_to_fasta(file_name=fasta_file_name, sequences=seq_dictionary, subset=ids)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 tmp_seq_files[chunk].append(fasta_file_name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 del seq_dictionary
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92 LOGGER.info("Assembly of small clusters running")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 pickled_fparts_small_contigs = parallel(cap3worker_multiple, tmp_seq_files, prefixes)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 LOGGER.info("Assembly of small clusters done")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 # append to all results
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 LOGGER.info("Assembly of small clusters - appending results")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 all_con = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 for fkey in config.CAP3_FILES_MAPPING:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99 if config.CAP3_FILES_MAPPING[fkey]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100 all_con[fkey] = open("{}/{}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 assembly_dir, config.CAP3_FILES_MAPPING[fkey]), 'w')
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103 all_con[fkey] = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 for pickled_fparts_multiple in pickled_fparts_small_contigs:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 with open(pickled_fparts_multiple, 'rb') as fp:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106 fparts_multiple = pickle.load(fp)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 os.unlink(pickled_fparts_multiple)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 for fparts in fparts_multiple:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 for fkey in fparts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 if all_con[fkey]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 with open(fparts[fkey]) as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
112 all_con[fkey].write(f.read())
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
113 os.unlink(fparts[fkey])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
114 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
115 os.unlink(fparts[fkey])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
116
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
117 for fkey in all_con:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
118 if all_con[fkey]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
119 all_con[fkey].close()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
120 LOGGER.info("Assembling - copying files to final location")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
121 # copy all contigs to one location: - contigs_file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
122 with open(contigs_file, 'w') as fout:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
123 for cl in clusters_info:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
124 # append contig from large clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
125 with open(cl.assembly_files["{}.{}.contigs"], 'r') as fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
126 for i in fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
127 fout.write(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
128 # append contigs from small clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
129 small_aln_file = "{}/{}".format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
130 assembly_dir, config.CAP3_FILES_MAPPING['{}.{}.aln'])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
131 # calculate profile on aln file of small clusters
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
132 file_base_name = small_aln_file[:-4]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
133 os.system("align_parsing.pl -i {fn} -o {out}.info.fasta -p {out}.profile 2>&1".format(fn=small_aln_file, out=file_base_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
134 os.system("select_and_sort_contigs.pl {fn}.info.fasta 5 2>&1".format(fn=file_base_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
135 small_contig_file = file_base_name + ".info.fasta"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
136 with open(small_contig_file, 'r') as fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
137 for i in fin:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
138 fout.write(i)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
139 shutil.rmtree(tmp_dir_root)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
140
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
141 def write_seqDict_to_fasta(file_name, sequences, subset):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
142 with open(file_name, 'w') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
143 for i in subset:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
144 f.write(">{}\n{}\n".format(i, sequences[i]))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
145
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
146
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
147
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
148
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
149 def cap3worker(seqfile, prefix="cap", cap3args=" -p 80 -o 40 "):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
150 prefix2 = "cap"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
151 cmd = "cap3 " + seqfile + cap3args + " -x " + prefix2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
152 with open(seqfile + "." + prefix2 + ".aln", "w") as aln:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
153 subprocess.check_call(shlex.split(cmd), shell=False, stdout=aln)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
154 # this generate three files
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
155 files_dict = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
156 for fkey in config.CAP3_FILENAMES:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
157 fn = fkey.format(seqfile, prefix2)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
158 fn_tmp = "{}.tmp".format(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
159 files_dict[fkey] = fn
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
160 if config.CAP3_PATTERNS_REPLACE[fkey]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
161 pattern, replacement = config.CAP3_PATTERNS_REPLACE[fkey]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
162 with open(fn, "r") as con_in, open(fn_tmp, 'w') as con_out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
163 for line in con_in:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
164 con_out.write(line.replace(pattern, replacement.format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
165 prefix)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
166 os.rename(fn_tmp, fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
167
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
168 # make new meaningful names here
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
169
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
170 for fkey in config.CAP3_FILES_GOODNAMES:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
171 config.CAP3_FILES_GOODNAMES[fkey]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
172 fn_goodname = os.path.dirname(files_dict[fkey]) + "/" + config.CAP3_FILES_GOODNAMES[fkey]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
173 os.rename(files_dict[fkey], fn_goodname)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
174 files_dict[fkey] = fn_goodname
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
175
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
176 aln_file = files_dict["{}.{}.aln"]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
177 file_base_name = aln_file[:-4]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
178 os.system("align_parsing.pl -i {fn} -o {out}.info.fasta -p {out}.profile 2>&1".format(fn=aln_file, out=file_base_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
179 os.system("select_and_sort_contigs.pl {fn}.info.fasta 5".format(fn=file_base_name))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
180 # TODO -add new file to files_dict
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
181 # replace simple fasta with info.fasta
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
182 files_dict["{}.{}.contigs"] = file_base_name + ".info.fasta"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
183 return files_dict
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
184
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
185 def cap3worker_multiple(many_seqfile, many_prefixes, cap3args=" -p 80 -o 40 "):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
186 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
187 purpose of this script is to run multiple assemblies within single process
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
188 avoiding running high number of short parallel subprocesses
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
189 As starting subprocess for each cap3 assembly was very ineffective,
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
190 all ap3 commands are written to single file and run using single subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
191 command -
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
192 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
193 cmd_file = tempfile.NamedTemporaryFile(mode="w",delete=False).name
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
194 with open(cmd_file,'w') as cmdf:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
195 for seqfile, prefix in zip(many_seqfile, many_prefixes):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
196 cmd = "cap3 " + seqfile + cap3args + " -x " + prefix + " > " + seqfile + "." + prefix + ".aln\n"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
197 cmdf.write(cmd)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
198 os.system("sh "+ cmd_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
199 # collect results:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
200 files_dict_many = []
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
201 for seqfile, prefix in zip(many_seqfile, many_prefixes):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
202 files_dict = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
203 for fkey in config.CAP3_FILENAMES:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
204 fn = fkey.format(seqfile, prefix)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
205 fn_tmp = "{}.tmp".format(fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
206 files_dict[fkey] = fn
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
207 if config.CAP3_PATTERNS_REPLACE[fkey]:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
208 pattern, replacement = config.CAP3_PATTERNS_REPLACE[fkey]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
209 with open(fn, "r") as con_in, open(fn_tmp, 'w') as con_out:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
210 for line in con_in:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
211 con_out.write(line.replace(pattern, replacement.format(
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
212 prefix)))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
213 os.rename(fn_tmp, fn)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
214 files_dict_many.append(files_dict)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
215 # this is too large to be return directly - use picking
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
216 f = tempfile.NamedTemporaryFile(delete=False)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
217 os.unlink(cmd_file)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
218 pickle.dump(files_dict_many,file=f)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
219 return f.name
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
220
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
221