Mercurial > repos > bioit_sciensano > phagetermvirome
comparison _modules/seq_processing.py @ 0:69e8f12c8b31 draft
"planemo upload"
author | bioit_sciensano |
---|---|
date | Fri, 11 Mar 2022 15:06:20 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:69e8f12c8b31 |
---|---|
1 ##@file seq_processing.py | |
2 # | |
3 # This file contains functions that are used when running phageterm on multiple machines on a calculation cluster. | |
4 # @param DR_Path directory path where to put DR content. | |
5 from __future__ import print_function | |
6 | |
7 from time import gmtime, strftime | |
8 import os | |
9 import numpy as np | |
10 from _modules.utilities import checkReportTitle | |
11 from _modules.readsCoverage_res import loadRCRes | |
12 from _modules.common_readsCoverage_processing import processCovValuesForSeq | |
13 #from SeqStats import SeqStats | |
14 def sum_readsCoverage_for_seq(dir_cov_res,idx_refseq,nb_pieces,inDArgs,fParms,inRawDArgs,dir_seq_res,DR_path): | |
15 if os.path.exists(DR_path): | |
16 if not (os.path.isdir(DR_path)): | |
17 raise RuntimeError("DR_path must point to a directory") | |
18 else: | |
19 os.mkdir(DR_path) | |
20 DR = {"Headful (pac)": {}, "COS (5')": {}, "COS (3')": {}, "COS": {}, "DTR (short)": {}, "DTR (long)": {}, | |
21 "Mu-like": {}, "UNKNOWN": {}, "NEW": {}} | |
22 print("going to load ",nb_pieces," files for sequence ",idx_refseq) | |
23 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
24 for i in range(0,nb_pieces): | |
25 fic_name = os.path.join(dir_cov_res, "coverage" + str(idx_refseq) + "_" + str(i)+".npz") | |
26 print("loading file: ",fic_name) | |
27 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
28 partial_res=loadRCRes(fic_name) | |
29 #npzfile=np.load(fic_name) | |
30 if i == 0: | |
31 termini_coverage = partial_res.termini_coverage | |
32 whole_coverage = partial_res.whole_coverage | |
33 paired_whole_coverage = partial_res.paired_whole_coverage | |
34 phage_hybrid_coverage = partial_res.phage_hybrid_coverage | |
35 host_hybrid_coverage = partial_res.host_hybrid_coverage | |
36 host_whole_coverage = partial_res.host_whole_coverage | |
37 list_hybrid = partial_res.list_hybrid | |
38 insert = partial_res.insert | |
39 paired_missmatch = partial_res.paired_mismatch | |
40 reads_tested = partial_res.reads_tested | |
41 else: | |
42 termini_coverage += partial_res.termini_coverage | |
43 whole_coverage += partial_res.whole_coverage | |
44 paired_whole_coverage += partial_res.paired_whole_coverage | |
45 phage_hybrid_coverage += partial_res.phage_hybrid_coverage | |
46 host_hybrid_coverage += partial_res.host_hybrid_coverage | |
47 host_whole_coverage += partial_res.host_whole_coverage | |
48 list_hybrid += partial_res.list_hybrid | |
49 insert += partial_res.insert | |
50 paired_missmatch += partial_res.paired_mismatch | |
51 reads_tested += partial_res.reads_tested | |
52 | |
53 # fic_name = os.path.join(dir_seq_res, "coverage" + str(idx_refseq)) | |
54 # np.savez(fic_name, termini_coverage=termini_coverage, whole_coverage=whole_coverage, | |
55 # paired_whole_coverage=paired_whole_coverage, \ | |
56 # phage_hybrid_coverage=phage_hybrid_coverage, host_hybrid_coverage=host_hybrid_coverage, \ | |
57 # host_whole_coverage=host_whole_coverage, list_hybrid=list_hybrid, insert=insert, | |
58 # paired_missmatch=np.array(paired_missmatch)) | |
59 termini_coverage = termini_coverage.tolist() | |
60 whole_coverage = whole_coverage.tolist() | |
61 paired_whole_coverage = paired_whole_coverage.tolist() | |
62 phage_hybrid_coverage = phage_hybrid_coverage.tolist() | |
63 host_hybrid_coverage = host_hybrid_coverage.tolist() | |
64 host_whole_coverage = host_whole_coverage.tolist() | |
65 list_hybrid = list_hybrid.tolist() | |
66 | |
67 if sum(termini_coverage[0]) + sum(termini_coverage[1]) == 0: | |
68 no_match_file="no_natch"+str(idx_refseq) | |
69 f=open(os.path.join(dir_seq_res,no_match_file),"w") | |
70 f.write((checkReportTitle(seq_name[idx_refseq]))) | |
71 f.close() | |
72 | |
73 print("finished sum, calling processCovValuesForSeq") | |
74 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
75 # TODO: having so many values in input and returned is ugly and bad for readibility and maintanability. Group those who are related in structures. | |
76 refseq = inDArgs.refseq_liste[idx_refseq] | |
77 S_stats=processCovValuesForSeq(refseq, inDArgs.hostseq, inDArgs.refseq_name, inDArgs.refseq_liste, fParms.seed, | |
78 inRawDArgs.analysis_name, inRawDArgs.tot_reads, \ | |
79 idx_refseq, fParms.test_run, inRawDArgs.paired, fParms.edge, inRawDArgs.host, | |
80 fParms.test, fParms.surrounding, \ | |
81 fParms.limit_preferred, fParms.limit_fixed, fParms.Mu_threshold, termini_coverage, | |
82 whole_coverage, \ | |
83 paired_whole_coverage, phage_hybrid_coverage, host_hybrid_coverage, | |
84 host_whole_coverage, insert, list_hybrid, reads_tested, DR,DR_path) | |
85 #fic_name = os.path.join(dir_seq_res, "seq_stats" + str(idx_refseq)) | |
86 # S_stats.toFile(fic_name) s_stats content is only used in the case where there is only 1 sequence. I'm not interested in this case here since sum_readsCoverage_for_seq will be used for viromes. | |
87 # so, just drop s_stat and forget it... | |
88 # Only writing DR content to file is usefuk in the case of a virome processing over several machines on a cluster. | |
89 print("exit sum_readsCoverage_for_seq") | |
90 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
91 | |
92 | |
93 | |
94 | |
95 |