comparison coverage_table.py @ 0:7e01297a3b4a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
author iuc
date Sun, 13 Mar 2022 08:45:32 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7e01297a3b4a
1 #!/usr/bin/env python
2
3 import argparse
4 import gzip
5 from functools import partial
6
7 from Bio import SeqIO
8
9
10 def generate_coverage_table(input_fasta, input_tabular, gzipped, output):
11 # Read input file into a dict and return everything
12 # in the table format required by CONCOCT.
13 gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped)
14 assert(len(gc_and_len_dict) > 0)
15 bed_coverage_dict = get_bed_coverage_dict(input_tabular)
16
17 with open(output, 'w') as fh:
18 # Output the header.
19 fh.write("contig\tlength")
20 t = tuple(range(len(bed_coverage_dict)))
21 fh.write("\tcov_mean_sample_%d\n" % len(t))
22 # Output the content.
23 for acc in gc_and_len_dict:
24 # Fasta stats.
25 fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length']))
26 # Mean
27 try:
28 # Coverage mean
29 fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"]))
30 except KeyError:
31 # No reads mapped to this contig
32 fh.write("\t0")
33 fh.write("\n")
34
35
36 def get_bed_coverage_dict(input_tabular):
37 # Ddetermine mean coverage and percentage covered
38 # for each contig, returning a dict with fasta id
39 # as key and percentage covered and cov_mean as keys
40 # for the inner dict.
41 out_dict = {}
42
43 with open(input_tabular, 'r') as fh:
44 for line in fh:
45 line = line.rstrip('\r\n')
46 cols = line.split('\t')
47 try:
48 d = out_dict[cols[0]]
49 except KeyError:
50 d = {}
51 out_dict[cols[0]] = d
52 if int(cols[1]) == 0:
53 d["percentage_covered"] = 100 - float(cols[4]) * 100.0
54 else:
55 d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])
56 return out_dict
57
58
59 def get_gc_and_len_dict(input_fasta, gzipped):
60 # Creates a dictionary with the fasta id as key
61 # and GC and length as keys for the inner dictionary.
62 if gzipped:
63 _open = partial(gzip.open, mode='rt')
64 else:
65 _open = open
66
67 out_dict = {}
68 with _open(input_fasta) as input_fh:
69 for rec in SeqIO.parse(input_fh, "fasta"):
70 out_dict[rec.id] = {}
71 out_dict[rec.id]["length"] = len(rec.seq)
72 return out_dict
73
74
75 parser = argparse.ArgumentParser(description=__doc__)
76 parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file')
77 parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file')
78 parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped")
79 parser.add_argument('--output', action='store', dest='output', help='Output file')
80
81 args = parser.parse_args()
82
83 generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output)