Mercurial > repos > iuc > concoct_extract_fasta_bins
comparison coverage_table.py @ 1:a04028a8181d draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
author | iuc |
---|---|
date | Sun, 13 Mar 2022 08:44:08 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:1bc8fd1b3ed0 | 1:a04028a8181d |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import gzip | |
5 from functools import partial | |
6 | |
7 from Bio import SeqIO | |
8 | |
9 | |
10 def generate_coverage_table(input_fasta, input_tabular, gzipped, output): | |
11 # Read input file into a dict and return everything | |
12 # in the table format required by CONCOCT. | |
13 gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped) | |
14 assert(len(gc_and_len_dict) > 0) | |
15 bed_coverage_dict = get_bed_coverage_dict(input_tabular) | |
16 | |
17 with open(output, 'w') as fh: | |
18 # Output the header. | |
19 fh.write("contig\tlength") | |
20 t = tuple(range(len(bed_coverage_dict))) | |
21 fh.write("\tcov_mean_sample_%d\n" % len(t)) | |
22 # Output the content. | |
23 for acc in gc_and_len_dict: | |
24 # Fasta stats. | |
25 fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length'])) | |
26 # Mean | |
27 try: | |
28 # Coverage mean | |
29 fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"])) | |
30 except KeyError: | |
31 # No reads mapped to this contig | |
32 fh.write("\t0") | |
33 fh.write("\n") | |
34 | |
35 | |
36 def get_bed_coverage_dict(input_tabular): | |
37 # Ddetermine mean coverage and percentage covered | |
38 # for each contig, returning a dict with fasta id | |
39 # as key and percentage covered and cov_mean as keys | |
40 # for the inner dict. | |
41 out_dict = {} | |
42 | |
43 with open(input_tabular, 'r') as fh: | |
44 for line in fh: | |
45 line = line.rstrip('\r\n') | |
46 cols = line.split('\t') | |
47 try: | |
48 d = out_dict[cols[0]] | |
49 except KeyError: | |
50 d = {} | |
51 out_dict[cols[0]] = d | |
52 if int(cols[1]) == 0: | |
53 d["percentage_covered"] = 100 - float(cols[4]) * 100.0 | |
54 else: | |
55 d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4]) | |
56 return out_dict | |
57 | |
58 | |
59 def get_gc_and_len_dict(input_fasta, gzipped): | |
60 # Creates a dictionary with the fasta id as key | |
61 # and GC and length as keys for the inner dictionary. | |
62 if gzipped: | |
63 _open = partial(gzip.open, mode='rt') | |
64 else: | |
65 _open = open | |
66 | |
67 out_dict = {} | |
68 with _open(input_fasta) as input_fh: | |
69 for rec in SeqIO.parse(input_fh, "fasta"): | |
70 out_dict[rec.id] = {} | |
71 out_dict[rec.id]["length"] = len(rec.seq) | |
72 return out_dict | |
73 | |
74 | |
75 parser = argparse.ArgumentParser(description=__doc__) | |
76 parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file') | |
77 parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file') | |
78 parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped") | |
79 parser.add_argument('--output', action='store', dest='output', help='Output file') | |
80 | |
81 args = parser.parse_args() | |
82 | |
83 generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output) |