Mercurial > repos > iuc > concoct_merge_cut_up_clustering
comparison coverage_table.py @ 0:b546422c9128 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
| author | iuc |
|---|---|
| date | Sun, 13 Mar 2022 08:44:34 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b546422c9128 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import gzip | |
| 5 from functools import partial | |
| 6 | |
| 7 from Bio import SeqIO | |
| 8 | |
| 9 | |
| 10 def generate_coverage_table(input_fasta, input_tabular, gzipped, output): | |
| 11 # Read input file into a dict and return everything | |
| 12 # in the table format required by CONCOCT. | |
| 13 gc_and_len_dict = get_gc_and_len_dict(input_fasta, gzipped) | |
| 14 assert(len(gc_and_len_dict) > 0) | |
| 15 bed_coverage_dict = get_bed_coverage_dict(input_tabular) | |
| 16 | |
| 17 with open(output, 'w') as fh: | |
| 18 # Output the header. | |
| 19 fh.write("contig\tlength") | |
| 20 t = tuple(range(len(bed_coverage_dict))) | |
| 21 fh.write("\tcov_mean_sample_%d\n" % len(t)) | |
| 22 # Output the content. | |
| 23 for acc in gc_and_len_dict: | |
| 24 # Fasta stats. | |
| 25 fh.write("%s\t%s" % (acc, gc_and_len_dict[acc]['length'])) | |
| 26 # Mean | |
| 27 try: | |
| 28 # Coverage mean | |
| 29 fh.write("\t%f" % (bed_coverage_dict[acc]["cov_mean"])) | |
| 30 except KeyError: | |
| 31 # No reads mapped to this contig | |
| 32 fh.write("\t0") | |
| 33 fh.write("\n") | |
| 34 | |
| 35 | |
| 36 def get_bed_coverage_dict(input_tabular): | |
| 37 # Ddetermine mean coverage and percentage covered | |
| 38 # for each contig, returning a dict with fasta id | |
| 39 # as key and percentage covered and cov_mean as keys | |
| 40 # for the inner dict. | |
| 41 out_dict = {} | |
| 42 | |
| 43 with open(input_tabular, 'r') as fh: | |
| 44 for line in fh: | |
| 45 line = line.rstrip('\r\n') | |
| 46 cols = line.split('\t') | |
| 47 try: | |
| 48 d = out_dict[cols[0]] | |
| 49 except KeyError: | |
| 50 d = {} | |
| 51 out_dict[cols[0]] = d | |
| 52 if int(cols[1]) == 0: | |
| 53 d["percentage_covered"] = 100 - float(cols[4]) * 100.0 | |
| 54 else: | |
| 55 d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4]) | |
| 56 return out_dict | |
| 57 | |
| 58 | |
| 59 def get_gc_and_len_dict(input_fasta, gzipped): | |
| 60 # Creates a dictionary with the fasta id as key | |
| 61 # and GC and length as keys for the inner dictionary. | |
| 62 if gzipped: | |
| 63 _open = partial(gzip.open, mode='rt') | |
| 64 else: | |
| 65 _open = open | |
| 66 | |
| 67 out_dict = {} | |
| 68 with _open(input_fasta) as input_fh: | |
| 69 for rec in SeqIO.parse(input_fh, "fasta"): | |
| 70 out_dict[rec.id] = {} | |
| 71 out_dict[rec.id]["length"] = len(rec.seq) | |
| 72 return out_dict | |
| 73 | |
| 74 | |
| 75 parser = argparse.ArgumentParser(description=__doc__) | |
| 76 parser.add_argument('--input_tabular', action='store', dest='input_tabular', help='bedtools genomeCoverageBed bed file') | |
| 77 parser.add_argument('--input_fasta', action='store', dest='input_fasta', help='Contigs fasta file') | |
| 78 parser.add_argument("--gzipped", action="store_true", dest="gzipped", default=False, help="input_fasta is gzipped") | |
| 79 parser.add_argument('--output', action='store', dest='output', help='Output file') | |
| 80 | |
| 81 args = parser.parse_args() | |
| 82 | |
| 83 generate_coverage_table(args.input_fasta, args.input_tabular, args.gzipped, args.output) |
