Mercurial > repos > public-health-bioinformatics > match_plasmid_to_reference
comparison match_plasmid_to_reference.py @ 0:c917ef6807d7 draft default tip
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
| author | public-health-bioinformatics |
|---|---|
| date | Tue, 12 Nov 2019 22:47:36 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c917ef6807d7 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 from __future__ import division, print_function | |
| 4 | |
| 5 import argparse | |
| 6 import csv | |
| 7 import errno | |
| 8 import os | |
| 9 import re | |
| 10 import shutil | |
| 11 | |
| 12 | |
| 13 MOB_TYPER_FIELDNAMES = [ | |
| 14 "file_id", | |
| 15 "num_contigs", | |
| 16 "total_length", | |
| 17 "gc", | |
| 18 "rep_type(s)", | |
| 19 "rep_type_accession(s)", | |
| 20 "relaxase_type(s)", | |
| 21 "relaxase_type_accession(s)", | |
| 22 "mpf_type", | |
| 23 "mpf_type_accession(s)", | |
| 24 "orit_type(s)", | |
| 25 "orit_accession(s)", | |
| 26 "PredictedMobility", | |
| 27 "mash_nearest_neighbor", | |
| 28 "mash_neighbor_distance", | |
| 29 "mash_neighbor_cluster", | |
| 30 "NCBI-HR-rank", | |
| 31 "NCBI-HR-Name", | |
| 32 "LitRepHRPlasmClass", | |
| 33 "LitPredDBHRRank", | |
| 34 "LitPredDBHRRankSciName", | |
| 35 "LitRepHRRankInPubs", | |
| 36 "LitRepHRNameInPubs", | |
| 37 "LitMeanTransferRate", | |
| 38 "LitClosestRefAcc", | |
| 39 "LitClosestRefDonorStrain", | |
| 40 "LitClosestRefRecipientStrain", | |
| 41 "LitClosestRefTransferRate", | |
| 42 "LitClosestConjugTemp", | |
| 43 "LitPMIDs", | |
| 44 "LitPMIDsNumber", | |
| 45 ] | |
| 46 | |
| 47 | |
| 48 def parse_mob_typer_report(mob_typer_report_path): | |
| 49 mob_typer_report = [] | |
| 50 | |
| 51 with open(mob_typer_report_path) as f: | |
| 52 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | |
| 53 for row in reader: | |
| 54 mob_typer_report.append(row) | |
| 55 return mob_typer_report | |
| 56 | |
| 57 | |
| 58 def parse_genbank_accession(genbank_path): | |
| 59 with open(genbank_path, 'r') as f: | |
| 60 while True: | |
| 61 line = f.readline() | |
| 62 if line.startswith('ACCESSION'): | |
| 63 return line.strip().split()[1] | |
| 64 | |
| 65 | |
| 66 def parse_fasta_accession(fasta_path): | |
| 67 with open(fasta_path, 'r') as f: | |
| 68 while True: | |
| 69 line = f.readline() | |
| 70 if line.startswith('>'): | |
| 71 return line.strip().split()[0][1:] | |
| 72 | |
| 73 | |
| 74 def count_fasta_contigs(fasta_path): | |
| 75 contigs = 0 | |
| 76 with open(fasta_path, 'r') as f: | |
| 77 for line in f: | |
| 78 if line.startswith('>'): | |
| 79 contigs += 1 | |
| 80 return contigs | |
| 81 | |
| 82 | |
| 83 def count_fasta_bases(fasta_path): | |
| 84 bases = 0 | |
| 85 with open(fasta_path, 'r') as f: | |
| 86 for line in f: | |
| 87 line = line.strip() | |
| 88 if not line.startswith('>'): | |
| 89 bases += len(line) | |
| 90 return bases | |
| 91 | |
| 92 | |
| 93 def compute_fasta_gc_percent(fasta_path): | |
| 94 gc_count = 0 | |
| 95 total_bases_count = 0 | |
| 96 with open(fasta_path, 'r') as f: | |
| 97 for line in f: | |
| 98 if not line.startswith('>'): | |
| 99 line = line.strip() | |
| 100 line_c_count = line.count('c') + line.count('C') | |
| 101 line_g_count = line.count('g') + line.count('G') | |
| 102 line_total_bases_count = len(line) | |
| 103 gc_count += line_c_count + line_g_count | |
| 104 total_bases_count += line_total_bases_count | |
| 105 return 100 * (gc_count / total_bases_count) | |
| 106 | |
| 107 | |
| 108 def main(args): | |
| 109 | |
| 110 # create output directory | |
| 111 try: | |
| 112 os.mkdir(args.outdir) | |
| 113 except OSError as exc: | |
| 114 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): | |
| 115 pass | |
| 116 else: | |
| 117 raise | |
| 118 | |
| 119 # parse mob_typer report | |
| 120 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) | |
| 121 num_plasmid_contigs = count_fasta_contigs(args.plasmid) | |
| 122 num_plasmid_bases = count_fasta_bases(args.plasmid) | |
| 123 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid) | |
| 124 | |
| 125 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: | |
| 126 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | |
| 127 mob_typer_record_writer.writeheader() | |
| 128 for record in mob_typer_report: | |
| 129 # match the plasmid against three properties in the MOB-Typer report: | |
| 130 # 1. number of contigs | |
| 131 # 2. total length of all contigs | |
| 132 # 3. G/C percent (within +/-0.1%) | |
| 133 if num_plasmid_contigs == int(record['num_contigs']) and \ | |
| 134 num_plasmid_bases == int(record['total_length']) and \ | |
| 135 abs(plasmid_gc_percent - float(record['gc'])) < 0.1: | |
| 136 for reference_plasmid in args.reference_plasmids_genbank: | |
| 137 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: | |
| 138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) | |
| 139 | |
| 140 for reference_plasmid in args.reference_plasmids_fasta: | |
| 141 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None: | |
| 142 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta")) | |
| 143 mob_typer_record_writer.writerow(record) | |
| 144 | |
| 145 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) | |
| 146 | |
| 147 | |
| 148 if __name__ == '__main__': | |
| 149 parser = argparse.ArgumentParser() | |
| 150 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") | |
| 151 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)") | |
| 152 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)") | |
| 153 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") | |
| 154 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") | |
| 155 args = parser.parse_args() | |
| 156 main(args) |
