Mercurial > repos > public-health-bioinformatics > match_plasmid_to_reference
comparison match_plasmid_to_reference.py @ 0:c917ef6807d7 draft default tip
"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
author | public-health-bioinformatics |
---|---|
date | Tue, 12 Nov 2019 22:47:36 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c917ef6807d7 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import division, print_function | |
4 | |
5 import argparse | |
6 import csv | |
7 import errno | |
8 import os | |
9 import re | |
10 import shutil | |
11 | |
12 | |
13 MOB_TYPER_FIELDNAMES = [ | |
14 "file_id", | |
15 "num_contigs", | |
16 "total_length", | |
17 "gc", | |
18 "rep_type(s)", | |
19 "rep_type_accession(s)", | |
20 "relaxase_type(s)", | |
21 "relaxase_type_accession(s)", | |
22 "mpf_type", | |
23 "mpf_type_accession(s)", | |
24 "orit_type(s)", | |
25 "orit_accession(s)", | |
26 "PredictedMobility", | |
27 "mash_nearest_neighbor", | |
28 "mash_neighbor_distance", | |
29 "mash_neighbor_cluster", | |
30 "NCBI-HR-rank", | |
31 "NCBI-HR-Name", | |
32 "LitRepHRPlasmClass", | |
33 "LitPredDBHRRank", | |
34 "LitPredDBHRRankSciName", | |
35 "LitRepHRRankInPubs", | |
36 "LitRepHRNameInPubs", | |
37 "LitMeanTransferRate", | |
38 "LitClosestRefAcc", | |
39 "LitClosestRefDonorStrain", | |
40 "LitClosestRefRecipientStrain", | |
41 "LitClosestRefTransferRate", | |
42 "LitClosestConjugTemp", | |
43 "LitPMIDs", | |
44 "LitPMIDsNumber", | |
45 ] | |
46 | |
47 | |
48 def parse_mob_typer_report(mob_typer_report_path): | |
49 mob_typer_report = [] | |
50 | |
51 with open(mob_typer_report_path) as f: | |
52 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | |
53 for row in reader: | |
54 mob_typer_report.append(row) | |
55 return mob_typer_report | |
56 | |
57 | |
58 def parse_genbank_accession(genbank_path): | |
59 with open(genbank_path, 'r') as f: | |
60 while True: | |
61 line = f.readline() | |
62 if line.startswith('ACCESSION'): | |
63 return line.strip().split()[1] | |
64 | |
65 | |
66 def parse_fasta_accession(fasta_path): | |
67 with open(fasta_path, 'r') as f: | |
68 while True: | |
69 line = f.readline() | |
70 if line.startswith('>'): | |
71 return line.strip().split()[0][1:] | |
72 | |
73 | |
74 def count_fasta_contigs(fasta_path): | |
75 contigs = 0 | |
76 with open(fasta_path, 'r') as f: | |
77 for line in f: | |
78 if line.startswith('>'): | |
79 contigs += 1 | |
80 return contigs | |
81 | |
82 | |
83 def count_fasta_bases(fasta_path): | |
84 bases = 0 | |
85 with open(fasta_path, 'r') as f: | |
86 for line in f: | |
87 line = line.strip() | |
88 if not line.startswith('>'): | |
89 bases += len(line) | |
90 return bases | |
91 | |
92 | |
93 def compute_fasta_gc_percent(fasta_path): | |
94 gc_count = 0 | |
95 total_bases_count = 0 | |
96 with open(fasta_path, 'r') as f: | |
97 for line in f: | |
98 if not line.startswith('>'): | |
99 line = line.strip() | |
100 line_c_count = line.count('c') + line.count('C') | |
101 line_g_count = line.count('g') + line.count('G') | |
102 line_total_bases_count = len(line) | |
103 gc_count += line_c_count + line_g_count | |
104 total_bases_count += line_total_bases_count | |
105 return 100 * (gc_count / total_bases_count) | |
106 | |
107 | |
108 def main(args): | |
109 | |
110 # create output directory | |
111 try: | |
112 os.mkdir(args.outdir) | |
113 except OSError as exc: | |
114 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir): | |
115 pass | |
116 else: | |
117 raise | |
118 | |
119 # parse mob_typer report | |
120 mob_typer_report = parse_mob_typer_report(args.mob_typer_report) | |
121 num_plasmid_contigs = count_fasta_contigs(args.plasmid) | |
122 num_plasmid_bases = count_fasta_bases(args.plasmid) | |
123 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid) | |
124 | |
125 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f: | |
126 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES) | |
127 mob_typer_record_writer.writeheader() | |
128 for record in mob_typer_report: | |
129 # match the plasmid against three properties in the MOB-Typer report: | |
130 # 1. number of contigs | |
131 # 2. total length of all contigs | |
132 # 3. G/C percent (within +/-0.1%) | |
133 if num_plasmid_contigs == int(record['num_contigs']) and \ | |
134 num_plasmid_bases == int(record['total_length']) and \ | |
135 abs(plasmid_gc_percent - float(record['gc'])) < 0.1: | |
136 for reference_plasmid in args.reference_plasmids_genbank: | |
137 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']: | |
138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk")) | |
139 | |
140 for reference_plasmid in args.reference_plasmids_fasta: | |
141 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None: | |
142 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta")) | |
143 mob_typer_record_writer.writerow(record) | |
144 | |
145 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta")) | |
146 | |
147 | |
148 if __name__ == '__main__': | |
149 parser = argparse.ArgumentParser() | |
150 parser.add_argument("--plasmid", help="plasmid assembly (fasta)") | |
151 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)") | |
152 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)") | |
153 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)") | |
154 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory") | |
155 args = parser.parse_args() | |
156 main(args) |