comparison match_plasmid_to_reference.py @ 0:c917ef6807d7 draft default tip

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
author public-health-bioinformatics
date Tue, 12 Nov 2019 22:47:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c917ef6807d7
1 #!/usr/bin/env python
2
3 from __future__ import division, print_function
4
5 import argparse
6 import csv
7 import errno
8 import os
9 import re
10 import shutil
11
12
13 MOB_TYPER_FIELDNAMES = [
14 "file_id",
15 "num_contigs",
16 "total_length",
17 "gc",
18 "rep_type(s)",
19 "rep_type_accession(s)",
20 "relaxase_type(s)",
21 "relaxase_type_accession(s)",
22 "mpf_type",
23 "mpf_type_accession(s)",
24 "orit_type(s)",
25 "orit_accession(s)",
26 "PredictedMobility",
27 "mash_nearest_neighbor",
28 "mash_neighbor_distance",
29 "mash_neighbor_cluster",
30 "NCBI-HR-rank",
31 "NCBI-HR-Name",
32 "LitRepHRPlasmClass",
33 "LitPredDBHRRank",
34 "LitPredDBHRRankSciName",
35 "LitRepHRRankInPubs",
36 "LitRepHRNameInPubs",
37 "LitMeanTransferRate",
38 "LitClosestRefAcc",
39 "LitClosestRefDonorStrain",
40 "LitClosestRefRecipientStrain",
41 "LitClosestRefTransferRate",
42 "LitClosestConjugTemp",
43 "LitPMIDs",
44 "LitPMIDsNumber",
45 ]
46
47
48 def parse_mob_typer_report(mob_typer_report_path):
49 mob_typer_report = []
50
51 with open(mob_typer_report_path) as f:
52 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
53 for row in reader:
54 mob_typer_report.append(row)
55 return mob_typer_report
56
57
58 def parse_genbank_accession(genbank_path):
59 with open(genbank_path, 'r') as f:
60 while True:
61 line = f.readline()
62 if line.startswith('ACCESSION'):
63 return line.strip().split()[1]
64
65
66 def parse_fasta_accession(fasta_path):
67 with open(fasta_path, 'r') as f:
68 while True:
69 line = f.readline()
70 if line.startswith('>'):
71 return line.strip().split()[0][1:]
72
73
74 def count_fasta_contigs(fasta_path):
75 contigs = 0
76 with open(fasta_path, 'r') as f:
77 for line in f:
78 if line.startswith('>'):
79 contigs += 1
80 return contigs
81
82
83 def count_fasta_bases(fasta_path):
84 bases = 0
85 with open(fasta_path, 'r') as f:
86 for line in f:
87 line = line.strip()
88 if not line.startswith('>'):
89 bases += len(line)
90 return bases
91
92
93 def compute_fasta_gc_percent(fasta_path):
94 gc_count = 0
95 total_bases_count = 0
96 with open(fasta_path, 'r') as f:
97 for line in f:
98 if not line.startswith('>'):
99 line = line.strip()
100 line_c_count = line.count('c') + line.count('C')
101 line_g_count = line.count('g') + line.count('G')
102 line_total_bases_count = len(line)
103 gc_count += line_c_count + line_g_count
104 total_bases_count += line_total_bases_count
105 return 100 * (gc_count / total_bases_count)
106
107
108 def main(args):
109
110 # create output directory
111 try:
112 os.mkdir(args.outdir)
113 except OSError as exc:
114 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir):
115 pass
116 else:
117 raise
118
119 # parse mob_typer report
120 mob_typer_report = parse_mob_typer_report(args.mob_typer_report)
121 num_plasmid_contigs = count_fasta_contigs(args.plasmid)
122 num_plasmid_bases = count_fasta_bases(args.plasmid)
123 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid)
124
125 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f:
126 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
127 mob_typer_record_writer.writeheader()
128 for record in mob_typer_report:
129 # match the plasmid against three properties in the MOB-Typer report:
130 # 1. number of contigs
131 # 2. total length of all contigs
132 # 3. G/C percent (within +/-0.1%)
133 if num_plasmid_contigs == int(record['num_contigs']) and \
134 num_plasmid_bases == int(record['total_length']) and \
135 abs(plasmid_gc_percent - float(record['gc'])) < 0.1:
136 for reference_plasmid in args.reference_plasmids_genbank:
137 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']:
138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk"))
139
140 for reference_plasmid in args.reference_plasmids_fasta:
141 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None:
142 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta"))
143 mob_typer_record_writer.writerow(record)
144
145 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta"))
146
147
148 if __name__ == '__main__':
149 parser = argparse.ArgumentParser()
150 parser.add_argument("--plasmid", help="plasmid assembly (fasta)")
151 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)")
152 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)")
153 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)")
154 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory")
155 args = parser.parse_args()
156 main(args)