annotate match_plasmid_to_reference.py @ 0:c917ef6807d7 draft default tip

"planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
author public-health-bioinformatics
date Tue, 12 Nov 2019 22:47:36 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
1 #!/usr/bin/env python
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
2
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
3 from __future__ import division, print_function
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
4
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
5 import argparse
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
6 import csv
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
7 import errno
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
8 import os
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
9 import re
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
10 import shutil
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
11
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
12
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
13 MOB_TYPER_FIELDNAMES = [
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
14 "file_id",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
15 "num_contigs",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
16 "total_length",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
17 "gc",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
18 "rep_type(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
19 "rep_type_accession(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
20 "relaxase_type(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
21 "relaxase_type_accession(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
22 "mpf_type",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
23 "mpf_type_accession(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
24 "orit_type(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
25 "orit_accession(s)",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
26 "PredictedMobility",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
27 "mash_nearest_neighbor",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
28 "mash_neighbor_distance",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
29 "mash_neighbor_cluster",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
30 "NCBI-HR-rank",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
31 "NCBI-HR-Name",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
32 "LitRepHRPlasmClass",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
33 "LitPredDBHRRank",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
34 "LitPredDBHRRankSciName",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
35 "LitRepHRRankInPubs",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
36 "LitRepHRNameInPubs",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
37 "LitMeanTransferRate",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
38 "LitClosestRefAcc",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
39 "LitClosestRefDonorStrain",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
40 "LitClosestRefRecipientStrain",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
41 "LitClosestRefTransferRate",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
42 "LitClosestConjugTemp",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
43 "LitPMIDs",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
44 "LitPMIDsNumber",
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
45 ]
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
46
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
47
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
48 def parse_mob_typer_report(mob_typer_report_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
49 mob_typer_report = []
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
50
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
51 with open(mob_typer_report_path) as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
52 reader = csv.DictReader(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
53 for row in reader:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
54 mob_typer_report.append(row)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
55 return mob_typer_report
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
56
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
57
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
58 def parse_genbank_accession(genbank_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
59 with open(genbank_path, 'r') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
60 while True:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
61 line = f.readline()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
62 if line.startswith('ACCESSION'):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
63 return line.strip().split()[1]
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
64
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
65
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
66 def parse_fasta_accession(fasta_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
67 with open(fasta_path, 'r') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
68 while True:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
69 line = f.readline()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
70 if line.startswith('>'):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
71 return line.strip().split()[0][1:]
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
72
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
73
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
74 def count_fasta_contigs(fasta_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
75 contigs = 0
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
76 with open(fasta_path, 'r') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
77 for line in f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
78 if line.startswith('>'):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
79 contigs += 1
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
80 return contigs
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
81
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
82
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
83 def count_fasta_bases(fasta_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
84 bases = 0
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
85 with open(fasta_path, 'r') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
86 for line in f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
87 line = line.strip()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
88 if not line.startswith('>'):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
89 bases += len(line)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
90 return bases
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
91
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
92
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
93 def compute_fasta_gc_percent(fasta_path):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
94 gc_count = 0
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
95 total_bases_count = 0
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
96 with open(fasta_path, 'r') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
97 for line in f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
98 if not line.startswith('>'):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
99 line = line.strip()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
100 line_c_count = line.count('c') + line.count('C')
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
101 line_g_count = line.count('g') + line.count('G')
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
102 line_total_bases_count = len(line)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
103 gc_count += line_c_count + line_g_count
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
104 total_bases_count += line_total_bases_count
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
105 return 100 * (gc_count / total_bases_count)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
106
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
107
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
108 def main(args):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
109
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
110 # create output directory
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
111 try:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
112 os.mkdir(args.outdir)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
113 except OSError as exc:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
114 if exc.errno == errno.EEXIST and os.path.isdir(args.outdir):
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
115 pass
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
116 else:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
117 raise
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
118
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
119 # parse mob_typer report
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
120 mob_typer_report = parse_mob_typer_report(args.mob_typer_report)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
121 num_plasmid_contigs = count_fasta_contigs(args.plasmid)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
122 num_plasmid_bases = count_fasta_bases(args.plasmid)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
123 plasmid_gc_percent = compute_fasta_gc_percent(args.plasmid)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
124
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
125 with open(os.path.join(args.outdir, 'mob_typer_record.tsv'), 'w') as f:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
126 mob_typer_record_writer = csv.DictWriter(f, delimiter="\t", quotechar='"', fieldnames=MOB_TYPER_FIELDNAMES)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
127 mob_typer_record_writer.writeheader()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
128 for record in mob_typer_report:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
129 # match the plasmid against three properties in the MOB-Typer report:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
130 # 1. number of contigs
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
131 # 2. total length of all contigs
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
132 # 3. G/C percent (within +/-0.1%)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
133 if num_plasmid_contigs == int(record['num_contigs']) and \
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
134 num_plasmid_bases == int(record['total_length']) and \
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
135 abs(plasmid_gc_percent - float(record['gc'])) < 0.1:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
136 for reference_plasmid in args.reference_plasmids_genbank:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
137 if parse_genbank_accession(reference_plasmid) == record['mash_nearest_neighbor']:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
138 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.gbk"))
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
139
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
140 for reference_plasmid in args.reference_plasmids_fasta:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
141 if re.match(record['mash_nearest_neighbor'], parse_fasta_accession(reference_plasmid)) is not None:
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
142 shutil.copy2(reference_plasmid, os.path.join(args.outdir, "reference_plasmid.fasta"))
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
143 mob_typer_record_writer.writerow(record)
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
144
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
145 shutil.copy2(args.plasmid, os.path.join(args.outdir, "plasmid.fasta"))
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
146
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
147
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
148 if __name__ == '__main__':
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
149 parser = argparse.ArgumentParser()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
150 parser.add_argument("--plasmid", help="plasmid assembly (fasta)")
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
151 parser.add_argument("--reference_plasmids_genbank", nargs='+', help="reference plasmids (genbank)")
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
152 parser.add_argument("--reference_plasmids_fasta", nargs='+', help="reference plasmids (fasta)")
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
153 parser.add_argument("--mob_typer_report", help="mob_typer reports (tsv)")
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
154 parser.add_argument("--outdir", dest="outdir", default=".", help="Output directory")
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
155 args = parser.parse_args()
c917ef6807d7 "planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/tree/master/tools/match_plasmid_to_reference commit 0f3fff91eb329adf437224eb8f7449853083b01e"
public-health-bioinformatics
parents:
diff changeset
156 main(args)