annotate vsnp_get_snps.py @ 7:57bd5b859e86 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit c38fd63f7980c70390d104a73ba4c72b266444c3
author iuc
date Fri, 10 Jun 2022 06:10:23 +0000
parents 532a11cdd818
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
1 #!/usr/bin/env python
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
2
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
3 # Collect quality parsimonious SNPs from vcf files
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
4 # and output alignment files in fasta format.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
5
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
6 import argparse
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
7 import multiprocessing
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
8 import os
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
9 import queue
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
10 import shutil
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
11 import sys
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
12 import time
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
13 from collections import OrderedDict
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
14 from datetime import datetime
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
15
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
16 import pandas
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
17 import vcf
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
18
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
19
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
20 def get_time_stamp():
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
21 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
22
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
23
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
24 def setup_all_vcfs(vcf_files, vcf_dirs):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
25 # Create the all_vcfs directory and link
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
26 # all input vcf files into it for processing.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
27 all_vcfs_dir = 'all_vcf'
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
28 os.makedirs(all_vcfs_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
29 vcf_dirs.append(all_vcfs_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
30 for vcf_file in vcf_files:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
31 file_name_base = os.path.basename(vcf_file)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
32 dst_file = os.path.join(all_vcfs_dir, file_name_base)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
33 os.symlink(vcf_file, dst_file)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
34 return vcf_dirs
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
35
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
36
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
37 class SnpFinder:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
38
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
39 def __init__(self, num_files, dbkey, input_excel, all_isolates, ac, min_mq, quality_score_n_threshold, min_quality_score, input_vcf_dir, output_json_avg_mq_dir, output_json_snps_dir, output_snps_dir, output_summary):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
40 # Allele count
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
41 self.ac = ac
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
42 # Create a group that will contain all isolates.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
43 self.all_isolates = all_isolates
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
44 # Evolving positions dictionary.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
45 self.all_positions = None
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
46 # Isolate groups.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
47 self.groups = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
48 # Excel file for grouping.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
49 self.input_excel = input_excel
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
50 # Directory of input zero coverage vcf files.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
51 self.input_vcf_dir = input_vcf_dir
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
52 # Minimum map quality value.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
53 self.min_mq = min_mq
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
54 # Minimum quality score value.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
55 self.min_quality_score = min_quality_score
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
56 # Number of input zero coverage vcf files.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
57 self.num_files = num_files
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
58 # Output directory for json average mq files.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
59 self.output_json_avg_mq_dir = output_json_avg_mq_dir
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
60 # Output directory for json snps files.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
61 self.output_json_snps_dir = output_json_snps_dir
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
62 # Output directory for snps files.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
63 self.output_snps_dir = output_snps_dir
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
64 # Quality score N threshold value.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
65 self.quality_score_n_threshold = quality_score_n_threshold
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
66 self.dbkey = dbkey
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
67 self.start_time = get_time_stamp()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
68 self.summary_str = ""
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
69 self.timer_start = datetime.now()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
70 self.initiate_summary(output_summary)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
71
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
72 def append_to_summary(self, html_str):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
73 # Append a string to the html summary output file.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
74 self.summary_str = "%s%s" % (self.summary_str, html_str)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
75
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
76 def bin_input_files(self, filename, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
77 # Categorize input files into closely related
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
78 # isolate groups based on discovered SNPs, and
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
79 # return a group dictionary.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
80 sample_groups_list = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
81 table_name = self.get_sample_name(filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
82 defining_snp = False
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
83 # Absolute positions in set union of two lists.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
84 for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
85 group = defining_snps[abs_position]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
86 sample_groups_list.append(group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
87 self.check_add_group(group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
88 if len(list(defining_snps.keys() & found_positions_mix.keys())) > 0:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
89 table_name = self.get_sample_name(filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
90 table_name = '%s<font color="red">[[MIXED]]</font>' % table_name
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
91 self.copy_file(filename, group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
92 defining_snp = True
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
93 if not set(inverted_defining_snps.keys()).intersection(found_positions.keys() | found_positions_mix.keys()):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
94 for abs_position in list(inverted_defining_snps.keys()):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
95 group = inverted_defining_snps[abs_position]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
96 sample_groups_list.append(group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
97 self.check_add_group(group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
98 self.copy_file(filename, group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
99 defining_snp = True
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
100 if defining_snp:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
101 samples_groups_dict[table_name] = sorted(sample_groups_list)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
102 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
103 samples_groups_dict[table_name] = ['<font color="red">No defining SNP</font>']
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
104 return samples_groups_dict
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
105
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
106 def check_add_group(self, group):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
107 # Add a group if it is npt already in the list.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
108 if group not in self.groups:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
109 self.groups.append(group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
110
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
111 def copy_file(self, filename, dir):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
112 if not os.path.exists(dir):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
113 os.makedirs(dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
114 shutil.copy(filename, dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
115
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
116 def decide_snps(self, filename):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
117 # Find the SNPs in a vcf file to produce a pandas data
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
118 # frame and a dictionary containing sample map qualities.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
119 positions_dict = self.all_positions
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
120 sample_map_qualities = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
121 # Eliminate the path.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
122 file_name_base = self.get_sample_name(filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
123 vcf_reader = vcf.Reader(open(filename, 'r'))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
124 sample_dict = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
125 for record in vcf_reader:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
126 alt = str(record.ALT[0])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
127 record_position = "%s:%s" % (str(record.CHROM), str(record.POS))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
128 if record_position in positions_dict:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
129 if alt == "None":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
130 sample_dict.update({record_position: "-"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
131 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
132 # On rare occassions MQM gets called "NaN", thus passing
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
133 # a string when a number is expected when calculating average.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
134 mq_val = self.get_mq_val(record.INFO, filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
135 if str(mq_val).lower() not in ["nan"]:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
136 sample_map_qualities.update({record_position: mq_val})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
137 if len(alt) == 1:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
138 qual_val = self.val_as_int(record.QUAL)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
139 ac = record.INFO['AC'][0]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
140 ref = str(record.REF[0])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
141 if ac == 2 and qual_val > self.quality_score_n_threshold:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
142 # Add the SNP to a group.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
143 sample_dict.update({record_position: alt})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
144 elif ac == 1 and qual_val > self.quality_score_n_threshold:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
145 # The position is ambiguous.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
146 alt_ref = "%s%s" % (alt, ref)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
147 if alt_ref == "AG":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
148 sample_dict.update({record_position: "R"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
149 elif alt_ref == "CT":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
150 sample_dict.update({record_position: "Y"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
151 elif alt_ref == "GC":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
152 sample_dict.update({record_position: "S"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
153 elif alt_ref == "AT":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
154 sample_dict.update({record_position: "W"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
155 elif alt_ref == "GT":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
156 sample_dict.update({record_position: "K"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
157 elif alt_ref == "AC":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
158 sample_dict.update({record_position: "M"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
159 elif alt_ref == "GA":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
160 sample_dict.update({record_position: "R"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
161 elif alt_ref == "TC":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
162 sample_dict.update({record_position: "Y"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
163 elif alt_ref == "CG":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
164 sample_dict.update({record_position: "S"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
165 elif alt_ref == "TA":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
166 sample_dict.update({record_position: "W"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
167 elif alt_ref == "TG":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
168 sample_dict.update({record_position: "K"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
169 elif alt_ref == "CA":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
170 sample_dict.update({record_position: "M"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
171 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
172 sample_dict.update({record_position: "N"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
173 # Poor calls
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
174 elif qual_val <= 50:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
175 # Call the reference allele.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
176 # Do not coerce record.REF[0] to a string!
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
177 sample_dict.update({record_position: record.REF[0]})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
178 elif qual_val <= self.quality_score_n_threshold:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
179 sample_dict.update({record_position: "N"})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
180 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
181 # Insurance -- Will still report on a possible
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
182 # SNP even if missed with above statements.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
183 # Do not coerce record.REF[0] to a string!
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
184 sample_dict.update({record_position: record.REF[0]})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
185 # Merge dictionaries and order
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
186 merge_dict = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
187 merge_dict.update(positions_dict)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
188 merge_dict.update(sample_dict)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
189 sample_df = pandas.DataFrame(merge_dict, index=[file_name_base])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
190 return sample_df, file_name_base, sample_map_qualities
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
191
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
192 def df_to_fasta(self, parsimonious_df, group):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
193 # Generate SNP alignment file from
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
194 # the parsimonious_df data frame.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
195 snps_file = os.path.join(self.output_snps_dir, "%s.fasta" % group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
196 test_duplicates = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
197 has_sequence_data = False
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
198 for index, row in parsimonious_df.iterrows():
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
199 for pos in row:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
200 if len(pos) > 0:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
201 has_sequence_data = True
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
202 break
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
203 if has_sequence_data:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
204 with open(snps_file, 'w') as fh:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
205 for index, row in parsimonious_df.iterrows():
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
206 test_duplicates.append(row.name)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
207 if test_duplicates.count(row.name) < 2:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
208 print(f'>{row.name}', file=fh)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
209 for pos in row:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
210 print(pos, end='', file=fh)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
211 print("", file=fh)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
212 return has_sequence_data
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
213
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
214 def find_initial_positions(self, filename):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
215 # Find SNP positions in a vcf file.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
216 found_positions = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
217 found_positions_mix = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
218 vcf_reader = vcf.Reader(open(filename, 'r'))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
219 for record in vcf_reader:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
220 qual_val = self.val_as_int(record.QUAL)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
221 chrom = record.CHROM
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
222 position = record.POS
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
223 absolute_position = "%s:%s" % (str(chrom), str(position))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
224 alt = str(record.ALT[0])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
225 if alt != "None":
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
226 mq_val = self.get_mq_val(record.INFO, filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
227 ac = record.INFO['AC'][0]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
228 if ac == self.ac and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
229 found_positions.update({absolute_position: record.REF})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
230 if ac == 1 and len(record.REF) == 1 and qual_val > self.min_quality_score and mq_val > self.min_mq:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
231 found_positions_mix.update({absolute_position: record.REF})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
232 return found_positions, found_positions_mix
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
233
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
234 def gather_and_filter(self, prefilter_df, mq_averages, group_dir):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
235 # Group a data frame of SNPs.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
236 if self.input_excel is None:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
237 filtered_all_df = prefilter_df
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
238 sheet_names = None
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
239 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
240 # Filter positions to be removed from all.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
241 xl = pandas.ExcelFile(self.input_excel)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
242 sheet_names = xl.sheet_names
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
243 # Use the first column to filter "all" postions.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
244 exclusion_list_all = self.get_position_list(sheet_names, 0)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
245 exclusion_list_group = self.get_position_list(sheet_names, group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
246 exclusion_list = exclusion_list_all + exclusion_list_group
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
247 # Filters for all applied.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
248 filtered_all_df = prefilter_df.drop(columns=exclusion_list, errors='ignore')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
249 json_snps_file = os.path.join(self.output_json_snps_dir, "%s.json" % group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
250 parsimonious_df = self.get_parsimonious_df(filtered_all_df)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
251 samples_number, columns = parsimonious_df.shape
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
252 if samples_number >= 4:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
253 # Sufficient samples have been found
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
254 # to build a phylogenetic tree.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
255 has_sequence_data = self.df_to_fasta(parsimonious_df, group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
256 if has_sequence_data:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
257 json_avg_mq_file = os.path.join(self.output_json_avg_mq_dir, "%s.json" % group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
258 mq_averages.to_json(json_avg_mq_file, orient='split')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
259 parsimonious_df.to_json(json_snps_file, orient='split')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
260 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
261 msg = "<br/>No sequence data"
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
262 if group_dir is not None:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
263 msg = "%s for group: %s" % (msg, group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
264 self.append_to_summary("%s<br/>\n" % msg)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
265 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
266 msg = "<br/>Too few samples to build tree"
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
267 if group_dir is not None:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
268 msg = "%s for group: %s" % (msg, group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
269 self.append_to_summary("%s<br/>\n" % msg)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
270
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
271 def get_sample_name(self, file_path):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
272 # Return the sample part of a file name.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
273 base_file_name = os.path.basename(file_path)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
274 if base_file_name.find(".") > 0:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
275 # Eliminate the extension.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
276 return os.path.splitext(base_file_name)[0]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
277 return base_file_name
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
278
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
279 def get_mq_val(self, record_info, filename):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
280 # Get the MQ (gatk) or MQM (freebayes) value
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
281 # from the record.INFO component of the vcf file.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
282 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
283 mq_val = record_info['MQM']
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
284 return self.return_val(mq_val)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
285 except Exception:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
286 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
287 mq_val = record_info['MQ']
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
288 return self.return_val(mq_val)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
289 except Exception:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
290 msg = "Invalid or unsupported vcf header %s in file: %s\n" % (str(record_info), filename)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
291 sys.exit(msg)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
292
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
293 def get_parsimonious_df(self, filtered_all_df):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
294 # Get the parsimonious SNPs data frame
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
295 # from a data frame of filtered SNPs.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
296 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
297 ref_series = filtered_all_df.loc['root']
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
298 # In all_vcf root needs to be removed.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
299 filtered_all_df = filtered_all_df.drop(['root'])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
300 except KeyError:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
301 pass
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
302 parsimony = filtered_all_df.loc[:, (filtered_all_df != filtered_all_df.iloc[0]).any()]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
303 parsimony_positions = list(parsimony)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
304 parse_df = filtered_all_df[parsimony_positions]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
305 ref_df = ref_series.to_frame()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
306 ref_df = ref_df.T
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
307 parsimonious_df = pandas.concat([parse_df, ref_df], join='inner')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
308 return parsimonious_df
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
309
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
310 def get_position_list(self, sheet_names, group):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
311 # Get a list of positions defined by an excel file.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
312 exclusion_list = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
313 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
314 filter_to_all = pandas.read_excel(self.input_excel, header=1, usecols=[group])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
315 for value in filter_to_all.values:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
316 value = str(value[0])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
317 if "-" not in value.split(":")[-1]:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
318 exclusion_list.append(value)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
319 elif "-" in value:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
320 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
321 chrom, sequence_range = value.split(":")
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
322 except Exception as e:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
323 sys.exit(str(e))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
324 value = sequence_range.split("-")
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
325 for position in range(int(value[0].replace(',', '')), int(value[1].replace(',', '')) + 1):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
326 exclusion_list.append(chrom + ":" + str(position))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
327 return exclusion_list
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
328 except ValueError:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
329 return []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
330
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
331 def get_snps(self, task_queue, timeout):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
332 while True:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
333 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
334 group_dir = task_queue.get(block=True, timeout=timeout)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
335 except queue.Empty:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
336 break
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
337 # Parse all vcf files to accumulate
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
338 # the SNPs into a data frame.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
339 positions_dict = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
340 group_files = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
341 for file_name in os.listdir(os.path.abspath(group_dir)):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
342 file_path = os.path.abspath(os.path.join(group_dir, file_name))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
343 group_files.append(file_path)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
344 for file_name in group_files:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
345 found_positions, found_positions_mix = self.find_initial_positions(file_name)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
346 positions_dict.update(found_positions)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
347 # Order before adding to file to match
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
348 # with ordering of individual samples.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
349 # all_positions is abs_pos:REF
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
350 self.all_positions = OrderedDict(sorted(positions_dict.items()))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
351 ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
352 all_map_qualities = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
353 df_list = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
354 for file_name in group_files:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
355 sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
356 df_list.append(sample_df)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
357 all_map_qualities.update({file_name_base: sample_map_qualities})
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
358 all_sample_df = pandas.concat(df_list)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
359 # All positions have now been selected for each sample,
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
360 # so select parisomony informative SNPs. This removes
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
361 # columns where all fields are the same.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
362 # Add reference to top row.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
363 prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
364 all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
365 mq_averages = all_mq_df.mean(axis=1).astype(int)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
366 self.gather_and_filter(prefilter_df, mq_averages, group_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
367 task_queue.task_done()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
368
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
369 def group_vcfs(self, vcf_files):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
370 # Parse an excel file to produce a
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
371 # grouping dictionary for SNPs.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
372 xl = pandas.ExcelFile(self.input_excel)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
373 sheet_names = xl.sheet_names
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
374 ws = pandas.read_excel(self.input_excel, sheet_name=sheet_names[0])
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
375 defining_snps = ws.iloc[0]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
376 defsnp_iterator = iter(defining_snps.iteritems())
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
377 next(defsnp_iterator)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
378 defining_snps = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
379 inverted_defining_snps = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
380 for abs_pos, group in defsnp_iterator:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
381 if '!' in abs_pos:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
382 inverted_defining_snps[abs_pos.replace('!', '')] = group
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
383 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
384 defining_snps[abs_pos] = group
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
385 samples_groups_dict = {}
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
386 for vcf_file in vcf_files:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
387 found_positions, found_positions_mix = self.find_initial_positions(vcf_file)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
388 samples_groups_dict = self.bin_input_files(vcf_file, samples_groups_dict, defining_snps, inverted_defining_snps, found_positions, found_positions_mix)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
389 # Output summary grouping table.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
390 self.append_to_summary('<br/>')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
391 self.append_to_summary('<b>Groupings with %d listed:</b><br/>\n' % len(samples_groups_dict))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
392 self.append_to_summary('<table cellpadding="5" cellspaging="5" border="1">\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
393 for key, value in samples_groups_dict.items():
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
394 self.append_to_summary('<tr align="left"><th>Sample Name</th>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
395 self.append_to_summary('<td>%s</td>' % key)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
396 for group in value:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
397 self.append_to_summary('<td>%s</td>\n' % group)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
398 self.append_to_summary('</tr>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
399 self.append_to_summary('</table><br/>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
400
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
401 def initiate_summary(self, output_summary):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
402 # Output summary file handle.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
403 self.append_to_summary('<html>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
404 self.append_to_summary('<head></head>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
405 self.append_to_summary('<body style=\"font-size:12px;">')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
406 self.append_to_summary("<b>Time started:</b> %s<br/>" % get_time_stamp())
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
407 self.append_to_summary("<b>Number of VCF inputs:</b> %d<br/>" % self.num_files)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
408 self.append_to_summary("<b>Reference:</b> %s<br/>" % self.dbkey)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
409 self.append_to_summary("<b>All isolates:</b> %s<br/>" % str(self.all_isolates))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
410
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
411 def return_val(self, val, index=0):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
412 # Handle element and single-element list values.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
413 if isinstance(val, list):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
414 return val[index]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
415 return val
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
416
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
417 def val_as_int(self, val):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
418 # Handle integer value conversion.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
419 try:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
420 return int(val)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
421 except TypeError:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
422 # val is likely None here.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
423 return 0
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
424
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
425
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
426 if __name__ == '__main__':
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
427
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
428 parser = argparse.ArgumentParser()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
429
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
430 parser.add_argument('--ac', action='store', dest='ac', type=int, help='Allele count value'),
5
a8560decb495 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 2a94c64d6c7236550bf483d2ffc4e86248c63aab"
iuc
parents: 4
diff changeset
431 parser.add_argument('--all_isolates', action='store_true', dest='all_isolates', help='Create table with all isolates'),
4
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
432 parser.add_argument('--input_excel', action='store', dest='input_excel', required=False, default=None, help='Optional Excel filter file'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
433 parser.add_argument('--input_vcf_dir', action='store', dest='input_vcf_dir', help='Input vcf directory'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
434 parser.add_argument('--min_mq', action='store', dest='min_mq', type=int, help='Minimum map quality value'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
435 parser.add_argument('--min_quality_score', action='store', dest='min_quality_score', type=int, help='Minimum quality score value'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
436 parser.add_argument('--output_json_avg_mq_dir', action='store', dest='output_json_avg_mq_dir', help='Output json average mq directory'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
437 parser.add_argument('--output_json_snps_dir', action='store', dest='output_json_snps_dir', help='Output json snps directory'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
438 parser.add_argument('--output_snps_dir', action='store', dest='output_snps_dir', help='Output snps directory'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
439 parser.add_argument('--output_summary', action='store', dest='output_summary', help='Output summary html file'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
440 parser.add_argument('--processes', action='store', dest='processes', type=int, help='Configured processes for job'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
441 parser.add_argument('--quality_score_n_threshold', action='store', dest='quality_score_n_threshold', type=int, help='Minimum quality score N value for alleles'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
442 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Galaxy genome build dbkey'),
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
443
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
444 args = parser.parse_args()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
445
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
446 # Build the list of all input zero coverage vcf
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
447 # files, both the samples and the "database".
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
448 vcf_files = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
449 for file_name in os.listdir(args.input_vcf_dir):
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
450 file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
451 vcf_files.append(file_path)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
452
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
453 multiprocessing.set_start_method('spawn')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
454 queue1 = multiprocessing.JoinableQueue()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
455 num_files = len(vcf_files)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
456 # Set a timeout for get()s in the queue.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
457 timeout = 0.05
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
458
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
459 # Initialize the snp_finder object.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
460 snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
461
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
462 # Define and make the set of directories into which the input_zc_vcf
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
463 # files will be placed. Selected input values (e.g., the use of
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
464 # an Excel file for grouping and filtering, creating a group with
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
465 # all isolates) are used to define the directories.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
466 vcf_dirs = []
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
467 if args.input_excel is None:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
468 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
469 else:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
470 if args.all_isolates:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
471 vcf_dirs = setup_all_vcfs(vcf_files, vcf_dirs)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
472 # Parse the Excel file to detemine groups for filtering.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
473 snp_finder.group_vcfs(vcf_files)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
474 # Append the list of group directories created by
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
475 # the above call to the set of directories containing
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
476 # vcf files for analysis.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
477 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
478 vcf_dirs.extend(group_dirs)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
479
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
480 # Populate the queue for job splitting.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
481 for vcf_dir in vcf_dirs:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
482 queue1.put(vcf_dir)
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
483
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
484 # Complete the get_snps task.
7
57bd5b859e86 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit c38fd63f7980c70390d104a73ba4c72b266444c3
iuc
parents: 6
diff changeset
485 processes = [multiprocessing.Process(target=snp_finder.get_snps, args=(queue1, timeout, )) for _ in range(args.processes)]
4
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
486 for p in processes:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
487 p.start()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
488 for p in processes:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
489 p.join()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
490 queue1.join()
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
491
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
492 # Finish summary log.
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
493 snp_finder.append_to_summary("<br/><b>Time finished:</b> %s<br/>\n" % get_time_stamp())
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
494 total_run_time = datetime.now() - snp_finder.timer_start
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
495 snp_finder.append_to_summary("<br/><b>Total run time:</b> %s<br/>\n" % str(total_run_time))
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
496 snp_finder.append_to_summary('</body>\n</html>\n')
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
497 with open(args.output_summary, "w") as fh:
e3016c6c5994 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 95b221f68d19702681babd765c67caeeb24e7f1d"
iuc
parents:
diff changeset
498 fh.write("%s" % snp_finder.summary_str)