vsnp_get_snps: vsnp_get_snps.py comparison

comparison vsnp_get_snps.py @ 9:0fe292b20b9d draft

"planemo upload for repository https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_get_snps commit 3b7fef2d17fec96647345e89c774d4af417d23d7"

author	greg
date	Thu, 29 Jul 2021 13:16:03 +0000
parents	5e4595b9f63c
children	be5875f29ea4

comparison

equal deleted inserted replaced

-:5e4595b9f63c
+:0fe292b20b9d
 # Collect quality parsimonious SNPs from vcf files
 # and output alignment files in fasta format.
 import argparse
+import multiprocessing
 import os
+import queue
 import shutil
 import sys
 import time
 from collections import OrderedDict
 from datetime import datetime
 import vcf
 def get_time_stamp():
 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
+def set_num_cpus(num_files, processes):
+num_cpus = int(multiprocessing.cpu_count())
+if num_files < num_cpus and num_files < processes:
+return num_files
+if num_cpus < processes:
+half_cpus = int(num_cpus / 2)
+if num_files < half_cpus:
+return num_files
+return half_cpus
+return processes
 def setup_all_vcfs(vcf_files, vcf_dirs):
 # Create the all_vcfs directory and link
 # all input vcf files into it for processing.
 exclusion_list.append(chrom + ":" + str(position))
 return exclusion_list
 except ValueError:
 return []
-def get_snps(self, group_dir):
+def get_snps(self, task_queue, timeout):
-# Parse all vcf files to accumulate
+while True:
-# the SNPs into a data frame.
+try:
-positions_dict = {}
+group_dir = task_queue.get(block=True, timeout=timeout)
-group_files = []
+except queue.Empty:
-for file_name in os.listdir(os.path.abspath(group_dir)):
+break
-file_path = os.path.abspath(os.path.join(group_dir, file_name))
+# Parse all vcf files to accumulate
-group_files.append(file_path)
+# the SNPs into a data frame.
-for file_name in group_files:
+positions_dict = {}
-found_positions, found_positions_mix = self.find_initial_positions(file_name)
+group_files = []
-positions_dict.update(found_positions)
+for file_name in os.listdir(os.path.abspath(group_dir)):
-# Order before adding to file to match
+file_path = os.path.abspath(os.path.join(group_dir, file_name))
-# with ordering of individual samples.
+group_files.append(file_path)
-# all_positions is abs_pos:REF
+for file_name in group_files:
-self.all_positions = OrderedDict(sorted(positions_dict.items()))
+found_positions, found_positions_mix = self.find_initial_positions(file_name)
-ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
+positions_dict.update(found_positions)
-all_map_qualities = {}
+# Order before adding to file to match
-df_list = []
+# with ordering of individual samples.
-for file_name in group_files:
+# all_positions is abs_pos:REF
-sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
+self.all_positions = OrderedDict(sorted(positions_dict.items()))
-df_list.append(sample_df)
+ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
-all_map_qualities.update({file_name_base: sample_map_qualities})
+all_map_qualities = {}
-all_sample_df = pandas.concat(df_list)
+df_list = []
-# All positions have now been selected for each sample,
+for file_name in group_files:
-# so select parisomony informative SNPs.  This removes
+sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
-# columns where all fields are the same.
+df_list.append(sample_df)
-# Add reference to top row.
+all_map_qualities.update({file_name_base: sample_map_qualities})
-prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
+all_sample_df = pandas.concat(df_list)
-all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
+# All positions have now been selected for each sample,
-mq_averages = all_mq_df.mean(axis=1).astype(int)
+# so select parisomony informative SNPs.  This removes
-self.gather_and_filter(prefilter_df, mq_averages, group_dir)
+# columns where all fields are the same.
+# Add reference to top row.
+prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
+all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
+mq_averages = all_mq_df.mean(axis=1).astype(int)
+self.gather_and_filter(prefilter_df, mq_averages, group_dir)
+task_queue.task_done()
 def group_vcfs(self, vcf_files):
 # Parse an excel file to produce a
 # grouping dictionary for SNPs.
 xl = pandas.ExcelFile(self.input_excel)
 # files, both the samples and the "database".
 vcf_files = []
 for file_name in os.listdir(args.input_vcf_dir):
 file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name))
 vcf_files.append(file_path)
+multiprocessing.set_start_method('spawn')
+queue1 = multiprocessing.JoinableQueue()
 num_files = len(vcf_files)
+cpus = set_num_cpus(num_files, args.processes)
+# Set a timeout for get()s in the queue.
+timeout = 0.05
 # Initialize the snp_finder object.
 snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary)
 # Define and make the set of directories into which the input_zc_vcf
 # the above call to the set of directories containing
 # vcf files for analysis.
 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
 vcf_dirs.extend(group_dirs)
+# Populate the queue for job splitting.
 for vcf_dir in vcf_dirs:
-snp_finder.get_snps(vcf_dir)
+queue1.put(vcf_dir)
+# Complete the get_snps task.
+processes = [multiprocessing.Process(target=snp_finder.get_snps, args=(queue1, timeout, )) for _ in range(cpus)]
+for p in processes:
+p.start()
+for p in processes:
+p.join()
+queue1.join()
 # Finish summary log.
 snp_finder.append_to_summary("<br/><b>Time finished:</b> %s<br/>\n" % get_time_stamp())
 total_run_time = datetime.now() - snp_finder.timer_start
 snp_finder.append_to_summary("<br/><b>Total run time:</b> %s<br/>\n" % str(total_run_time))

Mercurial > repos > greg > vsnp_get_snps

comparison vsnp_get_snps.py @ 9:0fe292b20b9d draft