vsnp_get_snps: vsnp_get_snps.py comparison

comparison vsnp_get_snps.py @ 8:5e4595b9f63c draft

"planemo upload for repository https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_get_snps commit 7423e5bb852a786195c095b9f663aac0ec9c8fd9"

author	greg
date	Thu, 29 Jul 2021 12:50:01 +0000
parents	14285a94fb13
children	0fe292b20b9d

comparison

equal deleted inserted replaced

-:2286f3a13e4d
+:5e4595b9f63c
 # Collect quality parsimonious SNPs from vcf files
 # and output alignment files in fasta format.
 import argparse
-import multiprocessing
 import os
-import queue
 import shutil
 import sys
 import time
 from collections import OrderedDict
 from datetime import datetime
 import vcf
 def get_time_stamp():
 return datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H-%M-%S')
-def set_num_cpus(num_files, processes):
-num_cpus = int(multiprocessing.cpu_count())
-if num_files < num_cpus and num_files < processes:
-return num_files
-if num_cpus < processes:
-half_cpus = int(num_cpus / 2)
-if num_files < half_cpus:
-return num_files
-return half_cpus
-return processes
 def setup_all_vcfs(vcf_files, vcf_dirs):
 # Create the all_vcfs directory and link
 # all input vcf files into it for processing.
 exclusion_list.append(chrom + ":" + str(position))
 return exclusion_list
 except ValueError:
 return []
-def get_snps(self, task_queue, timeout):
+def get_snps(self, group_dir):
-while True:
+# Parse all vcf files to accumulate
-try:
+# the SNPs into a data frame.
-group_dir = task_queue.get(block=True, timeout=timeout)
+positions_dict = {}
-except queue.Empty:
+group_files = []
-break
+for file_name in os.listdir(os.path.abspath(group_dir)):
-# Parse all vcf files to accumulate
+file_path = os.path.abspath(os.path.join(group_dir, file_name))
-# the SNPs into a data frame.
+group_files.append(file_path)
-positions_dict = {}
+for file_name in group_files:
-group_files = []
+found_positions, found_positions_mix = self.find_initial_positions(file_name)
-for file_name in os.listdir(os.path.abspath(group_dir)):
+positions_dict.update(found_positions)
-file_path = os.path.abspath(os.path.join(group_dir, file_name))
+# Order before adding to file to match
-group_files.append(file_path)
+# with ordering of individual samples.
-for file_name in group_files:
+# all_positions is abs_pos:REF
-found_positions, found_positions_mix = self.find_initial_positions(file_name)
+self.all_positions = OrderedDict(sorted(positions_dict.items()))
-positions_dict.update(found_positions)
+ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
-# Order before adding to file to match
+all_map_qualities = {}
-# with ordering of individual samples.
+df_list = []
-# all_positions is abs_pos:REF
+for file_name in group_files:
-self.all_positions = OrderedDict(sorted(positions_dict.items()))
+sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
-ref_positions_df = pandas.DataFrame(self.all_positions, index=['root'])
+df_list.append(sample_df)
-all_map_qualities = {}
+all_map_qualities.update({file_name_base: sample_map_qualities})
-df_list = []
+all_sample_df = pandas.concat(df_list)
-for file_name in group_files:
+# All positions have now been selected for each sample,
-sample_df, file_name_base, sample_map_qualities = self.decide_snps(file_name)
+# so select parisomony informative SNPs.  This removes
-df_list.append(sample_df)
+# columns where all fields are the same.
-all_map_qualities.update({file_name_base: sample_map_qualities})
+# Add reference to top row.
-all_sample_df = pandas.concat(df_list)
+prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
-# All positions have now been selected for each sample,
+all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
-# so select parisomony informative SNPs.  This removes
+mq_averages = all_mq_df.mean(axis=1).astype(int)
-# columns where all fields are the same.
+self.gather_and_filter(prefilter_df, mq_averages, group_dir)
-# Add reference to top row.
-prefilter_df = pandas.concat([ref_positions_df, all_sample_df], join='inner')
-all_mq_df = pandas.DataFrame.from_dict(all_map_qualities)
-mq_averages = all_mq_df.mean(axis=1).astype(int)
-self.gather_and_filter(prefilter_df, mq_averages, group_dir)
-task_queue.task_done()
 def group_vcfs(self, vcf_files):
 # Parse an excel file to produce a
 # grouping dictionary for SNPs.
 xl = pandas.ExcelFile(self.input_excel)
 # files, both the samples and the "database".
 vcf_files = []
 for file_name in os.listdir(args.input_vcf_dir):
 file_path = os.path.abspath(os.path.join(args.input_vcf_dir, file_name))
 vcf_files.append(file_path)
-multiprocessing.set_start_method('spawn')
-queue1 = multiprocessing.JoinableQueue()
 num_files = len(vcf_files)
-cpus = set_num_cpus(num_files, args.processes)
-# Set a timeout for get()s in the queue.
-timeout = 0.05
 # Initialize the snp_finder object.
 snp_finder = SnpFinder(num_files, args.dbkey, args.input_excel, args.all_isolates, args.ac, args.min_mq, args.quality_score_n_threshold, args.min_quality_score, args.input_vcf_dir, args.output_json_avg_mq_dir, args.output_json_snps_dir, args.output_snps_dir, args.output_summary)
 # Define and make the set of directories into which the input_zc_vcf
 # the above call to the set of directories containing
 # vcf files for analysis.
 group_dirs = [d for d in os.listdir(os.getcwd()) if os.path.isdir(d) and d in snp_finder.groups]
 vcf_dirs.extend(group_dirs)
-# Populate the queue for job splitting.
 for vcf_dir in vcf_dirs:
-queue1.put(vcf_dir)
+snp_finder.get_snps(vcf_dir)
-# Complete the get_snps task.
-processes = [multiprocessing.Process(target=snp_finder.get_snps, args=(queue1, timeout, )) for _ in range(cpus)]
-for p in processes:
-p.start()
-for p in processes:
-p.join()
-queue1.join()
 # Finish summary log.
 snp_finder.append_to_summary("<br/><b>Time finished:</b> %s<br/>\n" % get_time_stamp())
 total_run_time = datetime.now() - snp_finder.timer_start
 snp_finder.append_to_summary("<br/><b>Total run time:</b> %s<br/>\n" % str(total_run_time))

Mercurial > repos > greg > vsnp_get_snps

comparison vsnp_get_snps.py @ 8:5e4595b9f63c draft