Mercurial > repos > davidvanzessen > demultiplex_emc
changeset 0:36c79869620b draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 09 Nov 2018 05:37:11 -0500 |
parents | |
children | 9cb2a61a5a1e |
files | demultiplex.py demultiplex.xml |
diffstat | 2 files changed, 233 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demultiplex.py Fri Nov 09 05:37:11 2018 -0500 @@ -0,0 +1,187 @@ +import argparse +import csv +import logging +import os +import sys +from collections import defaultdict +from collections import namedtuple + +from Bio import SeqIO +from Bio.Alphabet import generic_dna +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +def sniff_format(file_path): + """ + Try to guess the file format (fastq/fasta) by looking at the first character of the first line. + Should be '@' for fastq and '>' for fasta. + """ + with open(file_path, 'rU') as file_handle: + for line in file_handle: + if line.startswith("@"): + return "fastq" + if line.startswith(">"): + return "fasta" + break + return None + + +def search_barcode_in_first_half(sequence, barcode): + if type(sequence) is Seq: + sequence = str(sequence) + elif type(sequence) is SeqRecord: + sequence = str(sequence.seq) + return sequence.find(barcode, 0, int(len(sequence) / 2)) + + +def search_barcode_in_second_half(sequence, barcode): + if type(sequence) is Seq: + sequence = str(sequence) + elif type(sequence) is SeqRecord: + sequence = str(sequence.seq) + return sequence.find(barcode, int(len(sequence) / 2)) + + +def search_barcodes_in_sequence(barcode_datas, sequence): + for barcode_data in barcode_datas: + barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode) + if barcode_search_position != -1: + return barcode_search_position, barcode_data, False + barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode_reverse) + if barcode_search_position != -1: + return barcode_search_position, barcode_data, True + + barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode_reverse) + if barcode_search_position != -1: + return barcode_search_position, barcode_data, True + + barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode) + if barcode_search_position != -1: + return barcode_search_position, barcode_data, False + + return -1, None, None + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", help="The input file") + parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) + parser.add_argument("-o", "--output-dir", help="The output dir") + parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)") + + args = parser.parse_args() + + input_file_path = args.input + basename_input_file_path = os.path.basename(input_file_path) + input_basename_no_ext, input_extension = os.path.splitext(basename_input_file_path) + + input_format = args.format + + logging.basicConfig(stream=sys.stdout, level=logging.INFO) + + if input_format == "auto": + if input_extension in [".fasta", ".fa"]: + input_format = "fasta" + elif input_extension in [".fastq", ".fq"]: + input_format = "fastq" + else: + logging.info("Can't auto detect input format based on extension: {0}".format(input_extension)) + logging.info("Sniffing format...") + input_format = sniff_format(input_file_path) + if not input_format: + logging.error("Can't auto detect input format") + sys.exit(1) + logging.info("Sniffed '{0}' as format.".format(input_format)) + + output_dir = args.output_dir + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(args.mapping_file, newline="") as handle: + ID_barcode_mapping = list(csv.DictReader(handle, fieldnames=['ID', 'barcode'], delimiter="\t")) + + logging.info("Input: {0}".format(input_file_path)) + logging.info("Format: {0}".format(input_format)) + logging.info("Output: {0}".format(output_dir)) + logging.info("Mapping: {0} ({1} mappings)".format(args.mapping_file, len(ID_barcode_mapping))) + + BarcodeData = namedtuple("BarcodeData", [ + "ID", + "barcode", + "barcode_reverse", + "output_file_path", + "output_file_handle" + ]) + + barcode_data_dict = defaultdict(list) + ID_file_handle_dict = {} + + for ID_barcode in ID_barcode_mapping: + ID = ID_barcode["ID"] + barcode = ID_barcode["barcode"] + + output_file_path = os.path.join( + output_dir, + "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format) + ) + + if ID not in ID_file_handle_dict: + ID_file_handle = open(output_file_path, 'w') + ID_file_handle_dict[ID] = ID_file_handle + + ID_file_handle = ID_file_handle_dict[ID] + + barcode_data_dict[ID] += [BarcodeData( + ID=ID, + barcode=barcode, + barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()), + output_file_path=output_file_path, + output_file_handle=ID_file_handle + )] + + discarded_output_file_path = os.path.join( + output_dir, + "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format) + ) + + total_sequences = 0 + sequences_assigned_by_id = defaultdict(int) + + with open(input_file_path, 'rU') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle: + for record in SeqIO.parse(input_file_handle, input_format): + total_sequences += 1 + for ID, barcode_datas in barcode_data_dict.items(): + barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record) + if barcode_position == -1: + continue + barcode = barcode_data.barcode if not reverse else barcode_data.barcode_reverse + sequences_assigned_by_id[barcode_data.ID] += 1 + logging.debug(str(record.seq)) + logging.debug(" " * barcode_position + barcode) + SeqIO.write(record, ID_file_handle_dict[ID], input_format) + break + else: # no match # TODO fuzzy match ? + SeqIO.write(record, discarded_output_handle, input_format) + if total_sequences % 10000 == 0: + assigned_sequences = sum(sequences_assigned_by_id.values()) + logging.info("Processed {0} sequences, assigned {1} ({2}%)".format( + total_sequences, + assigned_sequences, + round(assigned_sequences / total_sequences * 100) + )) + + for file_handle in ID_file_handle_dict.values(): + file_handle.close() + + assigned_sequences = sum(sequences_assigned_by_id.values()) + logging.info("Processed {0} sequences, assigned {1} ({2}%)".format( + total_sequences, + assigned_sequences, + round(assigned_sequences / total_sequences * 100) + )) + + +if __name__ == "__main__": + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demultiplex.xml Fri Nov 09 05:37:11 2018 -0500 @@ -0,0 +1,46 @@ +<tool id="demultiplex-emc" name="Demultiplex" version="1.0.0"> + <requirements> + <requirement type="package" version="3.7.0">python</requirement> + <requirement type="package" version="1.7.2">biopython</requirement> + </requirements> + <description></description> + <command> + mkdir outputs; + python3 $__tool_directory__/demultiplex.py + --input $input + --format auto + --output-dir ./outputs + --mapping-file $mapping + </command> + <inputs> + <param name="input" type="data" format='fasta,fastq' label="The input fasta/fastq"/> + <param name="mapping" type="data" format='tabular' label="The mapping file"/> + </inputs> + <outputs> + <data name="debug" format="txt" label="debug"/> + <collection name='demultiplex_out' format_source='input' type='list'> + <discover_datasets pattern="__name_and_ext__" directory="outputs" format_source='input'/> + </collection> + </outputs> + <tests> + <!-- + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <output name="out_file1" file="cat_wrapper_out1.bed"/> + </test> + TODO: if possible, enhance the underlying test code to handle this test + the problem is multiple params with the same name "input2" + <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="input2" value="3.bed"/> + <output name="out_file1" file="cat_wrapper_out2.bed"/> + </test> + --> + </tests> + <help> +There is no help + </help> +</tool> +