Mercurial > repos > davidvanzessen > demultiplex_emc
changeset 4:146bbd9d58f6 draft default tip
Added tests
author | davidvanzessen |
---|---|
date | Mon, 12 Nov 2018 09:33:34 -0500 |
parents | b6d63b9efb8f |
children | |
files | demultiplex.py demultiplex.xml test-data/input.fasta test-data/input.fastq test-data/mapping.txt test-data/sequence1.fasta test-data/sequence1.fastq test-data/sequence2.fasta test-data/sequence2.fastq |
diffstat | 9 files changed, 142 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/demultiplex.py Fri Nov 09 05:52:15 2018 -0500 +++ b/demultiplex.py Mon Nov 12 09:33:34 2018 -0500 @@ -29,17 +29,17 @@ def search_barcode_in_first_half(sequence, barcode): if type(sequence) is Seq: - sequence = str(sequence) + sequence = str(sequence).lower() elif type(sequence) is SeqRecord: - sequence = str(sequence.seq) + sequence = str(sequence.seq).lower() return sequence.find(barcode, 0, int(len(sequence) / 2)) def search_barcode_in_second_half(sequence, barcode): if type(sequence) is Seq: - sequence = str(sequence) + sequence = str(sequence).lower() elif type(sequence) is SeqRecord: - sequence = str(sequence.seq) + sequence = str(sequence.seq).lower() return sequence.find(barcode, int(len(sequence) / 2)) @@ -65,10 +65,10 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", help="The input file") + parser.add_argument("-i", "--input", help="The input file", required=True) parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) - parser.add_argument("-o", "--output-dir", help="The output dir") - parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)") + parser.add_argument("-o", "--output-dir", help="The output dir", required=True) + parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)", required=True) args = parser.parse_args() @@ -121,9 +121,11 @@ ID = ID_barcode["ID"] barcode = ID_barcode["barcode"] + logging.info("{0}:\t\t{1}".format(ID, barcode)) + output_file_path = os.path.join( output_dir, - "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format) + "{0}.{1}".format(ID, input_format) ) if ID not in ID_file_handle_dict: @@ -134,15 +136,15 @@ barcode_data_dict[ID] += [BarcodeData( ID=ID, - barcode=barcode, - barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()), + barcode=barcode.lower(), + barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()).lower(), output_file_path=output_file_path, output_file_handle=ID_file_handle )] discarded_output_file_path = os.path.join( output_dir, - "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format) + "{0}.{1}".format("discarded", input_format) ) total_sequences = 0
--- a/demultiplex.xml Fri Nov 09 05:52:15 2018 -0500 +++ b/demultiplex.xml Mon Nov 12 09:33:34 2018 -0500 @@ -1,9 +1,10 @@ <tool id="demultiplex-emc" name="Demultiplex" version="1.0.0"> + <description></description> <requirements> <requirement type="package" version="3.7.0">python</requirement> <requirement type="package" version="1.72">biopython</requirement> </requirements> - <description></description> + <command> mkdir outputs; python3 $__tool_directory__/demultiplex.py @@ -18,29 +19,39 @@ </inputs> <outputs> <!--<data name="debug" format="txt" label="debug"/>--> - <collection name='demultiplex_out' format_source='input' type='list'> - <discover_datasets pattern="__name_and_ext__" directory="outputs" format_source='input'/> + <collection name='demultiplex_out' format_source='input' type="list"> + <discover_datasets pattern="__designation_and_ext__" directory="outputs"/> + <!--<discover_datasets pattern="(?P<designation>.+)\.(?P<ext>.+)" directory="outputs"/>--> </collection> </outputs> <tests> - <!-- <test> - <param name="input1" value="1.bed"/> - <param name="input2" value="2.bed"/> - <output name="out_file1" file="cat_wrapper_out1.bed"/> + <param name="input" value="input.fastq"/> + <param name="mapping" value="mapping.txt"/> + <output_collection name="demultiplex_out" type="list"> + <element name="sequence1" file="sequence1.fastq"/> + <element name="sequence2" file="sequence2.fastq"/> + </output_collection> </test> - TODO: if possible, enhance the underlying test code to handle this test - the problem is multiple params with the same name "input2" <test> - <param name="input1" value="1.bed"/> - <param name="input2" value="2.bed"/> - <param name="input2" value="3.bed"/> - <output name="out_file1" file="cat_wrapper_out2.bed"/> + <param name="input" value="input.fasta"/> + <param name="mapping" value="mapping.txt"/> + <output_collection name="demultiplex_out" type="list"> + <element name="sequence1" file="sequence1.fasta"/> + <element name="sequence2" file="sequence2.fasta"/> + </output_collection> </test> - --> </tests> <help> There is no help </help> + <citations> + <citation type="bibtex">@misc{Demultplex-EMC, + author = {Erasmus MC}, + title = {Demultiplex-EMC}, + year = {2018}, + howpublished = {https://github.com/ErasmusMC-Bioinformatics/Demultiplex} + }</citation> + </citations> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fasta Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,24 @@ +>sequence1_upper +GTACAACAATATTTGTTAGTCACCTTTGGGTCACGATCTCCCACCTTACTGGAATTTAGTCCCTGCTATAATTTGCCTTG +CATATAAGTTGCGTTACTTCAGCGTCCTAACCGCACCCTTAGCACGAAGACAGATTTGTTCATTCCCATACTCCGGCGTT +GGCAGGGGGTTCGCATGTCCCACGTGAAACGTTGCTAAAC +>sequence1_lower +gtacaacaatatttgttagtcacctttgggtcacgatctcccaccttactggaatttagtccctgctataatttgccttg +catataagttgcgttacttcagcgtcctaaccgcacccttagcacgaagacagatttgttcattcccatactccggcgtt +ggcagggggttcgcatgtcccacgtgaaacgttgctaaac +>sequence1_mix +gTACaacAaTATTTGtTaGtCAccttTgGgTCACGATCtCcCaccttACtggAAtTTaGTcCCTGCTATAAtTtgCcttG +CAtATAaGtTgcgttaCTtCaGCgtccTAaCcgcAccCTTagCaCgaAGacaGaTttGTTCATtCccATACTCcggCgtT +ggCagGGGgtTCgCatgtCccACGtGaAACgTTgCtAAac +>sequence2_upper +CCTCAGGTTTCTGAGCGACAAAAGCTTTAAACGGGAGTTCGCGCTCATAACTTGGTCCGAATGCGGGTTCTTGCATCGTT +CGACTGAGTTTGTTTCATGTAGAACGGGCGCAAAGTATACTTAGTTCAATCTTCAATACCTCGTATCATTGTACACCTGC +CGGTCACCACCCAACGATGTGGGGACGGCGTTGCAACTTC +>sequence2_lower +cctcaggtttctgagcgacaaaagctttaaacgggagttcgcgctcataacttggtccgaatgcgggttcttgcatcgtt +cgactgagtttgtttcatgtagaacgggcgcaaagtatacttagttcaatcttcaatacctcgtatcattgtacacctgc +cggtcaccacccaacgatgtggggacggcgttgcaacttc +>sequence1_mix +CctCaGgTTTctGaGCGAcAAAagCTttAAaCgGGaGtTCgcGCtcAtAaCTtggTcCGAATgcgGGtTcTTgCAtCGtt +CgaCtgaGTTTgtttCatGTAgAacGGGCgCAAagTatACttaGtTCaATCTtCaatACCtcgtAtcATTgTACaCCtGC +CGgTcAccaCcCaAcgATgTGGggACggCGtTgCAAcTTC \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fastq Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,24 @@ +@sequence1_upper +GTACAACAATATTTGTTAGTCACCTTTGGGTCACGATCTCCCACCTTACTGGAATTTAGTCCCTGCTATAATTTGCCTTGCATATAAGTTGCGTTACTTCAGCGTCCTAACCGCACCCTTAGCACGAAGACAGATTTGTTCATTCCCATACTCCGGCGTTGGCAGGGGGTTCGCATGTCCCACGTGAAACGTTGCTAAAC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_lower +gtacaacaatatttgttagtcacctttgggtcacgatctcccaccttactggaatttagtccctgctataatttgccttgcatataagttgcgttacttcagcgtcctaaccgcacccttagcacgaagacagatttgttcattcccatactccggcgttggcagggggttcgcatgtcccacgtgaaacgttgctaaac ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_mix +gTACaacAaTATTTGtTaGtCAccttTgGgTCACGATCtCcCaccttACtggAAtTTaGTcCCTGCTATAAtTtgCcttGCAtATAaGtTgcgttaCTtCaGCgtccTAaCcgcAccCTTagCaCgaAGacaGaTttGTTCATtCccATACTCcggCgtTggCagGGGgtTCgCatgtCccACGtGaAACgTTgCtAAac ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence2_upper +CCTCAGGTTTCTGAGCGACAAAAGCTTTAAACGGGAGTTCGCGCTCATAACTTGGTCCGAATGCGGGTTCTTGCATCGTTCGACTGAGTTTGTTTCATGTAGAACGGGCGCAAAGTATACTTAGTTCAATCTTCAATACCTCGTATCATTGTACACCTGCCGGTCACCACCCAACGATGTGGGGACGGCGTTGCAACTTC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence2_lower +cctcaggtttctgagcgacaaaagctttaaacgggagttcgcgctcataacttggtccgaatgcgggttcttgcatcgttcgactgagtttgtttcatgtagaacgggcgcaaagtatacttagttcaatcttcaatacctcgtatcattgtacacctgccggtcaccacccaacgatgtggggacggcgttgcaacttc ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_mix +CctCaGgTTTctGaGCGAcAAAagCTttAAaCgGGaGtTCgcGCtcAtAaCTtggTcCGAATgcgGGtTcTTgCAtCGttCgaCtgaGTTTgtttCatGTAgAacGGGCgCAAagTatACttaGtTCaATCTtCaatACCtcgtAtcATTgTACaCCtGCCGgTcAccaCcCaAcgATgTGGggACggCGtTgCAAcTTC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mapping.txt Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,2 @@ +sequence1 CAATATTTGT +sequence2 tttctgagcg
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence1.fasta Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,15 @@ +>sequence1_upper +GTACAACAATATTTGTTAGTCACCTTTGGGTCACGATCTCCCACCTTACTGGAATTTAGT +CCCTGCTATAATTTGCCTTGCATATAAGTTGCGTTACTTCAGCGTCCTAACCGCACCCTT +AGCACGAAGACAGATTTGTTCATTCCCATACTCCGGCGTTGGCAGGGGGTTCGCATGTCC +CACGTGAAACGTTGCTAAAC +>sequence1_lower +gtacaacaatatttgttagtcacctttgggtcacgatctcccaccttactggaatttagt +ccctgctataatttgccttgcatataagttgcgttacttcagcgtcctaaccgcaccctt +agcacgaagacagatttgttcattcccatactccggcgttggcagggggttcgcatgtcc +cacgtgaaacgttgctaaac +>sequence1_mix +gTACaacAaTATTTGtTaGtCAccttTgGgTCACGATCtCcCaccttACtggAAtTTaGT +cCCTGCTATAAtTtgCcttGCAtATAaGtTgcgttaCTtCaGCgtccTAaCcgcAccCTT +agCaCgaAGacaGaTttGTTCATtCccATACTCcggCgtTggCagGGGgtTCgCatgtCc +cACGtGaAACgTTgCtAAac
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence1.fastq Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,12 @@ +@sequence1_upper +GTACAACAATATTTGTTAGTCACCTTTGGGTCACGATCTCCCACCTTACTGGAATTTAGTCCCTGCTATAATTTGCCTTGCATATAAGTTGCGTTACTTCAGCGTCCTAACCGCACCCTTAGCACGAAGACAGATTTGTTCATTCCCATACTCCGGCGTTGGCAGGGGGTTCGCATGTCCCACGTGAAACGTTGCTAAAC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_lower +gtacaacaatatttgttagtcacctttgggtcacgatctcccaccttactggaatttagtccctgctataatttgccttgcatataagttgcgttacttcagcgtcctaaccgcacccttagcacgaagacagatttgttcattcccatactccggcgttggcagggggttcgcatgtcccacgtgaaacgttgctaaac ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_mix +gTACaacAaTATTTGtTaGtCAccttTgGgTCACGATCtCcCaccttACtggAAtTTaGTcCCTGCTATAAtTtgCcttGCAtATAaGtTgcgttaCTtCaGCgtccTAaCcgcAccCTTagCaCgaAGacaGaTttGTTCATtCccATACTCcggCgtTggCagGGGgtTCgCatgtCccACGtGaAACgTTgCtAAac ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence2.fasta Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,15 @@ +>sequence2_upper +CCTCAGGTTTCTGAGCGACAAAAGCTTTAAACGGGAGTTCGCGCTCATAACTTGGTCCGA +ATGCGGGTTCTTGCATCGTTCGACTGAGTTTGTTTCATGTAGAACGGGCGCAAAGTATAC +TTAGTTCAATCTTCAATACCTCGTATCATTGTACACCTGCCGGTCACCACCCAACGATGT +GGGGACGGCGTTGCAACTTC +>sequence2_lower +cctcaggtttctgagcgacaaaagctttaaacgggagttcgcgctcataacttggtccga +atgcgggttcttgcatcgttcgactgagtttgtttcatgtagaacgggcgcaaagtatac +ttagttcaatcttcaatacctcgtatcattgtacacctgccggtcaccacccaacgatgt +ggggacggcgttgcaacttc +>sequence1_mix +CctCaGgTTTctGaGCGAcAAAagCTttAAaCgGGaGtTCgcGCtcAtAaCTtggTcCGA +ATgcgGGtTcTTgCAtCGttCgaCtgaGTTTgtttCatGTAgAacGGGCgCAAagTatAC +ttaGtTCaATCTtCaatACCtcgtAtcATTgTACaCCtGCCGgTcAccaCcCaAcgATgT +GGggACggCGtTgCAAcTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence2.fastq Mon Nov 12 09:33:34 2018 -0500 @@ -0,0 +1,12 @@ +@sequence2_upper +CCTCAGGTTTCTGAGCGACAAAAGCTTTAAACGGGAGTTCGCGCTCATAACTTGGTCCGAATGCGGGTTCTTGCATCGTTCGACTGAGTTTGTTTCATGTAGAACGGGCGCAAAGTATACTTAGTTCAATCTTCAATACCTCGTATCATTGTACACCTGCCGGTCACCACCCAACGATGTGGGGACGGCGTTGCAACTTC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence2_lower +cctcaggtttctgagcgacaaaagctttaaacgggagttcgcgctcataacttggtccgaatgcgggttcttgcatcgttcgactgagtttgtttcatgtagaacgggcgcaaagtatacttagttcaatcttcaatacctcgtatcattgtacacctgccggtcaccacccaacgatgtggggacggcgttgcaacttc ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +@sequence1_mix +CctCaGgTTTctGaGCGAcAAAagCTttAAaCgGGaGtTCgcGCtcAtAaCTtggTcCGAATgcgGGtTcTTgCAtCGttCgaCtgaGTTTgtttCatGTAgAacGGGCgCAAagTatACttaGtTCaATCTtCaatACCtcgtAtcATTgTACaCCtGCCGgTcAccaCcCaAcgATgTGGggACggCGtTgCAAcTTC ++ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa