Mercurial > repos > davidvanzessen > demultiplex_emc
comparison demultiplex.py @ 4:146bbd9d58f6 draft default tip
Added tests
author | davidvanzessen |
---|---|
date | Mon, 12 Nov 2018 09:33:34 -0500 |
parents | b6d63b9efb8f |
children |
comparison
equal
deleted
inserted
replaced
3:b6d63b9efb8f | 4:146bbd9d58f6 |
---|---|
27 return None | 27 return None |
28 | 28 |
29 | 29 |
30 def search_barcode_in_first_half(sequence, barcode): | 30 def search_barcode_in_first_half(sequence, barcode): |
31 if type(sequence) is Seq: | 31 if type(sequence) is Seq: |
32 sequence = str(sequence) | 32 sequence = str(sequence).lower() |
33 elif type(sequence) is SeqRecord: | 33 elif type(sequence) is SeqRecord: |
34 sequence = str(sequence.seq) | 34 sequence = str(sequence.seq).lower() |
35 return sequence.find(barcode, 0, int(len(sequence) / 2)) | 35 return sequence.find(barcode, 0, int(len(sequence) / 2)) |
36 | 36 |
37 | 37 |
38 def search_barcode_in_second_half(sequence, barcode): | 38 def search_barcode_in_second_half(sequence, barcode): |
39 if type(sequence) is Seq: | 39 if type(sequence) is Seq: |
40 sequence = str(sequence) | 40 sequence = str(sequence).lower() |
41 elif type(sequence) is SeqRecord: | 41 elif type(sequence) is SeqRecord: |
42 sequence = str(sequence.seq) | 42 sequence = str(sequence.seq).lower() |
43 return sequence.find(barcode, int(len(sequence) / 2)) | 43 return sequence.find(barcode, int(len(sequence) / 2)) |
44 | 44 |
45 | 45 |
46 def search_barcodes_in_sequence(barcode_datas, sequence): | 46 def search_barcodes_in_sequence(barcode_datas, sequence): |
47 for barcode_data in barcode_datas: | 47 for barcode_data in barcode_datas: |
63 return -1, None, None | 63 return -1, None, None |
64 | 64 |
65 | 65 |
66 def main(): | 66 def main(): |
67 parser = argparse.ArgumentParser() | 67 parser = argparse.ArgumentParser() |
68 parser.add_argument("-i", "--input", help="The input file") | 68 parser.add_argument("-i", "--input", help="The input file", required=True) |
69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) | 69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) |
70 parser.add_argument("-o", "--output-dir", help="The output dir") | 70 parser.add_argument("-o", "--output-dir", help="The output dir", required=True) |
71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)") | 71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)", required=True) |
72 | 72 |
73 args = parser.parse_args() | 73 args = parser.parse_args() |
74 | 74 |
75 input_file_path = args.input | 75 input_file_path = args.input |
76 basename_input_file_path = os.path.basename(input_file_path) | 76 basename_input_file_path = os.path.basename(input_file_path) |
119 | 119 |
120 for ID_barcode in ID_barcode_mapping: | 120 for ID_barcode in ID_barcode_mapping: |
121 ID = ID_barcode["ID"] | 121 ID = ID_barcode["ID"] |
122 barcode = ID_barcode["barcode"] | 122 barcode = ID_barcode["barcode"] |
123 | 123 |
124 logging.info("{0}:\t\t{1}".format(ID, barcode)) | |
125 | |
124 output_file_path = os.path.join( | 126 output_file_path = os.path.join( |
125 output_dir, | 127 output_dir, |
126 "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format) | 128 "{0}.{1}".format(ID, input_format) |
127 ) | 129 ) |
128 | 130 |
129 if ID not in ID_file_handle_dict: | 131 if ID not in ID_file_handle_dict: |
130 ID_file_handle = open(output_file_path, 'w') | 132 ID_file_handle = open(output_file_path, 'w') |
131 ID_file_handle_dict[ID] = ID_file_handle | 133 ID_file_handle_dict[ID] = ID_file_handle |
132 | 134 |
133 ID_file_handle = ID_file_handle_dict[ID] | 135 ID_file_handle = ID_file_handle_dict[ID] |
134 | 136 |
135 barcode_data_dict[ID] += [BarcodeData( | 137 barcode_data_dict[ID] += [BarcodeData( |
136 ID=ID, | 138 ID=ID, |
137 barcode=barcode, | 139 barcode=barcode.lower(), |
138 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()), | 140 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()).lower(), |
139 output_file_path=output_file_path, | 141 output_file_path=output_file_path, |
140 output_file_handle=ID_file_handle | 142 output_file_handle=ID_file_handle |
141 )] | 143 )] |
142 | 144 |
143 discarded_output_file_path = os.path.join( | 145 discarded_output_file_path = os.path.join( |
144 output_dir, | 146 output_dir, |
145 "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format) | 147 "{0}.{1}".format("discarded", input_format) |
146 ) | 148 ) |
147 | 149 |
148 total_sequences = 0 | 150 total_sequences = 0 |
149 sequences_assigned_by_id = defaultdict(int) | 151 sequences_assigned_by_id = defaultdict(int) |
150 | 152 |