comparison demultiplex.py @ 4:146bbd9d58f6 draft default tip

Added tests
author davidvanzessen
date Mon, 12 Nov 2018 09:33:34 -0500
parents b6d63b9efb8f
children
comparison
equal deleted inserted replaced
3:b6d63b9efb8f 4:146bbd9d58f6
27 return None 27 return None
28 28
29 29
30 def search_barcode_in_first_half(sequence, barcode): 30 def search_barcode_in_first_half(sequence, barcode):
31 if type(sequence) is Seq: 31 if type(sequence) is Seq:
32 sequence = str(sequence) 32 sequence = str(sequence).lower()
33 elif type(sequence) is SeqRecord: 33 elif type(sequence) is SeqRecord:
34 sequence = str(sequence.seq) 34 sequence = str(sequence.seq).lower()
35 return sequence.find(barcode, 0, int(len(sequence) / 2)) 35 return sequence.find(barcode, 0, int(len(sequence) / 2))
36 36
37 37
38 def search_barcode_in_second_half(sequence, barcode): 38 def search_barcode_in_second_half(sequence, barcode):
39 if type(sequence) is Seq: 39 if type(sequence) is Seq:
40 sequence = str(sequence) 40 sequence = str(sequence).lower()
41 elif type(sequence) is SeqRecord: 41 elif type(sequence) is SeqRecord:
42 sequence = str(sequence.seq) 42 sequence = str(sequence.seq).lower()
43 return sequence.find(barcode, int(len(sequence) / 2)) 43 return sequence.find(barcode, int(len(sequence) / 2))
44 44
45 45
46 def search_barcodes_in_sequence(barcode_datas, sequence): 46 def search_barcodes_in_sequence(barcode_datas, sequence):
47 for barcode_data in barcode_datas: 47 for barcode_data in barcode_datas:
63 return -1, None, None 63 return -1, None, None
64 64
65 65
66 def main(): 66 def main():
67 parser = argparse.ArgumentParser() 67 parser = argparse.ArgumentParser()
68 parser.add_argument("-i", "--input", help="The input file") 68 parser.add_argument("-i", "--input", help="The input file", required=True)
69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"]) 69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"])
70 parser.add_argument("-o", "--output-dir", help="The output dir") 70 parser.add_argument("-o", "--output-dir", help="The output dir", required=True)
71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)") 71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)", required=True)
72 72
73 args = parser.parse_args() 73 args = parser.parse_args()
74 74
75 input_file_path = args.input 75 input_file_path = args.input
76 basename_input_file_path = os.path.basename(input_file_path) 76 basename_input_file_path = os.path.basename(input_file_path)
119 119
120 for ID_barcode in ID_barcode_mapping: 120 for ID_barcode in ID_barcode_mapping:
121 ID = ID_barcode["ID"] 121 ID = ID_barcode["ID"]
122 barcode = ID_barcode["barcode"] 122 barcode = ID_barcode["barcode"]
123 123
124 logging.info("{0}:\t\t{1}".format(ID, barcode))
125
124 output_file_path = os.path.join( 126 output_file_path = os.path.join(
125 output_dir, 127 output_dir,
126 "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format) 128 "{0}.{1}".format(ID, input_format)
127 ) 129 )
128 130
129 if ID not in ID_file_handle_dict: 131 if ID not in ID_file_handle_dict:
130 ID_file_handle = open(output_file_path, 'w') 132 ID_file_handle = open(output_file_path, 'w')
131 ID_file_handle_dict[ID] = ID_file_handle 133 ID_file_handle_dict[ID] = ID_file_handle
132 134
133 ID_file_handle = ID_file_handle_dict[ID] 135 ID_file_handle = ID_file_handle_dict[ID]
134 136
135 barcode_data_dict[ID] += [BarcodeData( 137 barcode_data_dict[ID] += [BarcodeData(
136 ID=ID, 138 ID=ID,
137 barcode=barcode, 139 barcode=barcode.lower(),
138 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()), 140 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()).lower(),
139 output_file_path=output_file_path, 141 output_file_path=output_file_path,
140 output_file_handle=ID_file_handle 142 output_file_handle=ID_file_handle
141 )] 143 )]
142 144
143 discarded_output_file_path = os.path.join( 145 discarded_output_file_path = os.path.join(
144 output_dir, 146 output_dir,
145 "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format) 147 "{0}.{1}".format("discarded", input_format)
146 ) 148 )
147 149
148 total_sequences = 0 150 total_sequences = 0
149 sequences_assigned_by_id = defaultdict(int) 151 sequences_assigned_by_id = defaultdict(int)
150 152