0
|
1 import argparse
|
|
2 import csv
|
|
3 import logging
|
|
4 import os
|
|
5 import sys
|
|
6 from collections import defaultdict
|
|
7 from collections import namedtuple
|
|
8
|
|
9 from Bio import SeqIO
|
|
10 from Bio.Alphabet import generic_dna
|
|
11 from Bio.Seq import Seq
|
|
12 from Bio.SeqRecord import SeqRecord
|
|
13
|
|
14
|
|
15 def sniff_format(file_path):
|
|
16 """
|
|
17 Try to guess the file format (fastq/fasta) by looking at the first character of the first line.
|
|
18 Should be '@' for fastq and '>' for fasta.
|
|
19 """
|
3
|
20 with open(file_path, 'r') as file_handle:
|
0
|
21 for line in file_handle:
|
|
22 if line.startswith("@"):
|
|
23 return "fastq"
|
|
24 if line.startswith(">"):
|
|
25 return "fasta"
|
|
26 break
|
|
27 return None
|
|
28
|
3
|
29
|
0
|
30 def search_barcode_in_first_half(sequence, barcode):
|
|
31 if type(sequence) is Seq:
|
|
32 sequence = str(sequence)
|
|
33 elif type(sequence) is SeqRecord:
|
|
34 sequence = str(sequence.seq)
|
|
35 return sequence.find(barcode, 0, int(len(sequence) / 2))
|
|
36
|
|
37
|
|
38 def search_barcode_in_second_half(sequence, barcode):
|
|
39 if type(sequence) is Seq:
|
|
40 sequence = str(sequence)
|
|
41 elif type(sequence) is SeqRecord:
|
|
42 sequence = str(sequence.seq)
|
|
43 return sequence.find(barcode, int(len(sequence) / 2))
|
|
44
|
|
45
|
|
46 def search_barcodes_in_sequence(barcode_datas, sequence):
|
|
47 for barcode_data in barcode_datas:
|
|
48 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode)
|
|
49 if barcode_search_position != -1:
|
|
50 return barcode_search_position, barcode_data, False
|
|
51 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode_reverse)
|
|
52 if barcode_search_position != -1:
|
|
53 return barcode_search_position, barcode_data, True
|
|
54
|
|
55 barcode_search_position = search_barcode_in_first_half(sequence, barcode_data.barcode_reverse)
|
|
56 if barcode_search_position != -1:
|
|
57 return barcode_search_position, barcode_data, True
|
|
58
|
|
59 barcode_search_position = search_barcode_in_second_half(sequence, barcode_data.barcode)
|
|
60 if barcode_search_position != -1:
|
|
61 return barcode_search_position, barcode_data, False
|
|
62
|
|
63 return -1, None, None
|
|
64
|
|
65
|
|
66 def main():
|
|
67 parser = argparse.ArgumentParser()
|
|
68 parser.add_argument("-i", "--input", help="The input file")
|
|
69 parser.add_argument("-f", "--format", help="The format of the input file (fastq/fasta)", default="auto", choices=["fasta", "fastq", "auto"])
|
|
70 parser.add_argument("-o", "--output-dir", help="The output dir")
|
|
71 parser.add_argument("-m", "--mapping-file", help="A tab seperated file containing two columns, ID and barcode (no header)")
|
|
72
|
|
73 args = parser.parse_args()
|
|
74
|
|
75 input_file_path = args.input
|
|
76 basename_input_file_path = os.path.basename(input_file_path)
|
|
77 input_basename_no_ext, input_extension = os.path.splitext(basename_input_file_path)
|
|
78
|
|
79 input_format = args.format
|
|
80
|
|
81 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
82
|
|
83 if input_format == "auto":
|
|
84 if input_extension in [".fasta", ".fa"]:
|
|
85 input_format = "fasta"
|
|
86 elif input_extension in [".fastq", ".fq"]:
|
|
87 input_format = "fastq"
|
|
88 else:
|
|
89 logging.info("Can't auto detect input format based on extension: {0}".format(input_extension))
|
|
90 logging.info("Sniffing format...")
|
|
91 input_format = sniff_format(input_file_path)
|
|
92 if not input_format:
|
|
93 logging.error("Can't auto detect input format")
|
|
94 sys.exit(1)
|
|
95 logging.info("Sniffed '{0}' as format.".format(input_format))
|
|
96
|
|
97 output_dir = args.output_dir
|
|
98 if not os.path.exists(output_dir):
|
|
99 os.makedirs(output_dir)
|
|
100
|
|
101 with open(args.mapping_file, newline="") as handle:
|
|
102 ID_barcode_mapping = list(csv.DictReader(handle, fieldnames=['ID', 'barcode'], delimiter="\t"))
|
|
103
|
|
104 logging.info("Input: {0}".format(input_file_path))
|
|
105 logging.info("Format: {0}".format(input_format))
|
|
106 logging.info("Output: {0}".format(output_dir))
|
|
107 logging.info("Mapping: {0} ({1} mappings)".format(args.mapping_file, len(ID_barcode_mapping)))
|
|
108
|
|
109 BarcodeData = namedtuple("BarcodeData", [
|
|
110 "ID",
|
|
111 "barcode",
|
|
112 "barcode_reverse",
|
|
113 "output_file_path",
|
|
114 "output_file_handle"
|
|
115 ])
|
|
116
|
|
117 barcode_data_dict = defaultdict(list)
|
|
118 ID_file_handle_dict = {}
|
|
119
|
|
120 for ID_barcode in ID_barcode_mapping:
|
|
121 ID = ID_barcode["ID"]
|
|
122 barcode = ID_barcode["barcode"]
|
|
123
|
|
124 output_file_path = os.path.join(
|
|
125 output_dir,
|
|
126 "{0}_{1}.{2}".format(input_basename_no_ext, ID, input_format)
|
|
127 )
|
|
128
|
|
129 if ID not in ID_file_handle_dict:
|
|
130 ID_file_handle = open(output_file_path, 'w')
|
|
131 ID_file_handle_dict[ID] = ID_file_handle
|
|
132
|
|
133 ID_file_handle = ID_file_handle_dict[ID]
|
|
134
|
|
135 barcode_data_dict[ID] += [BarcodeData(
|
|
136 ID=ID,
|
|
137 barcode=barcode,
|
|
138 barcode_reverse=str(Seq(barcode, generic_dna).reverse_complement()),
|
|
139 output_file_path=output_file_path,
|
|
140 output_file_handle=ID_file_handle
|
|
141 )]
|
|
142
|
|
143 discarded_output_file_path = os.path.join(
|
|
144 output_dir,
|
|
145 "{0}_{1}.{2}".format(basename_input_file_path, "discarded", input_format)
|
|
146 )
|
|
147
|
|
148 total_sequences = 0
|
|
149 sequences_assigned_by_id = defaultdict(int)
|
|
150
|
3
|
151 with open(input_file_path, 'r') as input_file_handle, open(discarded_output_file_path, 'w') as discarded_output_handle:
|
0
|
152 for record in SeqIO.parse(input_file_handle, input_format):
|
|
153 total_sequences += 1
|
|
154 for ID, barcode_datas in barcode_data_dict.items():
|
|
155 barcode_position, barcode_data, reverse = search_barcodes_in_sequence(barcode_datas, record)
|
|
156 if barcode_position == -1:
|
|
157 continue
|
|
158 barcode = barcode_data.barcode if not reverse else barcode_data.barcode_reverse
|
|
159 sequences_assigned_by_id[barcode_data.ID] += 1
|
|
160 logging.debug(str(record.seq))
|
|
161 logging.debug(" " * barcode_position + barcode)
|
|
162 SeqIO.write(record, ID_file_handle_dict[ID], input_format)
|
|
163 break
|
|
164 else: # no match # TODO fuzzy match ?
|
|
165 SeqIO.write(record, discarded_output_handle, input_format)
|
|
166 if total_sequences % 10000 == 0:
|
|
167 assigned_sequences = sum(sequences_assigned_by_id.values())
|
|
168 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format(
|
|
169 total_sequences,
|
|
170 assigned_sequences,
|
|
171 round(assigned_sequences / total_sequences * 100)
|
|
172 ))
|
|
173
|
|
174 for file_handle in ID_file_handle_dict.values():
|
|
175 file_handle.close()
|
|
176
|
|
177 assigned_sequences = sum(sequences_assigned_by_id.values())
|
|
178 logging.info("Processed {0} sequences, assigned {1} ({2}%)".format(
|
|
179 total_sequences,
|
|
180 assigned_sequences,
|
|
181 round(assigned_sequences / total_sequences * 100)
|
|
182 ))
|
|
183
|
|
184
|
|
185 if __name__ == "__main__":
|
|
186 main()
|
|
187
|