2
|
1 #!/usr/bin/python -tt
|
|
2
|
|
3 ###########################################################
|
|
4 # Graeme Fox - 26/09/2015 - graeme.fox@manchester.ac.uk
|
|
5 # Tested on (L)ubuntu 15.04 only, with Python 2.7
|
|
6 ###########################################################
|
|
7 # PROGRAM DESCRIPTION
|
|
8 # Program to pick optimum loci from the output of pal_finder_v0.02.04
|
|
9 # and to predict the fragment length.
|
|
10 #
|
|
11 # This program can be used to filter pal_finder output to choose the 'optimum' loci.
|
|
12 #
|
|
13 # Additionally this program can assemble the two paired end reads,
|
|
14 # find the primers within HQ assembly, log their positions
|
|
15 # and calculate the difference to give the fragment length.
|
|
16 #
|
|
17 # For best results in your PCR, I suggest doing both.
|
|
18 #
|
|
19 ###########################################################
|
|
20 # Requirements:
|
|
21 # Must have Biopython (www.biopython.org).
|
|
22 #
|
|
23 # If you with to perform the assembly QC step, you must have:
|
|
24 # PandaSeq (https://github.com/neufeld/pandaseq)
|
|
25 # PandaSeq must be in your path / able to run from anywhere
|
|
26 ###########################################################
|
|
27 # Required options:
|
|
28 # -i forward_paired_ends.fastQ
|
|
29 # -j reverse_paired_ends.fastQ
|
|
30 # -p pal_finder output - the "(microsatellites with read IDs and primer pairs)" file
|
|
31 #
|
|
32 # By default it does nothing.
|
|
33 #
|
|
34 # Non-required options:
|
|
35 # -assembly: turn on the pandaseq assembly QC step
|
|
36 # -primers: filter microsatellite loci to just those which have primers designed
|
|
37 # -occurrences: filter microsatellite loci to those with primers which appear only once in the dataset
|
|
38 # -rankmotifs: filter microsatellite loci to just those with perfect motifs. Rank the output by size of motif (largest first)
|
|
39 #
|
|
40 ###########################################################
|
|
41 # For repeat analysis, the following extra non-required options may be useful:
|
|
42 # PandaSeq Assembly, and fastq -> fasta conversion are slow.
|
|
43 #
|
|
44 # Do them the first time, generate the files and then skip either, or both steps with the following:
|
|
45 # -a: skip assembly step
|
|
46 # -c: skip fastq -> fasta conversion step
|
|
47 #
|
|
48 # # Just make sure to keep the files in the correct directory with the correct filename
|
|
49 ###########################################################
|
|
50 #
|
|
51 # Example usage:
|
|
52 # pal_finder_filter_and_assembly.py -i R1.fastq -j R2.fastq -p pal_finder_output_(microsatellites with read IDs and primer pairs).tabular -primers -occurrences -rankmotifs -assembly
|
|
53 #
|
|
54 ###########################################################
|
|
55 import Bio, subprocess, argparse, csv, os, re, time
|
|
56 from Bio import SeqIO
|
|
57
|
|
58 # Get values for all the required and optional arguments
|
|
59
|
|
60 parser = argparse.ArgumentParser(description='Frag_length_finder')
|
|
61 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', required=True)
|
|
62 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', required=True)
|
|
63 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', required=True)
|
|
64 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq based QC', nargs='?', const=1, type=int, required=False)
|
|
65 parser.add_argument('-a','--skip_assembly', help='If the assembly has already been run, skip it with -a', nargs='?', const=1, type=int, required=False)
|
|
66 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta conversion has already been run, skip it with -c', nargs='?', const=1, type=int, required=False)
|
|
67 parser.add_argument('-primers','--filter_by_primer_column', help='Filter pal_finder output to just those loci which have primers designed', nargs='?', const=1, type=int, required=False)
|
|
68 parser.add_argument('-occurrences','--filter_by_occurrences_column', help='Filter pal_finder output to just loci with primers which only occur once in the dataset', nargs='?', const=1, type=int, required=False)
|
|
69 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', help='Filter pal_finder output to just loci which are a perfect repeat unit. Also, rank the loci by motif size (largest first)', nargs='?', const=1, type=int, required=False)
|
|
70 args = vars(parser.parse_args())
|
|
71
|
|
72 # parse arguments
|
|
73
|
|
74 R1_input = args['input1'];
|
|
75 R2_input = args['input2'];
|
|
76 pal_finder_output = args['pal_finder'];
|
|
77 perform_assembly = args['assembly_QC']
|
|
78 skip_assembly = args['skip_assembly'];
|
|
79 skip_conversion = args['skip_conversion'];
|
|
80 filter_primers = args['filter_by_primer_column'];
|
|
81 filter_occurrences = args['filter_by_occurrences_column'];
|
|
82 filter_rank_motifs = args['filter_and_rank_by_motif_size'];
|
|
83
|
|
84 # set default values for arguments
|
|
85 if perform_assembly is None:
|
|
86 perform_assembly = 0
|
|
87 if skip_assembly is None:
|
|
88 skip_assembly = 0
|
|
89 if skip_conversion is None:
|
|
90 skip_conversion = 0
|
|
91 if filter_primers is None:
|
|
92 filter_primers = 0
|
|
93 if filter_occurrences is None:
|
|
94 filter_occurrences = 0
|
|
95 if filter_rank_motifs is None:
|
|
96 filter_rank_motifs = 0
|
|
97
|
|
98 ############################################################
|
|
99 # Function List #
|
|
100 ############################################################
|
|
101 # Reverse complement a sequence
|
|
102 def ReverseComplement1(seq):
|
|
103 seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
|
|
104 return "".join([seq_dict[base] for base in reversed(seq)])
|
|
105
|
|
106 # Convert a .fastq to a .fasta, filter to just lines we want and strip MiSeq barcodes
|
|
107 def fastq_to_fasta(file):
|
|
108 file_name = os.path.splitext(os.path.basename(file))[0]
|
|
109 with open(file_name + "_filtered.fasta", "w") as out:
|
|
110 for record in SeqIO.parse(file, "fastq"):
|
|
111 ID = str(record.id)
|
|
112 SEQ = str(record.seq)
|
|
113 if ID in wanted:
|
|
114 out.write(">" + ID + "\n" + SEQ + "\n")
|
|
115
|
|
116 # strip the miseq barcodes from a fasta file
|
|
117 def strip_barcodes(file):
|
|
118 file_name = os.path.splitext(os.path.basename(file))[0]
|
|
119 with open(file_name + "_adapters_removed.fasta", "w") as out:
|
|
120 for record in SeqIO.parse(file, "fasta"):
|
|
121 match = re.search(r'\S*:', record.id)
|
|
122 if match:
|
|
123 correct = match.group().rstrip(":")
|
|
124 else:
|
|
125 correct = str(record.id)
|
|
126 SEQ = str(record.seq)
|
|
127 if correct in wanted:
|
|
128 out.write(">" + correct + "\n" + SEQ + "\n")
|
|
129
|
|
130 ############################################################
|
|
131 # MAIN PROGRAM #
|
|
132 ############################################################
|
|
133
|
|
134 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 and filter_rank_motifs == 0):
|
|
135 print "\nNo optional arguments supplied."
|
|
136 print "\nBy default this program does nothing."
|
|
137 print "\nNo files produced and no modifications made."
|
|
138 print "\nFinished.\n"
|
|
139 exit()
|
|
140
|
|
141 else:
|
|
142 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
143 print "Checking supplied filtering parameters:"
|
|
144 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
145 if filter_primers == 1:
|
|
146 print "-primers flag supplied. Filtering pal_finder output on the \"Primers found (1=y,0=n)\" column."
|
|
147 if filter_occurrences == 1:
|
|
148 print "-occurrences flag supplied. Filtering pal_finder output on the \"Occurances of Forward Primer in Reads\" and \"Occurances of Reverse Primer in Reads\" columns."
|
|
149 if filter_rank_motifs == 1:
|
|
150 print "-rankmotifs flag supplied. Filtering pal_finder output on the \"Motifs(bases)\" column to just those with perfect repeats. Ranking output by size of motif (largest first)."
|
|
151
|
|
152 # index the raw fastq files so that the sequences can be pulled out and added to the filtered output file
|
|
153 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq')
|
|
154 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq')
|
|
155
|
|
156 # create a set to hold the filtered output
|
|
157 wanted_lines = set()
|
|
158
|
|
159 # get lines from the pal_finder output which meet filter settings
|
|
160
|
|
161 # read the pal_finder output file into a csv reader
|
|
162 with open (pal_finder_output) as csvfile_infile:
|
|
163 csv_f = csv.reader(csvfile_infile, delimiter='\t')
|
|
164 header = csv_f.next()
|
|
165 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered", 'w') as csvfile_outfile:
|
|
166 # write the header line for the output file
|
|
167 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\tR1_Sequence\tR2_Sequence_ID\tR2_Sequence\n")
|
|
168 for row in csv_f:
|
|
169 # get the sequence ID
|
|
170 seq_ID = row[0]
|
|
171 # get the raw sequence reads and convert to a format that can go into a tsv file
|
|
172 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").replace("\n","\t",1).replace("\n","")
|
|
173 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").replace("\n","\t",1).replace("\n","")
|
|
174 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
|
|
175 # navigate through all different combinations of filter options
|
|
176 # if the primer filter is switched on
|
|
177 if filter_primers == 1:
|
|
178 # check the occurrences of primers field
|
|
179 if row[5] == "1":
|
|
180 # if filter occurrences of primers is switched on
|
|
181 if filter_occurrences == 1:
|
|
182 # check the occurrences of primers field
|
|
183 if (row[15] == "1" and row[16] == "1"):
|
|
184 # if rank by motif is switched on
|
|
185 if filter_rank_motifs == 1:
|
|
186 # check for perfect motifs
|
|
187 if row[1].count('(') == 1:
|
|
188 # all 3 filter switched on - write line out to output
|
|
189 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
190
|
|
191 else:
|
|
192 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
193 elif filter_rank_motifs == 1:
|
|
194 if row[1].count('(') == 1:
|
|
195 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
196 else:
|
|
197 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
198 elif filter_occurrences == 1:
|
|
199 if (row[15] == "1" and row[16] == "1"):
|
|
200 if filter_rank_motifs == 1:
|
|
201 if row[1].count('(') == 1:
|
|
202 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
203 else:
|
|
204 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
205 elif filter_rank_motifs == 1:
|
|
206 if row[1].count('(') == 1:
|
|
207 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
208 else:
|
|
209 csvfile_outfile.write('\t'.join(row) + seq_info)
|
|
210
|
|
211 # if filter_rank_motifs is active, order the file by the size of the motif
|
|
212 if filter_rank_motifs == 1:
|
|
213 rank_motif = []
|
|
214 ranked_list = []
|
|
215 # read in the non-ordered file and add every entry to rank_motif list
|
|
216 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered") as csvfile_ranksize:
|
|
217 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
|
|
218 header = csv_rank.next()
|
|
219 for line in csv_rank:
|
|
220 rank_motif.append(line)
|
|
221
|
|
222 # open the file ready to write the ordered list
|
|
223 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered", 'w') as rank_outfile:
|
|
224 rankwriter = csv.writer(rank_outfile, delimiter='\t', lineterminator='\n')
|
|
225 rankwriter.writerow(header)
|
|
226 count = 2
|
|
227 while count < 10:
|
|
228 for row in rank_motif:
|
|
229 # count size of motif
|
|
230 motif = re.search(r'[ATCG]*',row[1])
|
|
231 if motif:
|
|
232 the_motif = motif.group()
|
|
233 # rank it and write into ranked_list
|
|
234 if len(the_motif) == count:
|
|
235 ranked_list.insert(0, row)
|
|
236 count = count + 1
|
|
237 # write out the ordered list, into the .filtered file
|
|
238 for row in ranked_list:
|
|
239 rankwriter.writerow(row)
|
|
240
|
|
241 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
242 print "Checking assembly flags supplied:"
|
|
243 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
244 if perform_assembly == 0:
|
|
245 print "Assembly flag not supplied. Not performing assembly QC.\n"
|
|
246 if perform_assembly == 1:
|
|
247 print "-assembly flag supplied: Performing PandaSeq assembly quality checks."
|
|
248 time.sleep(5)
|
|
249
|
|
250 # Get readID, F primers, R primers and motifs from the filtered pal_finder output
|
|
251 seqIDs = []
|
|
252 motif = []
|
|
253 F_primers = []
|
|
254 R_primers = []
|
|
255 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered") as input_csv:
|
|
256 pal_finder_csv = csv.reader(input_csv, delimiter='\t')
|
|
257 header = pal_finder_csv.next()
|
|
258 for row in pal_finder_csv:
|
|
259 seqIDs.append(row[0])
|
|
260 motif.append(row[1])
|
|
261 F_primers.append(row[7])
|
|
262 R_primers.append(row[9])
|
|
263
|
|
264 # Get a list of just the unique IDs we want
|
|
265 wanted = set()
|
|
266 for line in seqIDs:
|
|
267 wanted.add(line)
|
|
268
|
|
269 # Assemble the paired end reads into overlapping contigs using PandaSeq
|
|
270 # (can be skipped with the -a flag if assembly has already been run
|
|
271 # and the appropriate files are in the same directory as the script,
|
|
272 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
|
|
273 #
|
|
274 # I suggest you run this script WITHOUT -a first, and then use the -a flag in
|
|
275 # any subsequent reanalysis
|
|
276
|
|
277 if skip_assembly == 0:
|
|
278 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + R2_input + ' -o 25 -t 0.95 -w Assembly.fasta'
|
|
279 subprocess.call(pandaseq_command, shell=True)
|
|
280 strip_barcodes("Assembly.fasta")
|
|
281 print "\nPaired end reads been assembled into overlapping reads."
|
|
282 print "\nFor future analysis, you can skip this assembly step using the -a flag, provided that the assembly.fasta file is intact and in the same location."
|
|
283 else:
|
|
284 print "\n(Skipping the assembly step as you provided the -a flag)"
|
|
285
|
|
286 # FastQ files need to be converted to fasta. Again, I suggest running this section
|
|
287 # the first time WITHOUT the -c flag and then skipping it later using the -c flag
|
|
288 # Make sure the fasta files are in the same location and do not change the filenames
|
|
289
|
|
290 if skip_conversion ==0:
|
|
291 fastq_to_fasta(R1_input)
|
|
292 fastq_to_fasta(R2_input)
|
|
293 print "\nThe input fastq files have been converted to the fasta format."
|
|
294 print "\nFor any future analysis, you can skip this conversion step using the -c flag, provided that the fasta files are intact and in the same location."
|
|
295 else:
|
|
296 print "\n(Skipping the fastq -> fasta conversion as you provided the -c flag)"
|
|
297
|
|
298 # get the files and everything else we will need
|
|
299 assembly_file = "Assembly_adapters_removed.fasta" # Assembled fasta file
|
|
300 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + "_filtered.fasta" # filtered R1 reads
|
|
301 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + "_filtered.fasta" # filtered R2 reads
|
|
302 outputfilename = os.path.splitext(os.path.basename(R1_input))[0]
|
|
303
|
|
304 # parse the files with SeqIO
|
|
305 assembly_sequences = SeqIO.parse(assembly_file,'fasta')
|
|
306 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
|
|
307
|
|
308 # create some empty lists to hold the ID tags we are interested in
|
|
309 assembly_IDs = []
|
|
310 fasta_IDs = []
|
|
311
|
|
312 # populate the above lists with sequence IDs
|
|
313 for sequence in assembly_sequences:
|
|
314 assembly_IDs.append(sequence.id)
|
|
315 for sequence in R1fasta_sequences:
|
|
316 fasta_IDs.append(sequence.id)
|
|
317
|
|
318 # Index the assembly fasta file
|
|
319 assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
|
|
320 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
|
|
321 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
|
|
322
|
|
323 # prepare the output file
|
|
324 with open (outputfilename + "_pal_finder_assembly_output.txt", 'w') as outputfile:
|
|
325 outputfile.write("readPairID\t Forward Primer\t F Primer Position in Assembled Read\t Reverse Primer\t R Primer Position in Assembled Read\t Predicted Amplicon Size (bp)\t Motifs(bases)\t Assembled Read ID\t Assembled Read Sequence\t Raw Forward Read ID\t Raw Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse Read Sequence\n")
|
|
326
|
|
327 # cycle through parameters from the pal_finder output
|
|
328 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
|
|
329 if str(x) in assembly_IDs:
|
|
330 # get the raw sequences ready to go into the output file
|
|
331 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
|
|
332 # fasta entries need to be converted to single line so they sit nicely in the output
|
|
333 assembly_output = assembly_seq.replace("\n","\t")
|
|
334 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
|
|
335 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
|
|
336 #R1_output = R1_output.replace("\n","")
|
|
337 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
|
|
338 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
|
|
339 #R2_output = R2_output.replace("\n","")
|
|
340 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
|
|
341
|
|
342 # check that both primer sequences can be seen in the assembled contig
|
|
343 if y or ReverseComplement1(y) in assembly_no_id and z or ReverseComplement1(z) in assembly_no_id:
|
|
344 if y in assembly_no_id:
|
|
345 # get the positions of the primers in the assembly to predict fragment length
|
|
346 F_position = assembly_no_id.index(y)+len(y)+1
|
|
347 if ReverseComplement1(y) in assembly_no_id:
|
|
348 F_position = assembly_no_id.index(ReverseComplement1(y))+len(ReverseComplement1(y))+1
|
|
349 if z in assembly_no_id:
|
|
350 R_position = assembly_no_id.index(z)+1
|
|
351 if ReverseComplement1(z) in assembly_no_id:
|
|
352 R_position = assembly_no_id.index(ReverseComplement1(z))+1
|
|
353 # calculate fragment length
|
|
354 fragment_length = R_position-F_position
|
|
355
|
|
356 # write everything out into the output file
|
|
357 output = (str(x) + "\t" + y + "\t" + str(F_position) + "\t" + (z) + "\t" + str(R_position) + "\t" + str(fragment_length) + "\t" + a + "\t" + assembly_output + R1_output + "\t" + R2_output + "\n")
|
|
358 outputfile.write(output)
|
|
359 print "\nFinished\n"
|
|
360 else:
|
|
361 if (skip_assembly == 1 or skip_conversion == 1):
|
|
362 print "\nERROR: You cannot supply the -a flag or the -c flag without also supplying the -assembly flag.\n"
|
|
363 print "\nFinished\n"
|