comparison pal_filter.py @ 4:cb56cc1d5c39 draft

Updates to the palfilter.py utility.
author pjbriggs
date Mon, 21 Mar 2016 06:52:43 -0400
parents e1a14ed7a9d6
children 8159dab5dbdb
comparison
equal deleted inserted replaced
3:e1a14ed7a9d6 4:cb56cc1d5c39
1 #!/usr/bin/python -tt 1 #!/usr/bin/python -tt
2 # 2 """
3 # pal_filter 3 pal_filter
4 # https://github.com/graemefox/pal_filter 4 https://github.com/graemefox/pal_filter
5 # 5
6 ################################################################################ 6 Graeme Fox - 03/03/2016 - graeme.fox@manchester.ac.uk
7 # Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk 7 Tested on 64-bit Ubuntu, with Python 2.7
8 # Tested on 64-bit Ubuntu, with Python 2.7 8
9 # 9 ~~~~~~~~~~~~~~~~~~~
10 ################################################################################ 10 PROGRAM DESCRIPTION
11 # PROGRAM DESCRIPTION 11
12 # 12 Program to pick optimum loci from the output of pal_finder_v0.02.04
13 # Program to pick optimum loci from the output of pal_finder_v0.02.04 13
14 # 14 This program can be used to filter output from pal_finder and choose the
15 # This program can be used to filter output from pal_finder and choose the 15 'optimum' loci.
16 # 'optimum' loci. 16
17 # 17 For the paper referncing this workflow, see Griffiths et al.
18 # For the paper referncing this workflow, see Griffiths et al. 18 (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk)
19 # (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk) 19
20 # 20 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
21 ################################################################################ 21 This program also contains a quality-check method to improve the rate of PCR
22 # 22 success. For this QC method, paired end reads are assembled using
23 # This program also contains a quality-check method to improve the rate of PCR 23 PANDAseq so you must have PANDAseq installed.
24 # success. For this QC method, paired end reads are assembled using 24
25 # PANDAseq so you must have PANDAseq installed. 25 For the paper referencing this assembly-QC method see Fox et al.
26 # 26 (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk)
27 # For the paper referencing this assembly-QC method see Fox et al. 27
28 # (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk) 28 For best results in PCR for marker development, I suggest enabling all the
29 # 29 filter options AND the assembly based QC
30 # For best results in PCR for marker development, I suggest enabling all the 30
31 # filter options AND the assembly based QC 31 ~~~~~~~~~~~~
32 # 32 REQUIREMENTS
33 ################################################################################ 33
34 # REQUIREMENTS 34 Must have Biopython installed (www.biopython.org).
35 # 35
36 # Must have Biopython installed (www.biopython.org). 36 If you with to perform the assembly QC step, you must have:
37 # 37 PandaSeq (https://github.com/neufeld/pandaseq)
38 # If you with to perform the assembly QC step, you must have: 38 PandaSeq must be in your $PATH / able to run from anywhere
39 # PandaSeq (https://github.com/neufeld/pandaseq) 39
40 # PandaSeq must be in your $PATH / able to run from anywhere 40 ~~~~~~~~~~~~~~~~
41 ################################################################################ 41 REQUIRED OPTIONS
42 # REQUIRED OPTIONS 42
43 # 43 -i forward_paired_ends.fastQ
44 # -i forward_paired_ends.fastQ 44 -j reverse_paired_ends.fastQ
45 # -j reverse_paired_ends.fastQ 45 -p pal_finder output - by default pal_finder names this "x_PAL_summary.txt"
46 # -p pal_finder output - the "(microsatellites with read IDs and 46
47 # primer pairs)" file 47 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
48 # 48 BY DEFAULT THIS PROGRAM DOES NOTHING. ENABLE SOME OF THE OPTIONS BELOW.
49 # By default the program does nothing.... 49 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
50 # 50 NON-REQUIRED OPTIONS
51 # NON-REQUIRED OPTIONS 51
52 # 52 -assembly: turn on the pandaseq assembly QC step
53 # -assembly: turn on the pandaseq assembly QC step 53
54 # -primers: filter microsatellite loci to just those which 54 -primers: filter microsatellite loci to just those which have primers designed
55 # have primers designed 55
56 # 56 -occurrences: filter microsatellite loci to those with primers
57 # -occurrences: filter microsatellite loci to those with primers 57 which appear only once in the dataset
58 # which appear only once in the dataset 58
59 # 59 -rankmotifs: filter microsatellite loci to just those with perfect motifs.
60 # -rankmotifs: filter microsatellite loci to just those with perfect motifs. 60 Rank the output by size of motif (largest first)
61 # Rank the output by size of motif (largest first) 61
62 # 62 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
63 ########################################################### 63 For repeat analysis, the following extra non-required options may be useful:
64 # For repeat analysis, the following extra non-required options may be useful: 64
65 # 65 Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the
66 # Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the 66 first time, generate the files and then skip either, or both steps with
67 # first time, generate the files and then skip either, or both steps with 67 the following:
68 # the following: 68
69 # 69 -a: skip assembly step
70 # -a: skip assembly step 70 -c: skip fastq -> fasta conversion step
71 # -c: skip fastq -> fasta conversion step 71
72 # 72 Just make sure to keep the assembled/converted files in the correct directory
73 # Just make sure to keep the assembled/converted files in the correct directory 73 with the correct filename(s)
74 # with the correct filename(s) 74
75 ########################################################### 75 ~~~~~~~~~~~~~~~
76 # 76 EXAMPLE USAGE:
77 # EXAMPLE USAGE: 77
78 # 78 pal_filtery.py -i R1.fastq -j R2.fastq
79 # pal_filtery.py -i R1.fastq -j R2.fastq 79 -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly
80 # -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly 80
81 # 81 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
82 ########################################################### 82 """
83 import Bio, subprocess, argparse, csv, os, re, time 83 import Bio, subprocess, argparse, csv, os, re, time
84 from Bio import SeqIO 84 from Bio import SeqIO
85 85 __version__ = "1.0.0"
86 # Get values for all the required and optional arguments
87
88 parser = argparse.ArgumentParser(description='pal_filter')
89 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \
90 required=True)
91
92 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \
93 required=True)
94
95 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \
96 required=True)
97
98 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \
99 based QC', nargs='?', const=1, type=int, required=False)
100
101 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \
102 been run, skip it with -a', nargs='?', const=1, \
103 type=int, required=False)
104
105 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \
106 conversion has already been run, skip it with -c', \
107 nargs='?', const=1, type=int, required=False)
108
109 parser.add_argument('-primers','--filter_by_primer_column', help='Filter \
110 pal_finder output to just those loci which have primers \
111 designed', nargs='?', const=1, type=int, required=False)
112
113 parser.add_argument('-occurrences','--filter_by_occurrences_column', \
114 help='Filter pal_finder output to just loci with primers \
115 which only occur once in the dataset', nargs='?', \
116 const=1, type=int, required=False)
117
118 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \
119 help='Filter pal_finder output to just loci which are a \
120 perfect repeat unit. Also, rank the loci by motif size \
121 (largest first)', nargs='?', const=1, type=int, \
122 required=False)
123
124 args = vars(parser.parse_args())
125
126 # parse arguments
127
128 R1_input = args['input1'];
129 R2_input = args['input2'];
130 pal_finder_output = args['pal_finder'];
131 perform_assembly = args['assembly_QC']
132 skip_assembly = args['skip_assembly'];
133 skip_conversion = args['skip_conversion'];
134 filter_primers = args['filter_by_primer_column'];
135 filter_occurrences = args['filter_by_occurrences_column'];
136 filter_rank_motifs = args['filter_and_rank_by_motif_size'];
137
138 # set default values for arguments
139 if perform_assembly is None:
140 perform_assembly = 0
141 if skip_assembly is None:
142 skip_assembly = 0
143 if skip_conversion is None:
144 skip_conversion = 0
145 if filter_primers is None:
146 filter_primers = 0
147 if filter_occurrences is None:
148 filter_occurrences = 0
149 if filter_rank_motifs is None:
150 filter_rank_motifs = 0
151
152 ############################################################ 86 ############################################################
153 # Function List # 87 # Function List #
154 ############################################################ 88 ############################################################
155
156 # Reverse complement a sequence
157
158 def ReverseComplement1(seq): 89 def ReverseComplement1(seq):
90 """
91 take a nucleotide sequence and reverse-complement it
92 """
159 seq_dict = {'A':'T','T':'A','G':'C','C':'G'} 93 seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
160 return "".join([seq_dict[base] for base in reversed(seq)]) 94 return "".join([seq_dict[base] for base in reversed(seq)])
161 95
162 # Convert a .fastq to a .fasta, filter to just lines we want 96 def fastq_to_fasta(input_file, wanted_set):
163 # and strip MiSeq barcodes 97 """
164 98 take a file in fastq format, convert to fasta format and filter on
165 def fastq_to_fasta(file): 99 the set of sequences that we want to keep
166 file_name = os.path.splitext(os.path.basename(file))[0] 100 """
101 file_name = os.path.splitext(os.path.basename(input_file))[0]
167 with open(file_name + "_filtered.fasta", "w") as out: 102 with open(file_name + "_filtered.fasta", "w") as out:
168 for record in SeqIO.parse(file, "fastq"): 103 for record in SeqIO.parse(input_file, "fastq"):
169 ID = str(record.id) 104 ID = str(record.id)
170 SEQ = str(record.seq) 105 SEQ = str(record.seq)
171 if ID in wanted: 106 if ID in wanted_set:
172 out.write(">" + ID + "\n" + SEQ + "\n") 107 out.write(">" + ID + "\n" + SEQ + "\n")
173 108
174 # strip the miseq barcodes from a fasta file 109 def strip_barcodes(input_file, wanted_set):
175 110 """
176 def strip_barcodes(file): 111 take fastq data containing sequencing barcodes and strip the barcode
177 file_name = os.path.splitext(os.path.basename(file))[0] 112 from each sequence. Filter on the set of sequences that we want to keep
113 """
114 file_name = os.path.splitext(os.path.basename(input_file))[0]
178 with open(file_name + "_adapters_removed.fasta", "w") as out: 115 with open(file_name + "_adapters_removed.fasta", "w") as out:
179 for record in SeqIO.parse(file, "fasta"): 116 for record in SeqIO.parse(input_file, "fasta"):
180 match = re.search(r'\S*:', record.id) 117 match = re.search(r'\S*:', record.id)
181 if match: 118 if match:
182 correct = match.group().rstrip(":") 119 correct = match.group().rstrip(":")
183 else: 120 else:
184 correct = str(record.id) 121 correct = str(record.id)
185 SEQ = str(record.seq) 122 SEQ = str(record.seq)
186 if correct in wanted: 123 if correct in wanted_set:
187 out.write(">" + correct + "\n" + SEQ + "\n") 124 out.write(">" + correct + "\n" + SEQ + "\n")
188 125
189 ############################################################ 126 ############################################################
190 # MAIN PROGRAM # 127 # MAIN PROGRAM #
191 ############################################################ 128 ############################################################
192 print "\n~~~~~~~~~~" 129 print "\n~~~~~~~~~~"
193 print "pal_filter" 130 print "pal_filter"
194 print "~~~~~~~~~~\n" 131 print "~~~~~~~~~~"
132 print "Version: " + __version__
195 time.sleep(1) 133 time.sleep(1)
196 print "\"Find the optimum loci in your pal_finder output and increase "\ 134 print "\nFind the optimum loci in your pal_finder output and increase "\
197 "the rate of successful microsatellite marker development\"" 135 "the rate of successful microsatellite marker development"
198 print "\nSee Griffiths et al. (currently unpublished) for more details......" 136 print "\nSee Griffiths et al. (currently unpublished) for more details......"
199 time.sleep(2) 137 time.sleep(2)
200 138
201 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \ 139 # Get values for all the required and optional arguments
202 and filter_rank_motifs == 0): 140
141 parser = argparse.ArgumentParser(description='pal_filter')
142 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \
143 required=True)
144
145 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \
146 required=True)
147
148 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \
149 required=True)
150
151 parser.add_argument('-assembly', help='Perform the PandaSeq based QC', \
152 action='store_true')
153
154 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \
155 been run, skip it with -a', action='store_true')
156
157 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \
158 conversion has already been run, skip it with -c', \
159 action='store_true')
160
161 parser.add_argument('-primers', help='Filter \
162 pal_finder output to just those loci which have primers \
163 designed', action='store_true')
164
165 parser.add_argument('-occurrences', \
166 help='Filter pal_finder output to just loci with primers \
167 which only occur once in the dataset', action='store_true')
168
169 parser.add_argument('-rankmotifs', \
170 help='Filter pal_finder output to just loci which are a \
171 perfect repeat unit. Also, rank the loci by motif size \
172 (largest first)', action='store_true')
173
174 parser.add_argument('-v', '--get_version', help='Print the version number of \
175 this pal_filter script', action='store_true')
176
177 args = parser.parse_args()
178
179 if not args.assembly and not args.primers and not args.occurrences \
180 and not args.rankmotifs:
203 print "\nNo optional arguments supplied." 181 print "\nNo optional arguments supplied."
204 print "\nBy default this program does nothing." 182 print "\nBy default this program does nothing."
205 print "\nNo files produced and no modifications made." 183 print "\nNo files produced and no modifications made."
206 print "\nFinished.\n" 184 print "\nFinished.\n"
207 exit() 185 exit()
208
209 else: 186 else:
210 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 187 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
211 print "Checking supplied filtering parameters:" 188 print "Checking supplied filtering parameters:"
212 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 189 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
213 time.sleep(2) 190 time.sleep(2)
214 if filter_primers == 1: 191 if args.get_version:
192 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
193 print "pal_filter version is " + __version__ + " (03/03/2016)"
194 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
195 if args.primers:
215 print "-primers flag supplied." 196 print "-primers flag supplied."
216 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \ 197 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \
217 " column." 198 " column."
218 print "Only rows where primers have successfully been designed will"\ 199 print "Only rows where primers have successfully been designed will"\
219 " pass the filter.\n" 200 " pass the filter.\n"
220 time.sleep(2) 201 time.sleep(2)
221 if filter_occurrences == 1: 202 if args.occurrences:
222 print "-occurrences flag supplied." 203 print "-occurrences flag supplied."
223 print "Filtering pal_finder output on the \"Occurences of Forward" \ 204 print "Filtering pal_finder output on the \"Occurrences of Forward" \
224 " Primer in Reads\" and \"Occurences of Reverse Primer" \ 205 " Primer in Reads\" and \"Occurrences of Reverse Primer" \
225 " in Reads\" columns." 206 " in Reads\" columns."
226 print "Only rows where both primers occur only a single time in the"\ 207 print "Only rows where both primers occur only a single time in the"\
227 " reads pass the filter.\n" 208 " reads pass the filter.\n"
228 time.sleep(2) 209 time.sleep(2)
229 if filter_rank_motifs == 1: 210 if args.rankmotifs:
230 print "-rankmotifs flag supplied." 211 print "-rankmotifs flag supplied."
231 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \ 212 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \
232 " just those with perfect repeats." 213 " just those with perfect repeats."
233 print "Only rows containing 'perfect' repeats will pass the filter." 214 print "Only rows containing 'perfect' repeats will pass the filter."
234 print "Also, ranking output by size of motif (largest first).\n" 215 print "Also, ranking output by size of motif (largest first).\n"
235 time.sleep(2) 216 time.sleep(2)
217
236 # index the raw fastq files so that the sequences can be pulled out and 218 # index the raw fastq files so that the sequences can be pulled out and
237 # added to the filtered output file 219 # added to the filtered output file
238
239 # Indexing input sequence files
240 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 220 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
241 print "Indexing FastQ files....." 221 print "Indexing FastQ files....."
242 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 222 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
243 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq') 223 R1fastq_sequences_index = SeqIO.index(args.input1,'fastq')
244 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq') 224 R2fastq_sequences_index = SeqIO.index(args.input2,'fastq')
245 print "Indexing complete." 225 print "Indexing complete."
246 226
247 # create a set to hold the filtered output 227 # create a set to hold the filtered output
248 wanted_lines = set() 228 wanted_lines = set()
249 229
250 # get lines from the pal_finder output which meet filter settings 230 # get lines from the pal_finder output which meet filter settings
251
252 # read the pal_finder output file into a csv reader 231 # read the pal_finder output file into a csv reader
253 with open (pal_finder_output) as csvfile_infile: 232 with open (args.pal_finder) as csvfile_infile:
254 csv_f = csv.reader(csvfile_infile, delimiter='\t') 233 csv_f = csv.reader(csvfile_infile, delimiter='\t')
255 header = csv_f.next() 234 header = csv_f.next()
256 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ 235 header.extend(("R1_Sequence_ID", \
257 ".filtered", 'w') as csvfile_outfile: 236 "R1_Sequence", \
258 # write the header line for the output file 237 "R2_Sequence_ID", \
259 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t" 238 "R2_Sequence" + "\n"))
260 "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n") 239 with open( \
240 os.path.splitext(os.path.basename(args.pal_finder))[0] + \
241 ".filtered", 'w') as csvfile_outfile:
242 # write the header line for the output file
243 csvfile_outfile.write('\t'.join(header))
261 for row in csv_f: 244 for row in csv_f:
262 # get the sequence ID 245 # get the sequence ID
263 seq_ID = row[0] 246 seq_ID = row[0]
264 # get the raw sequence reads and convert to a format that can 247 # get the raw sequence reads and convert to a format that can
265 # go into a tsv file 248 # go into a tsv file
266 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\ 249 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\
267 replace("\n","\t",1).replace("\n","") 250 replace("\n","\t",1).replace("\n","")
268 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\ 251 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\
269 replace("\n","\t",1).replace("\n","") 252 replace("\n","\t",1).replace("\n","")
270 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n" 253 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
271 # navigate through all different combinations of filter options 254 # navigate through all different combinations of filter options
272 # if the primer filter is switched on 255 # if the primer filter is switched on
273 if filter_primers == 1: 256 if args.primers:
274 # check the occurrences of primers field 257 # check the occurrences of primers field
275 if row[5] == "1": 258 if row[5] == "1":
276 # if filter occurrences of primers is switched on 259 # if filter occurrences of primers is switched on
277 if filter_occurrences == 1: 260 if args.occurrences:
278 # check the occurrences of primers field 261 # check the occurrences of primers field
279 if (row[15] == "1" and row[16] == "1"): 262 if (row[15] == "1" and row[16] == "1"):
280 # if rank by motif is switched on 263 # if rank by motif is switched on
281 if filter_rank_motifs == 1: 264 if args.rankmotifs:
282 # check for perfect motifs 265 # check for perfect motifs
283 if row[1].count('(') == 1: 266 if row[1].count('(') == 1:
284 # all 3 filter switched on 267 # all 3 filter switched on
285 # write line out to output 268 # write line out to output
286 csvfile_outfile.write('\t'.join(row) + \ 269 csvfile_outfile.write('\t'.join(row) + \
287 seq_info) 270 seq_info)
288
289 else: 271 else:
290 csvfile_outfile.write('\t'.join(row) + seq_info) 272 csvfile_outfile.write('\t'.join(row) + seq_info)
291 elif filter_rank_motifs == 1: 273 elif args.rankmotifs:
292 if row[1].count('(') == 1: 274 if row[1].count('(') == 1:
293 csvfile_outfile.write('\t'.join(row) + seq_info) 275 csvfile_outfile.write('\t'.join(row) + seq_info)
294 else: 276 else:
295 csvfile_outfile.write('\t'.join(row) + seq_info) 277 csvfile_outfile.write('\t'.join(row) + seq_info)
296 elif filter_occurrences == 1: 278 elif args.occurrences:
297 if (row[15] == "1" and row[16] == "1"): 279 if (row[15] == "1" and row[16] == "1"):
298 if filter_rank_motifs == 1: 280 if args.rankmotifs:
299 if row[1].count('(') == 1: 281 if row[1].count('(') == 1:
300 csvfile_outfile.write('\t'.join(row) + seq_info) 282 csvfile_outfile.write('\t'.join(row) + seq_info)
301 else: 283 else:
302 csvfile_outfile.write('\t'.join(row) + seq_info) 284 csvfile_outfile.write('\t'.join(row) + seq_info)
303 elif filter_rank_motifs == 1: 285 elif args.rankmotifs:
304 if row[1].count('(') == 1: 286 if row[1].count('(') == 1:
305 csvfile_outfile.write('\t'.join(row) + seq_info) 287 csvfile_outfile.write('\t'.join(row) + seq_info)
306 else: 288 else:
307 csvfile_outfile.write('\t'.join(row) + seq_info) 289 csvfile_outfile.write('\t'.join(row) + seq_info)
308 290
309 # if filter_rank_motifs is active, order the file by the size of the motif 291 # if filter_rank_motifs is active, order the file by the size of the motif
310 if filter_rank_motifs == 1: 292 if args.rankmotifs:
311 rank_motif = [] 293 rank_motif = []
312 ranked_list = [] 294 ranked_list = []
313 # read in the non-ordered file and add every entry to rank_motif list 295 # read in the non-ordered file and add every entry to rank_motif list
314 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ 296 with open( \
315 ".filtered") as csvfile_ranksize: 297 os.path.splitext(os.path.basename(args.pal_finder))[0] + \
298 ".filtered") as csvfile_ranksize:
316 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t') 299 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
317 header = csv_rank.next() 300 header = csv_rank.next()
318 for line in csv_rank: 301 for line in csv_rank:
319 rank_motif.append(line) 302 rank_motif.append(line)
320 303
321 # open the file ready to write the ordered list 304 # open the file ready to write the ordered list
322 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ 305 with open( \
323 ".filtered", 'w') as rank_outfile: 306 os.path.splitext(os.path.basename(args.pal_finder))[0] + \
307 ".filtered", 'w') as rank_outfile:
324 rankwriter = csv.writer(rank_outfile, delimiter='\t', \ 308 rankwriter = csv.writer(rank_outfile, delimiter='\t', \
325 lineterminator='\n') 309 lineterminator='\n')
326 rankwriter.writerow(header) 310 rankwriter.writerow(header)
327 count = 2 311 count = 2
328 while count < 10: 312 while count < 10:
329 for row in rank_motif: 313 for row in rank_motif:
330 # count size of motif 314 # count size of motif
331 motif = re.search(r'[ATCG]*',row[1]) 315 motif = re.search(r'[ATCG]*',row[1])
332 if motif: 316 if motif:
333 the_motif = motif.group() 317 the_motif = motif.group()
334 # rank it and write into ranked_list 318 # rank it and write into ranked_list
335 if len(the_motif) == count: 319 if len(the_motif) == count:
336 ranked_list.insert(0, row) 320 ranked_list.insert(0, row)
337 count = count + 1 321 count = count + 1
338 # write out the ordered list, into the .filtered file 322 # write out the ordered list, into the .filtered file
339 for row in ranked_list: 323 for row in ranked_list:
340 rankwriter.writerow(row) 324 rankwriter.writerow(row)
341 325
342 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 326 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
343 print "Checking assembly flags supplied:" 327 print "Checking assembly flags supplied:"
344 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" 328 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
345 if perform_assembly == 0: 329 if not args.assembly:
346 print "Assembly flag not supplied. Not performing assembly QC.\n" 330 print "Assembly flag not supplied. Not performing assembly QC.\n"
347 if perform_assembly == 1: 331 if args.assembly:
348 print "-assembly flag supplied: Performing PandaSeq assembly quality checks." 332 print "-assembly flag supplied: Perform PandaSeq assembly quality checks."
349 print "See Fox et al. (currently unpublished) for full details on the"\ 333 print "See Fox et al. (currently unpublished) for full details on the"\
350 " quality-check process.\n" 334 " quality-check process.\n"
351 time.sleep(5) 335 time.sleep(5)
352 336
353 # Get readID, F primers, R primers and motifs from filtered pal_finder output 337 # Get readID, F primers, R primers and motifs from filtered pal_finder output
354 seqIDs = [] 338 seqIDs = []
355 motif = [] 339 motif = []
356 F_primers = [] 340 F_primers = []
357 R_primers = [] 341 R_primers = []
358 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ 342 with open( \
359 ".filtered") as input_csv: 343 os.path.splitext(os.path.basename(args.pal_finder))[0] + \
344 ".filtered") as input_csv:
360 pal_finder_csv = csv.reader(input_csv, delimiter='\t') 345 pal_finder_csv = csv.reader(input_csv, delimiter='\t')
361 header = pal_finder_csv.next() 346 header = pal_finder_csv.next()
362 for row in pal_finder_csv: 347 for row in pal_finder_csv:
363 seqIDs.append(row[0]) 348 seqIDs.append(row[0])
364 motif.append(row[1]) 349 motif.append(row[1])
367 352
368 # Get a list of just the unique IDs we want 353 # Get a list of just the unique IDs we want
369 wanted = set() 354 wanted = set()
370 for line in seqIDs: 355 for line in seqIDs:
371 wanted.add(line) 356 wanted.add(line)
372 357 """
373 # Assemble the paired end reads into overlapping contigs using PandaSeq 358 Assemble the paired end reads into overlapping contigs using PandaSeq
374 # (can be skipped with the -a flag if assembly has already been run 359 (can be skipped with the -a flag if assembly has already been run
375 # and the appropriate files are in the same directory as the script, 360 and the appropriate files are in the same directory as the script,
376 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta") 361 and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
377 # 362
378 # The first time you riun the script you MUST not enable the -a flag.t 363 The first time you riun the script you MUST not enable the -a flag.t
379 # but you are able to skip the assembly in subsequent analysis using the 364 but you are able to skip the assembly in subsequent analysis using the
380 # -a flag. 365 -a flag.
381 366 """
382 if skip_assembly == 0: 367 if not args.skip_assembly:
383 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \ 368 pandaseq_command = 'pandaseq -A pear -f ' + args.input1 + ' -r ' + \
384 R2_input + ' -o 25 -t 0.95 -w Assembly.fasta' 369 args.input2 + ' -o 25 -t 0.95 -w Assembly.fasta'
385 subprocess.call(pandaseq_command, shell=True) 370 subprocess.call(pandaseq_command, shell=True)
386 strip_barcodes("Assembly.fasta") 371 strip_barcodes("Assembly.fasta", wanted)
387 print "\nPaired end reads been assembled into overlapping reads." 372 print "\nPaired end reads been assembled into overlapping reads."
388 print "\nFor future analysis, you can skip this assembly step using \ 373 print "\nFor future analysis, you can skip this assembly step using" \
389 the -a flag, provided that the assembly.fasta file is \ 374 " the -a flag, provided that the assembly.fasta file" \
390 intact and in the same location." 375 " is intact and in the same location."
391 else: 376 else:
392 print "\n(Skipping the assembly step as you provided the -a flag)" 377 print "\n(Skipping the assembly step as you provided the -a flag)"
393 378 """
394 # Fastq files need to be converted to fasta. The first time you run the script 379 Fastq files need to be converted to fasta. The first time you run the script
395 # you MUST not enable the -c flag, but you are able to skip the conversion 380 you MUST not enable the -c flag, but you are able to skip the conversion
396 # later using the -c flag. Make sure the fasta files are in the same location 381 later using the -c flag. Make sure the fasta files are in the same location
397 #and do not change the filenames 382 and do not change the filenames
398 383 """
399 if skip_conversion ==0: 384 if not args.skip_conversion:
400 fastq_to_fasta(R1_input) 385 fastq_to_fasta(args.input1, wanted)
401 fastq_to_fasta(R2_input) 386 fastq_to_fasta(args.input2, wanted)
402 print "\nThe input fastq files have been converted to the fasta format." 387 print "\nThe input fastq files have been converted to the fasta format."
403 print "\nFor any future analysis, you can skip this conversion step \ 388 print "\nFor any future analysis, you can skip this conversion step" \
404 using the -c flag, provided that the fasta files \ 389 " using the -c flag, provided that the fasta files" \
405 are intact and in the same location." 390 " are intact and in the same location."
406 else: 391 else:
407 print "\n(Skipping the fastq -> fasta conversion as you provided the" \ 392 print "\n(Skipping the fastq -> fasta conversion as you provided the" \
408 " -c flag).\n" 393 " -c flag).\n"
409 394
410 # get the files and everything else we will need 395 # get the files and everything else needed
411 396 # Assembled fasta file
412 # Assembled fasta file
413 assembly_file = "Assembly_adapters_removed.fasta" 397 assembly_file = "Assembly_adapters_removed.fasta"
414 # filtered R1 reads 398 # filtered R1 reads
415 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \ 399 R1_fasta = os.path.splitext( \
416 "_filtered.fasta" 400 os.path.basename(args.input1))[0] + "_filtered.fasta"
417 # filtered R2 reads 401 # filtered R2 reads
418 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \ 402 R2_fasta = os.path.splitext( \
419 "_filtered.fasta" 403 os.path.basename(args.input2))[0] + "_filtered.fasta"
420 404 outputfilename = os.path.splitext(os.path.basename(args.input1))[0]
421 outputfilename = os.path.splitext(os.path.basename(R1_input))[0] 405 # parse the files with SeqIO
422
423 # parse the files with SeqIO
424 assembly_sequences = SeqIO.parse(assembly_file,'fasta') 406 assembly_sequences = SeqIO.parse(assembly_file,'fasta')
425 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta') 407 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
426 408
427 # create some empty lists to hold the ID tags we are interested in 409 # create some empty lists to hold the ID tags we are interested in
428 assembly_IDs = [] 410 assembly_IDs = []
429 fasta_IDs = [] 411 fasta_IDs = []
430 412
431 # populate the above lists with sequence IDs 413 # populate the above lists with sequence IDs
432 for sequence in assembly_sequences: 414 for sequence in assembly_sequences:
433 assembly_IDs.append(sequence.id) 415 assembly_IDs.append(sequence.id)
434 for sequence in R1fasta_sequences: 416 for sequence in R1fasta_sequences:
435 fasta_IDs.append(sequence.id) 417 fasta_IDs.append(sequence.id)
436 418
437 # Index the assembly fasta file 419 # Index the assembly fasta file
438 assembly_sequences_index = SeqIO.index(assembly_file,'fasta') 420 assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
439 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta') 421 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
440 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta') 422 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
441 423
442 # prepare the output file 424 # prepare the output file
443 with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \ 425 with open ( \
444 as outputfile: 426 outputfilename + "_pal_filter_assembly_output.txt", 'w') \
445 # write the headers for the output file 427 as outputfile:
446 outputfile.write("readPairID\t Forward Primer\t F Primer Position in " 428 # write the headers for the output file
447 "Assembled Read\t Reverse Primer\t R Primer Position in " 429 output_header = ("readPairID", \
448 "Assembled Read\t Motifs(bases)\t Assembled Read ID\t " 430 "Forward Primer",\
449 "Assembled Read Sequence\t Raw Forward Read ID\t Raw " 431 "F Primer Position in Assembled Read", \
450 "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse " 432 "Reverse Primer", \
451 "Read Sequence\n") 433 "R Primer Position in Assembled Read", \
452 434 "Motifs(bases)", \
453 # cycle through parameters from the pal_finder output 435 "Assembled Read ID", \
436 "Assembled Read Sequence", \
437 "Raw Forward Read ID", \
438 "Raw Forward Read Sequence", \
439 "Raw Reverse Read ID", \
440 "Raw Reverse Read Sequence\n")
441 outputfile.write("\t".join(output_header))
442
443 # cycle through parameters from the pal_finder output
454 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif): 444 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
455 if str(x) in assembly_IDs: 445 if str(x) in assembly_IDs:
456 # get the raw sequences ready to go into the output file 446 # get the raw sequences ready to go into the output file
457 assembly_seq = (assembly_sequences_index.get_raw(x).decode()) 447 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
458 # fasta entries need to be converted to single line so sit nicely in the output 448 # fasta entries need to be converted to single line so sit
459 assembly_output = assembly_seq.replace("\n","\t") 449 # nicely in the output
450 assembly_output = assembly_seq.replace("\n","\t").strip('\t')
460 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode()) 451 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
461 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","") 452 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
462 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode()) 453 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
463 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","") 454 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
464 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:]) 455 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
465 456
466 # check that both primer sequences can be seen in the assembled contig 457 # check that both primer sequences can be seen in the
458 # assembled contig
467 if y or ReverseComplement1(y) in assembly_no_id and z or \ 459 if y or ReverseComplement1(y) in assembly_no_id and z or \
468 ReverseComplement1(z) in assembly_no_id: 460 ReverseComplement1(z) in assembly_no_id:
469 if y in assembly_no_id: 461 if y in assembly_no_id:
470 # get the positions of the primers in the assembly to predict fragment length 462 # get the positions of the primers in the assembly
463 # (can be used to predict fragment length)
471 F_position = assembly_no_id.index(y)+len(y)+1 464 F_position = assembly_no_id.index(y)+len(y)+1
472 if ReverseComplement1(y) in assembly_no_id: 465 if ReverseComplement1(y) in assembly_no_id:
473 F_position = assembly_no_id.index(ReverseComplement1(y)\ 466 F_position = assembly_no_id.index( \
474 )+len(ReverseComplement1(y))+1 467 ReverseComplement1(y))+len(ReverseComplement1(y))+1
475 if z in assembly_no_id: 468 if z in assembly_no_id:
476 R_position = assembly_no_id.index(z)+1 469 R_position = assembly_no_id.index(z)+1
477 if ReverseComplement1(z) in assembly_no_id: 470 if ReverseComplement1(z) in assembly_no_id:
478 R_position = assembly_no_id.index(ReverseComplement1(z)\ 471 R_position = assembly_no_id.index( \
479 )+1 472 ReverseComplement1(z))+1
480 473 output = (str(x),
481 # write everything out into the output file 474 str(y),
482 output = (str(x) + "\t" + y + "\t" + str(F_position) \ 475 str(F_position),
483 + "\t" + (z) + "\t" + str(R_position) \ 476 str(z),
484 + "\t" + a + "\t" + assembly_output \ 477 str(R_position),
485 + R1_output + "\t" + R2_output + "\n") 478 str(a),
486 outputfile.write(output) 479 str(assembly_output),
480 str(R1_output),
481 str(R2_output + "\n"))
482 outputfile.write("\t".join(output))
487 print "\nPANDAseq quality check complete." 483 print "\nPANDAseq quality check complete."
488 print "Results from PANDAseq quality check (and filtering, if any" \ 484 print "Results from PANDAseq quality check (and filtering, if any" \
489 " any filters enabled) written to output file" \ 485 " any filters enabled) written to output file" \
490 " ending \"_pal_filter_assembly_output.txt\".\n\n" 486 " ending \"_pal_filter_assembly_output.txt\".\n"
491
492 print "Filtering of pal_finder results complete." 487 print "Filtering of pal_finder results complete."
493 print "Filtered results written to output file ending \".filtered\"." 488 print "Filtered results written to output file ending \".filtered\"."
494 print "\nFinished\n" 489 print "\nFinished\n"
495 else: 490 else:
496 if (skip_assembly == 1 or skip_conversion == 1): 491 if args.skip_assembly or args.skip_conversion:
497 print "\nERROR: You cannot supply the -a flag or the -c flag without \ 492 print "\nERROR: You cannot supply the -a flag or the -c flag without \
498 also supplying the -assembly flag.\n" 493 also supplying the -assembly flag.\n"
499
500 print "\nProgram Finished\n" 494 print "\nProgram Finished\n"