Mercurial > repos > pjbriggs > pal_finder
comparison pal_filter.py @ 4:cb56cc1d5c39 draft
Updates to the palfilter.py utility.
author | pjbriggs |
---|---|
date | Mon, 21 Mar 2016 06:52:43 -0400 |
parents | e1a14ed7a9d6 |
children | 8159dab5dbdb |
comparison
equal
deleted
inserted
replaced
3:e1a14ed7a9d6 | 4:cb56cc1d5c39 |
---|---|
1 #!/usr/bin/python -tt | 1 #!/usr/bin/python -tt |
2 # | 2 """ |
3 # pal_filter | 3 pal_filter |
4 # https://github.com/graemefox/pal_filter | 4 https://github.com/graemefox/pal_filter |
5 # | 5 |
6 ################################################################################ | 6 Graeme Fox - 03/03/2016 - graeme.fox@manchester.ac.uk |
7 # Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk | 7 Tested on 64-bit Ubuntu, with Python 2.7 |
8 # Tested on 64-bit Ubuntu, with Python 2.7 | 8 |
9 # | 9 ~~~~~~~~~~~~~~~~~~~ |
10 ################################################################################ | 10 PROGRAM DESCRIPTION |
11 # PROGRAM DESCRIPTION | 11 |
12 # | 12 Program to pick optimum loci from the output of pal_finder_v0.02.04 |
13 # Program to pick optimum loci from the output of pal_finder_v0.02.04 | 13 |
14 # | 14 This program can be used to filter output from pal_finder and choose the |
15 # This program can be used to filter output from pal_finder and choose the | 15 'optimum' loci. |
16 # 'optimum' loci. | 16 |
17 # | 17 For the paper referncing this workflow, see Griffiths et al. |
18 # For the paper referncing this workflow, see Griffiths et al. | 18 (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk) |
19 # (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk) | 19 |
20 # | 20 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
21 ################################################################################ | 21 This program also contains a quality-check method to improve the rate of PCR |
22 # | 22 success. For this QC method, paired end reads are assembled using |
23 # This program also contains a quality-check method to improve the rate of PCR | 23 PANDAseq so you must have PANDAseq installed. |
24 # success. For this QC method, paired end reads are assembled using | 24 |
25 # PANDAseq so you must have PANDAseq installed. | 25 For the paper referencing this assembly-QC method see Fox et al. |
26 # | 26 (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk) |
27 # For the paper referencing this assembly-QC method see Fox et al. | 27 |
28 # (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk) | 28 For best results in PCR for marker development, I suggest enabling all the |
29 # | 29 filter options AND the assembly based QC |
30 # For best results in PCR for marker development, I suggest enabling all the | 30 |
31 # filter options AND the assembly based QC | 31 ~~~~~~~~~~~~ |
32 # | 32 REQUIREMENTS |
33 ################################################################################ | 33 |
34 # REQUIREMENTS | 34 Must have Biopython installed (www.biopython.org). |
35 # | 35 |
36 # Must have Biopython installed (www.biopython.org). | 36 If you with to perform the assembly QC step, you must have: |
37 # | 37 PandaSeq (https://github.com/neufeld/pandaseq) |
38 # If you with to perform the assembly QC step, you must have: | 38 PandaSeq must be in your $PATH / able to run from anywhere |
39 # PandaSeq (https://github.com/neufeld/pandaseq) | 39 |
40 # PandaSeq must be in your $PATH / able to run from anywhere | 40 ~~~~~~~~~~~~~~~~ |
41 ################################################################################ | 41 REQUIRED OPTIONS |
42 # REQUIRED OPTIONS | 42 |
43 # | 43 -i forward_paired_ends.fastQ |
44 # -i forward_paired_ends.fastQ | 44 -j reverse_paired_ends.fastQ |
45 # -j reverse_paired_ends.fastQ | 45 -p pal_finder output - by default pal_finder names this "x_PAL_summary.txt" |
46 # -p pal_finder output - the "(microsatellites with read IDs and | 46 |
47 # primer pairs)" file | 47 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
48 # | 48 BY DEFAULT THIS PROGRAM DOES NOTHING. ENABLE SOME OF THE OPTIONS BELOW. |
49 # By default the program does nothing.... | 49 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
50 # | 50 NON-REQUIRED OPTIONS |
51 # NON-REQUIRED OPTIONS | 51 |
52 # | 52 -assembly: turn on the pandaseq assembly QC step |
53 # -assembly: turn on the pandaseq assembly QC step | 53 |
54 # -primers: filter microsatellite loci to just those which | 54 -primers: filter microsatellite loci to just those which have primers designed |
55 # have primers designed | 55 |
56 # | 56 -occurrences: filter microsatellite loci to those with primers |
57 # -occurrences: filter microsatellite loci to those with primers | 57 which appear only once in the dataset |
58 # which appear only once in the dataset | 58 |
59 # | 59 -rankmotifs: filter microsatellite loci to just those with perfect motifs. |
60 # -rankmotifs: filter microsatellite loci to just those with perfect motifs. | 60 Rank the output by size of motif (largest first) |
61 # Rank the output by size of motif (largest first) | 61 |
62 # | 62 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
63 ########################################################### | 63 For repeat analysis, the following extra non-required options may be useful: |
64 # For repeat analysis, the following extra non-required options may be useful: | 64 |
65 # | 65 Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the |
66 # Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the | 66 first time, generate the files and then skip either, or both steps with |
67 # first time, generate the files and then skip either, or both steps with | 67 the following: |
68 # the following: | 68 |
69 # | 69 -a: skip assembly step |
70 # -a: skip assembly step | 70 -c: skip fastq -> fasta conversion step |
71 # -c: skip fastq -> fasta conversion step | 71 |
72 # | 72 Just make sure to keep the assembled/converted files in the correct directory |
73 # Just make sure to keep the assembled/converted files in the correct directory | 73 with the correct filename(s) |
74 # with the correct filename(s) | 74 |
75 ########################################################### | 75 ~~~~~~~~~~~~~~~ |
76 # | 76 EXAMPLE USAGE: |
77 # EXAMPLE USAGE: | 77 |
78 # | 78 pal_filtery.py -i R1.fastq -j R2.fastq |
79 # pal_filtery.py -i R1.fastq -j R2.fastq | 79 -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly |
80 # -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly | 80 |
81 # | 81 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
82 ########################################################### | 82 """ |
83 import Bio, subprocess, argparse, csv, os, re, time | 83 import Bio, subprocess, argparse, csv, os, re, time |
84 from Bio import SeqIO | 84 from Bio import SeqIO |
85 | 85 __version__ = "1.0.0" |
86 # Get values for all the required and optional arguments | |
87 | |
88 parser = argparse.ArgumentParser(description='pal_filter') | |
89 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \ | |
90 required=True) | |
91 | |
92 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \ | |
93 required=True) | |
94 | |
95 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \ | |
96 required=True) | |
97 | |
98 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \ | |
99 based QC', nargs='?', const=1, type=int, required=False) | |
100 | |
101 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \ | |
102 been run, skip it with -a', nargs='?', const=1, \ | |
103 type=int, required=False) | |
104 | |
105 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \ | |
106 conversion has already been run, skip it with -c', \ | |
107 nargs='?', const=1, type=int, required=False) | |
108 | |
109 parser.add_argument('-primers','--filter_by_primer_column', help='Filter \ | |
110 pal_finder output to just those loci which have primers \ | |
111 designed', nargs='?', const=1, type=int, required=False) | |
112 | |
113 parser.add_argument('-occurrences','--filter_by_occurrences_column', \ | |
114 help='Filter pal_finder output to just loci with primers \ | |
115 which only occur once in the dataset', nargs='?', \ | |
116 const=1, type=int, required=False) | |
117 | |
118 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \ | |
119 help='Filter pal_finder output to just loci which are a \ | |
120 perfect repeat unit. Also, rank the loci by motif size \ | |
121 (largest first)', nargs='?', const=1, type=int, \ | |
122 required=False) | |
123 | |
124 args = vars(parser.parse_args()) | |
125 | |
126 # parse arguments | |
127 | |
128 R1_input = args['input1']; | |
129 R2_input = args['input2']; | |
130 pal_finder_output = args['pal_finder']; | |
131 perform_assembly = args['assembly_QC'] | |
132 skip_assembly = args['skip_assembly']; | |
133 skip_conversion = args['skip_conversion']; | |
134 filter_primers = args['filter_by_primer_column']; | |
135 filter_occurrences = args['filter_by_occurrences_column']; | |
136 filter_rank_motifs = args['filter_and_rank_by_motif_size']; | |
137 | |
138 # set default values for arguments | |
139 if perform_assembly is None: | |
140 perform_assembly = 0 | |
141 if skip_assembly is None: | |
142 skip_assembly = 0 | |
143 if skip_conversion is None: | |
144 skip_conversion = 0 | |
145 if filter_primers is None: | |
146 filter_primers = 0 | |
147 if filter_occurrences is None: | |
148 filter_occurrences = 0 | |
149 if filter_rank_motifs is None: | |
150 filter_rank_motifs = 0 | |
151 | |
152 ############################################################ | 86 ############################################################ |
153 # Function List # | 87 # Function List # |
154 ############################################################ | 88 ############################################################ |
155 | |
156 # Reverse complement a sequence | |
157 | |
158 def ReverseComplement1(seq): | 89 def ReverseComplement1(seq): |
90 """ | |
91 take a nucleotide sequence and reverse-complement it | |
92 """ | |
159 seq_dict = {'A':'T','T':'A','G':'C','C':'G'} | 93 seq_dict = {'A':'T','T':'A','G':'C','C':'G'} |
160 return "".join([seq_dict[base] for base in reversed(seq)]) | 94 return "".join([seq_dict[base] for base in reversed(seq)]) |
161 | 95 |
162 # Convert a .fastq to a .fasta, filter to just lines we want | 96 def fastq_to_fasta(input_file, wanted_set): |
163 # and strip MiSeq barcodes | 97 """ |
164 | 98 take a file in fastq format, convert to fasta format and filter on |
165 def fastq_to_fasta(file): | 99 the set of sequences that we want to keep |
166 file_name = os.path.splitext(os.path.basename(file))[0] | 100 """ |
101 file_name = os.path.splitext(os.path.basename(input_file))[0] | |
167 with open(file_name + "_filtered.fasta", "w") as out: | 102 with open(file_name + "_filtered.fasta", "w") as out: |
168 for record in SeqIO.parse(file, "fastq"): | 103 for record in SeqIO.parse(input_file, "fastq"): |
169 ID = str(record.id) | 104 ID = str(record.id) |
170 SEQ = str(record.seq) | 105 SEQ = str(record.seq) |
171 if ID in wanted: | 106 if ID in wanted_set: |
172 out.write(">" + ID + "\n" + SEQ + "\n") | 107 out.write(">" + ID + "\n" + SEQ + "\n") |
173 | 108 |
174 # strip the miseq barcodes from a fasta file | 109 def strip_barcodes(input_file, wanted_set): |
175 | 110 """ |
176 def strip_barcodes(file): | 111 take fastq data containing sequencing barcodes and strip the barcode |
177 file_name = os.path.splitext(os.path.basename(file))[0] | 112 from each sequence. Filter on the set of sequences that we want to keep |
113 """ | |
114 file_name = os.path.splitext(os.path.basename(input_file))[0] | |
178 with open(file_name + "_adapters_removed.fasta", "w") as out: | 115 with open(file_name + "_adapters_removed.fasta", "w") as out: |
179 for record in SeqIO.parse(file, "fasta"): | 116 for record in SeqIO.parse(input_file, "fasta"): |
180 match = re.search(r'\S*:', record.id) | 117 match = re.search(r'\S*:', record.id) |
181 if match: | 118 if match: |
182 correct = match.group().rstrip(":") | 119 correct = match.group().rstrip(":") |
183 else: | 120 else: |
184 correct = str(record.id) | 121 correct = str(record.id) |
185 SEQ = str(record.seq) | 122 SEQ = str(record.seq) |
186 if correct in wanted: | 123 if correct in wanted_set: |
187 out.write(">" + correct + "\n" + SEQ + "\n") | 124 out.write(">" + correct + "\n" + SEQ + "\n") |
188 | 125 |
189 ############################################################ | 126 ############################################################ |
190 # MAIN PROGRAM # | 127 # MAIN PROGRAM # |
191 ############################################################ | 128 ############################################################ |
192 print "\n~~~~~~~~~~" | 129 print "\n~~~~~~~~~~" |
193 print "pal_filter" | 130 print "pal_filter" |
194 print "~~~~~~~~~~\n" | 131 print "~~~~~~~~~~" |
132 print "Version: " + __version__ | |
195 time.sleep(1) | 133 time.sleep(1) |
196 print "\"Find the optimum loci in your pal_finder output and increase "\ | 134 print "\nFind the optimum loci in your pal_finder output and increase "\ |
197 "the rate of successful microsatellite marker development\"" | 135 "the rate of successful microsatellite marker development" |
198 print "\nSee Griffiths et al. (currently unpublished) for more details......" | 136 print "\nSee Griffiths et al. (currently unpublished) for more details......" |
199 time.sleep(2) | 137 time.sleep(2) |
200 | 138 |
201 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \ | 139 # Get values for all the required and optional arguments |
202 and filter_rank_motifs == 0): | 140 |
141 parser = argparse.ArgumentParser(description='pal_filter') | |
142 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \ | |
143 required=True) | |
144 | |
145 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \ | |
146 required=True) | |
147 | |
148 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \ | |
149 required=True) | |
150 | |
151 parser.add_argument('-assembly', help='Perform the PandaSeq based QC', \ | |
152 action='store_true') | |
153 | |
154 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \ | |
155 been run, skip it with -a', action='store_true') | |
156 | |
157 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \ | |
158 conversion has already been run, skip it with -c', \ | |
159 action='store_true') | |
160 | |
161 parser.add_argument('-primers', help='Filter \ | |
162 pal_finder output to just those loci which have primers \ | |
163 designed', action='store_true') | |
164 | |
165 parser.add_argument('-occurrences', \ | |
166 help='Filter pal_finder output to just loci with primers \ | |
167 which only occur once in the dataset', action='store_true') | |
168 | |
169 parser.add_argument('-rankmotifs', \ | |
170 help='Filter pal_finder output to just loci which are a \ | |
171 perfect repeat unit. Also, rank the loci by motif size \ | |
172 (largest first)', action='store_true') | |
173 | |
174 parser.add_argument('-v', '--get_version', help='Print the version number of \ | |
175 this pal_filter script', action='store_true') | |
176 | |
177 args = parser.parse_args() | |
178 | |
179 if not args.assembly and not args.primers and not args.occurrences \ | |
180 and not args.rankmotifs: | |
203 print "\nNo optional arguments supplied." | 181 print "\nNo optional arguments supplied." |
204 print "\nBy default this program does nothing." | 182 print "\nBy default this program does nothing." |
205 print "\nNo files produced and no modifications made." | 183 print "\nNo files produced and no modifications made." |
206 print "\nFinished.\n" | 184 print "\nFinished.\n" |
207 exit() | 185 exit() |
208 | |
209 else: | 186 else: |
210 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 187 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
211 print "Checking supplied filtering parameters:" | 188 print "Checking supplied filtering parameters:" |
212 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 189 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
213 time.sleep(2) | 190 time.sleep(2) |
214 if filter_primers == 1: | 191 if args.get_version: |
192 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | |
193 print "pal_filter version is " + __version__ + " (03/03/2016)" | |
194 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" | |
195 if args.primers: | |
215 print "-primers flag supplied." | 196 print "-primers flag supplied." |
216 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \ | 197 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \ |
217 " column." | 198 " column." |
218 print "Only rows where primers have successfully been designed will"\ | 199 print "Only rows where primers have successfully been designed will"\ |
219 " pass the filter.\n" | 200 " pass the filter.\n" |
220 time.sleep(2) | 201 time.sleep(2) |
221 if filter_occurrences == 1: | 202 if args.occurrences: |
222 print "-occurrences flag supplied." | 203 print "-occurrences flag supplied." |
223 print "Filtering pal_finder output on the \"Occurences of Forward" \ | 204 print "Filtering pal_finder output on the \"Occurrences of Forward" \ |
224 " Primer in Reads\" and \"Occurences of Reverse Primer" \ | 205 " Primer in Reads\" and \"Occurrences of Reverse Primer" \ |
225 " in Reads\" columns." | 206 " in Reads\" columns." |
226 print "Only rows where both primers occur only a single time in the"\ | 207 print "Only rows where both primers occur only a single time in the"\ |
227 " reads pass the filter.\n" | 208 " reads pass the filter.\n" |
228 time.sleep(2) | 209 time.sleep(2) |
229 if filter_rank_motifs == 1: | 210 if args.rankmotifs: |
230 print "-rankmotifs flag supplied." | 211 print "-rankmotifs flag supplied." |
231 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \ | 212 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \ |
232 " just those with perfect repeats." | 213 " just those with perfect repeats." |
233 print "Only rows containing 'perfect' repeats will pass the filter." | 214 print "Only rows containing 'perfect' repeats will pass the filter." |
234 print "Also, ranking output by size of motif (largest first).\n" | 215 print "Also, ranking output by size of motif (largest first).\n" |
235 time.sleep(2) | 216 time.sleep(2) |
217 | |
236 # index the raw fastq files so that the sequences can be pulled out and | 218 # index the raw fastq files so that the sequences can be pulled out and |
237 # added to the filtered output file | 219 # added to the filtered output file |
238 | |
239 # Indexing input sequence files | |
240 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 220 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
241 print "Indexing FastQ files....." | 221 print "Indexing FastQ files....." |
242 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 222 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
243 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq') | 223 R1fastq_sequences_index = SeqIO.index(args.input1,'fastq') |
244 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq') | 224 R2fastq_sequences_index = SeqIO.index(args.input2,'fastq') |
245 print "Indexing complete." | 225 print "Indexing complete." |
246 | 226 |
247 # create a set to hold the filtered output | 227 # create a set to hold the filtered output |
248 wanted_lines = set() | 228 wanted_lines = set() |
249 | 229 |
250 # get lines from the pal_finder output which meet filter settings | 230 # get lines from the pal_finder output which meet filter settings |
251 | |
252 # read the pal_finder output file into a csv reader | 231 # read the pal_finder output file into a csv reader |
253 with open (pal_finder_output) as csvfile_infile: | 232 with open (args.pal_finder) as csvfile_infile: |
254 csv_f = csv.reader(csvfile_infile, delimiter='\t') | 233 csv_f = csv.reader(csvfile_infile, delimiter='\t') |
255 header = csv_f.next() | 234 header = csv_f.next() |
256 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ | 235 header.extend(("R1_Sequence_ID", \ |
257 ".filtered", 'w') as csvfile_outfile: | 236 "R1_Sequence", \ |
258 # write the header line for the output file | 237 "R2_Sequence_ID", \ |
259 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t" | 238 "R2_Sequence" + "\n")) |
260 "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n") | 239 with open( \ |
240 os.path.splitext(os.path.basename(args.pal_finder))[0] + \ | |
241 ".filtered", 'w') as csvfile_outfile: | |
242 # write the header line for the output file | |
243 csvfile_outfile.write('\t'.join(header)) | |
261 for row in csv_f: | 244 for row in csv_f: |
262 # get the sequence ID | 245 # get the sequence ID |
263 seq_ID = row[0] | 246 seq_ID = row[0] |
264 # get the raw sequence reads and convert to a format that can | 247 # get the raw sequence reads and convert to a format that can |
265 # go into a tsv file | 248 # go into a tsv file |
266 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\ | 249 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\ |
267 replace("\n","\t",1).replace("\n","") | 250 replace("\n","\t",1).replace("\n","") |
268 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\ | 251 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\ |
269 replace("\n","\t",1).replace("\n","") | 252 replace("\n","\t",1).replace("\n","") |
270 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n" | 253 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n" |
271 # navigate through all different combinations of filter options | 254 # navigate through all different combinations of filter options |
272 # if the primer filter is switched on | 255 # if the primer filter is switched on |
273 if filter_primers == 1: | 256 if args.primers: |
274 # check the occurrences of primers field | 257 # check the occurrences of primers field |
275 if row[5] == "1": | 258 if row[5] == "1": |
276 # if filter occurrences of primers is switched on | 259 # if filter occurrences of primers is switched on |
277 if filter_occurrences == 1: | 260 if args.occurrences: |
278 # check the occurrences of primers field | 261 # check the occurrences of primers field |
279 if (row[15] == "1" and row[16] == "1"): | 262 if (row[15] == "1" and row[16] == "1"): |
280 # if rank by motif is switched on | 263 # if rank by motif is switched on |
281 if filter_rank_motifs == 1: | 264 if args.rankmotifs: |
282 # check for perfect motifs | 265 # check for perfect motifs |
283 if row[1].count('(') == 1: | 266 if row[1].count('(') == 1: |
284 # all 3 filter switched on | 267 # all 3 filter switched on |
285 # write line out to output | 268 # write line out to output |
286 csvfile_outfile.write('\t'.join(row) + \ | 269 csvfile_outfile.write('\t'.join(row) + \ |
287 seq_info) | 270 seq_info) |
288 | |
289 else: | 271 else: |
290 csvfile_outfile.write('\t'.join(row) + seq_info) | 272 csvfile_outfile.write('\t'.join(row) + seq_info) |
291 elif filter_rank_motifs == 1: | 273 elif args.rankmotifs: |
292 if row[1].count('(') == 1: | 274 if row[1].count('(') == 1: |
293 csvfile_outfile.write('\t'.join(row) + seq_info) | 275 csvfile_outfile.write('\t'.join(row) + seq_info) |
294 else: | 276 else: |
295 csvfile_outfile.write('\t'.join(row) + seq_info) | 277 csvfile_outfile.write('\t'.join(row) + seq_info) |
296 elif filter_occurrences == 1: | 278 elif args.occurrences: |
297 if (row[15] == "1" and row[16] == "1"): | 279 if (row[15] == "1" and row[16] == "1"): |
298 if filter_rank_motifs == 1: | 280 if args.rankmotifs: |
299 if row[1].count('(') == 1: | 281 if row[1].count('(') == 1: |
300 csvfile_outfile.write('\t'.join(row) + seq_info) | 282 csvfile_outfile.write('\t'.join(row) + seq_info) |
301 else: | 283 else: |
302 csvfile_outfile.write('\t'.join(row) + seq_info) | 284 csvfile_outfile.write('\t'.join(row) + seq_info) |
303 elif filter_rank_motifs == 1: | 285 elif args.rankmotifs: |
304 if row[1].count('(') == 1: | 286 if row[1].count('(') == 1: |
305 csvfile_outfile.write('\t'.join(row) + seq_info) | 287 csvfile_outfile.write('\t'.join(row) + seq_info) |
306 else: | 288 else: |
307 csvfile_outfile.write('\t'.join(row) + seq_info) | 289 csvfile_outfile.write('\t'.join(row) + seq_info) |
308 | 290 |
309 # if filter_rank_motifs is active, order the file by the size of the motif | 291 # if filter_rank_motifs is active, order the file by the size of the motif |
310 if filter_rank_motifs == 1: | 292 if args.rankmotifs: |
311 rank_motif = [] | 293 rank_motif = [] |
312 ranked_list = [] | 294 ranked_list = [] |
313 # read in the non-ordered file and add every entry to rank_motif list | 295 # read in the non-ordered file and add every entry to rank_motif list |
314 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ | 296 with open( \ |
315 ".filtered") as csvfile_ranksize: | 297 os.path.splitext(os.path.basename(args.pal_finder))[0] + \ |
298 ".filtered") as csvfile_ranksize: | |
316 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t') | 299 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t') |
317 header = csv_rank.next() | 300 header = csv_rank.next() |
318 for line in csv_rank: | 301 for line in csv_rank: |
319 rank_motif.append(line) | 302 rank_motif.append(line) |
320 | 303 |
321 # open the file ready to write the ordered list | 304 # open the file ready to write the ordered list |
322 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ | 305 with open( \ |
323 ".filtered", 'w') as rank_outfile: | 306 os.path.splitext(os.path.basename(args.pal_finder))[0] + \ |
307 ".filtered", 'w') as rank_outfile: | |
324 rankwriter = csv.writer(rank_outfile, delimiter='\t', \ | 308 rankwriter = csv.writer(rank_outfile, delimiter='\t', \ |
325 lineterminator='\n') | 309 lineterminator='\n') |
326 rankwriter.writerow(header) | 310 rankwriter.writerow(header) |
327 count = 2 | 311 count = 2 |
328 while count < 10: | 312 while count < 10: |
329 for row in rank_motif: | 313 for row in rank_motif: |
330 # count size of motif | 314 # count size of motif |
331 motif = re.search(r'[ATCG]*',row[1]) | 315 motif = re.search(r'[ATCG]*',row[1]) |
332 if motif: | 316 if motif: |
333 the_motif = motif.group() | 317 the_motif = motif.group() |
334 # rank it and write into ranked_list | 318 # rank it and write into ranked_list |
335 if len(the_motif) == count: | 319 if len(the_motif) == count: |
336 ranked_list.insert(0, row) | 320 ranked_list.insert(0, row) |
337 count = count + 1 | 321 count = count + 1 |
338 # write out the ordered list, into the .filtered file | 322 # write out the ordered list, into the .filtered file |
339 for row in ranked_list: | 323 for row in ranked_list: |
340 rankwriter.writerow(row) | 324 rankwriter.writerow(row) |
341 | 325 |
342 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 326 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
343 print "Checking assembly flags supplied:" | 327 print "Checking assembly flags supplied:" |
344 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | 328 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" |
345 if perform_assembly == 0: | 329 if not args.assembly: |
346 print "Assembly flag not supplied. Not performing assembly QC.\n" | 330 print "Assembly flag not supplied. Not performing assembly QC.\n" |
347 if perform_assembly == 1: | 331 if args.assembly: |
348 print "-assembly flag supplied: Performing PandaSeq assembly quality checks." | 332 print "-assembly flag supplied: Perform PandaSeq assembly quality checks." |
349 print "See Fox et al. (currently unpublished) for full details on the"\ | 333 print "See Fox et al. (currently unpublished) for full details on the"\ |
350 " quality-check process.\n" | 334 " quality-check process.\n" |
351 time.sleep(5) | 335 time.sleep(5) |
352 | 336 |
353 # Get readID, F primers, R primers and motifs from filtered pal_finder output | 337 # Get readID, F primers, R primers and motifs from filtered pal_finder output |
354 seqIDs = [] | 338 seqIDs = [] |
355 motif = [] | 339 motif = [] |
356 F_primers = [] | 340 F_primers = [] |
357 R_primers = [] | 341 R_primers = [] |
358 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ | 342 with open( \ |
359 ".filtered") as input_csv: | 343 os.path.splitext(os.path.basename(args.pal_finder))[0] + \ |
344 ".filtered") as input_csv: | |
360 pal_finder_csv = csv.reader(input_csv, delimiter='\t') | 345 pal_finder_csv = csv.reader(input_csv, delimiter='\t') |
361 header = pal_finder_csv.next() | 346 header = pal_finder_csv.next() |
362 for row in pal_finder_csv: | 347 for row in pal_finder_csv: |
363 seqIDs.append(row[0]) | 348 seqIDs.append(row[0]) |
364 motif.append(row[1]) | 349 motif.append(row[1]) |
367 | 352 |
368 # Get a list of just the unique IDs we want | 353 # Get a list of just the unique IDs we want |
369 wanted = set() | 354 wanted = set() |
370 for line in seqIDs: | 355 for line in seqIDs: |
371 wanted.add(line) | 356 wanted.add(line) |
372 | 357 """ |
373 # Assemble the paired end reads into overlapping contigs using PandaSeq | 358 Assemble the paired end reads into overlapping contigs using PandaSeq |
374 # (can be skipped with the -a flag if assembly has already been run | 359 (can be skipped with the -a flag if assembly has already been run |
375 # and the appropriate files are in the same directory as the script, | 360 and the appropriate files are in the same directory as the script, |
376 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta") | 361 and named "Assembly.fasta" and "Assembly_adapters_removed.fasta") |
377 # | 362 |
378 # The first time you riun the script you MUST not enable the -a flag.t | 363 The first time you riun the script you MUST not enable the -a flag.t |
379 # but you are able to skip the assembly in subsequent analysis using the | 364 but you are able to skip the assembly in subsequent analysis using the |
380 # -a flag. | 365 -a flag. |
381 | 366 """ |
382 if skip_assembly == 0: | 367 if not args.skip_assembly: |
383 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \ | 368 pandaseq_command = 'pandaseq -A pear -f ' + args.input1 + ' -r ' + \ |
384 R2_input + ' -o 25 -t 0.95 -w Assembly.fasta' | 369 args.input2 + ' -o 25 -t 0.95 -w Assembly.fasta' |
385 subprocess.call(pandaseq_command, shell=True) | 370 subprocess.call(pandaseq_command, shell=True) |
386 strip_barcodes("Assembly.fasta") | 371 strip_barcodes("Assembly.fasta", wanted) |
387 print "\nPaired end reads been assembled into overlapping reads." | 372 print "\nPaired end reads been assembled into overlapping reads." |
388 print "\nFor future analysis, you can skip this assembly step using \ | 373 print "\nFor future analysis, you can skip this assembly step using" \ |
389 the -a flag, provided that the assembly.fasta file is \ | 374 " the -a flag, provided that the assembly.fasta file" \ |
390 intact and in the same location." | 375 " is intact and in the same location." |
391 else: | 376 else: |
392 print "\n(Skipping the assembly step as you provided the -a flag)" | 377 print "\n(Skipping the assembly step as you provided the -a flag)" |
393 | 378 """ |
394 # Fastq files need to be converted to fasta. The first time you run the script | 379 Fastq files need to be converted to fasta. The first time you run the script |
395 # you MUST not enable the -c flag, but you are able to skip the conversion | 380 you MUST not enable the -c flag, but you are able to skip the conversion |
396 # later using the -c flag. Make sure the fasta files are in the same location | 381 later using the -c flag. Make sure the fasta files are in the same location |
397 #and do not change the filenames | 382 and do not change the filenames |
398 | 383 """ |
399 if skip_conversion ==0: | 384 if not args.skip_conversion: |
400 fastq_to_fasta(R1_input) | 385 fastq_to_fasta(args.input1, wanted) |
401 fastq_to_fasta(R2_input) | 386 fastq_to_fasta(args.input2, wanted) |
402 print "\nThe input fastq files have been converted to the fasta format." | 387 print "\nThe input fastq files have been converted to the fasta format." |
403 print "\nFor any future analysis, you can skip this conversion step \ | 388 print "\nFor any future analysis, you can skip this conversion step" \ |
404 using the -c flag, provided that the fasta files \ | 389 " using the -c flag, provided that the fasta files" \ |
405 are intact and in the same location." | 390 " are intact and in the same location." |
406 else: | 391 else: |
407 print "\n(Skipping the fastq -> fasta conversion as you provided the" \ | 392 print "\n(Skipping the fastq -> fasta conversion as you provided the" \ |
408 " -c flag).\n" | 393 " -c flag).\n" |
409 | 394 |
410 # get the files and everything else we will need | 395 # get the files and everything else needed |
411 | 396 # Assembled fasta file |
412 # Assembled fasta file | |
413 assembly_file = "Assembly_adapters_removed.fasta" | 397 assembly_file = "Assembly_adapters_removed.fasta" |
414 # filtered R1 reads | 398 # filtered R1 reads |
415 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \ | 399 R1_fasta = os.path.splitext( \ |
416 "_filtered.fasta" | 400 os.path.basename(args.input1))[0] + "_filtered.fasta" |
417 # filtered R2 reads | 401 # filtered R2 reads |
418 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \ | 402 R2_fasta = os.path.splitext( \ |
419 "_filtered.fasta" | 403 os.path.basename(args.input2))[0] + "_filtered.fasta" |
420 | 404 outputfilename = os.path.splitext(os.path.basename(args.input1))[0] |
421 outputfilename = os.path.splitext(os.path.basename(R1_input))[0] | 405 # parse the files with SeqIO |
422 | |
423 # parse the files with SeqIO | |
424 assembly_sequences = SeqIO.parse(assembly_file,'fasta') | 406 assembly_sequences = SeqIO.parse(assembly_file,'fasta') |
425 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta') | 407 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta') |
426 | 408 |
427 # create some empty lists to hold the ID tags we are interested in | 409 # create some empty lists to hold the ID tags we are interested in |
428 assembly_IDs = [] | 410 assembly_IDs = [] |
429 fasta_IDs = [] | 411 fasta_IDs = [] |
430 | 412 |
431 # populate the above lists with sequence IDs | 413 # populate the above lists with sequence IDs |
432 for sequence in assembly_sequences: | 414 for sequence in assembly_sequences: |
433 assembly_IDs.append(sequence.id) | 415 assembly_IDs.append(sequence.id) |
434 for sequence in R1fasta_sequences: | 416 for sequence in R1fasta_sequences: |
435 fasta_IDs.append(sequence.id) | 417 fasta_IDs.append(sequence.id) |
436 | 418 |
437 # Index the assembly fasta file | 419 # Index the assembly fasta file |
438 assembly_sequences_index = SeqIO.index(assembly_file,'fasta') | 420 assembly_sequences_index = SeqIO.index(assembly_file,'fasta') |
439 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta') | 421 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta') |
440 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta') | 422 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta') |
441 | 423 |
442 # prepare the output file | 424 # prepare the output file |
443 with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \ | 425 with open ( \ |
444 as outputfile: | 426 outputfilename + "_pal_filter_assembly_output.txt", 'w') \ |
445 # write the headers for the output file | 427 as outputfile: |
446 outputfile.write("readPairID\t Forward Primer\t F Primer Position in " | 428 # write the headers for the output file |
447 "Assembled Read\t Reverse Primer\t R Primer Position in " | 429 output_header = ("readPairID", \ |
448 "Assembled Read\t Motifs(bases)\t Assembled Read ID\t " | 430 "Forward Primer",\ |
449 "Assembled Read Sequence\t Raw Forward Read ID\t Raw " | 431 "F Primer Position in Assembled Read", \ |
450 "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse " | 432 "Reverse Primer", \ |
451 "Read Sequence\n") | 433 "R Primer Position in Assembled Read", \ |
452 | 434 "Motifs(bases)", \ |
453 # cycle through parameters from the pal_finder output | 435 "Assembled Read ID", \ |
436 "Assembled Read Sequence", \ | |
437 "Raw Forward Read ID", \ | |
438 "Raw Forward Read Sequence", \ | |
439 "Raw Reverse Read ID", \ | |
440 "Raw Reverse Read Sequence\n") | |
441 outputfile.write("\t".join(output_header)) | |
442 | |
443 # cycle through parameters from the pal_finder output | |
454 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif): | 444 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif): |
455 if str(x) in assembly_IDs: | 445 if str(x) in assembly_IDs: |
456 # get the raw sequences ready to go into the output file | 446 # get the raw sequences ready to go into the output file |
457 assembly_seq = (assembly_sequences_index.get_raw(x).decode()) | 447 assembly_seq = (assembly_sequences_index.get_raw(x).decode()) |
458 # fasta entries need to be converted to single line so sit nicely in the output | 448 # fasta entries need to be converted to single line so sit |
459 assembly_output = assembly_seq.replace("\n","\t") | 449 # nicely in the output |
450 assembly_output = assembly_seq.replace("\n","\t").strip('\t') | |
460 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode()) | 451 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode()) |
461 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","") | 452 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","") |
462 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode()) | 453 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode()) |
463 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","") | 454 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","") |
464 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:]) | 455 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:]) |
465 | 456 |
466 # check that both primer sequences can be seen in the assembled contig | 457 # check that both primer sequences can be seen in the |
458 # assembled contig | |
467 if y or ReverseComplement1(y) in assembly_no_id and z or \ | 459 if y or ReverseComplement1(y) in assembly_no_id and z or \ |
468 ReverseComplement1(z) in assembly_no_id: | 460 ReverseComplement1(z) in assembly_no_id: |
469 if y in assembly_no_id: | 461 if y in assembly_no_id: |
470 # get the positions of the primers in the assembly to predict fragment length | 462 # get the positions of the primers in the assembly |
463 # (can be used to predict fragment length) | |
471 F_position = assembly_no_id.index(y)+len(y)+1 | 464 F_position = assembly_no_id.index(y)+len(y)+1 |
472 if ReverseComplement1(y) in assembly_no_id: | 465 if ReverseComplement1(y) in assembly_no_id: |
473 F_position = assembly_no_id.index(ReverseComplement1(y)\ | 466 F_position = assembly_no_id.index( \ |
474 )+len(ReverseComplement1(y))+1 | 467 ReverseComplement1(y))+len(ReverseComplement1(y))+1 |
475 if z in assembly_no_id: | 468 if z in assembly_no_id: |
476 R_position = assembly_no_id.index(z)+1 | 469 R_position = assembly_no_id.index(z)+1 |
477 if ReverseComplement1(z) in assembly_no_id: | 470 if ReverseComplement1(z) in assembly_no_id: |
478 R_position = assembly_no_id.index(ReverseComplement1(z)\ | 471 R_position = assembly_no_id.index( \ |
479 )+1 | 472 ReverseComplement1(z))+1 |
480 | 473 output = (str(x), |
481 # write everything out into the output file | 474 str(y), |
482 output = (str(x) + "\t" + y + "\t" + str(F_position) \ | 475 str(F_position), |
483 + "\t" + (z) + "\t" + str(R_position) \ | 476 str(z), |
484 + "\t" + a + "\t" + assembly_output \ | 477 str(R_position), |
485 + R1_output + "\t" + R2_output + "\n") | 478 str(a), |
486 outputfile.write(output) | 479 str(assembly_output), |
480 str(R1_output), | |
481 str(R2_output + "\n")) | |
482 outputfile.write("\t".join(output)) | |
487 print "\nPANDAseq quality check complete." | 483 print "\nPANDAseq quality check complete." |
488 print "Results from PANDAseq quality check (and filtering, if any" \ | 484 print "Results from PANDAseq quality check (and filtering, if any" \ |
489 " any filters enabled) written to output file" \ | 485 " any filters enabled) written to output file" \ |
490 " ending \"_pal_filter_assembly_output.txt\".\n\n" | 486 " ending \"_pal_filter_assembly_output.txt\".\n" |
491 | |
492 print "Filtering of pal_finder results complete." | 487 print "Filtering of pal_finder results complete." |
493 print "Filtered results written to output file ending \".filtered\"." | 488 print "Filtered results written to output file ending \".filtered\"." |
494 print "\nFinished\n" | 489 print "\nFinished\n" |
495 else: | 490 else: |
496 if (skip_assembly == 1 or skip_conversion == 1): | 491 if args.skip_assembly or args.skip_conversion: |
497 print "\nERROR: You cannot supply the -a flag or the -c flag without \ | 492 print "\nERROR: You cannot supply the -a flag or the -c flag without \ |
498 also supplying the -assembly flag.\n" | 493 also supplying the -assembly flag.\n" |
499 | |
500 print "\nProgram Finished\n" | 494 print "\nProgram Finished\n" |