annotate pal_filter.py @ 3:e1a14ed7a9d6 draft

Updated to version 0.02.04.4 (new pal_filter script)
author pjbriggs
date Wed, 24 Feb 2016 08:25:17 -0500
parents
children cb56cc1d5c39
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
1 #!/usr/bin/python -tt
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
2 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
3 # pal_filter
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
4 # https://github.com/graemefox/pal_filter
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
5 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
6 ################################################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
7 # Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
8 # Tested on 64-bit Ubuntu, with Python 2.7
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
9 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
10 ################################################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
11 # PROGRAM DESCRIPTION
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
12 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
13 # Program to pick optimum loci from the output of pal_finder_v0.02.04
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
14 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
15 # This program can be used to filter output from pal_finder and choose the
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
16 # 'optimum' loci.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
17 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
18 # For the paper referncing this workflow, see Griffiths et al.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
19 # (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
20 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
21 ################################################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
22 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
23 # This program also contains a quality-check method to improve the rate of PCR
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
24 # success. For this QC method, paired end reads are assembled using
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
25 # PANDAseq so you must have PANDAseq installed.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
26 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
27 # For the paper referencing this assembly-QC method see Fox et al.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
28 # (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
29 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
30 # For best results in PCR for marker development, I suggest enabling all the
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
31 # filter options AND the assembly based QC
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
32 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
33 ################################################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
34 # REQUIREMENTS
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
35 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
36 # Must have Biopython installed (www.biopython.org).
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
37 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
38 # If you with to perform the assembly QC step, you must have:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
39 # PandaSeq (https://github.com/neufeld/pandaseq)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
40 # PandaSeq must be in your $PATH / able to run from anywhere
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
41 ################################################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
42 # REQUIRED OPTIONS
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
43 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
44 # -i forward_paired_ends.fastQ
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
45 # -j reverse_paired_ends.fastQ
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
46 # -p pal_finder output - the "(microsatellites with read IDs and
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
47 # primer pairs)" file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
48 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
49 # By default the program does nothing....
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
50 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
51 # NON-REQUIRED OPTIONS
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
52 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
53 # -assembly: turn on the pandaseq assembly QC step
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
54 # -primers: filter microsatellite loci to just those which
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
55 # have primers designed
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
56 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
57 # -occurrences: filter microsatellite loci to those with primers
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
58 # which appear only once in the dataset
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
59 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
60 # -rankmotifs: filter microsatellite loci to just those with perfect motifs.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
61 # Rank the output by size of motif (largest first)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
62 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
63 ###########################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
64 # For repeat analysis, the following extra non-required options may be useful:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
65 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
66 # Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
67 # first time, generate the files and then skip either, or both steps with
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
68 # the following:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
69 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
70 # -a: skip assembly step
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
71 # -c: skip fastq -> fasta conversion step
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
72 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
73 # Just make sure to keep the assembled/converted files in the correct directory
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
74 # with the correct filename(s)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
75 ###########################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
76 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
77 # EXAMPLE USAGE:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
78 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
79 # pal_filtery.py -i R1.fastq -j R2.fastq
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
80 # -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
81 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
82 ###########################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
83 import Bio, subprocess, argparse, csv, os, re, time
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
84 from Bio import SeqIO
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
85
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
86 # Get values for all the required and optional arguments
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
87
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
88 parser = argparse.ArgumentParser(description='pal_filter')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
89 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
90 required=True)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
91
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
92 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
93 required=True)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
94
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
95 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
96 required=True)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
97
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
98 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
99 based QC', nargs='?', const=1, type=int, required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
100
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
101 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
102 been run, skip it with -a', nargs='?', const=1, \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
103 type=int, required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
104
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
105 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
106 conversion has already been run, skip it with -c', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
107 nargs='?', const=1, type=int, required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
108
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
109 parser.add_argument('-primers','--filter_by_primer_column', help='Filter \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
110 pal_finder output to just those loci which have primers \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
111 designed', nargs='?', const=1, type=int, required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
112
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
113 parser.add_argument('-occurrences','--filter_by_occurrences_column', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
114 help='Filter pal_finder output to just loci with primers \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
115 which only occur once in the dataset', nargs='?', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
116 const=1, type=int, required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
117
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
118 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
119 help='Filter pal_finder output to just loci which are a \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
120 perfect repeat unit. Also, rank the loci by motif size \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
121 (largest first)', nargs='?', const=1, type=int, \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
122 required=False)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
123
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
124 args = vars(parser.parse_args())
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
125
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
126 # parse arguments
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
127
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
128 R1_input = args['input1'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
129 R2_input = args['input2'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
130 pal_finder_output = args['pal_finder'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
131 perform_assembly = args['assembly_QC']
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
132 skip_assembly = args['skip_assembly'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
133 skip_conversion = args['skip_conversion'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
134 filter_primers = args['filter_by_primer_column'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
135 filter_occurrences = args['filter_by_occurrences_column'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
136 filter_rank_motifs = args['filter_and_rank_by_motif_size'];
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
137
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
138 # set default values for arguments
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
139 if perform_assembly is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
140 perform_assembly = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
141 if skip_assembly is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
142 skip_assembly = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
143 if skip_conversion is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
144 skip_conversion = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
145 if filter_primers is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
146 filter_primers = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
147 if filter_occurrences is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
148 filter_occurrences = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
149 if filter_rank_motifs is None:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
150 filter_rank_motifs = 0
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
151
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
152 ############################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
153 # Function List #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
154 ############################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
155
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
156 # Reverse complement a sequence
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
157
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
158 def ReverseComplement1(seq):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
159 seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
160 return "".join([seq_dict[base] for base in reversed(seq)])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
161
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
162 # Convert a .fastq to a .fasta, filter to just lines we want
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
163 # and strip MiSeq barcodes
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
164
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
165 def fastq_to_fasta(file):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
166 file_name = os.path.splitext(os.path.basename(file))[0]
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
167 with open(file_name + "_filtered.fasta", "w") as out:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
168 for record in SeqIO.parse(file, "fastq"):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
169 ID = str(record.id)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
170 SEQ = str(record.seq)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
171 if ID in wanted:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
172 out.write(">" + ID + "\n" + SEQ + "\n")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
173
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
174 # strip the miseq barcodes from a fasta file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
175
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
176 def strip_barcodes(file):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
177 file_name = os.path.splitext(os.path.basename(file))[0]
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
178 with open(file_name + "_adapters_removed.fasta", "w") as out:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
179 for record in SeqIO.parse(file, "fasta"):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
180 match = re.search(r'\S*:', record.id)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
181 if match:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
182 correct = match.group().rstrip(":")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
183 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
184 correct = str(record.id)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
185 SEQ = str(record.seq)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
186 if correct in wanted:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
187 out.write(">" + correct + "\n" + SEQ + "\n")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
188
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
189 ############################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
190 # MAIN PROGRAM #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
191 ############################################################
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
192 print "\n~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
193 print "pal_filter"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
194 print "~~~~~~~~~~\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
195 time.sleep(1)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
196 print "\"Find the optimum loci in your pal_finder output and increase "\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
197 "the rate of successful microsatellite marker development\""
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
198 print "\nSee Griffiths et al. (currently unpublished) for more details......"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
199 time.sleep(2)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
200
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
201 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
202 and filter_rank_motifs == 0):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
203 print "\nNo optional arguments supplied."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
204 print "\nBy default this program does nothing."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
205 print "\nNo files produced and no modifications made."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
206 print "\nFinished.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
207 exit()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
208
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
209 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
210 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
211 print "Checking supplied filtering parameters:"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
212 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
213 time.sleep(2)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
214 if filter_primers == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
215 print "-primers flag supplied."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
216 print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
217 " column."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
218 print "Only rows where primers have successfully been designed will"\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
219 " pass the filter.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
220 time.sleep(2)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
221 if filter_occurrences == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
222 print "-occurrences flag supplied."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
223 print "Filtering pal_finder output on the \"Occurences of Forward" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
224 " Primer in Reads\" and \"Occurences of Reverse Primer" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
225 " in Reads\" columns."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
226 print "Only rows where both primers occur only a single time in the"\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
227 " reads pass the filter.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
228 time.sleep(2)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
229 if filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
230 print "-rankmotifs flag supplied."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
231 print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
232 " just those with perfect repeats."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
233 print "Only rows containing 'perfect' repeats will pass the filter."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
234 print "Also, ranking output by size of motif (largest first).\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
235 time.sleep(2)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
236 # index the raw fastq files so that the sequences can be pulled out and
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
237 # added to the filtered output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
238
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
239 # Indexing input sequence files
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
240 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
241 print "Indexing FastQ files....."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
242 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
243 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
244 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
245 print "Indexing complete."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
246
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
247 # create a set to hold the filtered output
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
248 wanted_lines = set()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
249
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
250 # get lines from the pal_finder output which meet filter settings
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
251
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
252 # read the pal_finder output file into a csv reader
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
253 with open (pal_finder_output) as csvfile_infile:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
254 csv_f = csv.reader(csvfile_infile, delimiter='\t')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
255 header = csv_f.next()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
256 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
257 ".filtered", 'w') as csvfile_outfile:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
258 # write the header line for the output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
259 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
260 "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
261 for row in csv_f:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
262 # get the sequence ID
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
263 seq_ID = row[0]
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
264 # get the raw sequence reads and convert to a format that can
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
265 # go into a tsv file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
266 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
267 replace("\n","\t",1).replace("\n","")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
268 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
269 replace("\n","\t",1).replace("\n","")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
270 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
271 # navigate through all different combinations of filter options
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
272 # if the primer filter is switched on
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
273 if filter_primers == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
274 # check the occurrences of primers field
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
275 if row[5] == "1":
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
276 # if filter occurrences of primers is switched on
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
277 if filter_occurrences == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
278 # check the occurrences of primers field
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
279 if (row[15] == "1" and row[16] == "1"):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
280 # if rank by motif is switched on
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
281 if filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
282 # check for perfect motifs
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
283 if row[1].count('(') == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
284 # all 3 filter switched on
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
285 # write line out to output
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
286 csvfile_outfile.write('\t'.join(row) + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
287 seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
288
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
289 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
290 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
291 elif filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
292 if row[1].count('(') == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
293 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
294 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
295 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
296 elif filter_occurrences == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
297 if (row[15] == "1" and row[16] == "1"):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
298 if filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
299 if row[1].count('(') == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
300 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
301 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
302 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
303 elif filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
304 if row[1].count('(') == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
305 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
306 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
307 csvfile_outfile.write('\t'.join(row) + seq_info)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
308
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
309 # if filter_rank_motifs is active, order the file by the size of the motif
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
310 if filter_rank_motifs == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
311 rank_motif = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
312 ranked_list = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
313 # read in the non-ordered file and add every entry to rank_motif list
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
314 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
315 ".filtered") as csvfile_ranksize:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
316 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
317 header = csv_rank.next()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
318 for line in csv_rank:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
319 rank_motif.append(line)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
320
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
321 # open the file ready to write the ordered list
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
322 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
323 ".filtered", 'w') as rank_outfile:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
324 rankwriter = csv.writer(rank_outfile, delimiter='\t', \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
325 lineterminator='\n')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
326 rankwriter.writerow(header)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
327 count = 2
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
328 while count < 10:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
329 for row in rank_motif:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
330 # count size of motif
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
331 motif = re.search(r'[ATCG]*',row[1])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
332 if motif:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
333 the_motif = motif.group()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
334 # rank it and write into ranked_list
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
335 if len(the_motif) == count:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
336 ranked_list.insert(0, row)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
337 count = count + 1
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
338 # write out the ordered list, into the .filtered file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
339 for row in ranked_list:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
340 rankwriter.writerow(row)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
341
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
342 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
343 print "Checking assembly flags supplied:"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
344 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
345 if perform_assembly == 0:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
346 print "Assembly flag not supplied. Not performing assembly QC.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
347 if perform_assembly == 1:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
348 print "-assembly flag supplied: Performing PandaSeq assembly quality checks."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
349 print "See Fox et al. (currently unpublished) for full details on the"\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
350 " quality-check process.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
351 time.sleep(5)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
352
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
353 # Get readID, F primers, R primers and motifs from filtered pal_finder output
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
354 seqIDs = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
355 motif = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
356 F_primers = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
357 R_primers = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
358 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
359 ".filtered") as input_csv:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
360 pal_finder_csv = csv.reader(input_csv, delimiter='\t')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
361 header = pal_finder_csv.next()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
362 for row in pal_finder_csv:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
363 seqIDs.append(row[0])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
364 motif.append(row[1])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
365 F_primers.append(row[7])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
366 R_primers.append(row[9])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
367
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
368 # Get a list of just the unique IDs we want
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
369 wanted = set()
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
370 for line in seqIDs:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
371 wanted.add(line)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
372
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
373 # Assemble the paired end reads into overlapping contigs using PandaSeq
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
374 # (can be skipped with the -a flag if assembly has already been run
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
375 # and the appropriate files are in the same directory as the script,
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
376 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
377 #
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
378 # The first time you riun the script you MUST not enable the -a flag.t
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
379 # but you are able to skip the assembly in subsequent analysis using the
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
380 # -a flag.
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
381
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
382 if skip_assembly == 0:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
383 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
384 R2_input + ' -o 25 -t 0.95 -w Assembly.fasta'
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
385 subprocess.call(pandaseq_command, shell=True)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
386 strip_barcodes("Assembly.fasta")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
387 print "\nPaired end reads been assembled into overlapping reads."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
388 print "\nFor future analysis, you can skip this assembly step using \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
389 the -a flag, provided that the assembly.fasta file is \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
390 intact and in the same location."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
391 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
392 print "\n(Skipping the assembly step as you provided the -a flag)"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
393
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
394 # Fastq files need to be converted to fasta. The first time you run the script
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
395 # you MUST not enable the -c flag, but you are able to skip the conversion
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
396 # later using the -c flag. Make sure the fasta files are in the same location
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
397 #and do not change the filenames
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
398
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
399 if skip_conversion ==0:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
400 fastq_to_fasta(R1_input)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
401 fastq_to_fasta(R2_input)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
402 print "\nThe input fastq files have been converted to the fasta format."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
403 print "\nFor any future analysis, you can skip this conversion step \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
404 using the -c flag, provided that the fasta files \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
405 are intact and in the same location."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
406 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
407 print "\n(Skipping the fastq -> fasta conversion as you provided the" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
408 " -c flag).\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
409
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
410 # get the files and everything else we will need
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
411
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
412 # Assembled fasta file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
413 assembly_file = "Assembly_adapters_removed.fasta"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
414 # filtered R1 reads
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
415 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
416 "_filtered.fasta"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
417 # filtered R2 reads
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
418 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
419 "_filtered.fasta"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
420
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
421 outputfilename = os.path.splitext(os.path.basename(R1_input))[0]
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
422
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
423 # parse the files with SeqIO
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
424 assembly_sequences = SeqIO.parse(assembly_file,'fasta')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
425 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
426
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
427 # create some empty lists to hold the ID tags we are interested in
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
428 assembly_IDs = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
429 fasta_IDs = []
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
430
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
431 # populate the above lists with sequence IDs
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
432 for sequence in assembly_sequences:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
433 assembly_IDs.append(sequence.id)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
434 for sequence in R1fasta_sequences:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
435 fasta_IDs.append(sequence.id)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
436
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
437 # Index the assembly fasta file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
438 assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
439 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
440 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
441
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
442 # prepare the output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
443 with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
444 as outputfile:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
445 # write the headers for the output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
446 outputfile.write("readPairID\t Forward Primer\t F Primer Position in "
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
447 "Assembled Read\t Reverse Primer\t R Primer Position in "
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
448 "Assembled Read\t Motifs(bases)\t Assembled Read ID\t "
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
449 "Assembled Read Sequence\t Raw Forward Read ID\t Raw "
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
450 "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse "
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
451 "Read Sequence\n")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
452
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
453 # cycle through parameters from the pal_finder output
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
454 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
455 if str(x) in assembly_IDs:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
456 # get the raw sequences ready to go into the output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
457 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
458 # fasta entries need to be converted to single line so sit nicely in the output
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
459 assembly_output = assembly_seq.replace("\n","\t")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
460 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
461 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
462 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
463 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
464 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
465
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
466 # check that both primer sequences can be seen in the assembled contig
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
467 if y or ReverseComplement1(y) in assembly_no_id and z or \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
468 ReverseComplement1(z) in assembly_no_id:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
469 if y in assembly_no_id:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
470 # get the positions of the primers in the assembly to predict fragment length
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
471 F_position = assembly_no_id.index(y)+len(y)+1
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
472 if ReverseComplement1(y) in assembly_no_id:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
473 F_position = assembly_no_id.index(ReverseComplement1(y)\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
474 )+len(ReverseComplement1(y))+1
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
475 if z in assembly_no_id:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
476 R_position = assembly_no_id.index(z)+1
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
477 if ReverseComplement1(z) in assembly_no_id:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
478 R_position = assembly_no_id.index(ReverseComplement1(z)\
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
479 )+1
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
480
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
481 # write everything out into the output file
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
482 output = (str(x) + "\t" + y + "\t" + str(F_position) \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
483 + "\t" + (z) + "\t" + str(R_position) \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
484 + "\t" + a + "\t" + assembly_output \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
485 + R1_output + "\t" + R2_output + "\n")
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
486 outputfile.write(output)
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
487 print "\nPANDAseq quality check complete."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
488 print "Results from PANDAseq quality check (and filtering, if any" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
489 " any filters enabled) written to output file" \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
490 " ending \"_pal_filter_assembly_output.txt\".\n\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
491
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
492 print "Filtering of pal_finder results complete."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
493 print "Filtered results written to output file ending \".filtered\"."
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
494 print "\nFinished\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
495 else:
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
496 if (skip_assembly == 1 or skip_conversion == 1):
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
497 print "\nERROR: You cannot supply the -a flag or the -c flag without \
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
498 also supplying the -assembly flag.\n"
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
499
e1a14ed7a9d6 Updated to version 0.02.04.4 (new pal_filter script)
pjbriggs
parents:
diff changeset
500 print "\nProgram Finished\n"