annotate pal_finder_filter_and_assembly.py @ 2:b6ccc7dd7b02 draft

Version 0.02.04.3.
author pjbriggs
date Fri, 04 Dec 2015 07:43:30 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
1 #!/usr/bin/python -tt
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
2
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
3 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
4 # Graeme Fox - 26/09/2015 - graeme.fox@manchester.ac.uk
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
5 # Tested on (L)ubuntu 15.04 only, with Python 2.7
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
6 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
7 # PROGRAM DESCRIPTION
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
8 # Program to pick optimum loci from the output of pal_finder_v0.02.04
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
9 # and to predict the fragment length.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
10 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
11 # This program can be used to filter pal_finder output to choose the 'optimum' loci.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
12 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
13 # Additionally this program can assemble the two paired end reads,
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
14 # find the primers within HQ assembly, log their positions
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
15 # and calculate the difference to give the fragment length.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
16 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
17 # For best results in your PCR, I suggest doing both.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
18 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
19 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
20 # Requirements:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
21 # Must have Biopython (www.biopython.org).
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
22 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
23 # If you with to perform the assembly QC step, you must have:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
24 # PandaSeq (https://github.com/neufeld/pandaseq)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
25 # PandaSeq must be in your path / able to run from anywhere
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
26 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
27 # Required options:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
28 # -i forward_paired_ends.fastQ
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
29 # -j reverse_paired_ends.fastQ
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
30 # -p pal_finder output - the "(microsatellites with read IDs and primer pairs)" file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
31 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
32 # By default it does nothing.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
33 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
34 # Non-required options:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
35 # -assembly: turn on the pandaseq assembly QC step
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
36 # -primers: filter microsatellite loci to just those which have primers designed
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
37 # -occurrences: filter microsatellite loci to those with primers which appear only once in the dataset
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
38 # -rankmotifs: filter microsatellite loci to just those with perfect motifs. Rank the output by size of motif (largest first)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
39 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
40 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
41 # For repeat analysis, the following extra non-required options may be useful:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
42 # PandaSeq Assembly, and fastq -> fasta conversion are slow.
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
43 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
44 # Do them the first time, generate the files and then skip either, or both steps with the following:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
45 # -a: skip assembly step
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
46 # -c: skip fastq -> fasta conversion step
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
47 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
48 # # Just make sure to keep the files in the correct directory with the correct filename
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
49 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
50 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
51 # Example usage:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
52 # pal_finder_filter_and_assembly.py -i R1.fastq -j R2.fastq -p pal_finder_output_(microsatellites with read IDs and primer pairs).tabular -primers -occurrences -rankmotifs -assembly
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
53 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
54 ###########################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
55 import Bio, subprocess, argparse, csv, os, re, time
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
56 from Bio import SeqIO
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
57
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
58 # Get values for all the required and optional arguments
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
59
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
60 parser = argparse.ArgumentParser(description='Frag_length_finder')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
61 parser.add_argument('-i','--input1', help='Forward paired-end fastq file', required=True)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
62 parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', required=True)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
63 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', required=True)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
64 parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq based QC', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
65 parser.add_argument('-a','--skip_assembly', help='If the assembly has already been run, skip it with -a', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
66 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta conversion has already been run, skip it with -c', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
67 parser.add_argument('-primers','--filter_by_primer_column', help='Filter pal_finder output to just those loci which have primers designed', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
68 parser.add_argument('-occurrences','--filter_by_occurrences_column', help='Filter pal_finder output to just loci with primers which only occur once in the dataset', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
69 parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', help='Filter pal_finder output to just loci which are a perfect repeat unit. Also, rank the loci by motif size (largest first)', nargs='?', const=1, type=int, required=False)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
70 args = vars(parser.parse_args())
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
71
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
72 # parse arguments
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
73
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
74 R1_input = args['input1'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
75 R2_input = args['input2'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
76 pal_finder_output = args['pal_finder'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
77 perform_assembly = args['assembly_QC']
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
78 skip_assembly = args['skip_assembly'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
79 skip_conversion = args['skip_conversion'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
80 filter_primers = args['filter_by_primer_column'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
81 filter_occurrences = args['filter_by_occurrences_column'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
82 filter_rank_motifs = args['filter_and_rank_by_motif_size'];
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
83
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
84 # set default values for arguments
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
85 if perform_assembly is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
86 perform_assembly = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
87 if skip_assembly is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
88 skip_assembly = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
89 if skip_conversion is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
90 skip_conversion = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
91 if filter_primers is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
92 filter_primers = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
93 if filter_occurrences is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
94 filter_occurrences = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
95 if filter_rank_motifs is None:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
96 filter_rank_motifs = 0
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
97
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
98 ############################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
99 # Function List #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
100 ############################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
101 # Reverse complement a sequence
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
102 def ReverseComplement1(seq):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
103 seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
104 return "".join([seq_dict[base] for base in reversed(seq)])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
105
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
106 # Convert a .fastq to a .fasta, filter to just lines we want and strip MiSeq barcodes
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
107 def fastq_to_fasta(file):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
108 file_name = os.path.splitext(os.path.basename(file))[0]
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
109 with open(file_name + "_filtered.fasta", "w") as out:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
110 for record in SeqIO.parse(file, "fastq"):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
111 ID = str(record.id)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
112 SEQ = str(record.seq)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
113 if ID in wanted:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
114 out.write(">" + ID + "\n" + SEQ + "\n")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
115
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
116 # strip the miseq barcodes from a fasta file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
117 def strip_barcodes(file):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
118 file_name = os.path.splitext(os.path.basename(file))[0]
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
119 with open(file_name + "_adapters_removed.fasta", "w") as out:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
120 for record in SeqIO.parse(file, "fasta"):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
121 match = re.search(r'\S*:', record.id)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
122 if match:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
123 correct = match.group().rstrip(":")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
124 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
125 correct = str(record.id)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
126 SEQ = str(record.seq)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
127 if correct in wanted:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
128 out.write(">" + correct + "\n" + SEQ + "\n")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
129
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
130 ############################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
131 # MAIN PROGRAM #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
132 ############################################################
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
133
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
134 if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 and filter_rank_motifs == 0):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
135 print "\nNo optional arguments supplied."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
136 print "\nBy default this program does nothing."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
137 print "\nNo files produced and no modifications made."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
138 print "\nFinished.\n"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
139 exit()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
140
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
141 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
142 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
143 print "Checking supplied filtering parameters:"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
144 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
145 if filter_primers == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
146 print "-primers flag supplied. Filtering pal_finder output on the \"Primers found (1=y,0=n)\" column."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
147 if filter_occurrences == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
148 print "-occurrences flag supplied. Filtering pal_finder output on the \"Occurances of Forward Primer in Reads\" and \"Occurances of Reverse Primer in Reads\" columns."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
149 if filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
150 print "-rankmotifs flag supplied. Filtering pal_finder output on the \"Motifs(bases)\" column to just those with perfect repeats. Ranking output by size of motif (largest first)."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
151
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
152 # index the raw fastq files so that the sequences can be pulled out and added to the filtered output file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
153 R1fastq_sequences_index = SeqIO.index(R1_input,'fastq')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
154 R2fastq_sequences_index = SeqIO.index(R2_input,'fastq')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
155
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
156 # create a set to hold the filtered output
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
157 wanted_lines = set()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
158
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
159 # get lines from the pal_finder output which meet filter settings
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
160
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
161 # read the pal_finder output file into a csv reader
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
162 with open (pal_finder_output) as csvfile_infile:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
163 csv_f = csv.reader(csvfile_infile, delimiter='\t')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
164 header = csv_f.next()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
165 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered", 'w') as csvfile_outfile:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
166 # write the header line for the output file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
167 csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\tR1_Sequence\tR2_Sequence_ID\tR2_Sequence\n")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
168 for row in csv_f:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
169 # get the sequence ID
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
170 seq_ID = row[0]
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
171 # get the raw sequence reads and convert to a format that can go into a tsv file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
172 R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").replace("\n","\t",1).replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
173 R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").replace("\n","\t",1).replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
174 seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
175 # navigate through all different combinations of filter options
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
176 # if the primer filter is switched on
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
177 if filter_primers == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
178 # check the occurrences of primers field
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
179 if row[5] == "1":
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
180 # if filter occurrences of primers is switched on
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
181 if filter_occurrences == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
182 # check the occurrences of primers field
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
183 if (row[15] == "1" and row[16] == "1"):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
184 # if rank by motif is switched on
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
185 if filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
186 # check for perfect motifs
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
187 if row[1].count('(') == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
188 # all 3 filter switched on - write line out to output
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
189 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
190
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
191 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
192 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
193 elif filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
194 if row[1].count('(') == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
195 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
196 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
197 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
198 elif filter_occurrences == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
199 if (row[15] == "1" and row[16] == "1"):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
200 if filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
201 if row[1].count('(') == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
202 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
203 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
204 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
205 elif filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
206 if row[1].count('(') == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
207 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
208 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
209 csvfile_outfile.write('\t'.join(row) + seq_info)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
210
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
211 # if filter_rank_motifs is active, order the file by the size of the motif
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
212 if filter_rank_motifs == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
213 rank_motif = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
214 ranked_list = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
215 # read in the non-ordered file and add every entry to rank_motif list
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
216 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered") as csvfile_ranksize:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
217 csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
218 header = csv_rank.next()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
219 for line in csv_rank:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
220 rank_motif.append(line)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
221
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
222 # open the file ready to write the ordered list
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
223 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered", 'w') as rank_outfile:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
224 rankwriter = csv.writer(rank_outfile, delimiter='\t', lineterminator='\n')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
225 rankwriter.writerow(header)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
226 count = 2
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
227 while count < 10:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
228 for row in rank_motif:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
229 # count size of motif
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
230 motif = re.search(r'[ATCG]*',row[1])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
231 if motif:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
232 the_motif = motif.group()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
233 # rank it and write into ranked_list
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
234 if len(the_motif) == count:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
235 ranked_list.insert(0, row)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
236 count = count + 1
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
237 # write out the ordered list, into the .filtered file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
238 for row in ranked_list:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
239 rankwriter.writerow(row)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
240
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
241 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
242 print "Checking assembly flags supplied:"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
243 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
244 if perform_assembly == 0:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
245 print "Assembly flag not supplied. Not performing assembly QC.\n"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
246 if perform_assembly == 1:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
247 print "-assembly flag supplied: Performing PandaSeq assembly quality checks."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
248 time.sleep(5)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
249
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
250 # Get readID, F primers, R primers and motifs from the filtered pal_finder output
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
251 seqIDs = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
252 motif = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
253 F_primers = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
254 R_primers = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
255 with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + ".filtered") as input_csv:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
256 pal_finder_csv = csv.reader(input_csv, delimiter='\t')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
257 header = pal_finder_csv.next()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
258 for row in pal_finder_csv:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
259 seqIDs.append(row[0])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
260 motif.append(row[1])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
261 F_primers.append(row[7])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
262 R_primers.append(row[9])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
263
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
264 # Get a list of just the unique IDs we want
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
265 wanted = set()
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
266 for line in seqIDs:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
267 wanted.add(line)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
268
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
269 # Assemble the paired end reads into overlapping contigs using PandaSeq
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
270 # (can be skipped with the -a flag if assembly has already been run
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
271 # and the appropriate files are in the same directory as the script,
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
272 # and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
273 #
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
274 # I suggest you run this script WITHOUT -a first, and then use the -a flag in
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
275 # any subsequent reanalysis
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
276
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
277 if skip_assembly == 0:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
278 pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + R2_input + ' -o 25 -t 0.95 -w Assembly.fasta'
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
279 subprocess.call(pandaseq_command, shell=True)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
280 strip_barcodes("Assembly.fasta")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
281 print "\nPaired end reads been assembled into overlapping reads."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
282 print "\nFor future analysis, you can skip this assembly step using the -a flag, provided that the assembly.fasta file is intact and in the same location."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
283 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
284 print "\n(Skipping the assembly step as you provided the -a flag)"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
285
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
286 # FastQ files need to be converted to fasta. Again, I suggest running this section
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
287 # the first time WITHOUT the -c flag and then skipping it later using the -c flag
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
288 # Make sure the fasta files are in the same location and do not change the filenames
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
289
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
290 if skip_conversion ==0:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
291 fastq_to_fasta(R1_input)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
292 fastq_to_fasta(R2_input)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
293 print "\nThe input fastq files have been converted to the fasta format."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
294 print "\nFor any future analysis, you can skip this conversion step using the -c flag, provided that the fasta files are intact and in the same location."
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
295 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
296 print "\n(Skipping the fastq -> fasta conversion as you provided the -c flag)"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
297
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
298 # get the files and everything else we will need
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
299 assembly_file = "Assembly_adapters_removed.fasta" # Assembled fasta file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
300 R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + "_filtered.fasta" # filtered R1 reads
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
301 R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + "_filtered.fasta" # filtered R2 reads
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
302 outputfilename = os.path.splitext(os.path.basename(R1_input))[0]
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
303
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
304 # parse the files with SeqIO
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
305 assembly_sequences = SeqIO.parse(assembly_file,'fasta')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
306 R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
307
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
308 # create some empty lists to hold the ID tags we are interested in
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
309 assembly_IDs = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
310 fasta_IDs = []
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
311
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
312 # populate the above lists with sequence IDs
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
313 for sequence in assembly_sequences:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
314 assembly_IDs.append(sequence.id)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
315 for sequence in R1fasta_sequences:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
316 fasta_IDs.append(sequence.id)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
317
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
318 # Index the assembly fasta file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
319 assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
320 R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
321 R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
322
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
323 # prepare the output file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
324 with open (outputfilename + "_pal_finder_assembly_output.txt", 'w') as outputfile:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
325 outputfile.write("readPairID\t Forward Primer\t F Primer Position in Assembled Read\t Reverse Primer\t R Primer Position in Assembled Read\t Predicted Amplicon Size (bp)\t Motifs(bases)\t Assembled Read ID\t Assembled Read Sequence\t Raw Forward Read ID\t Raw Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse Read Sequence\n")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
326
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
327 # cycle through parameters from the pal_finder output
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
328 for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
329 if str(x) in assembly_IDs:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
330 # get the raw sequences ready to go into the output file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
331 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
332 # fasta entries need to be converted to single line so they sit nicely in the output
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
333 assembly_output = assembly_seq.replace("\n","\t")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
334 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
335 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
336 #R1_output = R1_output.replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
337 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
338 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
339 #R2_output = R2_output.replace("\n","")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
340 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
341
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
342 # check that both primer sequences can be seen in the assembled contig
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
343 if y or ReverseComplement1(y) in assembly_no_id and z or ReverseComplement1(z) in assembly_no_id:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
344 if y in assembly_no_id:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
345 # get the positions of the primers in the assembly to predict fragment length
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
346 F_position = assembly_no_id.index(y)+len(y)+1
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
347 if ReverseComplement1(y) in assembly_no_id:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
348 F_position = assembly_no_id.index(ReverseComplement1(y))+len(ReverseComplement1(y))+1
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
349 if z in assembly_no_id:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
350 R_position = assembly_no_id.index(z)+1
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
351 if ReverseComplement1(z) in assembly_no_id:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
352 R_position = assembly_no_id.index(ReverseComplement1(z))+1
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
353 # calculate fragment length
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
354 fragment_length = R_position-F_position
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
355
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
356 # write everything out into the output file
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
357 output = (str(x) + "\t" + y + "\t" + str(F_position) + "\t" + (z) + "\t" + str(R_position) + "\t" + str(fragment_length) + "\t" + a + "\t" + assembly_output + R1_output + "\t" + R2_output + "\n")
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
358 outputfile.write(output)
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
359 print "\nFinished\n"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
360 else:
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
361 if (skip_assembly == 1 or skip_conversion == 1):
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
362 print "\nERROR: You cannot supply the -a flag or the -c flag without also supplying the -assembly flag.\n"
b6ccc7dd7b02 Version 0.02.04.3.
pjbriggs
parents:
diff changeset
363 print "\nFinished\n"