Mercurial > repos > pjbriggs > pal_finder
diff pal_filter.py @ 3:e1a14ed7a9d6 draft
Updated to version 0.02.04.4 (new pal_filter script)
author | pjbriggs |
---|---|
date | Wed, 24 Feb 2016 08:25:17 -0500 |
parents | |
children | cb56cc1d5c39 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pal_filter.py Wed Feb 24 08:25:17 2016 -0500 @@ -0,0 +1,500 @@ +#!/usr/bin/python -tt +# +# pal_filter +# https://github.com/graemefox/pal_filter +# +################################################################################ +# Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk +# Tested on 64-bit Ubuntu, with Python 2.7 +# +################################################################################ +# PROGRAM DESCRIPTION +# +# Program to pick optimum loci from the output of pal_finder_v0.02.04 +# +# This program can be used to filter output from pal_finder and choose the +# 'optimum' loci. +# +# For the paper referncing this workflow, see Griffiths et al. +# (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk) +# +################################################################################ +# +# This program also contains a quality-check method to improve the rate of PCR +# success. For this QC method, paired end reads are assembled using +# PANDAseq so you must have PANDAseq installed. +# +# For the paper referencing this assembly-QC method see Fox et al. +# (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk) +# +# For best results in PCR for marker development, I suggest enabling all the +# filter options AND the assembly based QC +# +################################################################################ +# REQUIREMENTS +# +# Must have Biopython installed (www.biopython.org). +# +# If you with to perform the assembly QC step, you must have: +# PandaSeq (https://github.com/neufeld/pandaseq) +# PandaSeq must be in your $PATH / able to run from anywhere +################################################################################ +# REQUIRED OPTIONS +# +# -i forward_paired_ends.fastQ +# -j reverse_paired_ends.fastQ +# -p pal_finder output - the "(microsatellites with read IDs and +# primer pairs)" file +# +# By default the program does nothing.... +# +# NON-REQUIRED OPTIONS +# +# -assembly: turn on the pandaseq assembly QC step +# -primers: filter microsatellite loci to just those which +# have primers designed +# +# -occurrences: filter microsatellite loci to those with primers +# which appear only once in the dataset +# +# -rankmotifs: filter microsatellite loci to just those with perfect motifs. +# Rank the output by size of motif (largest first) +# +########################################################### +# For repeat analysis, the following extra non-required options may be useful: +# +# Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the +# first time, generate the files and then skip either, or both steps with +# the following: +# +# -a: skip assembly step +# -c: skip fastq -> fasta conversion step +# +# Just make sure to keep the assembled/converted files in the correct directory +# with the correct filename(s) +########################################################### +# +# EXAMPLE USAGE: +# +# pal_filtery.py -i R1.fastq -j R2.fastq +# -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly +# +########################################################### +import Bio, subprocess, argparse, csv, os, re, time +from Bio import SeqIO + +# Get values for all the required and optional arguments + +parser = argparse.ArgumentParser(description='pal_filter') +parser.add_argument('-i','--input1', help='Forward paired-end fastq file', \ + required=True) + +parser.add_argument('-j','--input2', help='Reverse paired-end fastq file', \ + required=True) + +parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \ + required=True) + +parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \ + based QC', nargs='?', const=1, type=int, required=False) + +parser.add_argument('-a','--skip_assembly', help='If the assembly has already \ + been run, skip it with -a', nargs='?', const=1, \ + type=int, required=False) + +parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \ + conversion has already been run, skip it with -c', \ + nargs='?', const=1, type=int, required=False) + +parser.add_argument('-primers','--filter_by_primer_column', help='Filter \ + pal_finder output to just those loci which have primers \ + designed', nargs='?', const=1, type=int, required=False) + +parser.add_argument('-occurrences','--filter_by_occurrences_column', \ + help='Filter pal_finder output to just loci with primers \ + which only occur once in the dataset', nargs='?', \ + const=1, type=int, required=False) + +parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \ + help='Filter pal_finder output to just loci which are a \ + perfect repeat unit. Also, rank the loci by motif size \ + (largest first)', nargs='?', const=1, type=int, \ + required=False) + +args = vars(parser.parse_args()) + +# parse arguments + +R1_input = args['input1']; +R2_input = args['input2']; +pal_finder_output = args['pal_finder']; +perform_assembly = args['assembly_QC'] +skip_assembly = args['skip_assembly']; +skip_conversion = args['skip_conversion']; +filter_primers = args['filter_by_primer_column']; +filter_occurrences = args['filter_by_occurrences_column']; +filter_rank_motifs = args['filter_and_rank_by_motif_size']; + +# set default values for arguments +if perform_assembly is None: + perform_assembly = 0 +if skip_assembly is None: + skip_assembly = 0 +if skip_conversion is None: + skip_conversion = 0 +if filter_primers is None: + filter_primers = 0 +if filter_occurrences is None: + filter_occurrences = 0 +if filter_rank_motifs is None: + filter_rank_motifs = 0 + +############################################################ +# Function List # +############################################################ + +# Reverse complement a sequence + +def ReverseComplement1(seq): + seq_dict = {'A':'T','T':'A','G':'C','C':'G'} + return "".join([seq_dict[base] for base in reversed(seq)]) + +# Convert a .fastq to a .fasta, filter to just lines we want +# and strip MiSeq barcodes + +def fastq_to_fasta(file): + file_name = os.path.splitext(os.path.basename(file))[0] + with open(file_name + "_filtered.fasta", "w") as out: + for record in SeqIO.parse(file, "fastq"): + ID = str(record.id) + SEQ = str(record.seq) + if ID in wanted: + out.write(">" + ID + "\n" + SEQ + "\n") + +# strip the miseq barcodes from a fasta file + +def strip_barcodes(file): + file_name = os.path.splitext(os.path.basename(file))[0] + with open(file_name + "_adapters_removed.fasta", "w") as out: + for record in SeqIO.parse(file, "fasta"): + match = re.search(r'\S*:', record.id) + if match: + correct = match.group().rstrip(":") + else: + correct = str(record.id) + SEQ = str(record.seq) + if correct in wanted: + out.write(">" + correct + "\n" + SEQ + "\n") + +############################################################ +# MAIN PROGRAM # +############################################################ +print "\n~~~~~~~~~~" +print "pal_filter" +print "~~~~~~~~~~\n" +time.sleep(1) +print "\"Find the optimum loci in your pal_finder output and increase "\ + "the rate of successful microsatellite marker development\"" +print "\nSee Griffiths et al. (currently unpublished) for more details......" +time.sleep(2) + +if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \ + and filter_rank_motifs == 0): + print "\nNo optional arguments supplied." + print "\nBy default this program does nothing." + print "\nNo files produced and no modifications made." + print "\nFinished.\n" + exit() + +else: + print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + print "Checking supplied filtering parameters:" + print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + time.sleep(2) + if filter_primers == 1: + print "-primers flag supplied." + print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \ + " column." + print "Only rows where primers have successfully been designed will"\ + " pass the filter.\n" + time.sleep(2) + if filter_occurrences == 1: + print "-occurrences flag supplied." + print "Filtering pal_finder output on the \"Occurences of Forward" \ + " Primer in Reads\" and \"Occurences of Reverse Primer" \ + " in Reads\" columns." + print "Only rows where both primers occur only a single time in the"\ + " reads pass the filter.\n" + time.sleep(2) + if filter_rank_motifs == 1: + print "-rankmotifs flag supplied." + print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \ + " just those with perfect repeats." + print "Only rows containing 'perfect' repeats will pass the filter." + print "Also, ranking output by size of motif (largest first).\n" + time.sleep(2) +# index the raw fastq files so that the sequences can be pulled out and +# added to the filtered output file + +# Indexing input sequence files +print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +print "Indexing FastQ files....." +print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +R1fastq_sequences_index = SeqIO.index(R1_input,'fastq') +R2fastq_sequences_index = SeqIO.index(R2_input,'fastq') +print "Indexing complete." + +# create a set to hold the filtered output +wanted_lines = set() + +# get lines from the pal_finder output which meet filter settings + +# read the pal_finder output file into a csv reader +with open (pal_finder_output) as csvfile_infile: + csv_f = csv.reader(csvfile_infile, delimiter='\t') + header = csv_f.next() + with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ + ".filtered", 'w') as csvfile_outfile: +# write the header line for the output file + csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t" + "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n") + for row in csv_f: +# get the sequence ID + seq_ID = row[0] +# get the raw sequence reads and convert to a format that can +# go into a tsv file + R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\ + replace("\n","\t",1).replace("\n","") + R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\ + replace("\n","\t",1).replace("\n","") + seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n" +# navigate through all different combinations of filter options +# if the primer filter is switched on + if filter_primers == 1: +# check the occurrences of primers field + if row[5] == "1": +# if filter occurrences of primers is switched on + if filter_occurrences == 1: +# check the occurrences of primers field + if (row[15] == "1" and row[16] == "1"): +# if rank by motif is switched on + if filter_rank_motifs == 1: +# check for perfect motifs + if row[1].count('(') == 1: +# all 3 filter switched on +# write line out to output + csvfile_outfile.write('\t'.join(row) + \ + seq_info) + + else: + csvfile_outfile.write('\t'.join(row) + seq_info) + elif filter_rank_motifs == 1: + if row[1].count('(') == 1: + csvfile_outfile.write('\t'.join(row) + seq_info) + else: + csvfile_outfile.write('\t'.join(row) + seq_info) + elif filter_occurrences == 1: + if (row[15] == "1" and row[16] == "1"): + if filter_rank_motifs == 1: + if row[1].count('(') == 1: + csvfile_outfile.write('\t'.join(row) + seq_info) + else: + csvfile_outfile.write('\t'.join(row) + seq_info) + elif filter_rank_motifs == 1: + if row[1].count('(') == 1: + csvfile_outfile.write('\t'.join(row) + seq_info) + else: + csvfile_outfile.write('\t'.join(row) + seq_info) + +# if filter_rank_motifs is active, order the file by the size of the motif +if filter_rank_motifs == 1: + rank_motif = [] + ranked_list = [] +# read in the non-ordered file and add every entry to rank_motif list + with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ + ".filtered") as csvfile_ranksize: + csv_rank = csv.reader(csvfile_ranksize, delimiter='\t') + header = csv_rank.next() + for line in csv_rank: + rank_motif.append(line) + +# open the file ready to write the ordered list + with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ + ".filtered", 'w') as rank_outfile: + rankwriter = csv.writer(rank_outfile, delimiter='\t', \ + lineterminator='\n') + rankwriter.writerow(header) + count = 2 + while count < 10: + for row in rank_motif: +# count size of motif + motif = re.search(r'[ATCG]*',row[1]) + if motif: + the_motif = motif.group() +# rank it and write into ranked_list + if len(the_motif) == count: + ranked_list.insert(0, row) + count = count + 1 +# write out the ordered list, into the .filtered file + for row in ranked_list: + rankwriter.writerow(row) + +print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +print "Checking assembly flags supplied:" +print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" +if perform_assembly == 0: + print "Assembly flag not supplied. Not performing assembly QC.\n" +if perform_assembly == 1: + print "-assembly flag supplied: Performing PandaSeq assembly quality checks." + print "See Fox et al. (currently unpublished) for full details on the"\ + " quality-check process.\n" + time.sleep(5) + +# Get readID, F primers, R primers and motifs from filtered pal_finder output + seqIDs = [] + motif = [] + F_primers = [] + R_primers = [] + with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \ + ".filtered") as input_csv: + pal_finder_csv = csv.reader(input_csv, delimiter='\t') + header = pal_finder_csv.next() + for row in pal_finder_csv: + seqIDs.append(row[0]) + motif.append(row[1]) + F_primers.append(row[7]) + R_primers.append(row[9]) + +# Get a list of just the unique IDs we want + wanted = set() + for line in seqIDs: + wanted.add(line) + +# Assemble the paired end reads into overlapping contigs using PandaSeq +# (can be skipped with the -a flag if assembly has already been run +# and the appropriate files are in the same directory as the script, +# and named "Assembly.fasta" and "Assembly_adapters_removed.fasta") +# +# The first time you riun the script you MUST not enable the -a flag.t +# but you are able to skip the assembly in subsequent analysis using the +# -a flag. + + if skip_assembly == 0: + pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \ + R2_input + ' -o 25 -t 0.95 -w Assembly.fasta' + subprocess.call(pandaseq_command, shell=True) + strip_barcodes("Assembly.fasta") + print "\nPaired end reads been assembled into overlapping reads." + print "\nFor future analysis, you can skip this assembly step using \ + the -a flag, provided that the assembly.fasta file is \ + intact and in the same location." + else: + print "\n(Skipping the assembly step as you provided the -a flag)" + +# Fastq files need to be converted to fasta. The first time you run the script +# you MUST not enable the -c flag, but you are able to skip the conversion +# later using the -c flag. Make sure the fasta files are in the same location +#and do not change the filenames + + if skip_conversion ==0: + fastq_to_fasta(R1_input) + fastq_to_fasta(R2_input) + print "\nThe input fastq files have been converted to the fasta format." + print "\nFor any future analysis, you can skip this conversion step \ + using the -c flag, provided that the fasta files \ + are intact and in the same location." + else: + print "\n(Skipping the fastq -> fasta conversion as you provided the" \ + " -c flag).\n" + +# get the files and everything else we will need + +# Assembled fasta file + assembly_file = "Assembly_adapters_removed.fasta" +# filtered R1 reads + R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \ + "_filtered.fasta" +# filtered R2 reads + R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \ + "_filtered.fasta" + + outputfilename = os.path.splitext(os.path.basename(R1_input))[0] + +# parse the files with SeqIO + assembly_sequences = SeqIO.parse(assembly_file,'fasta') + R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta') + +# create some empty lists to hold the ID tags we are interested in + assembly_IDs = [] + fasta_IDs = [] + +# populate the above lists with sequence IDs + for sequence in assembly_sequences: + assembly_IDs.append(sequence.id) + for sequence in R1fasta_sequences: + fasta_IDs.append(sequence.id) + +# Index the assembly fasta file + assembly_sequences_index = SeqIO.index(assembly_file,'fasta') + R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta') + R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta') + +# prepare the output file + with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \ + as outputfile: +# write the headers for the output file + outputfile.write("readPairID\t Forward Primer\t F Primer Position in " + "Assembled Read\t Reverse Primer\t R Primer Position in " + "Assembled Read\t Motifs(bases)\t Assembled Read ID\t " + "Assembled Read Sequence\t Raw Forward Read ID\t Raw " + "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse " + "Read Sequence\n") + +# cycle through parameters from the pal_finder output + for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif): + if str(x) in assembly_IDs: + # get the raw sequences ready to go into the output file + assembly_seq = (assembly_sequences_index.get_raw(x).decode()) +# fasta entries need to be converted to single line so sit nicely in the output + assembly_output = assembly_seq.replace("\n","\t") + R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode()) + R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","") + R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode()) + R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","") + assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:]) + +# check that both primer sequences can be seen in the assembled contig + if y or ReverseComplement1(y) in assembly_no_id and z or \ + ReverseComplement1(z) in assembly_no_id: + if y in assembly_no_id: +# get the positions of the primers in the assembly to predict fragment length + F_position = assembly_no_id.index(y)+len(y)+1 + if ReverseComplement1(y) in assembly_no_id: + F_position = assembly_no_id.index(ReverseComplement1(y)\ + )+len(ReverseComplement1(y))+1 + if z in assembly_no_id: + R_position = assembly_no_id.index(z)+1 + if ReverseComplement1(z) in assembly_no_id: + R_position = assembly_no_id.index(ReverseComplement1(z)\ + )+1 + +# write everything out into the output file + output = (str(x) + "\t" + y + "\t" + str(F_position) \ + + "\t" + (z) + "\t" + str(R_position) \ + + "\t" + a + "\t" + assembly_output \ + + R1_output + "\t" + R2_output + "\n") + outputfile.write(output) + print "\nPANDAseq quality check complete." + print "Results from PANDAseq quality check (and filtering, if any" \ + " any filters enabled) written to output file" \ + " ending \"_pal_filter_assembly_output.txt\".\n\n" + + print "Filtering of pal_finder results complete." + print "Filtered results written to output file ending \".filtered\"." + print "\nFinished\n" +else: + if (skip_assembly == 1 or skip_conversion == 1): + print "\nERROR: You cannot supply the -a flag or the -c flag without \ + also supplying the -assembly flag.\n" + + print "\nProgram Finished\n"