| 
0
 | 
     1 #
 | 
| 
 | 
     2 # Pyrocleaner 
 | 
| 
 | 
     3 # Copyright (C) 2009 INRA
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This program is free software: you can redistribute it and/or modify
 | 
| 
 | 
     6 # it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     7 # the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
     8 # (at your option) any later version.
 | 
| 
 | 
     9 # 
 | 
| 
 | 
    10 # This program is distributed in the hope that it will be useful,
 | 
| 
 | 
    11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    13 # GNU General Public License for more details.
 | 
| 
 | 
    14 # 
 | 
| 
 | 
    15 # You should have received a copy of the GNU General Public License
 | 
| 
 | 
    16 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    17 #
 | 
| 
 | 
    18 # ChangeLog
 | 
| 
 | 
    19 # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the
 | 
| 
 | 
    20 #                  --clean-quality option
 | 
| 
 | 
    21 # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality
 | 
| 
 | 
    22 #                  Add the --aggressive option in order to keep only one read per cluster
 | 
| 
 | 
    23 #                  Modify the duplication strategy so the longer read is keept
 | 
| 
 | 
    24 # v1.0 (09/2009) : Pyrocleaner first version
 | 
| 
 | 
    25 #
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 __author__ = 'Plateforme bioinformatique Midi Pyrenees'
 | 
| 
 | 
    28 __copyright__ = 'Copyright (C) 2009 INRA'
 | 
| 
 | 
    29 __license__ = 'GNU General Public License'
 | 
| 
 | 
    30 __version__ = '1.2'
 | 
| 
 | 
    31 __email__ = 'support.genopole@toulouse.inra.fr'
 | 
| 
 | 
    32 __status__ = 'beta'
 | 
| 
 | 
    33 
 | 
| 
 | 
    34 from Bio import SeqIO
 | 
| 
 | 
    35 from igraph import *
 | 
| 
 | 
    36 from optparse import *
 | 
| 
 | 
    37 import os, math, datetime, zlib, sys, re, glob, string, gzip
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 def create_roche_pairends_spacer_file(out_dir):
 | 
| 
 | 
    41     """
 | 
| 
 | 
    42     Create the Roche pairends fasta file
 | 
| 
 | 
    43       @param out_dir : the directory where will be writen the fasta file
 | 
| 
 | 
    44     """
 | 
| 
 | 
    45     file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr')
 | 
| 
 | 
    46     file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n")
 | 
| 
 | 
    47     file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n")
 | 
| 
 | 
    48     file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n")
 | 
| 
 | 
    49     file.close()
 | 
| 
 | 
    50     return file.name
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 def version_string ():
 | 
| 
 | 
    54     """
 | 
| 
 | 
    55     Return the pyrocleaner version
 | 
| 
 | 
    56     """
 | 
| 
 | 
    57     return "pyrocleaner " + __version__
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 def reads_to_sff (sff_file, seqs, output_file):
 | 
| 
 | 
    61     """
 | 
| 
 | 
    62     Extract seqs reads from the sff_file
 | 
| 
 | 
    63       @dependences sfffile : the sff software is required to execute this function
 | 
| 
 | 
    64       @param sff_file      : the input sff file
 | 
| 
 | 
    65       @param seqs          : table of seqs to extract   
 | 
| 
 | 
    66       @param output_file   : the name of the output sff file
 | 
| 
 | 
    67     """
 | 
| 
 | 
    68     # First creates the to_write file
 | 
| 
 | 
    69     tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt")
 | 
| 
 | 
    70     to_write_file = open(tmp_file, 'wr')
 | 
| 
 | 
    71     for read in seqs:
 | 
| 
 | 
    72         to_write_file.write(read.id + "\n")
 | 
| 
 | 
    73     to_write_file.close()
 | 
| 
 | 
    74     # Use the sfffile tools
 | 
| 
 | 
    75     cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file
 | 
| 
 | 
    76     os.system(cmd)
 | 
| 
 | 
    77     # Clean temporary files
 | 
| 
 | 
    78     try: os.remove(tmp_file)
 | 
| 
 | 
    79     except: pass
 | 
| 
 | 
    80 
 | 
| 
 | 
    81     
 | 
| 
 | 
    82 def filter_reads (seqs, options):
 | 
| 
 | 
    83     """
 | 
| 
 | 
    84     Filter input seqs by length, ns and complexity if options are asked by user
 | 
| 
 | 
    85       @param seqs    : table of seqs to filter   
 | 
| 
 | 
    86       @param options : the options asked by the user
 | 
| 
 | 
    87     """
 | 
| 
 | 
    88     reads_id = []
 | 
| 
 | 
    89     reads_length = []
 | 
| 
 | 
    90     reads_ok = []
 | 
| 
 | 
    91     del_by_ns = 0
 | 
| 
 | 
    92     del_by_complexity = 0
 | 
| 
 | 
    93     del_by_length = 0
 | 
| 
 | 
    94     del_by_quality = 0
 | 
| 
 | 
    95     log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
    96     
 | 
| 
 | 
    97     # Go throught all sequences
 | 
| 
 | 
    98     for i, reads_record in enumerate(seqs) :
 | 
| 
 | 
    99         reads_id.append(reads_record.id)
 | 
| 
 | 
   100         reads_ok.append(0)
 | 
| 
 | 
   101         
 | 
| 
 | 
   102         # If is asked to clean sequences by length using the standard 
 | 
| 
 | 
   103         # deviation, save length to compute some statistics 
 | 
| 
 | 
   104         if options.clean_length_std :
 | 
| 
 | 
   105             reads_length.append(len(reads_record))
 | 
| 
 | 
   106         
 | 
| 
 | 
   107         # If is asked to clean sequences by length using a window 
 | 
| 
 | 
   108         # and the sequence has not been flagged as deleted
 | 
| 
 | 
   109         if options.clean_length_win and reads_ok[i] == 0:
 | 
| 
 | 
   110             # Check if the sequence is longer than the min asked, if not flagged it as deleted
 | 
| 
 | 
   111             if len(reads_record) < int(options.min):
 | 
| 
 | 
   112                 reads_ok[i] = 1
 | 
| 
 | 
   113                 del_by_length += 1
 | 
| 
 | 
   114                 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n")
 | 
| 
 | 
   115             # Check if the sequence is smaller than the max asked, if not flagged it as deleted
 | 
| 
 | 
   116             elif len(reads_record) > int(options.max):
 | 
| 
 | 
   117                 reads_ok[i] = 1
 | 
| 
 | 
   118                 del_by_length += 1
 | 
| 
 | 
   119                 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n")
 | 
| 
 | 
   120         
 | 
| 
 | 
   121         # If is asked to clean sequences with too much Ns
 | 
| 
 | 
   122         # and the sequence has not been flagged as deleted
 | 
| 
 | 
   123         if options.clean_ns and reads_ok[i] == 0:
 | 
| 
 | 
   124             # Compute the rate of Ns into the current sequence
 | 
| 
 | 
   125             nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100)
 | 
| 
 | 
   126             # If the rate is higher than the threshold flagged it as deleted
 | 
| 
 | 
   127             if nb_n > float(options.ns_percent) :
 | 
| 
 | 
   128                 reads_ok[i] = 1
 | 
| 
 | 
   129                 del_by_ns += 1
 | 
| 
 | 
   130                 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n")
 | 
| 
 | 
   131         
 | 
| 
 | 
   132         # If is asked to clean sequences with low complexity using a sliding window
 | 
| 
 | 
   133         # and the sequence has not been flagged as deleted
 | 
| 
 | 
   134         if options.clean_complexity_win and reads_ok[i] == 0:
 | 
| 
 | 
   135             is_complex = False
 | 
| 
 | 
   136             # For each window
 | 
| 
 | 
   137             for win in range(0, len(reads_record)-options.window, options.step):
 | 
| 
 | 
   138                 start = win
 | 
| 
 | 
   139                 stop = start + options.window
 | 
| 
 | 
   140                 # Compute the window complexity
 | 
| 
 | 
   141                 w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100)
 | 
| 
 | 
   142                 # if the window complexity is higher to the threshold, flag the whole sequence as complex
 | 
| 
 | 
   143                 if w_cplx >= float(options.complexity):
 | 
| 
 | 
   144                     is_complex = True
 | 
| 
 | 
   145                     break
 | 
| 
 | 
   146             # If no window has been flagged as complex, then flagg the sequence as deleted
 | 
| 
 | 
   147             if not is_complex:
 | 
| 
 | 
   148                 reads_ok[i] = 1
 | 
| 
 | 
   149                 del_by_complexity += 1
 | 
| 
 | 
   150                 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n")
 | 
| 
 | 
   151         
 | 
| 
 | 
   152         # If is asked to clean sequences with low complexity working on the whole sequence
 | 
| 
 | 
   153         # and the sequence has not been flagged as deleted
 | 
| 
 | 
   154         if options.clean_complexity_full and reads_ok[i] == 0:
 | 
| 
 | 
   155             # Compute the complexity on the whole sequence
 | 
| 
 | 
   156             cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100)
 | 
| 
 | 
   157             # If the complexity is higher to the threshold, flag the sequence as deleted
 | 
| 
 | 
   158             if cplx < float(options.complexity) :
 | 
| 
 | 
   159                 reads_ok[i] = 1
 | 
| 
 | 
   160                 del_by_complexity += 1
 | 
| 
 | 
   161                 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n")
 | 
| 
 | 
   162     
 | 
| 
 | 
   163         # If is asked to clean sequences with low quality and quality information is available
 | 
| 
 | 
   164         # and the sequence has not been flagged as deleted
 | 
| 
 | 
   165         if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"):
 | 
| 
 | 
   166             # If at least one base has a quality score higher than the threashold
 | 
| 
 | 
   167             maw_qual = max(reads_record.letter_annotations["phred_quality"])
 | 
| 
 | 
   168             if maw_qual < int(options.quality_threshold) :
 | 
| 
 | 
   169                 reads_ok[i] = 1
 | 
| 
 | 
   170                 del_by_quality += 1
 | 
| 
 | 
   171                 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n")
 | 
| 
 | 
   172     
 | 
| 
 | 
   173     # If is asked to clean sequences by length using the standard deviation
 | 
| 
 | 
   174     if options.clean_length_std :
 | 
| 
 | 
   175         # Compute the mean and the standard deviation
 | 
| 
 | 
   176         mean = sum(reads_length) / len(reads_length)
 | 
| 
 | 
   177         mq = mean**2
 | 
| 
 | 
   178         s = sum([ x**2 for x in reads_length])
 | 
| 
 | 
   179         var = s/len(reads_length) - mq
 | 
| 
 | 
   180         etype = math.sqrt(var)
 | 
| 
 | 
   181         # For each sequences
 | 
| 
 | 
   182         for i, id in enumerate(reads_id):
 | 
| 
 | 
   183             # If the sequence has not been flagged as deleted
 | 
| 
 | 
   184             if  reads_ok[i] == 0 :
 | 
| 
 | 
   185                 # If the sequence length is higher than the upper threshold, flag the sequence as deleted
 | 
| 
 | 
   186                 if reads_length[i] > mean + options.std*etype:
 | 
| 
 | 
   187                     reads_ok[i] = 1
 | 
| 
 | 
   188                     del_by_length += 1
 | 
| 
 | 
   189                     log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
 | 
| 
 | 
   190                 # If the sequence length is smaller than the lower threshold, flag the sequence as deleted
 | 
| 
 | 
   191                 elif reads_length[i] < mean - options.std*etype:
 | 
| 
 | 
   192                     reads_ok[i] = 1
 | 
| 
 | 
   193                     del_by_length += 1
 | 
| 
 | 
   194                     log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
 | 
| 
 | 
   195     
 | 
| 
 | 
   196     seqs_to_return = []
 | 
| 
 | 
   197     # Then get only sequences not flagged to be deleted
 | 
| 
 | 
   198     for i, reads_record in enumerate(seqs) :
 | 
| 
 | 
   199         if reads_ok[i] == 0:
 | 
| 
 | 
   200             seqs_to_return.append(reads_record)
 | 
| 
 | 
   201     
 | 
| 
 | 
   202     # Return values
 | 
| 
 | 
   203     return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality]
 | 
| 
 | 
   204 
 | 
| 
 | 
   205 
 | 
| 
 | 
   206 def filter_same_reads (seqs, options):
 | 
| 
 | 
   207     """
 | 
| 
 | 
   208     Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster
 | 
| 
 | 
   209       @param seqs    : table of seqs to filter   
 | 
| 
 | 
   210       @param options : the options asked by the user
 | 
| 
 | 
   211     """
 | 
| 
 | 
   212     
 | 
| 
 | 
   213     megablast_input = os.path.join(options.output, "megablast_input.fasta")
 | 
| 
 | 
   214     log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   215     log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   216     
 | 
| 
 | 
   217     # First write down seqs
 | 
| 
 | 
   218     fasta = open(megablast_input, "w")
 | 
| 
 | 
   219     SeqIO.write(seqs, fasta, "fasta")
 | 
| 
 | 
   220     fasta.close()
 | 
| 
 | 
   221     
 | 
| 
 | 
   222     # Then formatdb the fasta file (no formatdb utils in Biopython)
 | 
| 
 | 
   223     cmd = "formatdb -i %s -p F" % megablast_input
 | 
| 
 | 
   224     os.system(cmd)    
 | 
| 
 | 
   225     log.write("## megablast (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   226     
 | 
| 
 | 
   227     # In case of pairends, use words of 50, so megablast cannot connect with spacers
 | 
| 
 | 
   228     opts = ""
 | 
| 
 | 
   229     if options.clean_pairends:
 | 
| 
 | 
   230         opts = " -W 50 -H 10 "
 | 
| 
 | 
   231         
 | 
| 
 | 
   232     # Megablast the fasta file versus itself
 | 
| 
 | 
   233     cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res" 
 | 
| 
 | 
   234     os.system(cmd)
 | 
| 
 | 
   235     
 | 
| 
 | 
   236     # Let's get the reads length with the fasta file and creates the graph
 | 
| 
 | 
   237     log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   238     gseqs = {}
 | 
| 
 | 
   239     vertices = []
 | 
| 
 | 
   240     for reads_record in seqs :
 | 
| 
 | 
   241         vertices.append({'name': reads_record.id})
 | 
| 
 | 
   242         gseqs[reads_record.id] = len(reads_record)
 | 
| 
 | 
   243 
 | 
| 
 | 
   244     #  Connects reads from hits starting at 1 and with ends closed to each others
 | 
| 
 | 
   245     log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   246     edges = []
 | 
| 
 | 
   247     for read in open(megablast_input + ".res", 'rU').readlines() :
 | 
| 
 | 
   248         parts = read.rstrip().split()
 | 
| 
 | 
   249         len1 = gseqs[parts[0]]
 | 
| 
 | 
   250         len2 = gseqs[parts[1]]
 | 
| 
 | 
   251         if options.clean_aggressive :
 | 
| 
 | 
   252             edges.append({'source': parts[0], 'target': parts[1]})
 | 
| 
 | 
   253         elif math.fabs(len1-len2) < options.duplication_limit:
 | 
| 
 | 
   254             if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit:
 | 
| 
 | 
   255                 # This alignments are realy similar -> may be the same -> link them into the graph
 | 
| 
 | 
   256                 edges.append({'source': parts[0], 'target': parts[1]})
 | 
| 
 | 
   257     
 | 
| 
 | 
   258     # Then get connected components and extract one of them as cluster leader
 | 
| 
 | 
   259     log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   260     gr = Graph.DictList(vertices, edges)
 | 
| 
 | 
   261     connex = gr.clusters().membership
 | 
| 
 | 
   262     
 | 
| 
 | 
   263     clusters = {}
 | 
| 
 | 
   264     for i, vertex in enumerate(vertices):
 | 
| 
 | 
   265         cluster_id = connex[i]
 | 
| 
 | 
   266         try:
 | 
| 
 | 
   267             clusters[cluster_id].append(seqs[i])
 | 
| 
 | 
   268         except:
 | 
| 
 | 
   269             clusters[cluster_id] = []
 | 
| 
 | 
   270             clusters[cluster_id].append(seqs[i])
 | 
| 
 | 
   271 
 | 
| 
 | 
   272     del_by_duplicat = 0
 | 
| 
 | 
   273     # Write down into the log the composition of each cluster
 | 
| 
 | 
   274     clusters_stats = {}
 | 
| 
 | 
   275     seqs_to_return = []
 | 
| 
 | 
   276     for cluster_id in clusters:
 | 
| 
 | 
   277         cl_elts = ""
 | 
| 
 | 
   278         del_by_duplicat += len(clusters[cluster_id]) - 1
 | 
| 
 | 
   279         longest_value = 0
 | 
| 
 | 
   280         cluster_leader = None
 | 
| 
 | 
   281         for seq_record in clusters[cluster_id]:
 | 
| 
 | 
   282             # Find out which sequence is the longest
 | 
| 
 | 
   283             if len(seq_record.seq) > longest_value:
 | 
| 
 | 
   284                 cluster_leader = seq_record
 | 
| 
 | 
   285                 
 | 
| 
 | 
   286         seqs_to_return.append(cluster_leader)
 | 
| 
 | 
   287         for seq_record in clusters[cluster_id]:
 | 
| 
 | 
   288             if seq_record.id != cluster_leader.id:
 | 
| 
 | 
   289                 log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n")
 | 
| 
 | 
   290             cl_elts += seq_record.id + " "
 | 
| 
 | 
   291         log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n")
 | 
| 
 | 
   292         
 | 
| 
 | 
   293         try :
 | 
| 
 | 
   294             clusters_stats[len(clusters[cluster_id])] += 1
 | 
| 
 | 
   295         except:
 | 
| 
 | 
   296             clusters_stats[len(clusters[cluster_id])] = 1
 | 
| 
 | 
   297     
 | 
| 
 | 
   298     # Write down a summary of what has been done
 | 
| 
 | 
   299     log_header = "## header (duplicated) : "
 | 
| 
 | 
   300     log_summary = "## summary (duplicated) : "
 | 
| 
 | 
   301     for stat in sorted(clusters_stats.keys()):
 | 
| 
 | 
   302         log_header += str(stat) + "\t"
 | 
| 
 | 
   303         log_summary += str(clusters_stats[stat]) + "\t"
 | 
| 
 | 
   304     log.write(log_header + "\n")
 | 
| 
 | 
   305     log.write(log_summary + "\n")
 | 
| 
 | 
   306     
 | 
| 
 | 
   307     # Clean temporary files
 | 
| 
 | 
   308     try:
 | 
| 
 | 
   309         os.remove(megablast_input)
 | 
| 
 | 
   310         os.remove(megablast_input+".nhr")
 | 
| 
 | 
   311         os.remove(megablast_input+".nin")
 | 
| 
 | 
   312         os.remove(megablast_input+".nsq")
 | 
| 
 | 
   313         os.remove(megablast_input+".res")
 | 
| 
 | 
   314         os.remove("formatdb.log")
 | 
| 
 | 
   315         os.remove("error.log")
 | 
| 
 | 
   316     except:
 | 
| 
 | 
   317         pass
 | 
| 
 | 
   318     
 | 
| 
 | 
   319     # Returns results
 | 
| 
 | 
   320     return [seqs_to_return, del_by_duplicat]
 | 
| 
 | 
   321 
 | 
| 
 | 
   322 
 | 
| 
 | 
   323 def filter_pairends(seqs, options):
 | 
| 
 | 
   324     """
 | 
| 
 | 
   325     Filter pairends sequences and split sequences without pairends into a fasta file
 | 
| 
 | 
   326       @param seqs    : the table of sequences to filter
 | 
| 
 | 
   327       @param options : the options asked by the user
 | 
| 
 | 
   328     """
 | 
| 
 | 
   329     
 | 
| 
 | 
   330     # Setup output files
 | 
| 
 | 
   331     shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta")
 | 
| 
 | 
   332     shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual")
 | 
| 
 | 
   333     crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta")
 | 
| 
 | 
   334     split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta")
 | 
| 
 | 
   335     split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual")
 | 
| 
 | 
   336     
 | 
| 
 | 
   337     # First write down seqs
 | 
| 
 | 
   338     fasta = open(crossmatch_input, "w")
 | 
| 
 | 
   339     SeqIO.write(seqs, fasta, "fasta")
 | 
| 
 | 
   340     fasta.close()
 | 
| 
 | 
   341     
 | 
| 
 | 
   342     # Write down the qual file for cross_match if possible
 | 
| 
 | 
   343     try:
 | 
| 
 | 
   344         qual = open(crossmatch_input+".qual", "w")
 | 
| 
 | 
   345         SeqIO.write(seqs, qual, "qual")
 | 
| 
 | 
   346         qual.close()
 | 
| 
 | 
   347     except:
 | 
| 
 | 
   348         os.remove(crossmatch_input+".qual")
 | 
| 
 | 
   349 
 | 
| 
 | 
   350     # Cross_match reverse matches pattern
 | 
| 
 | 
   351     rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
 | 
| 
 | 
   352     # Cross_match forward matches pattern
 | 
| 
 | 
   353     fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")
 | 
| 
 | 
   354     
 | 
| 
 | 
   355     # Write the spacer file and execute cross_match against the input sequences
 | 
| 
 | 
   356     spacers_file = create_roche_pairends_spacer_file(options.output)
 | 
| 
 | 
   357     cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res"
 | 
| 
 | 
   358     os.system(cmd)
 | 
| 
 | 
   359        
 | 
| 
 | 
   360     # Parse the cross_match file
 | 
| 
 | 
   361     cross_match_tab = {}
 | 
| 
 | 
   362     block_found = False
 | 
| 
 | 
   363     for line in open(crossmatch_input + ".cross_match.res", 'r'):
 | 
| 
 | 
   364         save_line = False
 | 
| 
 | 
   365         rm = rev_regex.match(line)
 | 
| 
 | 
   366         fm = fwd_regex.match(line)
 | 
| 
 | 
   367         if rm != None: # If it's a reverse matches
 | 
| 
 | 
   368             block_found = True
 | 
| 
 | 
   369             (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8)))
 | 
| 
 | 
   370             save_line = True
 | 
| 
 | 
   371         elif fm != None: # If it's a forward matches
 | 
| 
 | 
   372             block_found = True
 | 
| 
 | 
   373             (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8)))
 | 
| 
 | 
   374             save_line = True
 | 
| 
 | 
   375         else : save_line = False
 | 
| 
 | 
   376 
 | 
| 
 | 
   377         if line.startswith("Discrepancy summary:"): # This is the end of the section
 | 
| 
 | 
   378             break
 | 
| 
 | 
   379         
 | 
| 
 | 
   380         # Save the line
 | 
| 
 | 
   381         if save_line:
 | 
| 
 | 
   382             try:
 | 
| 
 | 
   383                 cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch])
 | 
| 
 | 
   384             except:
 | 
| 
 | 
   385                 cross_match_tab[primary_match] = []
 | 
| 
 | 
   386                 cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)])
 | 
| 
 | 
   387     
 | 
| 
 | 
   388     # Then get the spacers_length
 | 
| 
 | 
   389     spacers_length = {}
 | 
| 
 | 
   390     for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") :
 | 
| 
 | 
   391         spacers_length[seq_record.id] = len(seq_record.seq.tostring())
 | 
| 
 | 
   392     
 | 
| 
 | 
   393     # Finaly go throught all sequences
 | 
| 
 | 
   394     seqs_to_return = []
 | 
| 
 | 
   395     shotgun_seqs = []
 | 
| 
 | 
   396     del_by_pairends = 1
 | 
| 
 | 
   397     pe_splited = []
 | 
| 
 | 
   398         
 | 
| 
 | 
   399     for i, seq_record in enumerate(seqs) :
 | 
| 
 | 
   400         if not cross_match_tab.has_key(seq_record.id):
 | 
| 
 | 
   401             # No spacers found -> just add it to the shotgun_seqs tab
 | 
| 
 | 
   402             shotgun_seqs.append(seq_record)
 | 
| 
 | 
   403             log.write(seq_record.id + " shotgun -> no spacer found\n")
 | 
| 
 | 
   404         elif len(cross_match_tab[seq_record.id]) > 1:
 | 
| 
 | 
   405             # If multiple linker -> delete the whole sequence
 | 
| 
 | 
   406             log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n")
 | 
| 
 | 
   407             del_by_pairends += 1
 | 
| 
 | 
   408         elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)):
 | 
| 
 | 
   409             if (cross_match_tab[seq_record.id][0][1] == 1):
 | 
| 
 | 
   410                 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
 | 
| 
 | 
   411                 log.write(seq_record.id + " shotgun -> spacer found at the begining\n")
 | 
| 
 | 
   412             elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)):
 | 
| 
 | 
   413                 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
 | 
| 
 | 
   414                 log.write(seq_record.id + " shotgun -> spacer found at the end\n")
 | 
| 
 | 
   415             else :
 | 
| 
 | 
   416                 log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n")
 | 
| 
 | 
   417                 del_by_pairends += 1
 | 
| 
 | 
   418         elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit:
 | 
| 
 | 
   419             seqs_to_return.append(seq_record)
 | 
| 
 | 
   420             if options.split_pairends:
 | 
| 
 | 
   421                 sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:]
 | 
| 
 | 
   422                 sub_seq1.id = seq_record.id + ".r"
 | 
| 
 | 
   423                 pe_splited.append(sub_seq1)   
 | 
| 
 | 
   424                 sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]]
 | 
| 
 | 
   425                 sub_seq2.id = seq_record.id + ".f"
 | 
| 
 | 
   426                 pe_splited.append(sub_seq2)
 | 
| 
 | 
   427         elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
 | 
| 
 | 
   428             log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n")
 | 
| 
 | 
   429             del_by_pairends += 1
 | 
| 
 | 
   430         elif cross_match_tab[seq_record.id][0][1] < options.border_limit:
 | 
| 
 | 
   431             shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
 | 
| 
 | 
   432             log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n")
 | 
| 
 | 
   433         elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
 | 
| 
 | 
   434             shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
 | 
| 
 | 
   435             log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n")
 | 
| 
 | 
   436     
 | 
| 
 | 
   437     # Write down if required the splited pairends file
 | 
| 
 | 
   438     if options.split_pairends:
 | 
| 
 | 
   439         handle = open(split_pairends_fasta, "w")
 | 
| 
 | 
   440         SeqIO.write(pe_splited, handle, "fasta")
 | 
| 
 | 
   441         handle.close()
 | 
| 
 | 
   442         handle = open(split_pairends_qual, "w")
 | 
| 
 | 
   443         SeqIO.write(pe_splited, handle, "qual")
 | 
| 
 | 
   444         handle.close()
 | 
| 
 | 
   445     
 | 
| 
 | 
   446     # Finaly clean up new shotguns seqs
 | 
| 
 | 
   447     [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options)
 | 
| 
 | 
   448     handle = open(shotgun_ffile, "w")
 | 
| 
 | 
   449     SeqIO.write(shotgun_seqs_clean, handle, "fasta")
 | 
| 
 | 
   450     handle.close()
 | 
| 
 | 
   451     in_shot_gun = 0
 | 
| 
 | 
   452     try:
 | 
| 
 | 
   453         handle = open(shotgun_qfile, "w")
 | 
| 
 | 
   454         SeqIO.write(shotgun_seqs_clean, handle, "qual")
 | 
| 
 | 
   455         handle.close()
 | 
| 
 | 
   456         in_shot_gun = len(shotgun_seqs_clean)
 | 
| 
 | 
   457     except:
 | 
| 
 | 
   458         pass
 | 
| 
 | 
   459     
 | 
| 
 | 
   460     log.write("## header (pairends) : pairends\ttotal shotgun\n")
 | 
| 
 | 
   461     log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n")
 | 
| 
 | 
   462     
 | 
| 
 | 
   463     # Clean temporary files
 | 
| 
 | 
   464     try:
 | 
| 
 | 
   465         os.remove(crossmatch_input)
 | 
| 
 | 
   466         os.remove(spacers_file)
 | 
| 
 | 
   467         os.remove(crossmatch_input+".cross_match.res")
 | 
| 
 | 
   468         os.remove(crossmatch_input+".log")
 | 
| 
 | 
   469         os.remove(crossmatch_input+".qual")
 | 
| 
 | 
   470     except:
 | 
| 
 | 
   471         pass
 | 
| 
 | 
   472     
 | 
| 
 | 
   473     # Then returned pairends ones
 | 
| 
 | 
   474     return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends]
 | 
| 
 | 
   475 
 | 
| 
 | 
   476 
 | 
| 
 | 
   477 def get_seqs (options):
 | 
| 
 | 
   478     """
 | 
| 
 | 
   479     Converts input seqs in a BioPython seq table
 | 
| 
 | 
   480       @param options : the options asked by the user
 | 
| 
 | 
   481     """
 | 
| 
 | 
   482     
 | 
| 
 | 
   483     # First get fasta or/and qual input files
 | 
| 
 | 
   484     qual_file = ""
 | 
| 
 | 
   485     if options.format == "sff":
 | 
| 
 | 
   486         sff_file = options.input_file
 | 
| 
 | 
   487         
 | 
| 
 | 
   488         if sff_file.endswith(".gz"):
 | 
| 
 | 
   489             '''Gunzip the given file and then remove the file.'''
 | 
| 
 | 
   490             r_file = gzip.GzipFile(sff_file, 'r')
 | 
| 
 | 
   491             write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz'))
 | 
| 
 | 
   492             w_file = open(write_file, 'w')
 | 
| 
 | 
   493             w_file.write(r_file.read())
 | 
| 
 | 
   494             w_file.close()
 | 
| 
 | 
   495             r_file.close()
 | 
| 
 | 
   496             sff_file = write_file
 | 
| 
 | 
   497             
 | 
| 
 | 
   498         base = os.path.basename(sff_file)
 | 
| 
 | 
   499         fasta_file = os.path.join(options.output, base + '.fasta')
 | 
| 
 | 
   500         qual_file = os.path.join(options.output, base + '.qual')
 | 
| 
 | 
   501         xml_file = os.path.join(options.output, base + '.xml')
 | 
| 
 | 
   502         format = "fasta"
 | 
| 
 | 
   503         if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file):
 | 
| 
 | 
   504             #First extract the sff file to fasta, qual and xml file
 | 
| 
 | 
   505             cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file
 | 
| 
 | 
   506             os.system(cmd)
 | 
| 
 | 
   507         else :
 | 
| 
 | 
   508             log = open(options.log_file, "a+")
 | 
| 
 | 
   509             log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n")
 | 
| 
 | 
   510             log.close()
 | 
| 
 | 
   511     elif options.format == "fasta" or options.format == "fastq":
 | 
| 
 | 
   512         format = options.format
 | 
| 
 | 
   513         fasta_file = options.input_file
 | 
| 
 | 
   514     else :
 | 
| 
 | 
   515         print "Error : " + options.format + " is not a supported format!"
 | 
| 
 | 
   516         sys.exit(1)
 | 
| 
 | 
   517 
 | 
| 
 | 
   518     if options.qual_file:
 | 
| 
 | 
   519         qual_file = options.qual_file
 | 
| 
 | 
   520 
 | 
| 
 | 
   521     # If we got a quality file
 | 
| 
 | 
   522     if qual_file :
 | 
| 
 | 
   523         quals = {}
 | 
| 
 | 
   524         # If the file is gziped
 | 
| 
 | 
   525         if qual_file.endswith(".gz"):
 | 
| 
 | 
   526             for qual in SeqIO.parse(gzip.open(qual_file), "qual") :
 | 
| 
 | 
   527                 quals[qual.id] = qual
 | 
| 
 | 
   528         else :
 | 
| 
 | 
   529             for qual in SeqIO.parse(open(qual_file), "qual") :
 | 
| 
 | 
   530                 quals[qual.id] = qual
 | 
| 
 | 
   531     
 | 
| 
 | 
   532     # If the fasta file is gziped
 | 
| 
 | 
   533     if fasta_file.endswith(".gz"):
 | 
| 
 | 
   534         seqs = []
 | 
| 
 | 
   535         for record in SeqIO.parse(gzip.open(fasta_file), format) :
 | 
| 
 | 
   536             if qual_file :
 | 
| 
 | 
   537                 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
 | 
| 
 | 
   538             seqs.append(record)
 | 
| 
 | 
   539     else :
 | 
| 
 | 
   540         seqs = []
 | 
| 
 | 
   541         for record in SeqIO.parse(open(fasta_file), format) :
 | 
| 
 | 
   542             if qual_file :
 | 
| 
 | 
   543                 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
 | 
| 
 | 
   544             seqs.append(record)
 | 
| 
 | 
   545     
 | 
| 
 | 
   546     # Clean temporary files
 | 
| 
 | 
   547     if options.format == "sff":
 | 
| 
 | 
   548         try:
 | 
| 
 | 
   549             os.remove(fasta_file)
 | 
| 
 | 
   550             os.remove(qual_file)
 | 
| 
 | 
   551             os.remove(xml_file)
 | 
| 
 | 
   552         except:
 | 
| 
 | 
   553             pass
 | 
| 
 | 
   554     
 | 
| 
 | 
   555     # Returns the table of sequences
 | 
| 
 | 
   556     return [seqs]
 | 
| 
 | 
   557 
 | 
| 
 | 
   558 def which (program):
 | 
| 
 | 
   559     """
 | 
| 
 | 
   560     Return if the asked program exist in the user path
 | 
| 
 | 
   561       @param options : the options asked by the user
 | 
| 
 | 
   562     """
 | 
| 
 | 
   563     import os
 | 
| 
 | 
   564     def is_exe(fpath):
 | 
| 
 | 
   565         return os.path.exists(fpath) and os.access(fpath, os.X_OK)
 | 
| 
 | 
   566     fpath, fname = os.path.split(program)
 | 
| 
 | 
   567     if fpath:
 | 
| 
 | 
   568         if is_exe(program):
 | 
| 
 | 
   569             return program
 | 
| 
 | 
   570     else:
 | 
| 
 | 
   571         for path in os.environ["PATH"].split(os.pathsep):
 | 
| 
 | 
   572             exe_file = os.path.join(path, program)
 | 
| 
 | 
   573             if is_exe(exe_file):
 | 
| 
 | 
   574                 return exe_file
 | 
| 
 | 
   575     return None
 | 
| 
 | 
   576 
 | 
| 
 | 
   577 def pyrocleaner_depts_error(options):
 | 
| 
 | 
   578     """
 | 
| 
 | 
   579     Return the list of dependencies missing to run the pyrocleaner
 | 
| 
 | 
   580     """
 | 
| 
 | 
   581     error = ""
 | 
| 
 | 
   582     if which("formatdb") == None:
 | 
| 
 | 
   583         error += " - formatdb is missing\n" 
 | 
| 
 | 
   584     if which("megablast") == None:
 | 
| 
 | 
   585         error += " - megablast is missing\n"
 | 
| 
 | 
   586     if which("sff_extract.py") == None:
 | 
| 
 | 
   587         error += " - sff_extract.py\n" 
 | 
| 
 | 
   588     if which("cross_match") == None and options.clean_pairends != None:
 | 
| 
 | 
   589         error += " - cross_match\n"
 | 
| 
 | 
   590     if error != "":
 | 
| 
 | 
   591         error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!"
 | 
| 
 | 
   592     return error
 | 
| 
 | 
   593     
 | 
| 
 | 
   594 
 | 
| 
 | 
   595 if __name__ == "__main__":
 | 
| 
 | 
   596 
 | 
| 
 | 
   597     parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]")
 | 
| 
 | 
   598 
 | 
| 
 | 
   599     usage = "usage: %prog -i file -o output -f format"
 | 
| 
 | 
   600     desc = " Clean sequences from 454 SFF files \n"\
 | 
| 
 | 
   601            " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win"
 | 
| 
 | 
   602     parser = OptionParser(usage = usage, version = version_string(), description = desc)
 | 
| 
 | 
   603 
 | 
| 
 | 
   604     igroup = OptionGroup(parser, "Input files options","")
 | 
| 
 | 
   605     igroup.add_option("-i", "--in", dest="input_file",
 | 
| 
 | 
   606                       help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE")
 | 
| 
 | 
   607     igroup.add_option("-q", "--qual", dest="qual_file",
 | 
| 
 | 
   608                       help="The quality file to use if input file is fasta", metavar="FILE")
 | 
| 
 | 
   609     igroup.add_option("-f", "--format", dest="format",
 | 
| 
 | 
   610                       help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff")
 | 
| 
 | 
   611     parser.add_option_group(igroup)
 | 
| 
 | 
   612     
 | 
| 
 | 
   613     ogroup = OptionGroup(parser, "Output files options","")
 | 
| 
 | 
   614     ogroup.add_option("-o", "--out", dest="output",
 | 
| 
 | 
   615                       help="The output folder where to store results")
 | 
| 
 | 
   616     ogroup.add_option("-g", "--log", dest="log_file",
 | 
| 
 | 
   617                       help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log")
 | 
| 
 | 
   618     ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends", 
 | 
| 
 | 
   619                       default=False, help="Write splited pairends sequences into a fasta file")
 | 
| 
 | 
   620     parser.add_option_group(ogroup)
 | 
| 
 | 
   621 
 | 
| 
 | 
   622     cgroup = OptionGroup(parser, "Cleaning options","")
 | 
| 
 | 
   623     cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends", 
 | 
| 
 | 
   624                       default=False, help="Clean pairends")
 | 
| 
 | 
   625     cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std", 
 | 
| 
 | 
   626                       default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation")
 | 
| 
 | 
   627     cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win", 
 | 
| 
 | 
   628                       default=False, help="Filter reads with a legnth in between [x:y]")
 | 
| 
 | 
   629     cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns", 
 | 
| 
 | 
   630                       default=False, help="Filter reads with too much N")
 | 
| 
 | 
   631     cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads", 
 | 
| 
 | 
   632                       default=False, help="Filter duplicated reads")
 | 
| 
 | 
   633     cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win", 
 | 
| 
 | 
   634                       default=False, help="Filter low complexity reads computed on a sliding window")
 | 
| 
 | 
   635     cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full", 
 | 
| 
 | 
   636                       default=False, help="Filter low complexity reads computed on the whole sequence")        
 | 
| 
 | 
   637     cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality", 
 | 
| 
 | 
   638                       default=False, help="Filter low quality reads")  
 | 
| 
 | 
   639     parser.add_option_group(cgroup)
 | 
| 
 | 
   640     
 | 
| 
 | 
   641     pgroup = OptionGroup(parser, "Processing options","")
 | 
| 
 | 
   642     pgroup.add_option("-a", "--acpus", dest="nb_cpus",
 | 
| 
 | 
   643                       help="Number of processors to use", type="int", default=1)
 | 
| 
 | 
   644     pgroup.add_option("-r", "--recursion", dest="recursion",
 | 
| 
 | 
   645                       help="Recursion limit when computing duplicated reads", type="int", default=1000)
 | 
| 
 | 
   646     parser.add_option_group(pgroup)
 | 
| 
 | 
   647     
 | 
| 
 | 
   648     cpgroup = OptionGroup(parser, "Cleaning parameters","")
 | 
| 
 | 
   649     cpgroup.add_option("-b", "--border-limit", dest="border_limit",
 | 
| 
 | 
   650                       help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70)
 | 
| 
 | 
   651     cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive", 
 | 
| 
 | 
   652                       default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)")  
 | 
| 
 | 
   653     cpgroup.add_option("-e", "--missmatch", dest="missmatch",
 | 
| 
 | 
   654                       help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10)
 | 
| 
 | 
   655     cpgroup.add_option("-j", "--std", dest="std",
 | 
| 
 | 
   656                       help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2)
 | 
| 
 | 
   657     cpgroup.add_option("-1", "--min", dest="min",
 | 
| 
 | 
   658                       help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200)
 | 
| 
 | 
   659     cpgroup.add_option("-2", "--max", dest="max",
 | 
| 
 | 
   660                        help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600)        
 | 
| 
 | 
   661     cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold",
 | 
| 
 | 
   662                        help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35) 
 | 
| 
 | 
   663     cpgroup.add_option("-s", "--ns_percent", dest="ns_percent",
 | 
| 
 | 
   664                       help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4)        
 | 
| 
 | 
   665     cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit",
 | 
| 
 | 
   666                       help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70)        
 | 
| 
 | 
   667     cpgroup.add_option("-v", "--window", dest="window",
 | 
| 
 | 
   668                       help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100)
 | 
| 
 | 
   669     cpgroup.add_option("-x", "--step", dest="step",
 | 
| 
 | 
   670                       help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5)
 | 
| 
 | 
   671     cpgroup.add_option("-y", "--complexity", dest="complexity",
 | 
| 
 | 
   672                       help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40)
 | 
| 
 | 
   673     parser.add_option_group(cpgroup)        
 | 
| 
 | 
   674 
 | 
| 
 | 
   675     (options, args) = parser.parse_args()
 | 
| 
 | 
   676     
 | 
| 
 | 
   677     sys.setrecursionlimit(options.recursion)
 | 
| 
 | 
   678 
 | 
| 
 | 
   679     if not pyrocleaner_depts_error(options):
 | 
| 
 | 
   680         
 | 
| 
 | 
   681         if options.input_file == None or options.output == None and options.format == None:
 | 
| 
 | 
   682             parser.print_help()
 | 
| 
 | 
   683             sys.exit(1)
 | 
| 
 | 
   684         elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality :
 | 
| 
 | 
   685             print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help"
 | 
| 
 | 
   686             sys.exit(1)         
 | 
| 
 | 
   687         else:
 | 
| 
 | 
   688 
 | 
| 
 | 
   689             # Create output directory if doesn't exist
 | 
| 
 | 
   690             if not os.path.isdir(options.output) : os.mkdir(options.output )
 | 
| 
 | 
   691             
 | 
| 
 | 
   692             global log
 | 
| 
 | 
   693             log = open(options.output+"/"+options.log_file, "w")
 | 
| 
 | 
   694             log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   695             log.write("## with the following options: \n")
 | 
| 
 | 
   696             opts = ""
 | 
| 
 | 
   697             if options.clean_complexity_full:
 | 
| 
 | 
   698                 opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n" 
 | 
| 
 | 
   699             if options.clean_complexity_win:
 | 
| 
 | 
   700                 opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n" 
 | 
| 
 | 
   701             if options.clean_duplicated_reads and not options.clean_aggressive:
 | 
| 
 | 
   702                 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n"
 | 
| 
 | 
   703             elif options.clean_duplicated_reads and options.clean_aggressive :
 | 
| 
 | 
   704                 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n" 
 | 
| 
 | 
   705             if options.clean_length_std:
 | 
| 
 | 
   706                 opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n" 
 | 
| 
 | 
   707             if options.clean_length_win:
 | 
| 
 | 
   708                 opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n" 
 | 
| 
 | 
   709             if options.clean_ns:
 | 
| 
 | 
   710                 opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n" 
 | 
| 
 | 
   711             if options.clean_pairends:
 | 
| 
 | 
   712                 opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n"                                             
 | 
| 
 | 
   713             if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) :
 | 
| 
 | 
   714                 opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n"
 | 
| 
 | 
   715             elif options.clean_quality :
 | 
| 
 | 
   716                 print "No quality check will be performed beacause there is no quality file provided."                                           
 | 
| 
 | 
   717             log.write(opts)  
 | 
| 
 | 
   718             log.close()
 | 
| 
 | 
   719             
 | 
| 
 | 
   720             # 1 - First get inputs from options
 | 
| 
 | 
   721             [seqs] = get_seqs(options)
 | 
| 
 | 
   722             before_cleaning = len(seqs)
 | 
| 
 | 
   723             del_by_length = 0
 | 
| 
 | 
   724             del_by_ns = 0
 | 
| 
 | 
   725             del_by_complexity = 0
 | 
| 
 | 
   726             del_by_duplicat = 0
 | 
| 
 | 
   727             del_by_pairends = 0
 | 
| 
 | 
   728             del_by_quality = 0
 | 
| 
 | 
   729             in_shot_gun = 0
 | 
| 
 | 
   730             
 | 
| 
 | 
   731             log = open(options.output+"/"+options.log_file, "a+")
 | 
| 
 | 
   732             # 2 - clean reads
 | 
| 
 | 
   733             # 2.1 - Clean reads by length, ns, complexity
 | 
| 
 | 
   734             if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality:
 | 
| 
 | 
   735                 if options.format == "sff" :
 | 
| 
 | 
   736                     [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
 | 
| 
 | 
   737                 elif options.format == "fasta" or options.format == "fastq":
 | 
| 
 | 
   738                     [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
 | 
| 
 | 
   739                     
 | 
| 
 | 
   740             # 2.2 - Clean reads by duplicat
 | 
| 
 | 
   741             if options.clean_duplicated_reads:
 | 
| 
 | 
   742                 [seqs, del_by_duplicat] = filter_same_reads(seqs, options)
 | 
| 
 | 
   743             
 | 
| 
 | 
   744             # 3 - Clean pairends
 | 
| 
 | 
   745             if options.clean_pairends:
 | 
| 
 | 
   746                 [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options)
 | 
| 
 | 
   747                 del_by_length += del_by_length_pe
 | 
| 
 | 
   748                 del_by_ns += del_by_ns_pe
 | 
| 
 | 
   749                 del_by_complexity += del_by_complexity_pe
 | 
| 
 | 
   750                 del_by_quality += del_by_quality_pe
 | 
| 
 | 
   751             
 | 
| 
 | 
   752             # 4 - Create the output in the same format than the input
 | 
| 
 | 
   753             after_cleaning = len(seqs) + in_shot_gun
 | 
| 
 | 
   754             if options.input_file.endswith(".gz"):
 | 
| 
 | 
   755                 ifile = string.rstrip(options.input_file, '.gz')
 | 
| 
 | 
   756             else : ifile = options.input_file
 | 
| 
 | 
   757             if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean")
 | 
| 
 | 
   758             else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean")
 | 
| 
 | 
   759             output += "." + options.format
 | 
| 
 | 
   760             if options.format == "sff" :
 | 
| 
 | 
   761                 if options.input_file.endswith(".gz"):
 | 
| 
 | 
   762                     sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz'))
 | 
| 
 | 
   763                 else :
 | 
| 
 | 
   764                     sfffile = options.input_file
 | 
| 
 | 
   765                 if which("sfffile") == None:
 | 
| 
 | 
   766                     fastq = open(output, "w")
 | 
| 
 | 
   767                     SeqIO.write(seqs, fastq, "fastq")
 | 
| 
 | 
   768                     fastq.close()
 | 
| 
 | 
   769                 else : 
 | 
| 
 | 
   770                     reads_to_sff(sfffile, seqs, output)
 | 
| 
 | 
   771                     # Clean temporary files
 | 
| 
 | 
   772                     if options.input_file.endswith(".gz"):
 | 
| 
 | 
   773                         os.remove(sfffile)
 | 
| 
 | 
   774                         
 | 
| 
 | 
   775             elif options.format == "fasta" or options.format == "fastq":
 | 
| 
 | 
   776                 fasta = open(output, "w")
 | 
| 
 | 
   777                 SeqIO.write(seqs, fasta, options.format)
 | 
| 
 | 
   778                 fasta.close()
 | 
| 
 | 
   779                 if options.format == "fasta" and options.qual_file != None:
 | 
| 
 | 
   780                     qual = open(os.path.splitext(output)[0]+".qual", "w")
 | 
| 
 | 
   781                     SeqIO.write(seqs, qual, "qual")
 | 
| 
 | 
   782                     qual.close()
 | 
| 
 | 
   783             
 | 
| 
 | 
   784             # 5 - Write down the analyze sumary
 | 
| 
 | 
   785             log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n")
 | 
| 
 | 
   786             log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning)  + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n")
 | 
| 
 | 
   787             log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
 | 
| 
 | 
   788             log.close()           
 | 
| 
 | 
   789             sys.exit(0)
 | 
| 
 | 
   790     else : 
 | 
| 
 | 
   791         print pyrocleaner_depts_error(options)
 | 
| 
 | 
   792         sys.exit(1)
 | 
| 
 | 
   793          |