annotate pyrocleaner_galaxy_tool_V1.2/pyrocleaner.py @ 0:ef5dd11c01e6 default tip

pyrocleaner v1.2
author g2cmnty@test-web1.g2.bx.psu.edu
date Thu, 09 Jun 2011 06:09:09 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
1 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
2 # Pyrocleaner
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
3 # Copyright (C) 2009 INRA
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
4 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
5 # This program is free software: you can redistribute it and/or modify
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
6 # it under the terms of the GNU General Public License as published by
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
7 # the Free Software Foundation, either version 3 of the License, or
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
8 # (at your option) any later version.
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
9 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
10 # This program is distributed in the hope that it will be useful,
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
13 # GNU General Public License for more details.
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
14 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
15 # You should have received a copy of the GNU General Public License
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
17 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
18 # ChangeLog
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
19 # v1.2 (05/2011) : Correct a bug with --clean-pairends option introduced when adding the
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
20 # --clean-quality option
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
21 # v1.1 (02/2011) : Add the option --clean-quality to clean reads based on their bases quality
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
22 # Add the --aggressive option in order to keep only one read per cluster
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
23 # Modify the duplication strategy so the longer read is keept
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
24 # v1.0 (09/2009) : Pyrocleaner first version
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
25 #
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
26
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
27 __author__ = 'Plateforme bioinformatique Midi Pyrenees'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
28 __copyright__ = 'Copyright (C) 2009 INRA'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
29 __license__ = 'GNU General Public License'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
30 __version__ = '1.2'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
31 __email__ = 'support.genopole@toulouse.inra.fr'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
32 __status__ = 'beta'
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
33
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
34 from Bio import SeqIO
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
35 from igraph import *
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
36 from optparse import *
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
37 import os, math, datetime, zlib, sys, re, glob, string, gzip
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
38
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
39
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
40 def create_roche_pairends_spacer_file(out_dir):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
41 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
42 Create the Roche pairends fasta file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
43 @param out_dir : the directory where will be writen the fasta file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
44 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
45 file = open(os.path.join(out_dir, "roche_spacers.fna"), 'wr')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
46 file.write(">flx\nGTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
47 file.write(">titanium1\nTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
48 file.write(">titanium2\nCGTAATAACTTCGTATAGCATACATTATACGAAGTTATACGA\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
49 file.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
50 return file.name
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
51
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
52
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
53 def version_string ():
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
54 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
55 Return the pyrocleaner version
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
56 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
57 return "pyrocleaner " + __version__
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
58
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
59
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
60 def reads_to_sff (sff_file, seqs, output_file):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
61 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
62 Extract seqs reads from the sff_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
63 @dependences sfffile : the sff software is required to execute this function
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
64 @param sff_file : the input sff file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
65 @param seqs : table of seqs to extract
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
66 @param output_file : the name of the output sff file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
67 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
68 # First creates the to_write file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
69 tmp_file = os.path.join(os.path.dirname(output_file), "reads_to_sff.txt")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
70 to_write_file = open(tmp_file, 'wr')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
71 for read in seqs:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
72 to_write_file.write(read.id + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
73 to_write_file.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
74 # Use the sfffile tools
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
75 cmd = "sfffile -i " + tmp_file + " -o " + output_file + " " + sff_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
76 os.system(cmd)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
77 # Clean temporary files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
78 try: os.remove(tmp_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
79 except: pass
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
80
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
81
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
82 def filter_reads (seqs, options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
83 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
84 Filter input seqs by length, ns and complexity if options are asked by user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
85 @param seqs : table of seqs to filter
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
86 @param options : the options asked by the user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
87 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
88 reads_id = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
89 reads_length = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
90 reads_ok = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
91 del_by_ns = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
92 del_by_complexity = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
93 del_by_length = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
94 del_by_quality = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
95 log.write("## Start Basic cleaning (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
96
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
97 # Go throught all sequences
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
98 for i, reads_record in enumerate(seqs) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
99 reads_id.append(reads_record.id)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
100 reads_ok.append(0)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
101
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
102 # If is asked to clean sequences by length using the standard
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
103 # deviation, save length to compute some statistics
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
104 if options.clean_length_std :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
105 reads_length.append(len(reads_record))
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
106
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
107 # If is asked to clean sequences by length using a window
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
108 # and the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
109 if options.clean_length_win and reads_ok[i] == 0:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
110 # Check if the sequence is longer than the min asked, if not flagged it as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
111 if len(reads_record) < int(options.min):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
112 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
113 del_by_length += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
114 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + "<" + str(options.min) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
115 # Check if the sequence is smaller than the max asked, if not flagged it as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
116 elif len(reads_record) > int(options.max):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
117 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
118 del_by_length += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
119 log.write(reads_id[i] + " deleted -> Length ( " + str(len(reads_record)) + ">" + str(options.max) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
120
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
121 # If is asked to clean sequences with too much Ns
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
122 # and the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
123 if options.clean_ns and reads_ok[i] == 0:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
124 # Compute the rate of Ns into the current sequence
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
125 nb_n = (float(reads_record.seq.count("n")+reads_record.seq.count("N"))/float(len(reads_record)))*float(100)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
126 # If the rate is higher than the threshold flagged it as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
127 if nb_n > float(options.ns_percent) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
128 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
129 del_by_ns += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
130 log.write(reads_id[i] + " deleted -> Ns ( Reads containing " + str(nb_n) + "% of Ns > to the limit : " + str(options.ns_percent) + "% )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
131
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
132 # If is asked to clean sequences with low complexity using a sliding window
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
133 # and the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
134 if options.clean_complexity_win and reads_ok[i] == 0:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
135 is_complex = False
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
136 # For each window
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
137 for win in range(0, len(reads_record)-options.window, options.step):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
138 start = win
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
139 stop = start + options.window
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
140 # Compute the window complexity
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
141 w_cplx = (float(len(zlib.compress(str(reads_record.seq[start:stop]))))/float(stop-start+1))*float(100)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
142 # if the window complexity is higher to the threshold, flag the whole sequence as complex
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
143 if w_cplx >= float(options.complexity):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
144 is_complex = True
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
145 break
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
146 # If no window has been flagged as complex, then flagg the sequence as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
147 if not is_complex:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
148 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
149 del_by_complexity += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
150 log.write(reads_id[i] + " deleted -> Complexity ( No window complexity > " + str(options.complexity) + " found )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
151
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
152 # If is asked to clean sequences with low complexity working on the whole sequence
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
153 # and the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
154 if options.clean_complexity_full and reads_ok[i] == 0:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
155 # Compute the complexity on the whole sequence
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
156 cplx = (float(len(zlib.compress(str(reads_record.seq))))/float(len(reads_record)))*float(100)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
157 # If the complexity is higher to the threshold, flag the sequence as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
158 if cplx < float(options.complexity) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
159 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
160 del_by_complexity += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
161 log.write(reads_id[i] + " deleted -> Complexity ( Reads complexity " + str(cplx) + " < to the minimum : " + str(options.complexity) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
162
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
163 # If is asked to clean sequences with low quality and quality information is available
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
164 # and the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
165 if options.clean_quality and reads_ok[i] == 0 and reads_record.letter_annotations.has_key("phred_quality"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
166 # If at least one base has a quality score higher than the threashold
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
167 maw_qual = max(reads_record.letter_annotations["phred_quality"])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
168 if maw_qual < int(options.quality_threshold) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
169 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
170 del_by_quality += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
171 log.write(reads_id[i] + " deleted -> Quality ( Reads minimum quality " + str(maw_qual) + " < to the threshold : " + str(options.quality_threshold) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
172
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
173 # If is asked to clean sequences by length using the standard deviation
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
174 if options.clean_length_std :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
175 # Compute the mean and the standard deviation
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
176 mean = sum(reads_length) / len(reads_length)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
177 mq = mean**2
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
178 s = sum([ x**2 for x in reads_length])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
179 var = s/len(reads_length) - mq
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
180 etype = math.sqrt(var)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
181 # For each sequences
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
182 for i, id in enumerate(reads_id):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
183 # If the sequence has not been flagged as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
184 if reads_ok[i] == 0 :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
185 # If the sequence length is higher than the upper threshold, flag the sequence as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
186 if reads_length[i] > mean + options.std*etype:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
187 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
188 del_by_length += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
189 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + ">" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
190 # If the sequence length is smaller than the lower threshold, flag the sequence as deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
191 elif reads_length[i] < mean - options.std*etype:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
192 reads_ok[i] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
193 del_by_length += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
194 log.write(reads_id[i] + " deleted -> Length ( " + str(reads_length[i]) + "<" + str(mean) + "+" + str(options.std) + "*" + str(etype) + " )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
195
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
196 seqs_to_return = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
197 # Then get only sequences not flagged to be deleted
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
198 for i, reads_record in enumerate(seqs) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
199 if reads_ok[i] == 0:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
200 seqs_to_return.append(reads_record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
201
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
202 # Return values
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
203 return [seqs_to_return, del_by_length, del_by_ns, del_by_complexity, del_by_quality]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
204
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
205
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
206 def filter_same_reads (seqs, options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
207 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
208 Filter input seqs by duplicat, if sequences are too similar keep only one to represent the cluster
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
209 @param seqs : table of seqs to filter
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
210 @param options : the options asked by the user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
211 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
212
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
213 megablast_input = os.path.join(options.output, "megablast_input.fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
214 log.write("## Start cleaning duplicated reads (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
215 log.write("## formatdb the fasta file (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
216
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
217 # First write down seqs
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
218 fasta = open(megablast_input, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
219 SeqIO.write(seqs, fasta, "fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
220 fasta.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
221
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
222 # Then formatdb the fasta file (no formatdb utils in Biopython)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
223 cmd = "formatdb -i %s -p F" % megablast_input
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
224 os.system(cmd)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
225 log.write("## megablast (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
226
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
227 # In case of pairends, use words of 50, so megablast cannot connect with spacers
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
228 opts = ""
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
229 if options.clean_pairends:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
230 opts = " -W 50 -H 10 "
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
231
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
232 # Megablast the fasta file versus itself
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
233 cmd = "megablast -d " + megablast_input + " -i " + megablast_input + opts + " -p 98 -a " + str(options.nb_cpus) + " -M 500000 -s 100 -D 3 | grep -v '^#' | perl -lne 'chomp; split; if ($_[0] ne $_[1]) { if (($_[6] == 1 ) && ($_[8] == 1) && ($_{$_[0]} < 30)) { print $_;$_{$_[0]}++; }}' > " + megablast_input + ".res"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
234 os.system(cmd)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
235
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
236 # Let's get the reads length with the fasta file and creates the graph
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
237 log.write("## Parsing the megablast file (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
238 gseqs = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
239 vertices = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
240 for reads_record in seqs :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
241 vertices.append({'name': reads_record.id})
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
242 gseqs[reads_record.id] = len(reads_record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
243
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
244 # Connects reads from hits starting at 1 and with ends closed to each others
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
245 log.write("## Creating the graph (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
246 edges = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
247 for read in open(megablast_input + ".res", 'rU').readlines() :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
248 parts = read.rstrip().split()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
249 len1 = gseqs[parts[0]]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
250 len2 = gseqs[parts[1]]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
251 if options.clean_aggressive :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
252 edges.append({'source': parts[0], 'target': parts[1]})
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
253 elif math.fabs(len1-len2) < options.duplication_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
254 if int(parts[7]) > (len1 - options.duplication_limit) and int(parts[9]) > len2 - options.duplication_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
255 # This alignments are realy similar -> may be the same -> link them into the graph
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
256 edges.append({'source': parts[0], 'target': parts[1]})
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
257
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
258 # Then get connected components and extract one of them as cluster leader
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
259 log.write("## Comput connected components (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
260 gr = Graph.DictList(vertices, edges)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
261 connex = gr.clusters().membership
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
262
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
263 clusters = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
264 for i, vertex in enumerate(vertices):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
265 cluster_id = connex[i]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
266 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
267 clusters[cluster_id].append(seqs[i])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
268 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
269 clusters[cluster_id] = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
270 clusters[cluster_id].append(seqs[i])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
271
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
272 del_by_duplicat = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
273 # Write down into the log the composition of each cluster
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
274 clusters_stats = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
275 seqs_to_return = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
276 for cluster_id in clusters:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
277 cl_elts = ""
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
278 del_by_duplicat += len(clusters[cluster_id]) - 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
279 longest_value = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
280 cluster_leader = None
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
281 for seq_record in clusters[cluster_id]:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
282 # Find out which sequence is the longest
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
283 if len(seq_record.seq) > longest_value:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
284 cluster_leader = seq_record
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
285
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
286 seqs_to_return.append(cluster_leader)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
287 for seq_record in clusters[cluster_id]:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
288 if seq_record.id != cluster_leader.id:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
289 log.write(seq_record.id + " deleted -> Duplicated ( flagged as " + cluster_leader.id + " duplicat )\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
290 cl_elts += seq_record.id + " "
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
291 log.write("## cluster leader: " + cluster_leader.id + " of cluster composed by : " + cl_elts + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
292
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
293 try :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
294 clusters_stats[len(clusters[cluster_id])] += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
295 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
296 clusters_stats[len(clusters[cluster_id])] = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
297
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
298 # Write down a summary of what has been done
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
299 log_header = "## header (duplicated) : "
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
300 log_summary = "## summary (duplicated) : "
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
301 for stat in sorted(clusters_stats.keys()):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
302 log_header += str(stat) + "\t"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
303 log_summary += str(clusters_stats[stat]) + "\t"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
304 log.write(log_header + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
305 log.write(log_summary + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
306
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
307 # Clean temporary files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
308 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
309 os.remove(megablast_input)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
310 os.remove(megablast_input+".nhr")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
311 os.remove(megablast_input+".nin")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
312 os.remove(megablast_input+".nsq")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
313 os.remove(megablast_input+".res")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
314 os.remove("formatdb.log")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
315 os.remove("error.log")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
316 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
317 pass
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
318
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
319 # Returns results
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
320 return [seqs_to_return, del_by_duplicat]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
321
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
322
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
323 def filter_pairends(seqs, options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
324 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
325 Filter pairends sequences and split sequences without pairends into a fasta file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
326 @param seqs : the table of sequences to filter
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
327 @param options : the options asked by the user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
328 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
329
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
330 # Setup output files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
331 shotgun_ffile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
332 shotgun_qfile = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".shotgun.clean.qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
333 crossmatch_input = os.path.join(options.output, os.path.basename(options.input_file)+".cross_match_input.fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
334 split_pairends_fasta = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
335 split_pairends_qual = os.path.join(options.output, os.path.splitext(os.path.basename(options.input_file))[0]+".pairends.splited.clean.qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
336
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
337 # First write down seqs
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
338 fasta = open(crossmatch_input, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
339 SeqIO.write(seqs, fasta, "fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
340 fasta.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
341
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
342 # Write down the qual file for cross_match if possible
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
343 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
344 qual = open(crossmatch_input+".qual", "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
345 SeqIO.write(seqs, qual, "qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
346 qual.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
347 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
348 os.remove(crossmatch_input+".qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
349
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
350 # Cross_match reverse matches pattern
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
351 rev_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+C\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
352 # Cross_match forward matches pattern
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
353 fwd_regex = re.compile("(\s+)?\d+\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+\s+(\S+)\s+(\S+)\s+(\S+)\s+\S+")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
354
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
355 # Write the spacer file and execute cross_match against the input sequences
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
356 spacers_file = create_roche_pairends_spacer_file(options.output)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
357 cmd = "cross_match " + crossmatch_input + " " + spacers_file + " -minmatch 10 -minscore 25 > " + crossmatch_input + ".cross_match.res"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
358 os.system(cmd)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
359
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
360 # Parse the cross_match file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
361 cross_match_tab = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
362 block_found = False
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
363 for line in open(crossmatch_input + ".cross_match.res", 'r'):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
364 save_line = False
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
365 rm = rev_regex.match(line)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
366 fm = fwd_regex.match(line)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
367 if rm != None: # If it's a reverse matches
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
368 block_found = True
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
369 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, endSecondMatch, startSecondMatch)=(float(rm.group(2)), rm.group(3), int(rm.group(4)), int(rm.group(5)), rm.group(6), int(rm.group(7)), int(rm.group(8)))
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
370 save_line = True
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
371 elif fm != None: # If it's a forward matches
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
372 block_found = True
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
373 (percentMis, primary_match, startFirstMatch, endFirstMatch, secondary_match, startSecondMatch, endSecondMatch)=(float(fm.group(2)), fm.group(3), int(fm.group(4)), int(fm.group(5)), fm.group(6), int(fm.group(7)), int(fm.group(8)))
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
374 save_line = True
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
375 else : save_line = False
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
376
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
377 if line.startswith("Discrepancy summary:"): # This is the end of the section
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
378 break
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
379
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
380 # Save the line
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
381 if save_line:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
382 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
383 cross_match_tab[primary_match][secondary_match].append([startFirstMatch, endFirstMatch, startSecondMatch, endSecondMatch])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
384 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
385 cross_match_tab[primary_match] = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
386 cross_match_tab[primary_match].append([secondary_match, int(startFirstMatch), int(endFirstMatch), int(startSecondMatch), int(endSecondMatch)])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
387
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
388 # Then get the spacers_length
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
389 spacers_length = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
390 for seq_record in SeqIO.parse(open(spacers_file, "rU"), "fasta") :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
391 spacers_length[seq_record.id] = len(seq_record.seq.tostring())
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
392
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
393 # Finaly go throught all sequences
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
394 seqs_to_return = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
395 shotgun_seqs = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
396 del_by_pairends = 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
397 pe_splited = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
398
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
399 for i, seq_record in enumerate(seqs) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
400 if not cross_match_tab.has_key(seq_record.id):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
401 # No spacers found -> just add it to the shotgun_seqs tab
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
402 shotgun_seqs.append(seq_record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
403 log.write(seq_record.id + " shotgun -> no spacer found\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
404 elif len(cross_match_tab[seq_record.id]) > 1:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
405 # If multiple linker -> delete the whole sequence
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
406 log.write(seq_record.id + " deleted -> multiple spacers found : " + str(len(cross_match_tab[seq_record.id])) + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
407 del_by_pairends += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
408 elif (cross_match_tab[seq_record.id][0][2]-cross_match_tab[seq_record.id][0][1] < (spacers_length[cross_match_tab[seq_record.id][0][0]]-options.missmatch)):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
409 if (cross_match_tab[seq_record.id][0][1] == 1):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
410 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
411 log.write(seq_record.id + " shotgun -> spacer found at the begining\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
412 elif (cross_match_tab[seq_record.id][0][2] == len(seq_record)):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
413 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
414 log.write(seq_record.id + " shotgun -> spacer found at the end\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
415 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
416 log.write(seq_record.id + " deleted -> partiel spacer found in the middle of the read \n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
417 del_by_pairends += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
418 elif cross_match_tab[seq_record.id][0][1] >= options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] >= options.border_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
419 seqs_to_return.append(seq_record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
420 if options.split_pairends:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
421 sub_seq1 = seq_record[cross_match_tab[seq_record.id][0][2]:]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
422 sub_seq1.id = seq_record.id + ".r"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
423 pe_splited.append(sub_seq1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
424 sub_seq2 = seq_record[:cross_match_tab[seq_record.id][0][1]]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
425 sub_seq2.id = seq_record.id + ".f"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
426 pe_splited.append(sub_seq2)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
427 elif cross_match_tab[seq_record.id][0][1] < options.border_limit and len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
428 log.write(seq_record.id + " deleted -> both borders < to the border limit " + str(options.border_limit) + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
429 del_by_pairends += 1
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
430 elif cross_match_tab[seq_record.id][0][1] < options.border_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
431 shotgun_seqs.append(seq_record[cross_match_tab[seq_record.id][0][2]:])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
432 log.write(seq_record.id + " shotgun -> spacer found : left border way too short deleted \n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
433 elif len(seq_record)-cross_match_tab[seq_record.id][0][2] < options.border_limit:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
434 shotgun_seqs.append(seq_record[:cross_match_tab[seq_record.id][0][1]])
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
435 log.write(seq_record.id + " shotgun -> spacer found : right border way too short deleted \n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
436
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
437 # Write down if required the splited pairends file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
438 if options.split_pairends:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
439 handle = open(split_pairends_fasta, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
440 SeqIO.write(pe_splited, handle, "fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
441 handle.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
442 handle = open(split_pairends_qual, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
443 SeqIO.write(pe_splited, handle, "qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
444 handle.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
445
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
446 # Finaly clean up new shotguns seqs
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
447 [shotgun_seqs_clean, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(shotgun_seqs, options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
448 handle = open(shotgun_ffile, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
449 SeqIO.write(shotgun_seqs_clean, handle, "fasta")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
450 handle.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
451 in_shot_gun = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
452 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
453 handle = open(shotgun_qfile, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
454 SeqIO.write(shotgun_seqs_clean, handle, "qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
455 handle.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
456 in_shot_gun = len(shotgun_seqs_clean)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
457 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
458 pass
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
459
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
460 log.write("## header (pairends) : pairends\ttotal shotgun\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
461 log.write("## summary (pairends) : " + str(len(seqs_to_return)) + "\t" + str(len(shotgun_seqs_clean)) + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
462
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
463 # Clean temporary files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
464 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
465 os.remove(crossmatch_input)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
466 os.remove(spacers_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
467 os.remove(crossmatch_input+".cross_match.res")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
468 os.remove(crossmatch_input+".log")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
469 os.remove(crossmatch_input+".qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
470 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
471 pass
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
472
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
473 # Then returned pairends ones
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
474 return [seqs_to_return, in_shot_gun, del_by_length, del_by_ns, del_by_complexity, del_by_quality, del_by_pairends]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
475
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
476
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
477 def get_seqs (options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
478 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
479 Converts input seqs in a BioPython seq table
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
480 @param options : the options asked by the user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
481 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
482
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
483 # First get fasta or/and qual input files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
484 qual_file = ""
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
485 if options.format == "sff":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
486 sff_file = options.input_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
487
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
488 if sff_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
489 '''Gunzip the given file and then remove the file.'''
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
490 r_file = gzip.GzipFile(sff_file, 'r')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
491 write_file = os.path.join(options.output, string.rstrip(os.path.basename(sff_file), '.gz'))
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
492 w_file = open(write_file, 'w')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
493 w_file.write(r_file.read())
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
494 w_file.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
495 r_file.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
496 sff_file = write_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
497
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
498 base = os.path.basename(sff_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
499 fasta_file = os.path.join(options.output, base + '.fasta')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
500 qual_file = os.path.join(options.output, base + '.qual')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
501 xml_file = os.path.join(options.output, base + '.xml')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
502 format = "fasta"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
503 if not os.path.isfile(fasta_file) or not os.path.isfile(qual_file) or not os.path.isfile(xml_file):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
504 #First extract the sff file to fasta, qual and xml file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
505 cmd = "sff_extract.py -c -s " + fasta_file + " -q " + qual_file + " -x " + xml_file + " " + sff_file + " >> " + options.output+"/"+options.log_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
506 os.system(cmd)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
507 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
508 log = open(options.log_file, "a+")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
509 log.write(fasta_file + ", " + qual_file + ", " + xml_file + " already exist, working on existing files\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
510 log.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
511 elif options.format == "fasta" or options.format == "fastq":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
512 format = options.format
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
513 fasta_file = options.input_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
514 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
515 print "Error : " + options.format + " is not a supported format!"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
516 sys.exit(1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
517
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
518 if options.qual_file:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
519 qual_file = options.qual_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
520
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
521 # If we got a quality file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
522 if qual_file :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
523 quals = {}
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
524 # If the file is gziped
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
525 if qual_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
526 for qual in SeqIO.parse(gzip.open(qual_file), "qual") :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
527 quals[qual.id] = qual
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
528 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
529 for qual in SeqIO.parse(open(qual_file), "qual") :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
530 quals[qual.id] = qual
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
531
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
532 # If the fasta file is gziped
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
533 if fasta_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
534 seqs = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
535 for record in SeqIO.parse(gzip.open(fasta_file), format) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
536 if qual_file :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
537 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
538 seqs.append(record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
539 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
540 seqs = []
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
541 for record in SeqIO.parse(open(fasta_file), format) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
542 if qual_file :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
543 record.letter_annotations["phred_quality"] = quals[record.id].letter_annotations["phred_quality"]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
544 seqs.append(record)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
545
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
546 # Clean temporary files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
547 if options.format == "sff":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
548 try:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
549 os.remove(fasta_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
550 os.remove(qual_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
551 os.remove(xml_file)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
552 except:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
553 pass
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
554
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
555 # Returns the table of sequences
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
556 return [seqs]
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
557
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
558 def which (program):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
559 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
560 Return if the asked program exist in the user path
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
561 @param options : the options asked by the user
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
562 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
563 import os
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
564 def is_exe(fpath):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
565 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
566 fpath, fname = os.path.split(program)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
567 if fpath:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
568 if is_exe(program):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
569 return program
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
570 else:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
571 for path in os.environ["PATH"].split(os.pathsep):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
572 exe_file = os.path.join(path, program)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
573 if is_exe(exe_file):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
574 return exe_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
575 return None
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
576
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
577 def pyrocleaner_depts_error(options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
578 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
579 Return the list of dependencies missing to run the pyrocleaner
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
580 """
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
581 error = ""
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
582 if which("formatdb") == None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
583 error += " - formatdb is missing\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
584 if which("megablast") == None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
585 error += " - megablast is missing\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
586 if which("sff_extract.py") == None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
587 error += " - sff_extract.py\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
588 if which("cross_match") == None and options.clean_pairends != None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
589 error += " - cross_match\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
590 if error != "":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
591 error = "Cannot execute pyrocleaner, following dependencies are missing :\n" + error + "Please install them before to run the pyrocleaner!"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
592 return error
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
593
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
594
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
595 if __name__ == "__main__":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
596
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
597 parser = OptionParser(usage="Usage: pyrocleaner.py -i FILE [options]")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
598
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
599 usage = "usage: %prog -i file -o output -f format"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
600 desc = " Clean sequences from 454 SFF files \n"\
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
601 " ex : pyrocleaner.py -i file.sff -o /path/to/output -f sff --clean-pairends --clean-length-std --clean-ns --clean-duplicated-reads --clean-complexity-win"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
602 parser = OptionParser(usage = usage, version = version_string(), description = desc)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
603
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
604 igroup = OptionGroup(parser, "Input files options","")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
605 igroup.add_option("-i", "--in", dest="input_file",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
606 help="The file to clean, can be [sff|fastq|fasta]", metavar="FILE")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
607 igroup.add_option("-q", "--qual", dest="qual_file",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
608 help="The quality file to use if input file is fasta", metavar="FILE")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
609 igroup.add_option("-f", "--format", dest="format",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
610 help="The format of the input file [sff|fastq|fasta] default is sff", type="string", default="sff")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
611 parser.add_option_group(igroup)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
612
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
613 ogroup = OptionGroup(parser, "Output files options","")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
614 ogroup.add_option("-o", "--out", dest="output",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
615 help="The output folder where to store results")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
616 ogroup.add_option("-g", "--log", dest="log_file",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
617 help="The log file name (default:pyrocleaner.log)", metavar="FILE", default="pyrocleaner.log")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
618 ogroup.add_option("-z", "--split-pairends", action="store_true", dest="split_pairends",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
619 default=False, help="Write splited pairends sequences into a fasta file")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
620 parser.add_option_group(ogroup)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
621
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
622 cgroup = OptionGroup(parser, "Cleaning options","")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
623 cgroup.add_option("-p", "--clean-pairends", action="store_true", dest="clean_pairends",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
624 default=False, help="Clean pairends")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
625 cgroup.add_option("-l", "--clean-length-std", action="store_true", dest="clean_length_std",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
626 default=False, help="Filter short reads shorter than mean less x*standard deviation and long reads longer than mean plus x*standard deviation")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
627 cgroup.add_option("-w", "--clean-length-win", action="store_true", dest="clean_length_win",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
628 default=False, help="Filter reads with a legnth in between [x:y]")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
629 cgroup.add_option("-n", "--clean-ns", action="store_true", dest="clean_ns",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
630 default=False, help="Filter reads with too much N")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
631 cgroup.add_option("-d", "--clean-duplicated-reads", action="store_true", dest="clean_duplicated_reads",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
632 default=False, help="Filter duplicated reads")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
633 cgroup.add_option("-c", "--clean-complexity-win", action="store_true", dest="clean_complexity_win",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
634 default=False, help="Filter low complexity reads computed on a sliding window")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
635 cgroup.add_option("-u", "--clean-complexity-full", action="store_true", dest="clean_complexity_full",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
636 default=False, help="Filter low complexity reads computed on the whole sequence")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
637 cgroup.add_option("-k", "--clean-quality", action="store_true", dest="clean_quality",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
638 default=False, help="Filter low quality reads")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
639 parser.add_option_group(cgroup)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
640
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
641 pgroup = OptionGroup(parser, "Processing options","")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
642 pgroup.add_option("-a", "--acpus", dest="nb_cpus",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
643 help="Number of processors to use", type="int", default=1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
644 pgroup.add_option("-r", "--recursion", dest="recursion",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
645 help="Recursion limit when computing duplicated reads", type="int", default=1000)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
646 parser.add_option_group(pgroup)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
647
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
648 cpgroup = OptionGroup(parser, "Cleaning parameters","")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
649 cpgroup.add_option("-b", "--border-limit", dest="border_limit",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
650 help="Minimal length between the spacer and the read extremity (used with --clean-pairends option, default=70)", type="int", default=70)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
651 cpgroup.add_option("-m", "--aggressive", action="store_true", dest="clean_aggressive",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
652 default=False, help="Filter all duplication reads gathered in a cluster to keep one (used with --clean-duplicated-reads, default=False)")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
653 cpgroup.add_option("-e", "--missmatch", dest="missmatch",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
654 help="Limit of missmatch nucleotide (used with --clean-pairends option, default=10)", type="int", default=10)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
655 cpgroup.add_option("-j", "--std", dest="std",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
656 help="Number standard deviation to use (used with --clean-length-std option, default=2)", type="int", default=2)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
657 cpgroup.add_option("-1", "--min", dest="min",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
658 help="Minimal length (used with --clean-length-win option, default=200)", type="int", default=200)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
659 cpgroup.add_option("-2", "--max", dest="max",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
660 help="Maximal length (used with --clean-length-win option, default=600)", type="int", default=600)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
661 cpgroup.add_option("-3", "--quality-threshold", dest="quality_threshold",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
662 help="At least one base pair has to be equal or higher than this value (used with --clean-quality, default=35)", type="int", default=35)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
663 cpgroup.add_option("-s", "--ns_percent", dest="ns_percent",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
664 help="Percentage of N to use to filter reads (used with --clean-ns option, default=4)", type="int", default=4)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
665 cpgroup.add_option("-t", "--duplication_limit", dest="duplication_limit",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
666 help="Limit size difference (used with --clean-duplicated-reads, default=70)", type="int", default=70)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
667 cpgroup.add_option("-v", "--window", dest="window",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
668 help="The window size (used with --clean-complexity-win, default=100)", type="int", default=100)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
669 cpgroup.add_option("-x", "--step", dest="step",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
670 help="The window step (used with --clean-complexity-win, default=5)", type="int", default=5)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
671 cpgroup.add_option("-y", "--complexity", dest="complexity",
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
672 help="Minimal complexity/length ratio (used with --clean-complexity-win and --clean-complexity-full, default=40)", type="int", default=40)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
673 parser.add_option_group(cpgroup)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
674
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
675 (options, args) = parser.parse_args()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
676
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
677 sys.setrecursionlimit(options.recursion)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
678
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
679 if not pyrocleaner_depts_error(options):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
680
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
681 if options.input_file == None or options.output == None and options.format == None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
682 parser.print_help()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
683 sys.exit(1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
684 elif not options.clean_pairends and not options.clean_length_std and not options.clean_length_win and not options.clean_ns and not options.clean_duplicated_reads and not options.clean_complexity_win and not options.clean_complexity_full and not options.clean_quality :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
685 print "Please select at least one cleaning option between [--clean-pairends|--clean-length-std|--clean-length-win|--clean-ns|--clean-duplicated-reads|--clean-complexity-win|--clean-complexity-full|--clean-quality] for help use the option -h or --help"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
686 sys.exit(1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
687 else:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
688
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
689 # Create output directory if doesn't exist
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
690 if not os.path.isdir(options.output) : os.mkdir(options.output )
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
691
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
692 global log
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
693 log = open(options.output+"/"+options.log_file, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
694 log.write("## Start processing (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
695 log.write("## with the following options: \n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
696 opts = ""
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
697 if options.clean_complexity_full:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
698 opts += " - Clean complexity based on the whole sequence with " + str(options.complexity) + " as complexity/length ratio limit.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
699 if options.clean_complexity_win:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
700 opts += " - Clean complexity based on a sliding window with a size of " + str(options.window) + ", a step of " + str(options.step) + " and " + str(options.complexity) + " as complexity/length ratio limit.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
701 if options.clean_duplicated_reads and not options.clean_aggressive:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
702 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
703 elif options.clean_duplicated_reads and options.clean_aggressive :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
704 opts += " - Clean duplicated reads using " + str(options.duplication_limit) + " as limit for the difference between reads ends to consider them as duplicat and keep only one read per cluster.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
705 if options.clean_length_std:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
706 opts += " - Clean reads shorter than reads length mean minus " + str(options.std) + " standard deviation and reads longer than reads length mean plus " + str(options.std) + " standard deviation.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
707 if options.clean_length_win:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
708 opts += " - Clean reads with a length not in between [" + str(options.min) + ";" + str(options.max) + "].\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
709 if options.clean_ns:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
710 opts += " - Clean reads with a percentage of Ns higher than " + str(options.ns_percent) + "%.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
711 if options.clean_pairends:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
712 opts += " - Clean pairends reads if the sequence size between the spacer and the read begning/end is higher than " + str(options.border_limit) + " nucleaotides or if " + str(options.missmatch) + " nucleotides missmatch with the spacer.\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
713 if options.clean_quality and (options.format == "sff" or options.format == "fastq" or (options.format == "fasta" and options.qual_file)) :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
714 opts += " - Clean reads if no bases quality has a score higher than " + str(options.quality_threshold) + ".\n"
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
715 elif options.clean_quality :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
716 print "No quality check will be performed beacause there is no quality file provided."
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
717 log.write(opts)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
718 log.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
719
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
720 # 1 - First get inputs from options
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
721 [seqs] = get_seqs(options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
722 before_cleaning = len(seqs)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
723 del_by_length = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
724 del_by_ns = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
725 del_by_complexity = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
726 del_by_duplicat = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
727 del_by_pairends = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
728 del_by_quality = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
729 in_shot_gun = 0
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
730
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
731 log = open(options.output+"/"+options.log_file, "a+")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
732 # 2 - clean reads
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
733 # 2.1 - Clean reads by length, ns, complexity
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
734 if options.clean_ns or options.clean_length_std or options.clean_length_win or options.clean_complexity_full or options.clean_complexity_win or options.clean_quality:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
735 if options.format == "sff" :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
736 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
737 elif options.format == "fasta" or options.format == "fastq":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
738 [seqs, del_by_length, del_by_ns, del_by_complexity, del_by_quality] = filter_reads(seqs, options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
739
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
740 # 2.2 - Clean reads by duplicat
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
741 if options.clean_duplicated_reads:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
742 [seqs, del_by_duplicat] = filter_same_reads(seqs, options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
743
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
744 # 3 - Clean pairends
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
745 if options.clean_pairends:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
746 [seqs, in_shot_gun, del_by_length_pe, del_by_ns_pe, del_by_complexity_pe, del_by_quality_pe, del_by_pairends] = filter_pairends(seqs, options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
747 del_by_length += del_by_length_pe
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
748 del_by_ns += del_by_ns_pe
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
749 del_by_complexity += del_by_complexity_pe
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
750 del_by_quality += del_by_quality_pe
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
751
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
752 # 4 - Create the output in the same format than the input
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
753 after_cleaning = len(seqs) + in_shot_gun
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
754 if options.input_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
755 ifile = string.rstrip(options.input_file, '.gz')
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
756 else : ifile = options.input_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
757 if options.clean_pairends: output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".pairends.clean")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
758 else : output = os.path.join(options.output, os.path.splitext(os.path.basename(ifile))[0]+".clean")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
759 output += "." + options.format
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
760 if options.format == "sff" :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
761 if options.input_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
762 sfffile = os.path.join(options.output, string.rstrip(os.path.basename(options.input_file), '.gz'))
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
763 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
764 sfffile = options.input_file
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
765 if which("sfffile") == None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
766 fastq = open(output, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
767 SeqIO.write(seqs, fastq, "fastq")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
768 fastq.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
769 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
770 reads_to_sff(sfffile, seqs, output)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
771 # Clean temporary files
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
772 if options.input_file.endswith(".gz"):
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
773 os.remove(sfffile)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
774
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
775 elif options.format == "fasta" or options.format == "fastq":
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
776 fasta = open(output, "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
777 SeqIO.write(seqs, fasta, options.format)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
778 fasta.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
779 if options.format == "fasta" and options.qual_file != None:
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
780 qual = open(os.path.splitext(output)[0]+".qual", "w")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
781 SeqIO.write(seqs, qual, "qual")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
782 qual.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
783
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
784 # 5 - Write down the analyze sumary
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
785 log.write("## header (global) : nb reads at begining\tnb reads after clean up\tlength\tns\tcomplexity\tduplicat\tpairends\tquality\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
786 log.write("## summary (global) : " + str(before_cleaning) + "\t" + str(after_cleaning) + "\t" + str(del_by_length) + "\t" + str(del_by_ns) + "\t" + str(del_by_complexity) + "\t" + str(del_by_duplicat) + "\t" + str(del_by_pairends) + "\t" + str(del_by_quality) + "\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
787 log.write("## Ended with code 0 (" + str(datetime.datetime.now()) + ")\n")
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
788 log.close()
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
789 sys.exit(0)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
790 else :
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
791 print pyrocleaner_depts_error(options)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
792 sys.exit(1)
ef5dd11c01e6 pyrocleaner v1.2
g2cmnty@test-web1.g2.bx.psu.edu
parents:
diff changeset
793