Mercurial > repos > drosofff > yac_clipper
annotate yac.py @ 1:8a8f62b4bf27 draft
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author | drosofff |
---|---|
date | Wed, 24 Jun 2015 17:06:30 -0400 |
parents | 307cd074fa95 |
children |
rev | line source |
---|---|
0
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
1 #!/usr/bin/python |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
2 # yac = yet another clipper |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
3 # v 1.2.1 - 23-08-2014 - Support FastQ output |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
4 # v 1.1.0 - 23-08-2014 - argparse implementation |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
5 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
6 # Christophe Antoniewski <drosofff@gmail.com> |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
7 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
8 import sys |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
9 import string |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
10 import argparse |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
11 from itertools import islice |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
12 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
13 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
14 def Parser(): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
15 the_parser = argparse.ArgumentParser() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
16 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
17 '--input', action="store", nargs='+', help="input fastq files") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
18 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
19 '--output', action="store", type=str, help="output, clipped fasta file") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
20 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
21 '--output_format', action="store", type=str, help="output format, fasta or fastq") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
22 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
23 '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
24 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
25 '--min', action="store", type=int, help="minimal size of clipped sequence to keep") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
26 the_parser.add_argument( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
27 '--max', action="store", type=int, help="maximal size of clipped sequence to keep") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
28 the_parser.add_argument('--Nmode', action="store", type=str, choices=[ |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
29 "accept", "reject"], help="accept or reject sequences with N for clipping") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
30 args = the_parser.parse_args() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
31 args.adapter_to_clip = args.adapter_to_clip.upper() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
32 return args |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
33 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
34 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
35 class Clip: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
36 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
37 def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
38 self.inputfile = inputfile |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
39 self.outputfile = outputfile |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
40 self.output_format = output_format |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
41 self.adapter = adapter |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
42 self.minsize = int(minsize) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
43 self.maxsize = int(maxsize) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
44 self.Nmode = Nmode |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
45 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
46 def motives(sequence): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
47 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module''' |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
48 sequencevariants = [ |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
49 sequence[0:6]] # initializes the list with the 6mer perfect match |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
50 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"} |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
51 for pos in enumerate(sequence[:6]): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
52 for subst in dicsubst[pos[1]]: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
53 sequencevariants.append( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
54 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7]) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
55 return sequencevariants |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
56 self.adaptmotifs = motives(self.adapter) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
57 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
58 def scanadapt(self, adaptmotives=[], sequence="", qscore=""): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
59 '''scans sequence for adapter motives''' |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
60 match_position = sequence.rfind(adaptmotives[0]) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
61 if match_position != -1: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
62 return sequence[:match_position], qscore[:match_position] |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
63 for motif in adaptmotives[1:]: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
64 match_position = sequence.rfind(motif) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
65 if match_position != -1: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
66 return sequence[:match_position], qscore[:match_position] |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
67 return sequence, qscore |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
68 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
69 def write_output(self, id, read, qscore, output): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
70 if self.output_format == "fasta": |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
71 block = ">{0}\n{1}\n".format(id, read) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
72 else: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
73 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
74 output.write(block) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
75 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
76 def handle_io(self): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
77 '''Open input file, pass read sequence and read qscore to clipping function. |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
78 Pass clipped read and qscore to output function.''' |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
79 id = 0 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
80 output = open(self.outputfile, "a") |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
81 with open(self.inputfile, "r") as input: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
82 block_gen = islice(input, 1, None, 2) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
83 for i, line in enumerate(block_gen): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
84 if i % 2: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
85 qscore = line.rstrip() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
86 else: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
87 read = line.rstrip() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
88 continue |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
89 trimmed_read, trimmed_qscore = self.scanadapt( |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
90 self.adaptmotifs, read, qscore) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
91 if self.minsize <= len(trimmed_read) <= self.maxsize: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
92 if (self.Nmode == "reject") and ("N" in trimmed_read): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
93 continue |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
94 id += 1 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
95 self.write_output(id, trimmed_read, trimmed_qscore, output) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
96 output.close() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
97 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
98 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
99 def main(*argv): |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
100 instanceClip = Clip(*argv) |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
101 instanceClip.handle_io() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
102 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
103 if __name__ == "__main__": |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
104 args = Parser() |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
105 id = 0 |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
106 for inputfile in args.input: |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
107 main(inputfile, args.output, args.output_format, |
307cd074fa95
planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff
changeset
|
108 args.adapter_to_clip, args.min, args.max, args.Nmode) |