annotate yac.py @ 1:8a8f62b4bf27 draft

planemo upload for repository https://bitbucket.org/drosofff/gedtools/
author drosofff
date Wed, 24 Jun 2015 17:06:30 -0400
parents 307cd074fa95
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
1 #!/usr/bin/python
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
2 # yac = yet another clipper
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
3 # v 1.2.1 - 23-08-2014 - Support FastQ output
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
4 # v 1.1.0 - 23-08-2014 - argparse implementation
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
5 # Usage yac.py $input $output $adapter_to_clip $min $max $Nmode
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
6 # Christophe Antoniewski <drosofff@gmail.com>
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
7
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
8 import sys
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
9 import string
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
10 import argparse
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
11 from itertools import islice
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
12
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
13
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
14 def Parser():
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
15 the_parser = argparse.ArgumentParser()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
16 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
17 '--input', action="store", nargs='+', help="input fastq files")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
18 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
19 '--output', action="store", type=str, help="output, clipped fasta file")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
20 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
21 '--output_format', action="store", type=str, help="output format, fasta or fastq")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
22 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
23 '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
24 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
25 '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
26 the_parser.add_argument(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
27 '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
28 the_parser.add_argument('--Nmode', action="store", type=str, choices=[
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
29 "accept", "reject"], help="accept or reject sequences with N for clipping")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
30 args = the_parser.parse_args()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
31 args.adapter_to_clip = args.adapter_to_clip.upper()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
32 return args
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
33
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
34
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
35 class Clip:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
36
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
37 def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
38 self.inputfile = inputfile
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
39 self.outputfile = outputfile
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
40 self.output_format = output_format
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
41 self.adapter = adapter
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
42 self.minsize = int(minsize)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
43 self.maxsize = int(maxsize)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
44 self.Nmode = Nmode
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
45
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
46 def motives(sequence):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
47 '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
48 sequencevariants = [
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
49 sequence[0:6]] # initializes the list with the 6mer perfect match
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
50 dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
51 for pos in enumerate(sequence[:6]):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
52 for subst in dicsubst[pos[1]]:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
53 sequencevariants.append(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
54 sequence[:pos[0]] + subst + sequence[pos[0] + 1:7])
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
55 return sequencevariants
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
56 self.adaptmotifs = motives(self.adapter)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
57
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
58 def scanadapt(self, adaptmotives=[], sequence="", qscore=""):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
59 '''scans sequence for adapter motives'''
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
60 match_position = sequence.rfind(adaptmotives[0])
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
61 if match_position != -1:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
62 return sequence[:match_position], qscore[:match_position]
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
63 for motif in adaptmotives[1:]:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
64 match_position = sequence.rfind(motif)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
65 if match_position != -1:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
66 return sequence[:match_position], qscore[:match_position]
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
67 return sequence, qscore
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
68
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
69 def write_output(self, id, read, qscore, output):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
70 if self.output_format == "fasta":
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
71 block = ">{0}\n{1}\n".format(id, read)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
72 else:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
73 block = "@HWI-{0}\n{1}\n+\n{2}\n".format(id, read, qscore)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
74 output.write(block)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
75
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
76 def handle_io(self):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
77 '''Open input file, pass read sequence and read qscore to clipping function.
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
78 Pass clipped read and qscore to output function.'''
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
79 id = 0
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
80 output = open(self.outputfile, "a")
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
81 with open(self.inputfile, "r") as input:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
82 block_gen = islice(input, 1, None, 2)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
83 for i, line in enumerate(block_gen):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
84 if i % 2:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
85 qscore = line.rstrip()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
86 else:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
87 read = line.rstrip()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
88 continue
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
89 trimmed_read, trimmed_qscore = self.scanadapt(
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
90 self.adaptmotifs, read, qscore)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
91 if self.minsize <= len(trimmed_read) <= self.maxsize:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
92 if (self.Nmode == "reject") and ("N" in trimmed_read):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
93 continue
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
94 id += 1
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
95 self.write_output(id, trimmed_read, trimmed_qscore, output)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
96 output.close()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
97
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
98
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
99 def main(*argv):
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
100 instanceClip = Clip(*argv)
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
101 instanceClip.handle_io()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
102
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
103 if __name__ == "__main__":
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
104 args = Parser()
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
105 id = 0
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
106 for inputfile in args.input:
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
107 main(inputfile, args.output, args.output_format,
307cd074fa95 planemo upload for repository https://bitbucket.org/drosofff/gedtools/
drosofff
parents:
diff changeset
108 args.adapter_to_clip, args.min, args.max, args.Nmode)