0
|
1 #!/usr/bin/env python3
|
|
2
|
|
3 import logging
|
|
4 logger = logging.getLogger(__name__)
|
|
5 import itertools
|
|
6 import os
|
|
7 import sys
|
|
8 import random
|
|
9 import sqlite3
|
|
10 import subprocess
|
|
11 import shlex # for command line arguments split
|
|
12 import operator
|
|
13
|
|
14 REQUIRED_VERSION = (3, 4)
|
|
15 MAX_PRINT = 10
|
|
16 MEGABLAST = "-task megablast"
|
|
17 HITSORT = "-task megablast"
|
|
18
|
|
19 if sys.version_info < REQUIRED_VERSION:
|
|
20 raise Exception("\n\npython 3.4 or higher is required!\n")
|
|
21
|
|
22 # additional functions
|
|
23
|
|
24
|
|
25 class Sequence:
|
|
26
|
|
27 def __init__(self, seq, name="", paired=False):
|
|
28 # the mode os seq storage can be changed later to make it more
|
|
29 # memory efficient
|
|
30 self._seq = bytes(str(seq), "ascii")
|
|
31 self.name = str(name)
|
|
32
|
|
33 @property
|
|
34 def seq(self):
|
|
35 return self._seq.decode("utf-8")
|
|
36
|
|
37 @seq.setter
|
|
38 def seq(self, value):
|
|
39 self._seq = bytes(str(value), "ascii")
|
|
40
|
|
41 def __str__(self):
|
|
42 return "{0} : {1}".format(self.name, self.seq)
|
|
43
|
|
44 @staticmethod
|
|
45 def read_fasta(fasta_file_name):
|
|
46 '''
|
|
47 generator - reads sequences from fasta file
|
|
48 return sequence one by one
|
|
49 '''
|
|
50 with open(fasta_file_name, 'r') as f:
|
|
51 header = None
|
|
52 seqstr = None
|
|
53 for rawline in f:
|
|
54 line = rawline.strip()
|
|
55 if line == "":
|
|
56 continue
|
|
57 if line[0] == ">":
|
|
58 if header and seqstr:
|
|
59 yield Sequence(seqstr, header)
|
|
60 # reset
|
|
61 seqstr = None
|
|
62 header = line[1:]
|
|
63 elif seqstr:
|
|
64 Warning("sequence was not preceeded by header")
|
|
65 else:
|
|
66 header = line[1:]
|
|
67 else:
|
|
68 seqstr = line if not seqstr else seqstr + line
|
|
69 # skip empty lines:
|
|
70 if header and seqstr:
|
|
71 yield Sequence(seqstr, header)
|
|
72 return
|
|
73
|
|
74 def write2fasta(self, file_object):
|
|
75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
|
|
76
|
|
77
|
|
78 def get_kmers(string, width=11):
|
|
79 L = len(string)
|
|
80 parts = [string[i:i + width] for i in range(L - width + 0)]
|
|
81 return parts
|
|
82
|
|
83
|
|
84 def count_kmers_from_file(f, width=11):
|
|
85 counts = {}
|
|
86 for i in Sequence.read_fasta(f):
|
|
87 a = get_kmers(i.seq, width)
|
|
88 for km in a:
|
|
89 if "N" in km:
|
|
90 continue
|
|
91 if km in counts:
|
|
92 counts[km] += 1
|
|
93 else:
|
|
94 counts[km] = 1
|
|
95 sorted_counts = sorted(counts.items(),
|
|
96 key=operator.itemgetter(1),
|
|
97 reverse=True)
|
|
98 return sorted_counts
|
|
99
|
|
100
|
|
101 if __name__ == "__main__":
|
|
102 L = len(sys.argv) - 1
|
|
103 kmer_length = int(sys.argv[-1])
|
|
104 files = sys.argv[1:-1]
|
|
105 for fin in files:
|
|
106 counts = count_kmers_from_file(fin, kmer_length)
|
|
107 fout = "{}_{}.kmers".format(fin, kmer_length)
|
|
108 with open(fout, "w") as f:
|
|
109 for i in counts:
|
|
110 f.write("{}\t{}\n".format(*i))
|
|
111 print(fout)
|