comparison lib/tarean/kmer_counting.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 #!/usr/bin/env python3
2
3 import logging
4 logger = logging.getLogger(__name__)
5 import itertools
6 import os
7 import sys
8 import random
9 import sqlite3
10 import subprocess
11 import shlex # for command line arguments split
12 import operator
13
14 REQUIRED_VERSION = (3, 4)
15 MAX_PRINT = 10
16 MEGABLAST = "-task megablast"
17 HITSORT = "-task megablast"
18
19 if sys.version_info < REQUIRED_VERSION:
20 raise Exception("\n\npython 3.4 or higher is required!\n")
21
22 # additional functions
23
24
25 class Sequence:
26
27 def __init__(self, seq, name="", paired=False):
28 # the mode os seq storage can be changed later to make it more
29 # memory efficient
30 self._seq = bytes(str(seq), "ascii")
31 self.name = str(name)
32
33 @property
34 def seq(self):
35 return self._seq.decode("utf-8")
36
37 @seq.setter
38 def seq(self, value):
39 self._seq = bytes(str(value), "ascii")
40
41 def __str__(self):
42 return "{0} : {1}".format(self.name, self.seq)
43
44 @staticmethod
45 def read_fasta(fasta_file_name):
46 '''
47 generator - reads sequences from fasta file
48 return sequence one by one
49 '''
50 with open(fasta_file_name, 'r') as f:
51 header = None
52 seqstr = None
53 for rawline in f:
54 line = rawline.strip()
55 if line == "":
56 continue
57 if line[0] == ">":
58 if header and seqstr:
59 yield Sequence(seqstr, header)
60 # reset
61 seqstr = None
62 header = line[1:]
63 elif seqstr:
64 Warning("sequence was not preceeded by header")
65 else:
66 header = line[1:]
67 else:
68 seqstr = line if not seqstr else seqstr + line
69 # skip empty lines:
70 if header and seqstr:
71 yield Sequence(seqstr, header)
72 return
73
74 def write2fasta(self, file_object):
75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
76
77
78 def get_kmers(string, width=11):
79 L = len(string)
80 parts = [string[i:i + width] for i in range(L - width + 0)]
81 return parts
82
83
84 def count_kmers_from_file(f, width=11):
85 counts = {}
86 for i in Sequence.read_fasta(f):
87 a = get_kmers(i.seq, width)
88 for km in a:
89 if "N" in km:
90 continue
91 if km in counts:
92 counts[km] += 1
93 else:
94 counts[km] = 1
95 sorted_counts = sorted(counts.items(),
96 key=operator.itemgetter(1),
97 reverse=True)
98 return sorted_counts
99
100
101 if __name__ == "__main__":
102 L = len(sys.argv) - 1
103 kmer_length = int(sys.argv[-1])
104 files = sys.argv[1:-1]
105 for fin in files:
106 counts = count_kmers_from_file(fin, kmer_length)
107 fout = "{}_{}.kmers".format(fin, kmer_length)
108 with open(fout, "w") as f:
109 for i in counts:
110 f.write("{}\t{}\n".format(*i))
111 print(fout)