Mercurial > repos > petr-novak > repeatrxplorer
comparison lib/tarean/kmer_counting.py @ 0:1d1b9e1b2e2f draft
Uploaded
author | petr-novak |
---|---|
date | Thu, 19 Dec 2019 10:24:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1d1b9e1b2e2f |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import logging | |
4 logger = logging.getLogger(__name__) | |
5 import itertools | |
6 import os | |
7 import sys | |
8 import random | |
9 import sqlite3 | |
10 import subprocess | |
11 import shlex # for command line arguments split | |
12 import operator | |
13 | |
14 REQUIRED_VERSION = (3, 4) | |
15 MAX_PRINT = 10 | |
16 MEGABLAST = "-task megablast" | |
17 HITSORT = "-task megablast" | |
18 | |
19 if sys.version_info < REQUIRED_VERSION: | |
20 raise Exception("\n\npython 3.4 or higher is required!\n") | |
21 | |
22 # additional functions | |
23 | |
24 | |
25 class Sequence: | |
26 | |
27 def __init__(self, seq, name="", paired=False): | |
28 # the mode os seq storage can be changed later to make it more | |
29 # memory efficient | |
30 self._seq = bytes(str(seq), "ascii") | |
31 self.name = str(name) | |
32 | |
33 @property | |
34 def seq(self): | |
35 return self._seq.decode("utf-8") | |
36 | |
37 @seq.setter | |
38 def seq(self, value): | |
39 self._seq = bytes(str(value), "ascii") | |
40 | |
41 def __str__(self): | |
42 return "{0} : {1}".format(self.name, self.seq) | |
43 | |
44 @staticmethod | |
45 def read_fasta(fasta_file_name): | |
46 ''' | |
47 generator - reads sequences from fasta file | |
48 return sequence one by one | |
49 ''' | |
50 with open(fasta_file_name, 'r') as f: | |
51 header = None | |
52 seqstr = None | |
53 for rawline in f: | |
54 line = rawline.strip() | |
55 if line == "": | |
56 continue | |
57 if line[0] == ">": | |
58 if header and seqstr: | |
59 yield Sequence(seqstr, header) | |
60 # reset | |
61 seqstr = None | |
62 header = line[1:] | |
63 elif seqstr: | |
64 Warning("sequence was not preceeded by header") | |
65 else: | |
66 header = line[1:] | |
67 else: | |
68 seqstr = line if not seqstr else seqstr + line | |
69 # skip empty lines: | |
70 if header and seqstr: | |
71 yield Sequence(seqstr, header) | |
72 return | |
73 | |
74 def write2fasta(self, file_object): | |
75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq)) | |
76 | |
77 | |
78 def get_kmers(string, width=11): | |
79 L = len(string) | |
80 parts = [string[i:i + width] for i in range(L - width + 0)] | |
81 return parts | |
82 | |
83 | |
84 def count_kmers_from_file(f, width=11): | |
85 counts = {} | |
86 for i in Sequence.read_fasta(f): | |
87 a = get_kmers(i.seq, width) | |
88 for km in a: | |
89 if "N" in km: | |
90 continue | |
91 if km in counts: | |
92 counts[km] += 1 | |
93 else: | |
94 counts[km] = 1 | |
95 sorted_counts = sorted(counts.items(), | |
96 key=operator.itemgetter(1), | |
97 reverse=True) | |
98 return sorted_counts | |
99 | |
100 | |
101 if __name__ == "__main__": | |
102 L = len(sys.argv) - 1 | |
103 kmer_length = int(sys.argv[-1]) | |
104 files = sys.argv[1:-1] | |
105 for fin in files: | |
106 counts = count_kmers_from_file(fin, kmer_length) | |
107 fout = "{}_{}.kmers".format(fin, kmer_length) | |
108 with open(fout, "w") as f: | |
109 for i in counts: | |
110 f.write("{}\t{}\n".format(*i)) | |
111 print(fout) |