annotate lib/tarean/kmer_counting.py @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
2
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
3 import logging
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
4 logger = logging.getLogger(__name__)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
5 import itertools
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
6 import os
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
7 import sys
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
8 import random
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
9 import sqlite3
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
10 import subprocess
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
11 import shlex # for command line arguments split
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
12 import operator
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
13
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
14 REQUIRED_VERSION = (3, 4)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
15 MAX_PRINT = 10
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
16 MEGABLAST = "-task megablast"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
17 HITSORT = "-task megablast"
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
18
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
19 if sys.version_info < REQUIRED_VERSION:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
20 raise Exception("\n\npython 3.4 or higher is required!\n")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
21
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
22 # additional functions
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
23
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
24
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
25 class Sequence:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
26
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
27 def __init__(self, seq, name="", paired=False):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
28 # the mode os seq storage can be changed later to make it more
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
29 # memory efficient
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
30 self._seq = bytes(str(seq), "ascii")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
31 self.name = str(name)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
32
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
33 @property
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
34 def seq(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
35 return self._seq.decode("utf-8")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
36
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
37 @seq.setter
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
38 def seq(self, value):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
39 self._seq = bytes(str(value), "ascii")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
40
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
41 def __str__(self):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
42 return "{0} : {1}".format(self.name, self.seq)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
43
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
44 @staticmethod
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
45 def read_fasta(fasta_file_name):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
46 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
47 generator - reads sequences from fasta file
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
48 return sequence one by one
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
49 '''
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
50 with open(fasta_file_name, 'r') as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
51 header = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
52 seqstr = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
53 for rawline in f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
54 line = rawline.strip()
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
55 if line == "":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
56 continue
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
57 if line[0] == ">":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
58 if header and seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
59 yield Sequence(seqstr, header)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
60 # reset
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
61 seqstr = None
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
62 header = line[1:]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
63 elif seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
64 Warning("sequence was not preceeded by header")
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
65 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
66 header = line[1:]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
67 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
68 seqstr = line if not seqstr else seqstr + line
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
69 # skip empty lines:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
70 if header and seqstr:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
71 yield Sequence(seqstr, header)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
72 return
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
73
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
74 def write2fasta(self, file_object):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
75 file_object.write(">{0}\n{1}\n".format(self.name, self.seq))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
76
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
77
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
78 def get_kmers(string, width=11):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
79 L = len(string)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
80 parts = [string[i:i + width] for i in range(L - width + 0)]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
81 return parts
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
82
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
83
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
84 def count_kmers_from_file(f, width=11):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
85 counts = {}
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
86 for i in Sequence.read_fasta(f):
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
87 a = get_kmers(i.seq, width)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
88 for km in a:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
89 if "N" in km:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
90 continue
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
91 if km in counts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
92 counts[km] += 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
93 else:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
94 counts[km] = 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
95 sorted_counts = sorted(counts.items(),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
96 key=operator.itemgetter(1),
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
97 reverse=True)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
98 return sorted_counts
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
99
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
100
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
101 if __name__ == "__main__":
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
102 L = len(sys.argv) - 1
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
103 kmer_length = int(sys.argv[-1])
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
104 files = sys.argv[1:-1]
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
105 for fin in files:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
106 counts = count_kmers_from_file(fin, kmer_length)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
107 fout = "{}_{}.kmers".format(fin, kmer_length)
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
108 with open(fout, "w") as f:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
109 for i in counts:
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
110 f.write("{}\t{}\n".format(*i))
1d1b9e1b2e2f Uploaded
petr-novak
parents:
diff changeset
111 print(fout)