Mercurial > repos > petr-novak > repeat_annotation_pipeline3
annotate reAnnotate.py @ 11:5366d5ea04bc draft
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
author | petr-novak |
---|---|
date | Fri, 04 Aug 2023 12:35:32 +0000 |
parents | |
children |
rev | line source |
---|---|
11
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
1 #!/usr/bin/env python |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
2 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
3 parse blast output table to gff file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
4 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
5 import argparse |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
6 import itertools |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
7 import os |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
8 import re |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
9 import shutil |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
10 import subprocess |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
11 import sys |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
12 import tempfile |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
13 from collections import defaultdict |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
14 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
15 # check version of python, must be at least 3.7 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
16 if sys.version_info < (3, 10): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
17 sys.exit("Python 3.10 or a more recent version is required.") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
18 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
19 def make_temp_files(number_of_files): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
20 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
21 Make named temporary files, file will not be deleted upon exit! |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
22 :param number_of_files: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
23 :return: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
24 filepaths |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
25 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
26 temp_files = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
27 for i in range(number_of_files): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
28 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
29 os.remove(temp_files[-1]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
30 return temp_files |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
31 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
32 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
33 def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
34 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
35 Split fasta file to chunks, sequences longe than chuck size are split to overlaping |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
36 peaces. If sequences are shorter, chunck with multiple sequences are created. |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
37 :param fasta_file: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
38 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
39 :param fasta_file: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
40 :param chunk_size: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
41 :param overlap: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
42 :return: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
43 fasta_file_split |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
44 matching_table (list of lists [header,chunk_number, start, end, new_header]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
45 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
46 min_chunk_size = chunk_size * 2 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
47 fasta_sizes_dict = read_fasta_sequence_size(fasta_file) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
48 # calculate size of items in fasta_dist dictionary |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
49 fasta_size = sum(fasta_sizes_dict.values()) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
50 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
51 # calculates ranges for splitting of fasta files and store them in list |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
52 matching_table = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
53 fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
54 for header, size in fasta_sizes_dict.items(): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
55 print(header, size, min_chunk_size) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
56 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
57 if size > min_chunk_size: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
58 number_of_chunks = int(size / chunk_size) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
59 print("number_of_chunks", number_of_chunks) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
60 print("size", size) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
61 print("chunk_size", chunk_size) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
62 print("-----------------------------------------") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
63 adjusted_chunk_size = int(size / number_of_chunks) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
64 for i in range(number_of_chunks): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
65 start = i * adjusted_chunk_size |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
66 end = ((i + 1) * |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
67 adjusted_chunk_size |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
68 + overlap) if i + 1 < number_of_chunks else size |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
69 new_header = header + '_' + str(i) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
70 matching_table.append([header, i, start, end, new_header]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
71 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
72 new_header = header + '_0' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
73 matching_table.append([header, 0, 0, size, new_header]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
74 # read sequences from fasta files and split them to chunks according to matching table |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
75 # open output and input files, use with statement to close files |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
76 number_of_temp_files = len(matching_table) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
77 print('number of temp files', number_of_temp_files) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
78 fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r')) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
79 with open(fasta_file_split, 'w') as fh_out: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
80 for header in fasta_dict: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
81 matching_table_part = [x for x in matching_table if x[0] == header] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
82 for header2, i, start, end, new_header in matching_table_part: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
83 fh_out.write('>' + new_header + '\n') |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
84 fh_out.write(fasta_dict[header][start:end] + '\n') |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
85 temp_files_fasta = make_temp_files(number_of_temp_files) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
86 fasta_seq_size = read_fasta_sequence_size(fasta_file_split) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
87 seq_id_size_sorted = [i[0] for i in sorted( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
88 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
89 )] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
90 seq_id_file_dict = dict(zip(seq_id_size_sorted, itertools.cycle(temp_files_fasta))) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
91 # write sequences to temporary files |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
92 with open(fasta_file_split, 'r') as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
93 first = True |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
94 for line in f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
95 if line[0] == '>': |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
96 # close previous file if it is not the first sequence |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
97 if not first: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
98 fout.close() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
99 first = False |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
100 header = line.strip().split(' ')[0][1:] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
101 fout = open(seq_id_file_dict[header],'a') |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
102 fout.write(line) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
103 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
104 fout.write(line) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
105 os.remove(fasta_file_split) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
106 return temp_files_fasta, matching_table |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
107 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
108 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
109 def read_fasta_sequence_size(fasta_file): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
110 """Read size of sequence into dictionary""" |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
111 fasta_dict = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
112 with open(fasta_file, 'r') as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
113 for line in f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
114 if line[0] == '>': |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
115 header = line.strip().split(' ')[0][1:] # remove part of name after space |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
116 fasta_dict[header] = 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
117 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
118 fasta_dict[header] += len(line.strip()) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
119 return fasta_dict |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
120 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
121 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
122 def read_single_fasta_to_dictionary(fh): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
123 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
124 Read fasta file into dictionary |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
125 :param fh: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
126 :return: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
127 fasta_dict |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
128 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
129 fasta_dict = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
130 for line in fh: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
131 if line[0] == '>': |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
132 header = line.strip().split(' ')[0][1:] # remove part of name after space |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
133 fasta_dict[header] = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
134 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
135 fasta_dict[header] += [line.strip()] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
136 fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
137 return fasta_dict |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
138 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
139 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
140 def overlap(a, b): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
141 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
142 check if two intervals overlap |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
143 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
144 return max(a[0], b[0]) <= min(a[1], b[1]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
145 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
146 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
147 def blast2disjoint( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
148 blastfile, seqid_counts=None, start_column=6, end_column=7, class_column=1, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
149 bitscore_column=11, pident_column=2, canonical_classification=True |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
150 ): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
151 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
152 find all interval beginning and ends in blast file and create bed file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
153 input blastfile is tab separated file with columns: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
154 'qaccver saccver pident length mismatch gapopen qstart qend sstart send |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
155 evalue bitscore' (default outfmt 6 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
156 blast must be sorted on qseqid and qstart |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
157 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
158 # assume all in one chromosome! |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
159 starts_ends = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
160 intervals = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
161 if canonical_classification: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
162 # make regular expression for canonical classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
163 # to match: Name#classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
164 # e.g. "Name_of_sequence#LTR/Ty1_copia/Angela" |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
165 regex = re.compile(r"(.*)[#](.*)") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
166 group = 2 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
167 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
168 # make regular expression for non-canonical classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
169 # to match: Classification__Name |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
170 # e.g. "LTR/Ty1_copia/Angela__Name_of_sequence" |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
171 regex = re.compile(r"(.*)__(.*)") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
172 group = 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
173 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
174 # identify continuous intervals |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
175 with open(blastfile, "r") as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
176 for seqid in sorted(seqid_counts.keys()): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
177 n_lines = seqid_counts[seqid] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
178 starts_ends[seqid] = set() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
179 for i in range(n_lines): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
180 items = f.readline().strip().split() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
181 # note 1s and 2s labels are used to distinguish between start and end and |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
182 # guarantee that with same coordinated start will be before end when |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
183 # sorting (1s < 2e) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
184 starts_ends[seqid].add((int(items[start_column]), '1s')) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
185 starts_ends[seqid].add((int(items[end_column]), '2e')) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
186 intervals[seqid] = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
187 for p1, p2 in itertools.pairwise(sorted(starts_ends[seqid])): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
188 if p1[1] == '1s': |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
189 sp = 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
190 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
191 sp = 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
192 if p2[1] == '2e': |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
193 ep = 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
194 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
195 ep = 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
196 intervals[seqid].append((p1[0] + sp, p2[0] - ep)) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
197 # scan each blast hit against continuous region and record hit with best score |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
198 with open(blastfile, "r") as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
199 disjoint_regions = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
200 for seqid in sorted(seqid_counts.keys()): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
201 n_lines = seqid_counts[seqid] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
202 idx_of_overlaps = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
203 best_pident = defaultdict(lambda: 0.0) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
204 best_bitscore = defaultdict(lambda: 0.0) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
205 best_hit_name = defaultdict(lambda: "") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
206 i1 = 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
207 for i in range(n_lines): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
208 items = f.readline().strip().split() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
209 start = int(items[start_column]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
210 end = int(items[end_column]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
211 pident = float(items[pident_column]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
212 bitscore = float(items[bitscore_column]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
213 classification = items[class_column] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
214 j = 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
215 done = False |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
216 while True: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
217 # beginning of searched region - does it overlap? |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
218 c_ovl = overlap(intervals[seqid][i1], (start, end)) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
219 if c_ovl: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
220 # if overlap is detected, add to dictionary |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
221 idx_of_overlaps[i] = [i1] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
222 if best_bitscore[i1] < bitscore: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
223 best_pident[i1] = pident |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
224 best_bitscore[i1] = bitscore |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
225 best_hit_name[i1] = classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
226 # add search also downstream |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
227 while True: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
228 j += 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
229 if j + i1 >= len(intervals[seqid]): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
230 done = True |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
231 break |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
232 c_ovl = overlap(intervals[seqid][i1 + j], (start, end)) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
233 if c_ovl: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
234 idx_of_overlaps[i].append(i1 + j) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
235 if best_bitscore[i1 + j] < bitscore: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
236 best_pident[i1 + j] = pident |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
237 best_bitscore[i1 + j] = bitscore |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
238 best_hit_name[i1 + j] = classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
239 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
240 done = True |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
241 break |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
242 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
243 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
244 # does no overlap - search next interval |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
245 i1 += 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
246 if done or i1 >= (len(intervals[seqid]) - 1): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
247 break |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
248 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
249 for i in sorted(best_pident.keys()): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
250 try: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
251 classification = re.match(regex, best_hit_name[i]).group(group) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
252 except AttributeError: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
253 classification = best_hit_name[i] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
254 record = ( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
255 seqid, intervals[seqid][i][0], intervals[seqid][i][1], best_pident[i], |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
256 classification) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
257 disjoint_regions.append(record) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
258 return disjoint_regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
259 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
260 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
261 def remove_short_interrupting_regions(regions, min_len=10, max_gap=2): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
262 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
263 remove intervals shorter than min_len which are directly adjacent to other |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
264 regions on both sides which are longer than min_len and has same classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
265 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
266 regions_to_remove = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
267 for i in range(1, len(regions) - 1): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
268 if regions[i][2] - regions[i][1] < min_len: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
269 c1 = regions[i - 1][2] - regions[i - 1][1] > min_len |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
270 c2 = regions[i + 1][2] - regions[i + 1][1] > min_len |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
271 c3 = regions[i - 1][4] == regions[i + 1][4] # same classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
272 c4 = regions[i + 1][4] != regions[i][4] # different classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
273 c5 = regions[i][1] - regions[i - 1][2] < max_gap # max gap between regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
274 c6 = regions[i + 1][1] - regions[i][2] < max_gap # max gap between regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
275 if c1 and c2 and c3 & c4 and c5 and c6: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
276 regions_to_remove.append(i) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
277 for i in sorted(regions_to_remove, reverse=True): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
278 del regions[i] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
279 return regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
280 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
281 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
282 def remove_short_regions(regions, min_l_score=600): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
283 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
284 remove intervals shorter than min_len |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
285 min_l_score is the minimum score for a region to be considered |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
286 l_score = length * PID |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
287 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
288 regions_to_remove = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
289 for i in range(len(regions)): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
290 l_score = (regions[i][3] - 50) * (regions[i][2] - regions[i][1]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
291 if l_score < min_l_score: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
292 regions_to_remove.append(i) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
293 for i in sorted(regions_to_remove, reverse=True): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
294 del regions[i] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
295 return regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
296 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
297 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
298 def join_disjoint_regions_by_classification(disjoint_regions, max_gap=0): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
299 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
300 merge neighboring intervals with same classification and calculate mean weighted score |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
301 weight correspond to length of the interval |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
302 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
303 merged_regions = [] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
304 for seqid, start, end, score, classification in disjoint_regions: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
305 score_length = (end - start + 1) * score |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
306 if len(merged_regions) == 0: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
307 merged_regions.append([seqid, start, end, score_length, classification]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
308 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
309 cond_same_class = merged_regions[-1][4] == classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
310 cond_same_seqid = merged_regions[-1][0] == seqid |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
311 cond_neighboring = start - merged_regions[-1][2] + 1 <= max_gap |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
312 if cond_same_class and cond_same_seqid and cond_neighboring: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
313 # extend region |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
314 merged_regions[-1] = [merged_regions[-1][0], merged_regions[-1][1], end, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
315 merged_regions[-1][3] + score_length, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
316 merged_regions[-1][4]] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
317 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
318 merged_regions.append([seqid, start, end, score_length, classification]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
319 # recalculate length weighted score |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
320 for record in merged_regions: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
321 record[3] = record[3] / (record[2] - record[1] + 1) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
322 return merged_regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
323 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
324 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
325 def write_merged_regions_to_gff3(merged_regions, outfile): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
326 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
327 write merged regions to gff3 file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
328 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
329 with open(outfile, "w") as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
330 # write header |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
331 f.write("##gff-version 3\n") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
332 for seqid, start, end, score, classification in merged_regions: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
333 attributes = "Name={};score={}".format(classification, score) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
334 f.write( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
335 "\t".join( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
336 [seqid, "blast_parsed", "repeat_region", str(start), str(end), |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
337 str(round(score,2)), ".", ".", attributes] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
338 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
339 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
340 f.write("\n") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
341 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
342 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
343 def sort_blast_table( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
344 blastfile, seqid_column=0, start_column=6, cpu=1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
345 ): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
346 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
347 split blast table by seqid and sort by start position |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
348 stores output in temp files |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
349 columns are indexed from 0 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
350 but cut uses 1-based indexing! |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
351 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
352 blast_sorted = tempfile.NamedTemporaryFile().name |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
353 # create sorted dictionary seqid counts |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
354 seq_id_counts = {} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
355 # sort blast file on disk using sort on seqid and start (numeric) position columns |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
356 # using sort command as blast output could be very large |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
357 cmd = "sort -k {0},{0} -k {1},{1}n --parallel {4} {2} > {3}".format( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
358 seqid_column + 1, start_column + 1, blastfile, blast_sorted, cpu |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
359 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
360 subprocess.check_call(cmd, shell=True) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
361 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
362 # count seqids using uniq command |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
363 cmd = "cut -f {0} {1} | uniq -c > {2}".format( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
364 seqid_column + 1, blast_sorted, blast_sorted + ".counts" |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
365 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
366 subprocess.check_call(cmd, shell=True) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
367 # read counts file and create dictionary |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
368 with open(blast_sorted + ".counts", "r") as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
369 for line in f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
370 line = line.strip().split() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
371 seq_id_counts[line[1]] = int(line[0]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
372 # remove counts file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
373 subprocess.call(["rm", blast_sorted + ".counts"]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
374 # return sorted dictionary and sorted blast file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
375 return seq_id_counts, blast_sorted |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
376 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
377 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
378 def run_blastn( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
379 query, db, blastfile, evalue=1e-3, max_target_seqs=999999999, gapopen=2, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
380 gapextend=1, reward=1, penalty=-1, word_size=9, num_threads=1, outfmt="6" |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
381 ): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
382 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
383 run blastn |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
384 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
385 # create temporary blast database: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
386 db_formated = tempfile.NamedTemporaryFile().name |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
387 cmd = "makeblastdb -in {0} -dbtype nucl -out {1}".format(db, db_formated) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
388 subprocess.check_call(cmd, shell=True) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
389 # if query is smaller than 1GB, run blast on single file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
390 size = os.path.getsize(query) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
391 print("query size: {} bytes".format(size)) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
392 max_size = 1e6 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
393 overlap = 50000 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
394 if size < max_size: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
395 cmd = ("blastn -task rmblastn -query {0} -db {1} -out {2} -evalue {3} " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
396 "-max_target_seqs {4} " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
397 "-gapopen {5} -gapextend {6} -word_size {7} -num_threads " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
398 "{8} -outfmt '{9}' -reward {10} -penalty {11} -dust no").format( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
399 query, db_formated, blastfile, evalue, max_target_seqs, gapopen, gapextend, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
400 word_size, num_threads, outfmt, reward, penalty |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
401 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
402 subprocess.check_call(cmd, shell=True) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
403 # if query is larger than 1GB, split query in chunks and run blast on each chunk |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
404 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
405 print(f"query is larger than {max_size}, splitting query in chunks") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
406 query_parts, matching_table = split_fasta_to_chunks(query, max_size, overlap) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
407 print(query_parts) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
408 for i, part in enumerate(query_parts): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
409 print(f"running blast on chunk {i}") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
410 print(part) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
411 cmd = ("blastn -task rmblastn -query {0} -db {1} -out {2} -evalue {3} " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
412 "-max_target_seqs {4} " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
413 "-gapopen {5} -gapextend {6} -word_size {7} -num_threads " |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
414 "{8} -outfmt '{9}' -reward {10} -penalty {11} -dust no").format( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
415 part, db_formated, f'{blastfile}.{i}', evalue, max_target_seqs, gapopen, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
416 gapextend, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
417 word_size, num_threads, outfmt, reward, penalty |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
418 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
419 subprocess.check_call(cmd, shell=True) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
420 print(cmd) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
421 # remove part file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
422 # os.unlink(part) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
423 # merge blast results and recalculate start, end positions and header |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
424 merge_blast_results(blastfile, matching_table, n_parts=len(query_parts)) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
425 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
426 # remove temporary blast database |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
427 os.unlink(db_formated + ".nhr") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
428 os.unlink(db_formated + ".nin") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
429 os.unlink(db_formated + ".nsq") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
430 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
431 def merge_blast_results(blastfile, matching_table, n_parts): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
432 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
433 Merge blast tables and recalculate start, end positions based on |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
434 matching table |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
435 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
436 with open(blastfile, "w") as f: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
437 matching_table_dict = {i[4]: i for i in matching_table} |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
438 print(matching_table_dict) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
439 for i in range(n_parts): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
440 with open(f'{blastfile}.{i}', "r") as f2: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
441 for line in f2: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
442 line = line.strip().split("\t") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
443 # seqid (header) is in column 1 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
444 seqid = line[0] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
445 line[0] = matching_table_dict[seqid][0] |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
446 # increase coordinates by start position of chunk |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
447 line[6] = str(int(line[6]) + matching_table_dict[seqid][2]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
448 line[7] = str(int(line[7]) + matching_table_dict[seqid][2]) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
449 f.write("\t".join(line) + "\n") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
450 # remove temporary blast file |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
451 # os.unlink(f'{blastfile}.{i}') |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
452 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
453 def main(): |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
454 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
455 main function |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
456 """ |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
457 # get command line arguments |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
458 parser = argparse.ArgumentParser( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
459 description="""This script is used to parse blast output table to gff file""", |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
460 formatter_class=argparse.RawTextHelpFormatter |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
461 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
462 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
463 '-i', '--input', default=None, required=True, help="input file", type=str, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
464 action='store' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
465 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
466 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
467 '-d', '--db', default=None, required=False, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
468 help="Fasta file with repeat database", type=str, action='store' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
469 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
470 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
471 '-o', '--output', default=None, required=True, help="output file name", type=str, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
472 action='store' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
473 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
474 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
475 '-a', '--alternative_classification_coding', default=False, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
476 help="Use alternative classification coding", action='store_true' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
477 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
478 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
479 '-f', '--fasta_input', default=False, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
480 help="Input is fasta file instead of blast table", action='store_true' |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
481 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
482 parser.add_argument( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
483 '-c', '--cpu', default=1, help="Number of cpu to use", type=int |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
484 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
485 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
486 args = parser.parse_args() |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
487 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
488 if args.fasta_input: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
489 # run blast using blastn |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
490 blastfile = tempfile.NamedTemporaryFile().name |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
491 if args.db: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
492 run_blastn(args.input, args.db, blastfile, num_threads=args.cpu) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
493 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
494 sys.exit("No repeat database provided") |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
495 else: |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
496 blastfile = args.input |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
497 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
498 # sort blast table |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
499 seq_id_counts, blast_sorted = sort_blast_table(blastfile, cpu=args.cpu) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
500 disjoin_regions = blast2disjoint( |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
501 blast_sorted, seq_id_counts, |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
502 canonical_classification=not args.alternative_classification_coding |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
503 ) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
504 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
505 # remove short regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
506 disjoin_regions = remove_short_interrupting_regions(disjoin_regions) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
507 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
508 # join neighboring regions with same classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
509 merged_regions = join_disjoint_regions_by_classification(disjoin_regions) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
510 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
511 # remove short regions again |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
512 merged_regions = remove_short_interrupting_regions(merged_regions) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
513 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
514 # merge again neighboring regions with same classification |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
515 merged_regions = join_disjoint_regions_by_classification(merged_regions, max_gap=10) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
516 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
517 # remove short weak regions |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
518 merged_regions = remove_short_regions(merged_regions) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
519 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
520 # last merge |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
521 merged_regions = join_disjoint_regions_by_classification(merged_regions, max_gap=20) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
522 write_merged_regions_to_gff3(merged_regions, args.output) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
523 # remove temporary files |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
524 os.remove(blast_sorted) |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
525 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
526 |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
527 if __name__ == "__main__": |
5366d5ea04bc
planemo upload commit 9d1b19f98d8b7f0a0d1baf2da63a373d155626f8-dirty
petr-novak
parents:
diff
changeset
|
528 main() |