Mercurial > repos > petr-novak > profrep
diff gff.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff.py Wed Jun 26 08:01:42 2019 -0400 @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" sequence repetitive profile conversion to GFF3 format """ + +import time +import configuration +from operator import itemgetter +from itertools import groupby +from tempfile import NamedTemporaryFile +import os + + +def create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, headers): + seq_id = None + exclude = set(['ALL']) + gff_tmp_list = [] + for repeat in sorted(set(files_dict.keys()).difference(exclude)): + gff_tmp = NamedTemporaryFile(delete=False) + pos_seq_dict = {} + with open(files_dict[repeat][0], "r") as repeat_f, open( + files_dict[repeat][2], "r") as quality_f: + for line_r, line_q in zip(repeat_f, quality_f): + if "chrom" in line_r: + idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp, + configuration.REPEATS_FEATURE, repeat, + pos_seq_dict) + seq_id = line_r.rstrip().split("chrom=")[1] + pos_seq_dict = {} + else: + hits = int(line_r.rstrip().split("\t")[1]) + if hits >= THRESHOLD: + position = int(line_r.rstrip().split("\t")[0]) + pos_seq_dict[position] = line_q.rstrip().split("\t")[1] + idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp, + configuration.REPEATS_FEATURE, repeat, pos_seq_dict) + gff_tmp_list.append(gff_tmp.name) + gff_tmp.close() + sort_records(gff_tmp_list, headers, OUTPUT_GFF) + for tmp in gff_tmp_list: + os.unlink(tmp) + + +def idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_file, feature, repeat, + pos_seq_dict): + indices = sorted(pos_seq_dict.keys(), key=int) + for key, group in groupby( + enumerate(indices), + lambda index_item: index_item[0] - index_item[1]): + group = list(map(itemgetter(1), group)) + if len(group) > THRESHOLD_SEGMENT: + sum_qual = 0 + for position in group: + sum_qual += int(pos_seq_dict[position]) + qual_per_reg = sum_qual / len(group) + # Take boundaries of the group vectors + gff_file.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format( + seq_id, configuration.SOURCE_PROFREP, feature, group[ + 0], group[-1], configuration.GFF_EMPTY, + configuration.GFF_EMPTY, configuration.GFF_EMPTY, repeat, + round(qual_per_reg)).encode("utf-8")) + + +def idx_ranges_N(indices, THRESHOLD_SEGMENT, seq_id, gff_file, feature, + att_name): + for key, group in groupby( + enumerate(indices), + lambda index_item: index_item[0] - index_item[1]): + group = list(map(itemgetter(1), group)) + if len(group) > THRESHOLD_SEGMENT: + # Take boundaries of the group vectors + gff_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( + seq_id, configuration.SOURCE_PROFREP, feature, group[0], group[ + -1], configuration.GFF_EMPTY, configuration.GFF_EMPTY, + configuration.GFF_EMPTY, att_name)) + + +def sort_records(gff_tmp_list, headers, OUTPUT_GFF): + opened_files = [open(i, "r") for i in gff_tmp_list] + files_lines = dict((key, "") for key in opened_files) + count_without_line = 0 + count_seq = 0 + #################################################################### + present_seqs = headers + #################################################################### + with open(OUTPUT_GFF, "w") as final_gff: + final_gff.write("{}\n".format(configuration.HEADER_GFF)) + while True: + for file_name in opened_files: + if not files_lines[file_name]: + line = file_name.readline() + if line: + files_lines[file_name] = line + else: + count_without_line += 1 + if count_without_line == len(opened_files): + break + count_without_line = 0 + count = 0 + lowest_pos = float("inf") + for file_key in files_lines.keys(): + if files_lines[file_key].split("\t")[0] == present_seqs[ + count_seq]: + count += 1 + start_pos = int(files_lines[file_key].split("\t")[3]) + if start_pos < lowest_pos: + lowest_pos = start_pos + record_to_write = files_lines[file_key] + lowest_file_key = file_key + if count == 0: + count_seq += 1 + else: + final_gff.write(record_to_write) + files_lines[lowest_file_key] = "" + + +def main(): + pass + + +if __name__ == "__main__": + main()