Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
view scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
line wrap: on
line source
#!/usr/bin/env python3 import argparse import os from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord import ntpath version = '1.0' def parse_id(filename): # get wanted feature IDs gff_ids = [] with open(filename, 'r') as in_handle: for line in in_handle: line = line.strip() gff_ids.append(line) return gff_ids def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir): # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. handle = open(fasta_file, "rU") records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() seq_2_get = {} with open(coord_file, 'r') as sequeces2get: for line in sequeces2get: line = line.split(',') coords = (int(line[-2]), int(line[-1])) contig_id = line[0] if contig_id in list(seq_2_get.keys()): seq_2_get[contig_id].append(coords) else: seq_2_get[contig_id] = [coords] with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: fails = 0 successes = 0 records = [] for contig, listCoords in list(seq_2_get.items()): contig_seq = records_dict[contig].seq for coord in listCoords: coord1 = coord[0] - extra_seq coord2 = coord[1] + extra_seq if coord1 < 0 or coord2 > len(contig_seq): fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n') fail_log.close() fails += 1 else: geneseq = str(contig_seq[coord1:coord2]) record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)), description='') records.append(record) successes += 1 SeqIO.write(records, output_handle, "fasta") print('Retrived %s features successfully from %s with %s bp as extra' ' sequence.' % (str(successes), filename, str(extra_seq))) if fails > 0: print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir): # parsing the sequence file into a SeqIO dictionary. one contig per entry handle = open(fasta_file, "rU") records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: fails = 0 successes = 0 records = [] for locus, location in list(gff_features.items()): # print locus contig_seq = records_dict[location[0]].seq coord1 = location[1] - extra_seq coord2 = location[2] + extra_seq if coord1 < 0 or coord2 > len(contig_seq): fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') fail_log.write(locus + '\n') fail_log.close() fails += 1 else: geneseq = str(contig_seq[coord1:coord2]) if location[3] == '-': seq = Seq(geneseq) geneseq = str(seq.reverse_complement()) record = SeqRecord(Seq(geneseq), id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' + str(location[2])), description='') records.append(record) successes += 1 SeqIO.write(records, output_handle, "fasta") print('Retrived %s features successfully from %s with %s bp as extra' ' sequence.' % (str(successes), filename, str(extra_seq))) if fails > 0: print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) def parse_features(temp_gff): # parsing the feature file into a dictionary gff_features = {} with open(temp_gff, 'r') as temp_genes: for line in temp_genes: line = line.split('\t') if "CDS" in line[2]: id = line[-1].split(';') locus_id = str(id[0].split('=')[1]) contig = line[0] begining = int(line[3]) - 1 # to get the full sequence end = int(line[4]) strand = line[6] location = [contig, begining, end, strand] gff_features[locus_id] = location return gff_features def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None): filename = ntpath.basename(gff_file).replace('.gff', '') # cleaning temp files if they exist if os.path.isfile(output_dir + '/' + filename + '_features.gff'): os.remove(output_dir + '/' + filename + '_features.gff') if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'): os.remove(output_dir + '/' + filename + '_sequence.fasta') # cleaning fails file if it exists if os.path.isfile(output_dir + '/' + filename + '_fails.txt'): os.remove(output_dir + '/' + filename + '_fails.txt') if coord_file is None: if ids is not None: select_ids = parse_id(ids) else: select_ids = None # separating the gff into 2 different files: one with the features and another with the conting sequences with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \ open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: for line in in_handle: if not line.startswith('##'): if '\t' in line: if select_ids is not None: items = line.split('\t') id = items[-1].split(';')[0] id = id.split('=')[1] if id in select_ids: temp_genes.write(line) else: temp_genes.write(line) else: temp_contigs.write(line) gff_files = parse_features(output_dir + '/' + filename + '_features.gff') retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir) else: with open(gff_file, 'r') as in_handle, \ open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: for line in in_handle: if not line.startswith('##'): if '\t' in line: pass else: temp_contigs.write(line) retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir) # removing temp files if not keep_temporary_files: try: os.remove(output_dir + '/' + filename + '_features.gff') except: pass os.remove(output_dir + '/' + filename + '_sequence.fasta') def main(): parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from' ' PROKKA).', type=argparse.FileType('r'), required=True) parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig' ' ID and coords (contig,strart,end) in a csv file. One' ' per line.', type=argparse.FileType('r'), required=False) args = parser.parse_args() args.outputDir = os.path.abspath(args.outputDir) if not os.path.isdir(args.outputDir): os.makedirs(args.outputDir) gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) if __name__ == "__main__": main()