Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
diff scripts/ReMatCh/utils/gffParser.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
line wrap: on
line diff
--- a/scripts/ReMatCh/utils/gffParser.py Wed Jan 22 09:10:12 2020 -0500 +++ b/scripts/ReMatCh/utils/gffParser.py Tue Jan 28 10:42:31 2020 -0500 @@ -1,194 +1,224 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -import argparse, sys, os +import argparse +import os from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord -from Bio.Alphabet import IUPAC import ntpath -def parseID(filename): - #get wanted feature IDs - gffIDs=[] - with open(filename, 'r') as in_handle: - for line in in_handle: - line=line.strip() - gffIDs.append(line) - return gffIDs +version = '1.0' + + +def parse_id(filename): + # get wanted feature IDs + gff_ids = [] + with open(filename, 'r') as in_handle: + for line in in_handle: + line = line.strip() + gff_ids.append(line) + return gff_ids + -def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir): - #Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. - handle = open(fastaFile, "rU") - records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) - handle.close() +def retrieve_seq_file(fasta_file, coord_file, extra_seq, filename, output_dir): + # Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences. + handle = open(fasta_file, "rU") + records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) + handle.close() - Seq2Get={} - with open(coordFile, 'r') as sequeces2get: - for line in sequeces2get: - line=line.split(',') - coords=(int(line[-2]), int(line[-1])) - contigID=line[0] - if contigID in Seq2Get.keys(): - Seq2Get[contigID].append(coords) - else: - Seq2Get[contigID]=[coords] + seq_2_get = {} + with open(coord_file, 'r') as sequeces2get: + for line in sequeces2get: + line = line.split(',') + coords = (int(line[-2]), int(line[-1])) + contig_id = line[0] + if contig_id in list(seq_2_get.keys()): + seq_2_get[contig_id].append(coords) + else: + seq_2_get[contig_id] = [coords] - with open(outputDir+'/'+filename+'.fasta','w') as output_handle: - fails=0 - successes=0 - records=[] - for contig, listCoords in Seq2Get.items(): - contigSeq=records_dict[contig].seq - for coord in listCoords: - coord1=coord[0]-extraSeq - coord2=coord[1]+extraSeq - if coord1 < 0 or coord2 > len(contigSeq): - fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') - fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n') - fail_log.close() - fails+=1 - else: - geneseq=str(contigSeq[coord1:coord2]) - record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='') - records.append(record) - successes+=1 - SeqIO.write(records, output_handle, "fasta") + with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: + fails = 0 + successes = 0 + records = [] + for contig, listCoords in list(seq_2_get.items()): + contig_seq = records_dict[contig].seq + for coord in listCoords: + coord1 = coord[0] - extra_seq + coord2 = coord[1] + extra_seq + if coord1 < 0 or coord2 > len(contig_seq): + fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') + fail_log.write(contig + ',' + str(coord[0]) + ',' + str(coord[1]) + '\n') + fail_log.close() + fails += 1 + else: + geneseq = str(contig_seq[coord1:coord2]) + record = SeqRecord(Seq(geneseq), id=str(str(contig) + '#' + str(coord1) + '_' + str(coord2)), + description='') + records.append(record) + successes += 1 + SeqIO.write(records, output_handle, "fasta") - print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) - if fails>0: - print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) + print('Retrived %s features successfully from %s with %s bp as extra' + ' sequence.' % (str(successes), filename, str(extra_seq))) + if fails > 0: + print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) + -def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir): - #parsing the sequence file into a SeqIO dictionary. one contig per entry - handle = open(fastaFile, "rU") - records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) - handle.close() +def retrieve_seq(fasta_file, gff_features, extra_seq, filename, output_dir): + # parsing the sequence file into a SeqIO dictionary. one contig per entry + handle = open(fasta_file, "rU") + records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) + handle.close() - with open(outputDir+'/'+filename+'.fasta','w') as output_handle: - fails=0 - successes=0 - records=[] - for locus, location in gffFeatures.items(): - #print locus - contigSeq=records_dict[location[0]].seq - coord1=location[1]-extraSeq - coord2=location[2]+extraSeq - if coord1 < 0 or coord2 > len(contigSeq): - fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a') - fail_log.write(locus+'\n') - fail_log.close() - fails+=1 - else: - geneseq=str(contigSeq[coord1:coord2]) - if location[3] == '-': - seq = Seq(geneseq) - geneseq =str(seq.reverse_complement()) - record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='') - records.append(record) - successes+=1 - SeqIO.write(records, output_handle, "fasta") - print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq)) - if fails>0: - print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename) + with open(output_dir + '/' + filename + '.fasta', 'w') as output_handle: + fails = 0 + successes = 0 + records = [] + for locus, location in list(gff_features.items()): + # print locus + contig_seq = records_dict[location[0]].seq + coord1 = location[1] - extra_seq + coord2 = location[2] + extra_seq + if coord1 < 0 or coord2 > len(contig_seq): + fail_log = open(output_dir + '/' + filename + '_fails.txt', 'a') + fail_log.write(locus + '\n') + fail_log.close() + fails += 1 + else: + geneseq = str(contig_seq[coord1:coord2]) + if location[3] == '-': + seq = Seq(geneseq) + geneseq = str(seq.reverse_complement()) + record = SeqRecord(Seq(geneseq), + id=str(locus + '-' + str(location[0]) + '#' + str(location[1]) + '_' + + str(location[2])), + description='') + records.append(record) + successes += 1 + SeqIO.write(records, output_handle, "fasta") + print('Retrived %s features successfully from %s with %s bp as extra' + ' sequence.' % (str(successes), filename, str(extra_seq))) + if fails > 0: + print('%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)) -def parseFeatures(tempGFF): - #parsing the feature file into a dictionary - gffFeatures={} + +def parse_features(temp_gff): + # parsing the feature file into a dictionary + gff_features = {} - with open(tempGFF, 'r') as temp_genes: - for line in temp_genes: - line=line.split('\t') - if "CDS" in line[2]: - ID=line[-1].split(';') - locusID=str(ID[0].split('=')[1]) - contig=line[0] - begining=int(line[3])-1 #to get the full sequence - end=int(line[4]) - strand=line[6] - location=[contig, begining, end, strand] - gffFeatures[locusID]=location - return gffFeatures + with open(temp_gff, 'r') as temp_genes: + for line in temp_genes: + line = line.split('\t') + if "CDS" in line[2]: + id = line[-1].split(';') + locus_id = str(id[0].split('=')[1]) + contig = line[0] + begining = int(line[3]) - 1 # to get the full sequence + end = int(line[4]) + strand = line[6] + location = [contig, begining, end, strand] + gff_features[locus_id] = location + return gff_features -def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None): - filename=ntpath.basename(gffFile).replace('.gff', '') +def gff_parser(gff_file, extra_seq=0, output_dir='.', keep_temporary_files=False, ids=None, coord_file=None): + filename = ntpath.basename(gff_file).replace('.gff', '') - #cleaning temp files if they exist - if os.path.isfile(outputDir+'/'+filename+'_features.gff'): - os.remove(outputDir+'/'+filename+'_features.gff') - if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'): - os.remove(outputDir+'/'+filename+'_sequence.fasta') + # cleaning temp files if they exist + if os.path.isfile(output_dir + '/' + filename + '_features.gff'): + os.remove(output_dir + '/' + filename + '_features.gff') + if os.path.isfile(output_dir + '/' + filename + '_sequence.fasta'): + os.remove(output_dir + '/' + filename + '_sequence.fasta') - #cleaning fails file if it exists - if os.path.isfile(outputDir+'/'+filename+'_fails.txt'): - os.remove(outputDir+'/'+filename+'_fails.txt') + # cleaning fails file if it exists + if os.path.isfile(output_dir + '/' + filename + '_fails.txt'): + os.remove(output_dir + '/' + filename + '_fails.txt') - if coordFile is None: + if coord_file is None: + + if ids is not None: + select_ids = parse_id(ids) + else: + select_ids = None - if IDs is not None: - selectIDs=parseID(IDs) - else: - selectIDs=None - - #separating the gff into 2 different files: one with the features and another with the conting sequences - with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: - for line in in_handle: - if not line.startswith('##'): - if '\t' in line: - if selectIDs is not None: - items=line.split('\t') - ID=items[-1].split(';')[0] - ID=ID.split('=')[1] - if ID in selectIDs: - temp_genes.write(line) - else: - temp_genes.write(line) - else: - temp_contigs.write(line) + # separating the gff into 2 different files: one with the features and another with the conting sequences + with open(gff_file, 'r') as in_handle, open(output_dir + '/' + filename + '_features.gff', 'a') as temp_genes, \ + open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: + for line in in_handle: + if not line.startswith('##'): + if '\t' in line: + if select_ids is not None: + items = line.split('\t') + id = items[-1].split(';')[0] + id = id.split('=')[1] + if id in select_ids: + temp_genes.write(line) + else: + temp_genes.write(line) + else: + temp_contigs.write(line) + + gff_files = parse_features(output_dir + '/' + filename + '_features.gff') + + retrieve_seq(output_dir + '/' + filename + '_sequence.fasta', gff_files, extra_seq, filename, output_dir) - gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff') + else: + with open(gff_file, 'r') as in_handle, \ + open(output_dir + '/' + filename + '_sequence.fasta', 'a') as temp_contigs: + for line in in_handle: + if not line.startswith('##'): + if '\t' in line: + pass + else: + temp_contigs.write(line) - retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir) - - else: - with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs: - for line in in_handle: - if not line.startswith('##'): - if '\t' in line: - pass - else: - temp_contigs.write(line) + retrieve_seq_file(output_dir + '/' + filename + '_sequence.fasta', coord_file, extra_seq, filename, output_dir) - retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir) + # removing temp files + if not keep_temporary_files: + try: + os.remove(output_dir + '/' + filename + '_features.gff') + except: + pass + os.remove(output_dir + '/' + filename + '_sequence.fasta') - #removing temp files - if not keepTemporaryFiles: - try: - os.remove(outputDir+'/'+filename+'_features.gff') - except: - pass - os.remove(outputDir+'/'+filename+'_sequence.fasta') def main(): - - version='1.0.0' + parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', + epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') + parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) - parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)') - parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) + parser.add_argument('-i', '--input', + help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from' + ' PROKKA).', type=argparse.FileType('r'), required=True) + parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, + required=False) + parser.add_argument('-k', '--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', + action='store_true') + parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', + required=False) - parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True) - parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False) - parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true') - parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False) - + parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() + parser_optional_selected_regions_exclusive.add_argument('-s', '--select', + help='txt file with the IDs of interest, one per line', + type=argparse.FileType('r'), required=False) + parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', + help='Sequence coordinates to be retrieved. Requires contig' + ' ID and coords (contig,strart,end) in a csv file. One' + ' per line.', type=argparse.FileType('r'), + required=False) - parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group() - parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False) - parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False) + args = parser.parse_args() - args = parser.parse_args() - - gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) + args.outputDir = os.path.abspath(args.outputDir) + if not os.path.isdir(args.outputDir): + os.makedirs(args.outputDir) + + gff_parser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), + args.keepTemporaryFiles, + os.path.abspath(args.select.name) if args.select is not None else None, + os.path.abspath(args.fromFile.name) if args.fromFile is not None else None) if __name__ == "__main__":