diff scripts/ReMatCh/utils/gffParser.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/ReMatCh/utils/gffParser.py	Wed Jan 22 08:41:44 2020 -0500
@@ -0,0 +1,195 @@
+#!/usr/bin/env python
+
+import argparse, sys, os
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+import ntpath
+
+def parseID(filename):
+	#get wanted feature IDs
+	gffIDs=[]
+	with open(filename, 'r') as in_handle:
+		for line in in_handle:
+			line=line.strip()
+			gffIDs.append(line)
+	return gffIDs
+
+def retrieveSeq_File(fastaFile, coordFile, extraSeq, filename, outputDir):
+	#Parsing the sequence file, using the provided txt file containing the contig ID and positions to retrieve sequences.
+	handle = open(fastaFile, "rU")
+	records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
+	handle.close()
+
+	Seq2Get={}
+	with open(coordFile, 'r') as sequeces2get:
+		for line in sequeces2get:
+			line=line.split(',')
+			coords=(int(line[-2]), int(line[-1]))
+			contigID=line[0]
+			if contigID in Seq2Get.keys():
+				Seq2Get[contigID].append(coords)
+			else:
+				Seq2Get[contigID]=[coords]
+
+	with open(outputDir+'/'+filename+'.fasta','w') as output_handle:
+		fails=0
+		successes=0
+		records=[]
+		for contig, listCoords in Seq2Get.items():
+			contigSeq=records_dict[contig].seq
+			for coord in listCoords:
+				coord1=coord[0]-extraSeq
+				coord2=coord[1]+extraSeq
+				if coord1 < 0 or coord2 > len(contigSeq):
+					fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a')
+					fail_log.write(contig + ',' + str(coord[0])+','+ str(coord[1])+'\n')
+					fail_log.close()
+					fails+=1
+				else:
+					geneseq=str(contigSeq[coord1:coord2])
+					record = SeqRecord(Seq(geneseq), id=str(str(contig)+'#'+str(coord1)+'_'+str(coord2)), description='')
+					records.append(record)
+					successes+=1
+		SeqIO.write(records, output_handle, "fasta")
+
+	print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq))
+	if fails>0:
+		print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)
+
+def retrieveSeq(fastaFile, gffFeatures, extraSeq, filename, outputDir):
+	#parsing the sequence file into a SeqIO dictionary. one contig per entry
+	handle = open(fastaFile, "rU")
+	records_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
+	handle.close()
+
+	with open(outputDir+'/'+filename+'.fasta','w') as output_handle:
+		fails=0
+		successes=0
+		records=[]
+		for locus, location in gffFeatures.items():
+			#print locus
+			contigSeq=records_dict[location[0]].seq
+			coord1=location[1]-extraSeq
+			coord2=location[2]+extraSeq
+			if coord1 < 0 or coord2 > len(contigSeq):
+				fail_log=open(outputDir+'/'+filename+'_fails.txt', 'a')
+				fail_log.write(locus+'\n')
+				fail_log.close()
+				fails+=1
+			else:
+				geneseq=str(contigSeq[coord1:coord2])
+				if location[3] == '-':
+					seq = Seq(geneseq)
+					geneseq =str(seq.reverse_complement())
+				record = SeqRecord(Seq(geneseq), id=str(locus+'-'+str(location[0])+'#'+str(location[1])+'_'+str(location[2])), description='')
+				records.append(record)
+				successes+=1
+		SeqIO.write(records, output_handle, "fasta")
+	print 'Retrived %s features successfully from %s with %s bp as extra sequence.' % (str(successes), filename, str(extraSeq))
+	if fails>0:
+		print '%s featrued failed to retrieve. Check %s_fails.txt file.' % (str(fails), filename)
+
+def parseFeatures(tempGFF):
+	#parsing the feature file into a dictionary
+	gffFeatures={}
+
+	with open(tempGFF, 'r') as temp_genes:
+		for line in temp_genes:
+			line=line.split('\t')
+			if "CDS" in line[2]:
+				ID=line[-1].split(';')
+				locusID=str(ID[0].split('=')[1])
+				contig=line[0]
+				begining=int(line[3])-1 #to get the full sequence
+				end=int(line[4])
+				strand=line[6]
+				location=[contig, begining, end, strand]
+				gffFeatures[locusID]=location
+	return gffFeatures
+
+def gffParser(gffFile, extraSeq=0, outputDir='.', keepTemporaryFiles=False, IDs=None, coordFile=None):
+
+	filename=ntpath.basename(gffFile).replace('.gff', '')
+
+	#cleaning temp files if they exist
+	if os.path.isfile(outputDir+'/'+filename+'_features.gff'):
+		os.remove(outputDir+'/'+filename+'_features.gff')
+	if os.path.isfile(outputDir+'/'+filename+'_sequence.fasta'):
+		os.remove(outputDir+'/'+filename+'_sequence.fasta')
+
+	#cleaning fails file if it exists
+	if os.path.isfile(outputDir+'/'+filename+'_fails.txt'):
+		os.remove(outputDir+'/'+filename+'_fails.txt')
+
+	if coordFile is None:
+
+		if IDs is not None:
+			selectIDs=parseID(IDs)
+		else:
+			selectIDs=None
+	
+		#separating the gff into 2 different files: one with the features and another with the conting sequences
+		with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_features.gff', 'a') as temp_genes, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs:
+			for line in in_handle: 
+				if not line.startswith('##'):
+					if '\t' in line:
+						if selectIDs is not None:
+							items=line.split('\t')
+							ID=items[-1].split(';')[0]
+							ID=ID.split('=')[1]
+							if ID in selectIDs:
+								temp_genes.write(line)
+						else:
+							temp_genes.write(line)
+					else:
+						temp_contigs.write(line)
+
+		gffFiles=parseFeatures(outputDir+'/'+filename+'_features.gff')
+
+		retrieveSeq(outputDir+'/'+filename+'_sequence.fasta', gffFiles, extraSeq, filename, outputDir)
+	
+	else:
+		with open(gffFile, 'r') as in_handle, open(outputDir+'/'+filename+'_sequence.fasta', 'a') as temp_contigs:
+			for line in in_handle:
+				if not line.startswith('##'):
+					if '\t' in line:
+						pass
+					else:
+						temp_contigs.write(line)
+
+		retrieveSeq_File(outputDir+'/'+filename+'_sequence.fasta', coordFile, extraSeq, filename, outputDir)
+
+	#removing temp files
+	if not keepTemporaryFiles:
+		try:
+			os.remove(outputDir+'/'+filename+'_features.gff')
+		except:
+			pass
+		os.remove(outputDir+'/'+filename+'_sequence.fasta')
+
+def main():
+
+	version='1.0.0'
+
+	parser = argparse.ArgumentParser(prog='gffParser.py', description='GFF3 parser for feature sequence retrival.', epilog='by C I Mendes (cimendes@medicina.ulisboa.pt)')
+	parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
+
+	parser.add_argument('-i', '--input', help='GFF3 file to parse, containing both sequences and annotations (like the one obtained from PROKKA).', type=argparse.FileType('r'), required=True)
+	parser.add_argument('-x', '--extraSeq', help='Extra sequence to retrieve per feature in gff.', default=0, type=int, required=False)
+	parser.add_argument('-k','--keepTemporaryFiles', help='Keep temporary gff(without sequence) and fasta files.', action='store_true')
+	parser.add_argument('-o', '--outputDir', help='Path to where the output is to be saved.', default='.', required=False)
+	
+
+	parser_optional_selected_regions_exclusive = parser.add_mutually_exclusive_group()
+	parser_optional_selected_regions_exclusive.add_argument('-s', '--select', help='txt file with the IDs of interest, one per line', type=argparse.FileType('r'), required=False)
+	parser_optional_selected_regions_exclusive.add_argument('-f', '--fromFile', help='Sequence coordinates to be retrieved. Requires contig ID and coords (contig,strart,end) in a csv file. One per line.', type=argparse.FileType('r'), required=False)
+
+	args = parser.parse_args()
+	
+	gffParser(os.path.abspath(args.input.name), args.extraSeq, os.path.abspath(args.outputDir), args.keepTemporaryFiles, os.path.abspath(args.select.name) if args.select is not None else None, os.path.abspath(args.fromFile.name) if args.fromFile is not None else None)
+
+
+if __name__ == "__main__":
+    main()