view scaffremodler/convert2X.py @ 0:66885fa414c8 draft default tip

Uploaded
author gdroc
date Mon, 14 Nov 2016 08:31:23 -0500
parents
children
line wrap: on
line source


#
#  Copyright 2014 CIRAD
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, see <http://www.gnu.org/licenses/> or
#  write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#


import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator

from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def __main__():
	#Parse Command Line
	parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n"
	"This program replace specified regions in the provided table file by X. These X will be used to split scaffold using SplitOnX.py"
	"The table file should be formated has in the example:"
	"scaffold83 93565 93586"
	"scaffold120 330181 330183"
	"scaffold120 380870 383428")
	# Wrapper options. 
	parser.add_option( '', '--table', dest='table', default='not_filled', help='The table file with region to convert to X')
	parser.add_option( '', '--fasta', dest='fasta', default='not_filled', help='The multifasta sequence file')
	parser.add_option( '', '--out', dest='out', default='X_converted.fasta', help='The output file name, [default: %default]')
	(options, args) = parser.parse_args()
	
	
	
	if options.table == 'not_filled':
		sys.exit('--table argument is missing')
	if options.fasta == 'not_filled':
		sys.exit('--fasta argument is missing')
	
	#loading sequences
	record_dict = SeqIO.index(options.fasta, "fasta")
	file = open(options.table)
	dic = {}
	for line in file:
		data = line.split()
		if data:
			if data[0] in dic:
				if len(data) == 2:
					dic[data[0]].add(int(data[1])-1)
				else:
					i = int(data[1])
					while i <= int(data[2]):
						dic[data[0]].add(i-1)
						i += 1
			else:
				dic[data[0]] = set()
				if len(data) == 2:
					dic[data[0]].add(int(data[1])-1)
				else:
					i = int(data[1])
					while i <= int(data[2]):
						dic[data[0]].add(i-1)
						i += 1
	file.close()
	
	outfile = open(options.out,'w')
	for n in record_dict:
		if n in dic:
			sequence = list(str(record_dict[n].seq))
			for k in dic[n]:
				sequence[k] = 'X'
			SeqIO.write(SeqRecord(Seq(''.join(sequence), generic_dna), id = n, description=''),outfile, "fasta")
		else:
			SeqIO.write(SeqRecord(record_dict[n].seq, id = n, description=''),outfile, "fasta")

if __name__ == "__main__": __main__()