view Contra/scripts/convert_gene_coordinate.py @ 15:f4345d10e1ad

Uploaded
author fcaramia
date Tue, 11 Dec 2012 18:52:28 -0500
parents 7564f3b1e675
children
line wrap: on
line source

# ----------------------------------------------------------------------#
# Copyright (c) 2011, Richard Lupat & Jason Li.
#
# > Source License <
# This file is part of CONTRA.
#
#    CONTRA is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    CONTRA is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
#
# 
#-----------------------------------------------------------------------#
# Last Updated : 03 Oct 2011 11:00AM

import sys

def outputwrite(output, gene,chr,a,b,c,id,n,sOri, eOri):
        id = str(id)
	n = str(n)
        output.write(gene+"\t"+chr+"\t"+a+"\t"+b+"\t"+c+"\t"+id+"\t"+n+"\t"+sOri+"\t"+eOri+"\n")

def outputwrite2(output2, chr,c,sOri, eOri):
        output2.write(chr+"\t"+sOri+"\t"+eOri+"\t"+c+"\n")


def convertGeneCoordinate(targetList, bufLocFolder): 
	inputfile2 = bufLocFolder + "chr/"
	outputfile = bufLocFolder + "geneRefCoverage.txt"
	outputfile2= bufLocFolder + "geneRefCoverage2.txt"

	output= open(outputfile,"w")
	output2 = open(outputfile2 , "w")

	rn = 1
	prevchr = ""
	for target in targetList:
		chr = target.chr
		starts = target.oriStart.split(",")
		ends = target.oriEnd.split(",")

		if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
			continue

		if (prevchr != chr):
			print chr #progress checking
			prevchr = chr
			t = 0
			covFile = file.readlines(open(inputfile2+chr+".txt","r"))

		for n in range(0,target.numberExon):
			if t >= len(covFile):
				break
			cov = covFile[t].split()
			while  ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
				if (int(cov[1]) > int(starts[n])):
					t-=1
				else:
					t+=1
				cov = covFile[t].split() 
	
			while  ((int(ends[n]) < int(cov[1])) or (int(ends[n]) >  int(cov[2]))):
                		# print output #
				if (rn == 1):
					prev = target.id
                		nID = target.id
                		if (nID != prev):
					rn = 1
				ref1 = str(rn)
				ref2 = str(int(cov[2]) - int(starts[n]) + rn)
				outputwrite(output, target.gene,chr,ref1,ref2,cov[3],target.id,n,starts[n],cov[2])

				outputwrite2(output2, chr,cov[3],starts[n],cov[2])


				rn = int(ref2)
				prev = nID
				# -- #

				# get to the next line of inputfile#
				t+= 1
				cov = covFile[t].split()
				starts[n] = cov[1]	
		
		#print output #
		if (t == 0) and (t >= len(covFile)):
			continue

		if (rn == 1):
			prev = target.id
		nID = target.id
		if (nID != prev):
			rn = 1
		ref1 = str(rn)
		ref2 = str(int(ends[n]) - int(starts[n]) + rn)	
		outputwrite(output, target.gene, chr, ref1, ref2, cov[3], target.id, n, starts[n], ends[n])	

		outputwrite2(output2, chr, cov[3], starts[n], ends[n])


		rn = int(ref2)
		prev = nID
		# -- #
	output.close()	
	output2.close()


def convertGeneCoordinate2(targetList, bufLocFolder):
        inputfile2 = bufLocFolder + "chr/"
        outputfile = bufLocFolder + "geneRefCoverage.txt"
        outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt"

        output= open(outputfile,"w")
        output_avg = open(outputfile_avg,"w")

        rn = 1
        prevchr = ""
        for target in targetList:
                chr = target.chr
                starts = target.oriStart.split(",")
                ends = target.oriEnd.split(",")
                target_ttl_readdepth = 0
                starts_leftmost = starts[0]


                if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
                        continue

                if (prevchr != chr):
                        print chr #progress checking
                        prevchr = chr
                        t = 0
                        covFile = file.readlines(open(inputfile2+chr+".txt","r"))

                for n in range(0,target.numberExon):
                        if t >= len(covFile):
                                break
                        cov = covFile[t].split()
                        while  ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
                                if (int(cov[1]) > int(starts[n])):                                        t-=1
                                else:
                                        t+=1
                                cov = covFile[t].split()

                        while  ((int(ends[n]) < int(cov[1])) or (int(ends[n]) >  int(cov[2]))):
                                # print output #
                                if (rn == 1):
                                        prev = target.id
                                nID = target.id
                                if (nID != prev):
                                        rn = 1
                                ref1 = str(rn)
                                ref2 = str(int(cov[2]) - int(starts[n]) + rn)
                                outputwrite2(output, chr,cov[3],starts[n],cov[2])
                                tmprange=int(cov[2])-int(starts[n])+1
                                target_ttl_readdepth+=int(cov[3])*tmprange
                                #target_length+=tmprange

                                rn = int(ref2)
                                prev = nID
                                # -- #

                                # get to the next line of inputfile#
                                t+= 1
                                cov = covFile[t].split()
                                starts[n] = cov[1]

                #print output #
                if (t == 0) and (t >= len(covFile)):
                        continue

                if (rn == 1):
                        prev = target.id
                nID = target.id
                if (nID != prev):
                        rn = 1
                ref1 = str(rn)
                ref2 = str(int(ends[n]) - int(starts[n]) + rn)
                outputwrite2(output, chr, cov[3], starts[n], ends[n])
                tmprange=int(ends[n])-int(starts[n])+1
                target_ttl_readdepth+=int(cov[3])*tmprange
                #target_length+=tmprange
                target_length = int(ends[n])-int(starts_leftmost)+1
                output_avg.write("\t".join([chr,starts_leftmost,ends[n],str(target_ttl_readdepth/target_length),str(target_length)])+"\n")

                rn = int(ref2)
                prev = nID
                # -- #
        output.close()
        output_avg.close()