# HG changeset patch
# User fcaramia
# Date 1347517903 14400
# Node ID 7564f3b1e6756df4b201e6e336c57c76fd38aeb7
Uploaded
diff -r 000000000000 -r 7564f3b1e675 Contra/BEDTools-User-Manual.v4.pdf
Binary file Contra/BEDTools-User-Manual.v4.pdf has changed
diff -r 000000000000 -r 7564f3b1e675 Contra/BEDTools.v2.11.2.tar.gz
Binary file Contra/BEDTools.v2.11.2.tar.gz has changed
diff -r 000000000000 -r 7564f3b1e675 Contra/CONTRA_User_Guide.2.0.pdf
Binary file Contra/CONTRA_User_Guide.2.0.pdf has changed
diff -r 000000000000 -r 7564f3b1e675 Contra/baseline.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/baseline.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,253 @@
+#!/usr/bin/python
+
+import os
+import sys
+import fnmatch
+import shlex
+import subprocess
+import shutil
+import math
+from optparse import OptionParser
+from scripts.split_chromosome import splitByChromosome3 as splitByChromosome
+from scripts.get_chr_length import *
+from scripts.convert_targeted_regions import *
+from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate
+from multiprocessing import Pool
+
+class Params:
+ """
+ Class for top-level system parameters:
+ """
+
+ def __init__(self):
+
+ def mult_files(option, opt_str, value, parser):
+ args=[]
+ for arg in parser.rargs:
+ if arg[0] != "-":
+ args.append(arg)
+ else:
+ del parser.rargs[:len(args)]
+ break
+ if getattr(parser.values, option.dest):
+ args.extend(getattr(parser.values, option.dest))
+ setattr(parser.values, option.dest, args)
+
+
+ #command-line option definition
+ self.parser = OptionParser()
+ self.parser.add_option("-t", "--target",
+ help="Target region definition file [REQUIRED] [BED Format]",
+ action="store", type="string", dest="target")
+ self.parser.add_option("-f", "--files",
+ help="Files to be converted to baselines [REQUIRED] [BAM]",
+ action="callback", callback=mult_files, dest="files")
+ self.parser.add_option("-o", "--output",
+ help="Output folder [REQUIRED]",
+ action="store", type="string", dest="output")
+ self.parser.add_option("-c","--trim",
+ help="Portion of outliers to be removed before calculating average [Default: 0.2]",
+ action="store", dest="trim", default="0.2")
+ self.parser.add_option("-n","--name",
+ help="Output baseline name [Default: baseline]",
+ action="store", dest="name", default="baseline")
+
+ # required parameters list:
+ self.ERRORLIST = []
+
+ #change system parameters based on any command line arguments
+ (options, args) = self.parser.parse_args()
+ if options.target:
+ self.TARGET=options.target
+ else:
+ self.ERRORLIST.append("target")
+
+ if options.files:
+ self.FILES=options.files
+ else:
+ self.ERRORLIST.append("files")
+
+ if options.output:
+ self.OUTPUT=options.output
+ else:
+ self.ERRORLIST.append("output")
+
+ if options.trim:
+ self.TRIM=float(options.trim)
+
+ if options.name:
+ self.NAME=options.name
+
+ if len(self.ERRORLIST) != 0:
+ self.parser.print_help()
+ self.parser.error("Missing required parameters")
+
+
+ def repeat(self):
+ #params test
+ print "target :", self.TARGET
+ print "files :", self.FILES
+ print "output :", self.OUTPUT
+ print "trim :", self.TRIM
+ print "name :", self.NAME
+
+
+# option handling
+params = Params()
+#params.repeat()
+targetFile = params.TARGET
+infiles = params.FILES
+output_dir = params.OUTPUT
+
+#Debug
+print " ------ baseline.py ------- "
+print "Target:", targetFile
+for files in infiles:
+ print "File:", files
+print "Output Directory: ", output_dir
+
+def make_new_directory(outdir):
+ print outdir
+ if not os.path.exists(outdir):
+ os.mkdir(outdir)
+
+print " ----- creating output directory -----"
+make_new_directory(output_dir)
+outdir = os.path.join(output_dir, "buf")
+make_new_directory(outdir)
+
+targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted")
+os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2))
+
+def processInFile(infile):
+ infilename = os.path.basename(infile)
+ s_outdir = os.path.join(outdir, infilename)
+ make_new_directory(s_outdir)
+ genomeFile = os.path.join(s_outdir,infilename +'.chrsummary')
+ get_genome(infile, genomeFile)
+
+ bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH")
+ args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile))
+ iOutFile = open(bedgraph, "w")
+ output = subprocess.Popen(args, stdout = iOutFile).wait()
+ iOutFile.close()
+
+ targetList = convertTarget(targetFile2)
+
+ splitByChromosome(bedgraph)
+
+ convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/")
+ bedgraph_tgtonly = bedgraph+".TARGETONLY"
+ bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE"
+ os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly)
+ os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg)
+ shutil.copy(bedgraph_tgtonly,outdir)
+ shutil.copy(bedgraph_tgtonly_avg,outdir)
+
+print "----- Processing Files -----"
+pool = Pool(5)
+pool.map(processInFile,infiles)
+
+
+# STEP 2 -> Create Union BedGraph
+allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")]
+allfiles_path = [os.path.join(outdir,x) for x in allfiles_names]
+
+args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names
+
+print (str(args))
+fo = os.path.join(outdir,"TARGETONLY.union.txt")
+foh = open(fo,"w")
+
+subprocess.Popen(args,stdout=foh).wait()
+foh.close()
+
+# STEP 3 -> POOL METHOD
+TRIM = params.TRIM #TRIM = 0.2
+f = fo
+fh = open(f)
+
+fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt")
+fo_libsize=fo+".LIBSIZE"
+
+
+foh=open(fo,'w')
+fo_libsize_h=open(fo_libsize,'w')
+
+
+fh.readline()
+
+Ns = None
+nSamples = None
+
+lineCount=0
+for line in fh:
+ lineCount+=1
+ line_elems=line.rstrip().split("\t")
+ rangeLen=int(line_elems[2])-int(line_elems[1])
+ if not Ns:
+ nSamples=len(line_elems)-3
+ Ns = [0 for x in range(nSamples)]
+ for i in range(nSamples):
+ ind=i+3
+ Ns[i] += int(line_elems[ind])*rangeLen
+
+
+fh.close()
+fh=open(f)
+fh.readline()
+
+
+lineCount=0
+nSamples_exclude = int(math.floor(TRIM*nSamples))
+
+def gm_mean(xs):
+ tmpprod = 1
+ p = 1.0/len(xs)
+ for x in xs:
+ tmpprod = tmpprod * math.pow(x,p)
+ return tmpprod
+
+Ns_gmean = gm_mean(Ns)
+
+def meanstdv(x):
+ from math import sqrt
+ n, mean, std = len(x), 0, 0
+ for a in x:
+ mean = mean+a
+ mean = mean / float(n)
+ for a in x:
+ std = std+(a-mean)**2
+ std=sqrt(std/float(n-1))
+ return mean, std
+
+
+libsize = 0
+for line in fh:
+ lineCount+=1
+ line_elems=line.rstrip().split("\t")
+ rangeLen=int(line_elems[2])-int(line_elems[1])
+ xs_tmp=[int(x) for x in line_elems[3:]]
+ xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))]
+ xs.sort()
+ xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)]
+ #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed))
+ trimmed_mean,std=meanstdv(xs_trimmed)
+ libsize+=rangeLen*trimmed_mean
+ foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n")
+
+
+fo_libsize_h.write(str(int(round(libsize))))
+
+fh.close()
+foh.close()
+fo_libsize_h.close()
+
+def removeTempFolder(tempFolderPath):
+ import shutil
+
+ shutil.rmtree(tempFolderPath)
+ print "Temp Folder Removed"
+
+# Removed Temp Folder
+removeTempFolder(outdir)
diff -r 000000000000 -r 7564f3b1e675 Contra/baseline.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/baseline.xml Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,69 @@
+
+ : Control files for Contra
+
+ bedtools
+
+
+
+ baseline_wrapper.pl
+
+ ##Required files
+ "PLAYEROPTION::-t=$target_file"
+
+ #for $group in $file_group
+ "BAMLISTENTRY::${group.bam}"
+ #end for
+
+ "PLAYEROPTION::--name=$sampleName"
+ "PLAYEROPTION::--trim=$trim"
+
+ ##File to generate the bam list
+ "BASELINEOUTPUT::$baseline_output"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+|
+
+**Reference**
+ http://contra-cnv.sourceforge.net/
+
+-----
+
+**What it does**
+
+Creating a baseline control from multiple samples is can be useful when a matched control is not available. In the CONTRA download page, we have provided several baseline files for some of the platforms that we have tried. Alternatively, the “baseline.py” script that comes with CONTRA can be used to generate a custom baseline file.
+
+-----
+
+**Parameters**
+
+::
+
+ -t, --target Target region definition file [REQUIRED] [BED format]
+
+ -f, --files Files to be converted to baselines [REQUIRED] [BAM]
+
+ -o, --output Output folder [REQUIRED]
+
+ -c, --trim Portion of outliers to be removed before calculating
+ average [Default: 0.2]
+
+ -n, --name Output baseline file name [Default: baseline]
+
+
+
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/baseline_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/baseline_wrapper.pl Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,68 @@
+#sshpass -p pmac1512 ssh -o StrictHostkeyChecking=no galaxy@pmc-bioinf03 "gmt music play $*"
+#echo "gmt music play $*"
+
+
+use strict;
+use warnings;
+use File::Basename;
+use Cwd;
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+
+my $player_options = "";
+my $baseline_output;
+
+my $dir = getcwd;
+my $variable = "";
+my $files = "";
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+
+ if($tmp[0] eq "PLAYEROPTION")
+ {
+ $variable = $tmp[1];
+ $variable =~ s/=/ /g;
+ $player_options = "$player_options $variable";
+ }
+ elsif($tmp[0] eq "BASELINEOUTPUT")
+ {
+ $baseline_output = $tmp[1];
+ }
+ elsif($tmp[0] eq "BAMLISTENTRY")
+ {
+ $files = "$files ${tmp[1]}";
+ }
+ else
+ {
+ die("Unknown Input: $input\n");
+ }
+}
+
+
+my $working_dir = "BASELINE_OUTPUT";
+#remove extension
+
+#Create Contra Output dir
+system ("mkdir $working_dir");
+
+#run baseline
+
+system ("baseline.py --file $files --output $working_dir $player_options > /dev/null");
+
+#Search control file in output dir
+opendir(DIR, $working_dir);
+my @FILES= readdir(DIR);
+foreach my $file (@FILES)
+{
+ my ($filename,$directory,$extension) = fileparse($file, qr/\.[^.]*/);
+ if ($extension eq ".txt")
+ {
+ system ("mv $working_dir/$file $baseline_output");
+ }
+}
+closedir(DIR);
+
diff -r 000000000000 -r 7564f3b1e675 Contra/bedtools_installation_guide.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/bedtools_installation_guide.txt Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,6 @@
+Install BEDTools
+tar -zxvf BEDTools.tar.gz
+cd BEDTools
+make clean
+make all
+sudo cp bin/* /usr/local/bin/
\ No newline at end of file
diff -r 000000000000 -r 7564f3b1e675 Contra/contra.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/contra.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,570 @@
+#!/usr/bin/python
+
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 23 July 2012 16:43PM
+
+
+import os
+from optparse import OptionParser
+import sys
+import subprocess
+import shlex
+from multiprocessing import Process, Manager
+
+from scripts.assign_bin_number_v2 import *
+from scripts.average_count import *
+from scripts.cn_apply_threshold import *
+from scripts.convert_gene_coordinate import *
+from scripts.convert_targeted_regions import *
+from scripts.split_chromosome import *
+from scripts.vcf_out import *
+from scripts.get_chr_length import *
+from scripts.count_libsize import *
+from scripts.target_breakdown import *
+
+#Absolute Path
+scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
+
+class Params:
+ """
+ Class for top-level system parameters
+ """
+
+ def __init__(self):
+ # command-line option definition
+ self.parser = OptionParser()
+ self.parser.add_option("-t", "--target",
+ help="Target region definition file [REQUIRED] [BED Format]",
+ action="store", type="string", dest="target")
+ self.parser.add_option("-s", "--test",
+ help="Alignment file for the test sample [REQUIRED] [BAM/SAM]",
+ action="store", type="string", dest="test")
+ self.parser.add_option("-c", "--control",
+ help="Alignment file for the control sample [REQUIRED] [BAM/SAM]",
+ action="store", type="string", dest="control")
+ self.parser.add_option("-f", "--fasta",
+ help="Reference Genome [REQUIRED][FASTA]",
+ action="store", type="string", dest="fasta")
+ self.parser.add_option("-o", "--outFolder",
+ help="the output folder path name to store the output of analysis [REQUIRED]",
+ action="store", type="string", dest="outFolder")
+ self.parser.add_option("--numBin",
+ help="Numbers of bins to group regions. User can specify multiple experiments with different number of bins (comma separated) [20]",
+ action="store", type="string", dest="numBin", default="20")
+ self.parser.add_option("--minReadDepth",
+ help="The threshold for minimum read depth for each bases [10]",
+ action="store", type="string", dest="minReadDepth", default=10)
+ self.parser.add_option("--minNBases",
+ help="The threshold for minimum number of bases for each target regions [10]",
+ action="store", type="string", dest="minNBases", default= 10)
+ self.parser.add_option("--sam",
+ help="If the specified, test and control sample are in SAM [False]",
+ action="store_true", dest="sam", default="False")
+ self.parser.add_option("--bed",
+ help="if specified, control will be in BED format [False]",
+ action="store_true", dest = "bedInput", default="False")
+ self.parser.add_option("--pval",
+ help="The p-value threshold for filtering [0.05]. Applies to Adjusted P-Value.",
+ action="store", type="string", dest="pval", default=0.05)
+ self.parser.add_option("--sampleName",
+ help ="The name to be appended to the front of default output name ['']",
+ action="store", type="string", dest="sampleName", default='')
+ self.parser.add_option("--nomultimapped",
+ help="The option to remove multi-mapped reads [False]",
+ action="store_true", dest="nomultimapped",default="False")
+ self.parser.add_option("-p", "--plot",
+ help="Plots log-ratio distribution for each bin [False]",
+ action="store_true", dest="plot", default="False")
+ self.parser.add_option("--minExon",
+ help="Minimum number of Exons in one bin (if less than this, bin that contains small number of exons"
+ +"will be moved to the adjacent bins) [2000] ",
+ action="store", type="string", dest="minExon", default="2000")
+ self.parser.add_option("--minControlRdForCall",
+ help="Minimum control readdepth for call [5]",
+ action="store", type="string", dest="minControl", default="5")
+
+ self.parser.add_option("--minTestRdForCall",
+ help="Minimum test readdepth for call [0]",
+ action="store", type="string", dest="minTest", default="0")
+
+ self.parser.add_option("--minAvgForCall",
+ help="Minimum average coverage for call [20]",
+ action="store", type="string", dest="minAvg", default="20")
+
+ self.parser.add_option("--maxRegionSize",
+ help="Maximum Region Size in target region (for breaking large region into smaller region. By default, maxRegionSize 0 means no breakdown) [0]",
+ action="store", type="string", dest="maxRegionSize", default="0")
+
+ self.parser.add_option("--targetRegionSize",
+ help="Target Region Size for breakdown (if maxRegionSize is non zero) [200]",
+ action="store", type="string", dest="targetRegionSize", default="200")
+
+ self.parser.add_option("-l", "--largeDeletion",
+ help="if specified, CONTRA will run large deletion analysis (CBS). User must have DNAcopy R-library installed to run the analysis. [False]",
+ action="store_true", dest = "large", default="False")
+
+ self.parser.add_option("--smallSegment",
+ help="CBS segment size for calling large variations [1]",
+ action="store", type="string", dest="smallSegment", default="1")
+
+ self.parser.add_option("--largeSegment",
+ help="CBS segment size for calling large variations [25]",
+ action="store", type="string", dest="largeSegment", default="25")
+
+ self.parser.add_option("--lrCallStart",
+ help="Log ratios start range that will be used to call CNV [-0.3]",
+ action="store", type="string", dest="lrs", default="-0.3")
+
+ self.parser.add_option("--lrCallEnd",
+ help="Log ratios end range that will be used to call CNV [0.3]",
+ action="store", type="string", dest="lre", default="0.3")
+
+ self.parser.add_option("--passSize",
+ help="Size of exons that passed the p-value threshold compare to the original exon size [0.35]",
+ action="store", type="string", dest="passSize", default="0.35")
+
+
+ # required parameters list
+ self.ERRORLIST = []
+
+ # change system parameters based on any command line
+ (options, args) = self.parser.parse_args()
+ if options.target:
+ self.TARGET = options.target
+ else:
+ #self.parser.print_help()
+ #self.parser.error("--target not supplied")
+ self.ERRORLIST.append("target")
+
+ if options.test:
+ self.TEST = options.test
+ else:
+ #self.parser.error("--test not supplied")
+ self.ERRORLIST.append("test")
+
+ if options.control:
+ self.CONTROL = options.control
+ else:
+ #self.parser.error("--control not supplied")
+ self.ERRORLIST.append("control")
+
+ if options.fasta:
+ self.FASTA = options.fasta
+ else:
+ #self.parser.error("--fasta not supplied")
+ self.ERRORLIST.append("fasta")
+
+ if options.outFolder:
+ self.OUTFOLDER = options.outFolder
+ else:
+ #self.parser.error("--outFolder not supplied")
+ self.ERRORLIST.append("outfolder")
+
+ if len(self.ERRORLIST) != 0:
+ self.parser.print_help()
+ self.parser.error("Missing required parameters")
+
+ if options.numBin:
+ binsNumber = options.numBin.split(",")
+ try:
+ self.NUMBIN = [int(j) for j in binsNumber]
+ except:
+ self.NUMBIN = [20]
+ if options.minReadDepth:
+ self.MINREADDEPTH = int(options.minReadDepth)
+ if options.minNBases:
+ self.MINNBASES = int(options.minNBases)
+ if options.sam:
+ self.SAM = str(options.sam)
+ if options.pval:
+ self.PVAL = options.pval
+ if options.sampleName:
+ self.SAMPLENAME = options.sampleName
+ else:
+ self.SAMPLENAME = 'No-SampleName'
+ if options.nomultimapped:
+ self.NOMULTIMAPPED = str(options.nomultimapped)
+ if options.plot:
+ self.PLOT = str(options.plot)
+ if options.bedInput:
+ self.BEDINPUT = options.bedInput
+ if options.minExon:
+ self.MINEXON = int(options.minExon)
+ if options.minControl:
+ self.MINCONTROL = options.minControl
+ if options.minTest:
+ self.MINTEST = options.minTest
+ if options.minAvg:
+ self.MINAVG = options.minAvg
+ if options.maxRegionSize:
+ self.MAXREGIONSIZE = int(options.maxRegionSize)
+ if options.targetRegionSize:
+ self.TARGETREGIONSIZE = int(options.targetRegionSize)
+ if options.large:
+ self.LARGE = str(options.large)
+ if options.smallSegment:
+ self.SMALLSEGMENT = options.smallSegment
+ if options.largeSegment:
+ self.LARGESEGMENT = options.largeSegment
+ if options.lre:
+ self.LRE = options.lre
+ if options.lrs:
+ self.LRS = options.lrs
+ if options.passSize:
+ self.PASSSIZE = options.passSize
+
+ def repeat(self):
+ # params test
+ print "target :", self.TARGET
+ print "test :", self.TEST
+ print "control :", self.CONTROL
+ print "fasta :", self.FASTA
+ print "outfolder :", self.OUTFOLDER
+ print "numBin :", self.NUMBIN
+ print "minreaddepth :", self.MINREADDEPTH
+ print "minNBases :", self.MINNBASES
+ print "sam :", self.SAM
+ print "pval :", self.PVAL
+ print "sampleName :", self.SAMPLENAME
+ print "nomultimapped :", self.NOMULTIMAPPED
+ print "plot :", self.PLOT
+ print "bedInput :", self.BEDINPUT
+ print "minExon :", self.MINEXON
+ print "largeDeletion :", self.LARGE
+def checkOutputFolder(outF):
+ print "Creating Output Folder :",
+
+ if outF[len(outF)-1] == "/":
+ outF = outF[:len(outF)-1]
+
+ try:
+ os.mkdir(outF)
+ except:
+ print "cannot create folder '%s'" %outF
+ print "if folder already exist, please specify other folder"
+ sys.exit(1)
+
+ try:
+ os.mkdir(outF+"/table")
+ os.mkdir(outF+"/plot")
+ os.mkdir(outF+"/buf")
+ os.mkdir(outF+"/buf/ctrData/")
+ os.mkdir(outF+"/buf/testData/")
+ except:
+ print "[ERROR: CANNOT CREATE SUBFOLDERS]"
+ sys.exit(1)
+
+ print " Done."
+
+ return outF
+
+
+#BEDINPUT
+def countTotalReads3(params, folder):
+ tempFileName = folder + "/temp.txt"
+ tempReadFile = open(tempFileName, "w")
+ libsize = get_libsize(params.BEDINPUT)
+ tempReadFile.write(libsize)
+ #tempReadFile.write(params.CONTROLREADCOUNT)
+ tempReadFile.close()
+
+
+def countTotalReads(params, folder):
+ if 'testData' in folder:
+ inF = params.TEST
+ else:
+ inF = params.CONTROL
+
+ # Get Total ReadCount
+ getreadcount = os.system("samtools view %s | wc -l > %s/temp.txt" %(inF,folder))
+
+def samToBam(samfile, bamfile):
+ args = shlex.split("samtools view -bS %s -o %s" %(samfile, bamfile))
+ samtobam = subprocess.call(args)
+
+ return bamfile
+
+def removeMultiMapped(inF, newBAM):
+ # Get New BAM Files with mapping quality > 0
+ args = shlex.split("samtools view -bq 1 %s -o %s" %(inF, newBAM))
+ removeMM = subprocess.call(args)
+ print "Multi mapped reads removed. "
+
+
+#BEDINPUT
+def convertBamSimple(params, folder, targetList, genomeFile):
+ if 'testData' in folder:
+ inF = params.TEST
+ print "Converting TEST Sample... "
+ else:
+ inF = params.CONTROL
+ print "Converting CONTROL Sample... "
+
+ #Copy file to working folder
+ os.system("cp %s %s" %(inF, folder+"sample.BEDGRAPH"))
+
+ # Split Bedgraph by its chromosomes
+ splitByChromosome(folder)
+
+ # Slice the coverage files to only cover the targeted regions
+ print "Getting targeted regions DOC..."
+ convertGeneCoordinate(targetList, folder)
+
+ # LIBSIZE
+ libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
+ tempLibSize = open(folder + "/temp.txt", "w")
+ tempLibSize.write(libsize)
+ tempLibSize.close()
+
+ print "Targeted regions pre-processing: Done"
+
+
+def convertBam(params, folder, targetList, genomeFile):
+ if 'testData' in folder:
+ inF = params.TEST
+ print "Converting TEST Sample... "
+ else:
+ inF = params.CONTROL
+ print "Converting CONTROL Sample... "
+
+ # Convert BAM Files to BEDGRAPH
+ bedgraph = folder + "sample.BEDGRAPH"
+ args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(inF, genomeFile))
+ #output = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
+ iOutFile = open(bedgraph, "w")
+ #iOutFile.write(output)
+ output = subprocess.Popen(args, stdout = iOutFile).wait()
+ iOutFile.close()
+
+ # Split Bedgraph by its chromosomes
+ splitByChromosome(folder)
+
+ # Slice the coverage files to only cover the targeted regions
+ print "Getting targeted regions DOC..."
+ convertGeneCoordinate(targetList, folder)
+
+ # LIBSIZE
+ libsize = str(get_libsize(folder+"geneRefCoverage2.txt"))
+ tempLibSize = open(folder + "temp.txt", "w")
+ tempLibSize.write(libsize)
+ tempLibSize.close()
+
+ print "Targeted regions pre-processing: Done"
+
+def analysisPerBin(params, num_bin, outFolder, targetList):
+ import shutil
+
+ bufLoc = outFolder + "/buf"
+ # Assign bin number to the median and average file
+ numBin = assignBin(num_bin, bufLoc+"/average.txt", bufLoc+"/bin", targetList, params.MINEXON)
+
+ #copy bin_boundary to plot folder
+ #outBounFile = os.path.join(outFolder, "plot", "bin_boundary"+str(num_bin))
+ #curBounFile = os.path.join(bufLoc, "bin" + str(num_bin) + ".boundaries.txt")
+ #shutil.copy(curBounFile, outBounFile)
+
+
+ print "Significance Test ... "
+ rScriptName = os.path.join(scriptPath, "scripts", "cn_analysis.v3.R")
+ args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s %s %s"
+ %(rScriptName, num_bin, params.MINREADDEPTH, params.MINNBASES, outFolder, params.SAMPLENAME,params.PLOT, numBin, params.MINCONTROL, params.MINTEST, params.MINAVG))
+ rscr = subprocess.call(args)
+
+
+ print "Generating Output Files ... "
+ # Analysis of CNV
+ tNameList = os.listdir(outFolder+"/table/")
+ if num_bin > 1:
+ tNameId = str(num_bin) + "bins"
+ else:
+ tNameId = str(num_bin) + "bin"
+ for tName in tNameList:
+ if tNameId in tName:
+ break
+
+ if "CNATable" in tName:
+ tName = tName[:len(tName)-4]
+ tableName = outFolder + "/table/" + tName
+ bufTable = bufLoc + "/" + tName
+ applyThreshold(tableName, bufTable, params.PVAL, 100000) #params.MAXGAP = 100000
+
+ # Large Region CBS
+ if (params.LARGE != "False"):
+ rScriptName2 = os.path.join(scriptPath, "scripts", "large_region_cbs.R")
+
+ args = shlex.split("Rscript %s %s %s %s %s %s %s %s %s"
+ %(rScriptName2, tableName+".txt", params.SMALLSEGMENT, params.LARGESEGMENT, params.PVAL, params.PASSSIZE, params.LRS, params.LRE, bufLoc))
+ rscr2 = subprocess.call(args)
+
+ # Generate the DNA sequence (for VCF file)
+ bedFile = bufTable + ".BED"
+ bedFasta = bufTable + ".fastaOut.txt"
+ fastaFile = params.FASTA
+ args = shlex.split("fastaFromBed -fi %s -bed %s -fo %s -name"
+ %(fastaFile, bedFile, bedFasta))
+ fastaBED = subprocess.call(args)
+
+ # Write VCF
+ print "Creating VCF file ... "
+ vcfFile = tableName + ".vcf"
+ vcf_out(bedFasta, vcfFile)
+
+ print "%s created. " %(vcfFile)
+
+ else:
+ print "Table not found"
+
+def removeTempFolder(tempFolderPath):
+ import shutil
+
+ shutil.rmtree(tempFolderPath)
+
+ print "Temp Folder Removed"
+
+
+def main():
+ # option handling
+ params = Params()
+ params.repeat()
+
+ # output folder handling
+ outFolder = checkOutputFolder(params.OUTFOLDER)
+ bufLoc = outFolder + "/buf"
+
+ # convert target file
+ sorted_target = os.path.join(bufLoc, "target.BED")
+ os.system("sort -k1,1 -k2n %s > %s" %(params.TARGET, sorted_target))
+
+ # target breakdown
+ if params.MAXREGIONSIZE > 0:
+ new_target = os.path.join(bufLoc, "target_breakdown.BED")
+ target_breakdown(sorted_target, params.MAXREGIONSIZE, params.TARGETREGIONSIZE, new_target)
+ sorted_target = new_target
+
+ targetList = convertTarget(sorted_target)
+
+ # convert sam to bam if -sam specified
+ if (params.SAM == "True"):
+ print "Pre-processing SAM files"
+
+ test_bam = bufLoc + "/test.BAM"
+ ctr_bam = bufLoc + "/control.BAM"
+
+ samTest = Process(target= samToBam, args=(params.TEST, test_bam))
+ if params.BEDINPUT == "False":
+ samCtr = Process(target= samToBam, args=(params.CONTROL, ctr_bam))
+
+ samTest.start()
+ if params.BEDINPUT == "False":
+ samCtr.start()
+
+ samTest.join()
+ if params.BEDINPUT == "False":
+ samCtr.join()
+
+ params.TEST = test_bam
+ if params.BEDINPUT == "False":
+ params.CONTROL = ctr_bam
+
+ # remove multi mapped reads if --nomultimapped is specified
+ if (params.NOMULTIMAPPED == "True"):
+ print "Removing multi-mapped reads"
+
+ test_bam = bufLoc + "/test_reliable.BAM"
+ ctr_bam = bufLoc + "/control_reliable.BAM"
+
+ bamTest = Process(target= removeMultiMapped, args=(params.TEST, test_bam))
+ if params.BEDINPUT == "False":
+ bamCtr = Process(target= removeMultiMapped, args=(params.CONTROL, ctr_bam))
+
+ bamTest.start()
+ if params.BEDINPUT == "False":
+ bamCtr.start()
+
+ bamTest.join()
+ if params.BEDINPUT == "False":
+ bamCtr.join()
+
+ params.TEST = test_bam
+ if params.BEDINPUT == "False":
+ params.CONTROL = ctr_bam
+
+ # Get Chromosome Length
+ genomeFile = bufLoc + '/sample.Genome'
+ get_genome(params.TEST, genomeFile)
+
+ # spawn bam converting scripts
+ pTest = Process(target= convertBam,
+ args=(params, bufLoc+'/testData/', targetList, genomeFile))
+
+ #BEDINPUT
+ if params.BEDINPUT == "False":
+
+ cTest = Process(target= convertBam,
+ args=(params, bufLoc+'/ctrData/' , targetList, genomeFile))
+ else:
+ cTest = Process(target= convertBamSimple,
+ args=(params, bufLoc+'/ctrData/', targetList, genomeFile))
+ # start the processes
+ pTest.start()
+ cTest.start()
+
+ # wait for all the processes to finish before continuing
+ pTest.join()
+ cTest.join()
+
+ # Get the read depth count from temporary folder
+ for folder in [bufLoc+'/testData/', bufLoc+'/ctrData/']:
+ if 'testData' in folder:
+ t1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
+ else:
+ n1 = int(file.readlines(open(folder+"temp.txt"))[0].strip("\n"))
+ print "Test file read depth = ", t1
+ print "Control file read depth = ", n1
+ print "Pre-processing Completed. "
+
+ # Get the Average of the Log Ratio
+ print "Getting the Log Ratio ... "
+ testListName = bufLoc + '/testData/geneRefCoverage.txt'
+ controlListName = bufLoc + '/ctrData/geneRefCoverage.txt'
+ avOut = bufLoc + "/average.txt"
+ averageCount(testListName, controlListName, avOut, t1, n1, params.MINREADDEPTH, params.MINNBASES)
+
+ # Analysis. [Bin, significance test, large deletion, vcf output]
+ print "Binning ... "
+ binProc = []
+ for numBin in params.NUMBIN:
+ binProc.append(Process(target= analysisPerBin, args=(params,numBin,outFolder,targetList)))
+
+ for proc in binProc:
+ proc.start()
+
+ for proc in binProc:
+ proc.join()
+
+ # Removed Temp Folder
+ removeTempFolder(bufLoc)
+
+if __name__ == "__main__":
+ main()
+ print "Done... "
diff -r 000000000000 -r 7564f3b1e675 Contra/contra.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/contra.xml Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,286 @@
+
+ : Copy Number Analysis for Targeted Resequencing
+
+ bedtools
+
+
+
+ contra_wrapper.pl
+
+ ##Ref Genome
+ #if $genomeSource.refGenomeSource == "history":
+ "PLAYEROPTION::-f=${genomeSource.ownFile}"
+ #else:
+ ##use precomputed indexes
+ "PLAYEROPTION::-f=${genomeSource.indices.fields.path}"
+ #end if
+
+ ##Required files
+ "PLAYEROPTION::-t=$target_file"
+ "PLAYEROPTION::-s=$alignment_file"
+ #if $controlSource.refControlSource == "history":
+ "PLAYEROPTION::-c=${controlSource.control_file}"
+ #else:
+ ##use precomputed indexes
+ "PLAYEROPTION::-c=${controlSource.indices.fields.path}"
+ #end if
+
+ ##Optional parameter
+
+ #if $option.option == "modify_parameters":
+
+ "PLAYEROPTION::--numBin=$option.numBin"
+ "PLAYEROPTION::--minReadDepth=$option.minReadDepth"
+ "PLAYEROPTION::--minNBases=$option.minNbases"
+
+ #if str($option.sam) == "true":
+ "PLAYEROPTION::--sam"
+ #end if
+
+ #if str($option.bed) == "true":
+ "PLAYEROPTION::--bed"
+ #end if
+
+ "PLAYEROPTION::--pval=$option.pval"
+ "PLAYEROPTION::--sampleName=$option.sampleName"
+
+ #if str($option.nomultimapped) == "true":
+ "PLAYEROPTION::--nomultimapped"
+ #end if
+
+ #if str($option.plot) == "true":
+ "PLAYEROPTION::--plot"
+ #end if
+
+ "PLAYEROPTION::--minExon=$option.minExon"
+ "PLAYEROPTION::--minControlRdForCall=$option.minControlRdForCall"
+ "PLAYEROPTION::--minTestRdForCall=$option.minTestRdForCall"
+ "PLAYEROPTION::--minAvgForCall=$option.minAvgForCall"
+ "PLAYEROPTION::--maxRegionSize=$option.maxRegionSize"
+ "PLAYEROPTION::--targetRegionSize=$option.targetRegionSize"
+
+ #if str($option.largedeletion) == "true":
+ "PLAYEROPTION::--largedeletion"
+ #end if
+
+ "PLAYEROPTION::--smallSegment=$option.smallSegment"
+ "PLAYEROPTION::--targetRegionSize=$option.targetRegionSize"
+ "PLAYEROPTION::--largeSegment=$option.largeSegment"
+ "PLAYEROPTION::--lrCallStart=$option.lrCallStart"
+ "PLAYEROPTION::--lrCallEnd=$option.lrCallEnd"
+ "PLAYEROPTION::--passSize=$option.passSize"
+ #end if
+
+ ##File to generate the bam list
+ CONTRAOUTPUT::$html_file
+ CONTRADIR::$html_file.files_path
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+|
+
+
+**Reference**
+ http://contra-cnv.sourceforge.net/
+
+-----
+
+**What it does**
+
+CONTRA is a tool for copy number variation (CNV) detection for targeted resequencing data such as those from whole-exome capture data. CONTRA calls copy number gains and losses for each target region with key strategies include the use of base-level log-ratios to remove GC-content bias, correction for an imbalanced library size effect on log-ratios, and the estimation of log-ratio variations via binning and interpolation. It takes standard alignment formats (BAM/SAM) and output in variant call format (VCF 4.0) for easy integration with other next generation sequencing analysis package.
+
+
+-----
+
+**Required Parameters**
+
+::
+
+ -t, --target Target region definition file [BED format]
+
+ -s, --test Alignment file for the test sample [BAM/SAM]
+
+ -c, --control Alignment file for the control sample
+ [BAM/SAM/BED – baseline file]
+
+ --bed **option has to be supplied for control
+ with baseline file.**
+
+ -f, --fasta Reference genome [FASTA]
+
+ -o, --outFolder the folder name (and its path) to store the output
+ of the analysis (this new folder will be created –
+ error message occur if the folder exists)
+
+-----
+
+**Optional Parameters**
+
+::
+
+ --numBin Numbers of bins to group the regions. User can
+ specify multiple experiments with different numbers
+ of bins (comma separated). [Default: 20]
+
+ --minReadDepth The threshold for minimum read depth for each bases
+ (see Step 2 in CONTRA workflow) [Default: 10]
+
+ --minNBases The threshold for minimum number of bases for each
+ target regions (see Step 2 in CONTRA workflow)
+ [Default: 10]
+
+ --sam If the specified test and control samples are in
+ SAM format. [Default: False] (It will always take
+ BAM samples as default)
+
+ --bed If specified, control will be a baseline file in
+ BED format. [Default: False]
+ Please refer to the Baseline Script section for
+ instruction how to create baseline files from set
+ of BAMfiles. A set of baseline files from different
+ platform have also been provided in the CONTRA
+ download page.
+
+ --pval The p-value threshold for filtering. Based on Adjusted
+ P-Values. Only regions that pass this threshold will
+ be included in the VCF file. [Default: 0.05]
+
+ --sampleName The name to be appended to the front of the default output
+ name. By default, there will be nothing appended.
+
+ --nomultimapped The option to remove multi-mapped reads
+ (using SAMtools with mapping quality > 0).
+ [default: FALSE]
+
+ -p, --plot If specified, plots of log-ratio distribution for each
+ bin will be included in the output folder [default: FALSE]
+
+ --minExon Minimum number of exons in one bin (if less than this number
+ , bin that contains small number of exons will be merged to
+ the adjacent bins) [Default : 2000]
+
+ --minControlRdForCall Minimum Control ReadDepth for call [Default: 5]
+
+ --minTestRdForCall Minimum Test ReadDepth for call [Default: 0]
+
+ --minAvgForCall Minimum average coverage for call [Default: 20]
+
+ --maxRegionSize Maximum region size in target region (for breaking
+ large regions into smaller regions. By default,
+ maxRegionSize=0 means no breakdown). [Default : 0]
+
+ --targetRegionSize Target region size for breakdown (if maxRegionSize
+ is non-zero) [Default: 200]
+
+ -l, --largeDeletion If specified, CONTRA will run large deletion analysis (CBS).
+ User must have DNAcopy R-library installed to run the
+ analysis. [False]
+
+ --smallSegment CBS segment size for calling large variations [Default : 1]
+
+ --largeSegment CBS segment size for calling large variations [Default : 25]
+
+ --lrCallStart Log ratios start range that will be used to call CNV
+ [Default : -0.3]
+
+ --lrCallEnd Log ratios end range that will be used to call CNV
+ [Default : 0.3]
+
+ --passSize Size of exons that passed the p-value threshold compare
+ to the original exons size [Default: 0.5]
+
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/contra_wrapper.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/contra_wrapper.pl Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,94 @@
+use strict;
+use warnings;
+use File::Basename;
+use Cwd;
+use File::Path qw(make_path remove_tree);
+die qq(
+Bad numbr of inputs
+
+) if(!@ARGV);
+
+
+my $player_options = "";
+my $contra_output;
+my $contra_dir;
+
+my $dir = getcwd;
+my $variable;
+
+foreach my $input (@ARGV)
+{
+ my @tmp = split "::", $input;
+
+ if($tmp[0] eq "PLAYEROPTION")
+ {
+ $variable = $tmp[1];
+ $variable =~ s/=/ /g;
+ print "$variable\n";
+ $player_options = "$player_options $variable";
+ }
+ elsif($tmp[0] eq "CONTRAOUTPUT")
+ {
+ $contra_output = $tmp[1];
+ }
+ elsif($tmp[0] eq "CONTRADIR")
+ {
+ $contra_dir = $tmp[1];
+ }
+ else
+ {
+ die("Unknown Input: $input\n");
+ }
+}
+
+
+my $working_dir = "CONTRA_OUTPUT";
+make_path($contra_dir);
+#remove extension
+
+#run contra
+system ("contra.py -o $working_dir $player_options > /dev/null 2>&1");
+
+
+#set html
+#print "$contra_output - $working_dir\n";
+open(HTML, ">$contra_output");
+print HTML "
Contra: Copy Number Analysis for Targeted ResequencingContra Output Files:
\n";
+move_files($working_dir);
+print HTML "
\n";
+close(HTML);
+
+sub move_files
+{
+ my $local_dir = $_[0];
+ opendir(DIR, $local_dir);
+ #print ("Openning: $local_dir\n");
+ my @FILES= readdir(DIR);
+ closedir(DIR);
+ foreach my $file (@FILES)
+ {
+ if ($file eq "." || $file eq "..")
+ {
+ #print ("./ or ../ skipped\n");
+ }
+ elsif (-d "$local_dir/$file")
+ {
+ #print ("moving to: $local_dir/$file\n");
+ move_files("$local_dir/$file");
+ }
+ elsif (-f "$local_dir/$file")
+ {
+ #print ("mv $local_dir/$file $contra_dir\n");
+ print HTML "$file\n";
+ system ("mv $local_dir/$file $contra_dir");
+ }
+ else
+ {
+ die("Unrecognized file generated: $file\n");
+ }
+
+
+ }
+
+}
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/__init__.py
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/assign_bin_number_v2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/assign_bin_number_v2.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,153 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 12 October 2011 16:43PM
+
+
+def assignBin(binNumber, srcFile, binFile, targetList, minExons):
+#def assignBin(binNumber, srcFile, binFile, minExons):
+ from math import log
+
+ src = file.readlines(open(srcFile))
+ #binOut = open(binFile, "w")
+
+ minExons = int(minExons)
+ count = 0
+ logcov_list = []
+
+ # Get the Log Coverage List for the normal sample
+ for exon in src:
+ exon = exon.split()
+ cov1 = float(exon[7])
+ cov2 = float(exon[8])
+ cov = (cov1 + cov2) / 2
+ if (cov > 0):
+ logcov = log(cov, 2)
+ else:
+ logcov = 0
+ logcov_list.append(logcov)
+ #print "Len of logcov_list:", len(logcov_list)
+
+ # Calculate the boundaries of the bins
+ minLog = min(logcov_list)
+ maxLog = max(logcov_list)
+ boundary = (maxLog-minLog)/binNumber
+ #print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber
+
+
+ # Split exons to different bins
+ bin_list = []
+ boundary_dict = {}
+ for logcov in logcov_list:
+ i = 1
+ set_boundary = minLog + boundary
+ while (logcov > set_boundary):
+ i += 1
+ set_boundary = minLog + (boundary * i)
+ #boundary_dict[i] = set_boundary
+ bin_list.append(i)
+
+ for i in range(binNumber+2):
+ boundary_dict[i] = minLog + (boundary * i)
+
+
+ # Check the number of exons for each bin
+ # Merge with the adjacent bins if it is too small
+ for z in range(1, binNumber+1):
+ element = bin_list.count(z)
+ #print "Bin", z, "has", element, "elements"
+ if (element < minExons):
+ while (bin_list.count(z) != 0):
+ if (z != binNumber):
+ bin_list[bin_list.index(z)]+=1
+ else:
+ bin_list[bin_list.index(z)]-=1
+
+
+ # Check the number of exons in the last bin
+ last_bin_number = sorted(set(bin_list))[-1]
+ if len(set(bin_list)) > 1:
+ second_last_bin = sorted(set(bin_list))[-2]
+ else:
+ second_last_bin = last_bin_number
+ element = bin_list.count(last_bin_number)
+ if (element < minExons):
+ while (bin_list.count(last_bin_number) != 0):
+ if (last_bin_number != 1):
+ #bin_list[bin_list.index(last_bin_number)] -= 1
+ bin_list[bin_list.index(last_bin_number)] = second_last_bin
+
+ final_number_of_bin = len(set(bin_list))
+
+ # List out the binning boundaries in a txt file
+ boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))]
+ i = 1
+
+ #boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt"
+ boundary_file = binFile + str(binNumber) + ".boundaries.txt"
+ boOut = open(boundary_file, "w")
+ boOut.write("\t".join([str(0), str(minLog)])+"\n")
+ for z in boundary_list:
+ if (i==final_number_of_bin):
+ z = maxLog
+ boOut.write("\t".join([str(i), str(z)])+"\n")
+ i += 1
+ boOut.close()
+
+ # Re-sort the bin numbers - get rid of gaps
+ curr_z = 1
+ bin_number_dict = {}
+ for z in sorted(set(bin_list)):
+ bin_number_dict[z] = curr_z
+ curr_z += 1
+
+
+ # Append the bin number to the original file
+ #binFile = binFile + str(final_number_of_bin)+".txt"
+ binFile = binFile + str(binNumber) + ".txt"
+ binOut = open(binFile, "w")
+
+ for exons in src:
+ exon = exons.split()
+ id = int(exon[0])
+ gene = exon[1]
+ exonNumber = exon[5]
+
+ target = targetList[int(id)-1]
+ if target.id == id:
+ chr = target.chr
+ oriStart = target.start
+ oriEnd = target.end
+
+ else:
+ print "ERROR..."
+
+ f_bin_number = str(bin_number_dict[bin_list[count]])
+ binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd]))
+ binOut.write("\n")
+ count += 1
+
+ binOut.close()
+ print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin)
+
+
+ return final_number_of_bin
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/average_count.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/average_count.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,195 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 28 Sep 2011 11:00AM
+
+import sys
+import math
+
+def getAverage(list1):
+ if len(list1) > 0:
+ return float(sum(list1))/len(list1)
+
+ return 0.0
+
+def getStdDev(list1, avg):
+ var = 0.0
+ for x in list1:
+ var += (avg - x) ** 2
+
+ if (len(list1)-1) > 0:
+ var /= (len(list1)-1)
+
+ return math.sqrt(var)
+
+def getMinMax(list1):
+ length = len(list1)
+ if length != 0:
+ min = list1[0]
+ max = list1[length-1]
+ else:
+ min = 0
+ max = 0
+
+ return min, max
+
+def getMedian(list1):
+ length = len(list1)
+ if length == 0:
+ median = 0
+ elif length % 2 == 0:
+ median = (list1[length/2]+list1[(length/2) - 1])/2
+ else:
+ median = list1[length/2]
+ return median
+
+def createDataDict(count, list1, r, offset, id_check, exon_check):
+ tDict = {}
+ tDictOri = {}
+
+ while count < len(list1):
+ t = list1[count].split()
+ tId = t[5]
+ tExon = t[6]
+
+ if (tId != id_check) or (tExon != exon_check):
+ return count, tDict, tDictOri
+
+ tStart = int(t[2])
+ tEnd = int(t[3])
+ tCov = float(t[4]) / r + offset #GeoMean Normalisation
+ tCovOri = float(t[4]) + offset #without scaling
+
+ #filling dict
+ while tStart < tEnd:
+ tDict[tStart] = tCov
+ tDictOri[tStart] = tCovOri #without scaling
+ tStart += 1
+
+ count += 1
+
+ return count, tDict, tDictOri
+
+def getFactor (val1, val2):
+ r = math.sqrt(val1 * val2)
+ r1 = val1/r
+ r2 = val2/r
+ return r1, r2
+
+def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases):
+ tList = file.readlines(open(tFile))
+ nList = file.readlines(open(nFile))
+ # constant & counter
+ OFF = 1
+ tCount = 0
+ nCount = 0
+
+ # create and open files
+ output = open(averageOut, "w")
+
+ # Offset and Ratio for Geometric Mean Normalisation
+ r1, r2 = getFactor(tReadCount, nReadCount)
+ if rd_threshold > 0:
+ #OFF = 0
+ OFF = 0.5
+
+ #big loop
+ while (nCount < len(nList)):
+ # initialisation, get the chr, geneID, geneName
+ init = tList[tCount].split()
+ initial = init[5]
+ _exon = init[6]
+ chr = init[1]
+ gene = init[0]
+ _start = int(init[2])
+
+ # check if t-gene and n-gene refer to the same gene
+ check_init = nList[nCount].split()
+ if check_init[5] != initial or check_init[6] != _exon:
+ print "Initial: ", initial
+ print "Check_Init.id: ", check_init[5]
+ print "_Exon: ", _exon
+ print "Check_Init.exon: ", check_init[6]
+ print "Error. Comparing different Gene"
+ sys.exit(1)
+
+ # create data dictionary for tumour and normal data (per each regions/ exon)
+ tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
+ nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)
+ # check number of bases in the both gene dict
+ if len(nDict) != len(tDict):
+ print "N:", len(nDict)
+ print "T:", len(tDict)
+ print "Error. Different length of dict"
+ sys.exit(1)
+
+ # compare coverage
+ count = _start
+ _max = max(nDict.keys())
+ ratioList = []
+ tumourList = []
+ normalList = []
+ tumourOriList = []
+ normalOriList = []
+ while count <= _max:
+ # get ratio
+ if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
+ ratio = 0.0
+ else:
+ if tDict[count] == 0:
+ tDict[count] = 0.5
+
+ ratio = math.log((float(tDict[count]) / nDict[count]),2)
+ tumourList.append(tDict[count])
+ tumourOriList.append(tDictOri[count])
+ normalList.append(nDict[count])
+ normalOriList.append(nDictOri[count])
+ ratioList.append(ratio)
+
+ count += 1
+
+ ratioLen = len(ratioList)
+
+ # get average
+ avg = getAverage(ratioList)
+ sd = getStdDev(ratioList, avg)
+ tumourAvg= str(round(getAverage(tumourList),3))
+ normalAvg= str(round(getAverage(normalList),3))
+ tumourOriAvg = str(round(getAverage(tumourOriList),3))
+ normalOriAvg = str(round(getAverage(normalOriList),3))
+
+ # get median
+ ratioList.sort()
+ min_logratio, max_logratio = getMinMax(ratioList)
+ median = getMedian(ratioList)
+
+ # write output
+ if ratioLen >= minNBases:
+ output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
+ output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t")
+ output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t")
+ output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
+ output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t")
+ output.write(str(round(max_logratio,3)) + "\n")
+
+ output.close()
+
+ #print "End of averageCount.py with the last target = '%s'" %(initial)
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/cn_analysis.v3.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/cn_analysis.v3.R Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,278 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 31 Oct 2011 17:00PM
+
+
+# Parameters Parsing (from Command Line)
+options <- commandArgs(trailingOnly = T)
+bins = as.integer(options[1])
+rd.cutoff = as.integer(options[2])
+min.bases = as.integer(options[3])
+outf = options[4]
+sample.name = options[5]
+plotoption = options[6]
+actual.bin = as.numeric(options[7])
+min_normal_rd_for_call = as.numeric(options[8])
+min_tumour_rd_for_call = as.numeric(options[9])
+min_avg_cov_for_call = as.numeric(options[10])
+
+if (sample.name == "No-SampleName")
+ sample.name = ""
+
+if (sample.name != "")
+ sample.name = paste(sample.name, ".", sep="")
+
+# Setup output name
+out.f = paste(outf, "/table/", sample.name, "CNATable.", rd.cutoff,"rd.", min.bases,"bases.", bins,"bins.txt", sep="")
+pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", sep="")
+
+# Open and read input files
+# cnAverageFile = paste("bin", bins, ".txt", sep="")
+cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="")
+boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="")
+print (cnAverageFile)
+cn.average = read.delim(cnAverageFile, as.is=F, header=F)
+cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
+
+# Apply thresholds and data grouping
+cn.average.aboveTs = cn.average[cn.average$V3>min.bases,]
+cn.average.list = as.matrix(cn.average.aboveTs$V4)
+
+# Get the mean and sd for each bins
+cn.average.mean = c()
+cn.average.sd = c()
+cn.average.log= c()
+
+# Density Plots for each bins
+if (plotoption == "True"){
+ pdf(pdf.out.f)
+}
+for (j in 1:actual.bin){
+ cn.average.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V4)
+ cn.coverage.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V11)
+ boundary.end = cn.boundary[cn.boundary$V1==j,]$V2
+ boundary.start = cn.boundary[cn.boundary$V1==(j-1),]$V2
+ boundary.mid = (boundary.end+boundary.start)/2
+ if (plotoption == "True") {
+ plot_title = paste("density: bin", bins, sep="")
+ #plot(density(cn.average.nth),xlim=c(-5,5), title=plot_title)
+ plot(density(cn.average.nth),xlim=c(-5,5))
+ }
+ cn.average.mean = c(cn.average.mean, mean(cn.average.nth))
+# cn.average.sd = c(cn.average.sd, sd(cn.average.nth))
+ cn.average.sd = c(cn.average.sd, apply(cn.average.nth,2,sd))
+ #cn.average.log = c(cn.average.log, boundary.mid)
+ cn.average.log = c(cn.average.log, log(mean(cn.coverage.nth),2))
+}
+if (plotoption == "True"){
+ dev.off()
+}
+
+# for point outside of boundaries
+if (bins > 1) {
+ boundary.first = cn.boundary[cn.boundary$V1==0,]$V2
+ boundary.last = cn.boundary[cn.boundary$V1==bins,]$V2
+
+ b.mean.y2 = cn.average.mean[2]
+ b.mean.y1 = cn.average.mean[1]
+ b.sd.y2 = cn.average.sd[2]
+ b.sd.y1 = cn.average.sd[1]
+ b.x2 = cn.average.log[2]
+ b.x1 = cn.average.log[1]
+
+ boundary.f.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.mean.y1
+ boundary.f.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.first-b.x1))+b.sd.y1
+
+ if (boundary.f.sd < 0){
+ boundary.f.sd = 0
+ }
+
+ b.mean.y2 = cn.average.mean[bins]
+ b.mean.y1 = cn.average.mean[bins-1]
+ b.sd.y2 = cn.average.sd[bins]
+ b.sd.y1 = cn.average.sd[bins-1]
+ b.x2 = cn.average.log[bins]
+ b.x1 = cn.average.log[bins-1]
+
+ boundary.l.mean = (((b.mean.y2- b.mean.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.mean.y1
+ boundary.l.sd = (((b.sd.y2- b.sd.y1)/(b.x2-b.x1))*(boundary.last-b.x1))+b.sd.y1
+
+ #cn.average.log = c(boundary.first, cn.average.log, boundary.last)
+ #cn.linear.mean = c(boundary.f.mean, cn.average.mean, boundary.l.mean)
+ #cn.linear.sd = c(boundary.f.sd, cn.average.sd, boundary.l.sd)
+
+ cn.average.log = c(boundary.first, cn.average.log)
+ cn.linear.mean = c(boundary.f.mean, cn.average.mean)
+ cn.linear.sd = c(boundary.f.sd, cn.average.sd)
+
+}
+
+# Linear Interpolation
+if (bins > 1 ){
+ #print(cn.average.log)
+ #print(cn.linear.mean)
+ #print(cn.linear.sd)
+ mean.fn <- approxfun(cn.average.log, cn.linear.mean, rule=2)
+ sd.fn <- approxfun(cn.average.log, cn.linear.sd, rule=2)
+}
+
+
+# Put the data's details into matrices
+ids = as.matrix(cn.average.aboveTs$V1)
+exons = as.matrix(cn.average.aboveTs$V6)
+exons.pos = as.matrix(cn.average.aboveTs$V5)
+gs = as.matrix(cn.average.aboveTs$V2)
+number.bases = as.matrix(cn.average.aboveTs$V3)
+mean = as.matrix(cn.average.aboveTs$V4)
+sd = as.matrix(cn.average.aboveTs$V7)
+tumour.rd = as.matrix(cn.average.aboveTs$V8)
+tumour.rd.ori = as.matrix(cn.average.aboveTs$V10)
+normal.rd = as.matrix(cn.average.aboveTs$V9)
+normal.rd.ori = as.matrix(cn.average.aboveTs$V11)
+median = as.matrix(cn.average.aboveTs$V12)
+MinLogRatio = as.matrix(cn.average.aboveTs$V13)
+MaxLogRatio = as.matrix(cn.average.aboveTs$V14)
+Bin = as.matrix(cn.average.aboveTs$V15)
+Chr = as.matrix(cn.average.aboveTs$V16)
+OriStCoordinate = as.matrix(cn.average.aboveTs$V17)
+OriEndCoordinate= as.matrix(cn.average.aboveTs$V18)
+
+# Linear Fit
+logratios.mean = mean
+logcov.mean = log2((normal.rd + tumour.rd)/2)
+fit.mean = lm(logratios.mean ~ logcov.mean)
+fit.x = fit.mean$coefficient[1]
+fit.y = fit.mean$coefficient[2]
+
+adjusted.lr = rep(NA, length(logratios.mean))
+for (j in 1:length(logratios.mean)){
+ fitted.mean = fit.x + fit.y * logcov.mean[j]
+ adjusted.lr[j] = logratios.mean[j] - fitted.mean
+}
+
+fit.mean2 = lm(adjusted.lr ~ logcov.mean)
+fit.mean.a = fit.mean2$coefficient[1]
+fit.mean.b = fit.mean2$coefficient[2]
+
+fit.mean.fn <- function(x, fit.a, fit.b){
+ result = fit.a + fit.b * x
+ return (result)
+}
+
+# Adjust SD based on the new adjusted log ratios
+logratios.sd = c()
+logcov.bins.mean= c()
+for (j in 1:actual.bin){
+ lr.bins.mean = as.matrix(adjusted.lr[cn.average.aboveTs$V15==j])
+# logratios.sd = c(logratios.sd, sd(lr.bins.mean))
+ logratios.sd = c(logratios.sd, apply(lr.bins.mean,2,sd))
+
+ cn.coverage.tumour.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V8)
+ cn.coverage.normal.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V9)
+ cn.coverage.nth = (cn.coverage.tumour.nth + cn.coverage.normal.nth) /2
+ logcov.bins.mean= c(logcov.bins.mean, log2(mean(cn.coverage.nth)))
+
+}
+
+logratios.sd.ori = logratios.sd
+if (length(logratios.sd) > 2) {
+ logratios.sd = logratios.sd[-length(logratios.sd)]
+}
+
+logcov.bins.mean.ori = logcov.bins.mean
+if (length(logcov.bins.mean) > 2){
+ logcov.bins.mean= logcov.bins.mean[-length(logcov.bins.mean)]
+}
+
+fit.sd = lm(log2(logratios.sd) ~ logcov.bins.mean)
+fit.sd.a = fit.sd$coefficient[1]
+fit.sd.b = fit.sd$coefficient[2]
+
+fit.sd.fn <- function(x, fit.a, fit.b){
+ result = 2 ^ (fit.mean.fn(x, fit.a, fit.b))
+ return (result)
+}
+
+# Get the P Values, called the gain/loss
+# with average and sd from each bins
+pVal.list = c()
+gain.loss = c()
+
+for (i in 1:nrow(cn.average.list)){
+ #print (i)
+ #logratio = cn.average.list[i]
+ #logcov = log(normal.rd.ori[i],2)
+ logratio = adjusted.lr[i]
+ logcov = logcov.mean[i]
+ exon.bin = Bin[i]
+
+ if (length(logratios.sd) > 1){
+ #pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b))
+ pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), sd.fn(logcov))
+ } else {
+ pVal <- pnorm(logratio, 0, logratios.sd[exon.bin])
+ }
+
+ if (pVal > 0.5){
+ pVal = 1-pVal
+ gain.loss = c(gain.loss, "gain")
+ } else {
+ gain.loss = c(gain.loss, "loss")
+ }
+ pVal.list = c(pVal.list, pVal*2)
+}
+
+# Get the adjusted P Values
+adjusted.pVal.list = p.adjust(pVal.list, method="BH")
+
+# Write the output into a tab-delimited text files
+outdf=data.frame(Targeted.Region.ID=ids,Exon.Number=exons,Gene.Sym=gs,Chr, OriStCoordinate, OriEndCoordinate, Mean.of.LogRatio=cn.average.list, Adjusted.Mean.of.LogRatio=adjusted.lr, SD.of.LogRatio=sd, Median.of.LogRatio=median, number.bases, P.Value=pVal.list ,Adjusted.P.Value=adjusted.pVal.list , gain.loss, tumour.rd, normal.rd, tumour.rd.ori, normal.rd.ori, MinLogRatio, MaxLogRatio, BinNumber = Bin)
+
+#min_normal_rd_for_call=5
+#min_tumour_rd_for_call=0
+#min_avg_cov_for_call=20
+outdf$tumour.rd.ori = outdf$tumour.rd.ori-0.5
+outdf$normal.rd.ori = outdf$normal.rd.ori-0.5
+wh.to.excl = outdf$normal.rd.ori < min_normal_rd_for_call
+wh.to.excl = wh.to.excl | outdf$tumour.rd.ori < min_tumour_rd_for_call
+wh.to.excl = wh.to.excl | (outdf$tumour.rd.ori+outdf$normal.rd.ori)/2 < min_avg_cov_for_call
+outdf$P.Value[wh.to.excl]=NA
+outdf$Adjusted.P.Value[wh.to.excl]=NA
+
+
+write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
+
+#Plotting SD
+#a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori))
+#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori))
+#sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b)
+#sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="")
+#sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn)
+#write.table(sd.outdf, sd.out.f,sep="\t", quote=F, row.names=F, col.names=T)
+
+
+#End of the script
+print ("End of cn_analysis.R")
+print (i)
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/cn_analysis.v4.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/cn_analysis.v4.R Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,228 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 30 Sept 2011 17:00PM
+
+
+# Parameters Parsing (from Command Line)
+options <- commandArgs(trailingOnly = T)
+bins = as.numeric(options[1])
+rd.cutoff = as.numeric(options[2])
+min.bases = as.numeric(options[3])
+outf = options[4]
+sample.name = options[5]
+plotoption = options[6]
+actual.bin = as.numeric(options[7])
+min_normal_rd_for_call = as.numeric(options[8])
+min_tumour_rd_for_call = as.numeric(options[9])
+min_avg_cov_for_call = as.numeric(options[10])
+
+if (sample.name == "No-SampleName")
+ sample.name = ""
+
+if (sample.name != "")
+ sample.name = paste(sample.name, ".", sep="")
+
+# Setup output name
+out.f = paste(outf, "/table/", sample.name, "CNATable.", rd.cutoff,"rd.", min.bases,"bases.", bins,"bins.txt", sep="")
+pdf.out.f = paste(outf, "/plot/", sample.name, "densityplot.", bins, "bins.pdf", sep="")
+
+# Open and read input files
+# cnAverageFile = paste("bin", bins, ".txt", sep="")
+cnAverageFile = paste(outf,"/buf/bin",bins,".txt",sep="")
+boundariesFile = paste(outf,"/buf/bin",bins,".boundaries.txt",sep="")
+print (cnAverageFile)
+cn.average = read.delim(cnAverageFile, as.is=F, header=F)
+cn.boundary= read.delim(boundariesFile,as.is=F, header=F)
+
+# Apply thresholds and data grouping
+cn.average.aboveTs = cn.average[cn.average$V3>min.bases,]
+cn.average.list = as.matrix(cn.average.aboveTs$V4)
+
+# Get the mean and sd for each bins
+cn.average.mean = c()
+cn.average.sd = c()
+cn.average.log= c()
+
+# Density Plots for each bins
+if (plotoption == "True"){
+ pdf(pdf.out.f)
+}
+for (j in 1:actual.bin){
+ cn.average.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V4)
+ cn.coverage.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V11)
+ boundary.end = cn.boundary[cn.boundary$V1==j,]$V2
+ boundary.start = cn.boundary[cn.boundary$V1==(j-1),]$V2
+ boundary.mid = (boundary.end+boundary.start)/2
+ if (plotoption == "True") {
+ plot_title = paste("density: bin", bins, sep="")
+ #plot(density(cn.average.nth),xlim=c(-5,5), title=plot_title)
+ plot(density(cn.average.nth),xlim=c(-5,5))
+ }
+ cn.average.mean = c(cn.average.mean, mean(cn.average.nth))
+# cn.average.sd = c(cn.average.sd, sd(cn.average.nth))
+ cn.average.sd = c(cn.average.sd, apply(cn.average.nth,2,sd))
+ #cn.average.log = c(cn.average.log, boundary.mid)
+ cn.average.log = c(cn.average.log, log(mean(cn.coverage.nth),2))
+}
+if (plotoption == "True"){
+ dev.off()
+}
+
+# Put the data's details into matrices
+ids = as.matrix(cn.average.aboveTs$V1)
+exons = as.matrix(cn.average.aboveTs$V6)
+exons.pos = as.matrix(cn.average.aboveTs$V5)
+gs = as.matrix(cn.average.aboveTs$V2)
+number.bases = as.matrix(cn.average.aboveTs$V3)
+mean = as.matrix(cn.average.aboveTs$V4)
+sd = as.matrix(cn.average.aboveTs$V7)
+tumour.rd = as.matrix(cn.average.aboveTs$V8)
+tumour.rd.ori = as.matrix(cn.average.aboveTs$V10)
+normal.rd = as.matrix(cn.average.aboveTs$V9)
+normal.rd.ori = as.matrix(cn.average.aboveTs$V11)
+median = as.matrix(cn.average.aboveTs$V12)
+MinLogRatio = as.matrix(cn.average.aboveTs$V13)
+MaxLogRatio = as.matrix(cn.average.aboveTs$V14)
+Bin = as.matrix(cn.average.aboveTs$V15)
+Chr = as.matrix(cn.average.aboveTs$V16)
+OriStCoordinate = as.matrix(cn.average.aboveTs$V17)
+OriEndCoordinate= as.matrix(cn.average.aboveTs$V18)
+
+# Linear Fit
+logratios.mean = mean
+logcov.mean = log2((normal.rd + tumour.rd)/2)
+fit.mean = lm(logratios.mean ~ logcov.mean)
+fit.x = fit.mean$coefficient[1]
+fit.y = fit.mean$coefficient[2]
+
+adjusted.lr = rep(NA, length(logratios.mean))
+for (j in 1:length(logratios.mean)){
+ fitted.mean = fit.x + fit.y * logcov.mean[j]
+ adjusted.lr[j] = logratios.mean[j] - fitted.mean
+}
+
+fit.mean2 = lm(adjusted.lr ~ logcov.mean)
+fit.mean.a = fit.mean2$coefficient[1]
+fit.mean.b = fit.mean2$coefficient[2]
+
+fit.mean.fn <- function(x, fit.a, fit.b){
+ result = fit.a + fit.b * x
+ return (result)
+}
+
+# Adjust SD based on the new adjusted log ratios
+logratios.sd = c()
+logcov.bins.mean= c()
+for (j in 1:actual.bin){
+ lr.bins.mean = as.matrix(adjusted.lr[cn.average.aboveTs$V15==j])
+# logratios.sd = c(logratios.sd, sd(lr.bins.mean))
+ logratios.sd = c(logratios.sd, apply(lr.bins.mean,2,sd))
+
+ cn.coverage.tumour.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V8)
+ cn.coverage.normal.nth = as.matrix(cn.average.aboveTs[cn.average.aboveTs$V15==j,]$V9)
+ cn.coverage.nth = (cn.coverage.tumour.nth + cn.coverage.normal.nth) /2
+ logcov.bins.mean= c(logcov.bins.mean, log2(mean(cn.coverage.nth)))
+
+}
+
+logratios.sd.ori = logratios.sd
+if (length(logratios.sd) > 2) {
+ logratios.sd = logratios.sd[-length(logratios.sd)]
+}
+
+logcov.bins.mean.ori = logcov.bins.mean
+if (length(logcov.bins.mean) > 2){
+ logcov.bins.mean= logcov.bins.mean[-length(logcov.bins.mean)]
+}
+
+fit.sd = lm(log2(logratios.sd) ~ logcov.bins.mean)
+fit.sd.a = fit.sd$coefficient[1]
+fit.sd.b = fit.sd$coefficient[2]
+
+fit.sd.fn <- function(x, fit.a, fit.b){
+ result = 2 ^ (fit.mean.fn(x, fit.a, fit.b))
+ return (result)
+}
+
+# Get the P Values, called the gain/loss
+# with average and sd from each bins
+pVal.list = c()
+gain.loss = c()
+
+for (i in 1:nrow(cn.average.list)){
+ #print (i)
+ #logratio = cn.average.list[i]
+ #logcov = log(normal.rd.ori[i],2)
+ logratio = adjusted.lr[i]
+ logcov = logcov.mean[i]
+ exon.bin = Bin[i]
+
+ if (length(logratios.sd) > 1){
+ pVal <- pnorm(logratio, fit.mean.fn(logcov, fit.mean.a, fit.mean.b), fit.sd.fn(logcov, fit.sd.a, fit.sd.b))
+ } else {
+ pVal <- pnorm(logratio, 0, logratios.sd[exon.bin])
+ }
+
+ if (pVal > 0.5){
+ pVal = 1-pVal
+ gain.loss = c(gain.loss, "gain")
+ } else {
+ gain.loss = c(gain.loss, "loss")
+ }
+ pVal.list = c(pVal.list, pVal*2)
+}
+
+# Get the adjusted P Values
+adjusted.pVal.list = p.adjust(pVal.list, method="BH")
+
+# Write the output into a tab-delimited text files
+outdf=data.frame(Targeted.Region.ID=ids,Exon.Number=exons,Gene.Sym=gs,Chr, OriStCoordinate, OriEndCoordinate, Mean.of.LogRatio=cn.average.list, Adjusted.Mean.of.LogRatio=adjusted.lr, SD.of.LogRatio=sd, Median.of.LogRatio=median, number.bases, P.Value=pVal.list ,Adjusted.P.Value=adjusted.pVal.list , gain.loss, tumour.rd, normal.rd, tumour.rd.ori, normal.rd.ori, MinLogRatio, MaxLogRatio, BinNumber = Bin)
+
+#min_normal_rd_for_call=5
+#min_tumour_rd_for_call=0
+#min_avg_cov_for_call=20
+outdf$tumour.rd.ori = outdf$tumour.rd.ori-0.5
+outdf$normal.rd.ori = outdf$normal.rd.ori-0.5
+wh.to.excl = outdf$normal.rd.ori < min_normal_rd_for_call
+wh.to.excl = wh.to.excl | outdf$tumour.rd.ori < min_tumour_rd_for_call
+wh.to.excl = wh.to.excl | (outdf$tumour.rd.ori+outdf$normal.rd.ori)/2 < min_avg_cov_for_call
+outdf$P.Value[wh.to.excl]=NA
+outdf$Adjusted.P.Value[wh.to.excl]=NA
+
+
+write.table(outdf,out.f,sep="\t",quote=F,row.names=F,col.names=T)
+
+#Plotting SD
+#a.sd.fn = rep(fit.sd.a, length(logratios.sd.ori))
+#b.sd.fn = rep(fit.sd.b, length(logratios.sd.ori))
+#sd.after.fit = fit.sd.fn(logcov.bins.mean.ori, fit.sd.a, fit.sd.b)
+#sd.out.f = paste(outf, "/plot/", sample.name, "sd.data_fit.", bins, "bins.txt", sep="")
+#sd.outdf = data.frame(SD.Before.Fit = logratios.sd.ori, Log.Coverage = logcov.bins.mean.ori, SD.After.Fit = sd.after.fit, a.for.fitting=a.sd.fn, b.for.fitting=b.sd.fn)
+#write.table(sd.outdf, sd.out.f,sep="\t", quote=F, row.names=F, col.names=T)
+
+
+#End of the script
+print ("End of cn_analysis.R")
+print (i)
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/cn_apply_threshold.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/cn_apply_threshold.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,111 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 12 Oct 2011 11:00AM
+
+def applyThreshold(outputName, bufTable, threshold, maxGap):
+ srcFile = outputName + ".txt"
+ outFile = bufTable + ".LargeVariations.txt"
+ bedFile = bufTable + ".BED"
+ fFile = outputName + ".DetailsFILTERED.txt"
+ ts = float(threshold)
+
+ # Read and open files
+ srcTable = file.readlines(open(srcFile))
+ outTable = open(outFile, "w")
+ bedOut = open(bedFile, "w")
+ filteredTable = open(fFile, "w")
+
+
+ #header
+ outTable.write("Chr \tStartCoordinate \tEndCoordinate \tGenes \tGain.Loss \n")
+ filteredTable.write(srcTable[0])
+
+ prevChr = ''
+ prevStatus = ''
+ prevEnd = -1
+ genes = []
+ chrList = []
+
+ for exons in srcTable:
+ exon = exons.split()
+ try:
+ adjPVal = float(exon[12])
+ except:
+ continue
+
+ if adjPVal <= ts:
+ chr = exon[3]
+ gene = exon[2]
+ status = exon[13]
+ start = exon[4]
+ end = exon[5]
+
+ # For first row
+ if prevEnd == -1:
+ gap = 0
+ else:
+ gap = int(prevEnd) - int(start)
+
+ # Write Filtered Table
+ filteredTable.write(exons)
+
+ # Write Bed File
+ bedOut.write(chr.strip("chr") +"\t" +start +"\t"+ end+"\t"+
+ chr.strip("chr")+":"+start+"-"+end+":"+str(adjPVal)+"\n")
+
+ if prevChr == '' and prevStatus == '':
+ if chr not in chrList:
+ print chr
+ chrList.append(chr)
+ elif (chr == prevChr) and (status == prevStatus) and (gap < maxGap):
+ start = prevStart
+ else:
+ outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
+ for gsym in genes:
+ outTable.write(gsym + ", ")
+ outTable.write("\t" + prevStatus + "\n")
+ genes=[]
+
+ if gene not in genes:
+ genes.append(gene)
+ prevChr = chr
+ prevStatus = status
+ prevStart = start
+ prevEnd = end
+ elif len(genes) > 0:
+ outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
+ for gsym in genes:
+ outTable.write(gsym + ", " )
+ outTable.write("\t" + prevStatus + "\n")
+ prevChr = ''
+ prevStatus = ''
+ genes = []
+
+ if len(genes) > 0:
+ outTable.write(prevChr +"\t" +prevStart +"\t" +prevEnd + "\t")
+ for gsym in genes:
+ outTable.write(gsym + ", ")
+ outTable.write("\t" + prevStatus + "\n")
+
+ filteredTable.close()
+ bedOut.close()
+ outTable.close()
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/convert_gene_coordinate.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/convert_gene_coordinate.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,218 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 03 Oct 2011 11:00AM
+
+import sys
+
+def outputwrite(output, gene,chr,a,b,c,id,n,sOri, eOri):
+ id = str(id)
+ n = str(n)
+ output.write(gene+"\t"+chr+"\t"+a+"\t"+b+"\t"+c+"\t"+id+"\t"+n+"\t"+sOri+"\t"+eOri+"\n")
+
+def outputwrite2(output2, chr,c,sOri, eOri):
+ output2.write(chr+"\t"+sOri+"\t"+eOri+"\t"+c+"\n")
+
+
+def convertGeneCoordinate(targetList, bufLocFolder):
+ inputfile2 = bufLocFolder + "chr/"
+ outputfile = bufLocFolder + "geneRefCoverage.txt"
+ outputfile2= bufLocFolder + "geneRefCoverage2.txt"
+
+ output= open(outputfile,"w")
+ output2 = open(outputfile2 , "w")
+
+ rn = 1
+ prevchr = ""
+ for target in targetList:
+ chr = target.chr
+ starts = target.oriStart.split(",")
+ ends = target.oriEnd.split(",")
+
+ if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
+ continue
+
+ if (prevchr != chr):
+ print chr #progress checking
+ prevchr = chr
+ t = 0
+ covFile = file.readlines(open(inputfile2+chr+".txt","r"))
+
+ for n in range(0,target.numberExon):
+ if t >= len(covFile):
+ break
+ cov = covFile[t].split()
+ while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
+ if (int(cov[1]) > int(starts[n])):
+ t-=1
+ else:
+ t+=1
+ cov = covFile[t].split()
+
+ while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))):
+ # print output #
+ if (rn == 1):
+ prev = target.id
+ nID = target.id
+ if (nID != prev):
+ rn = 1
+ ref1 = str(rn)
+ ref2 = str(int(cov[2]) - int(starts[n]) + rn)
+ outputwrite(output, target.gene,chr,ref1,ref2,cov[3],target.id,n,starts[n],cov[2])
+
+ outputwrite2(output2, chr,cov[3],starts[n],cov[2])
+
+
+ rn = int(ref2)
+ prev = nID
+ # -- #
+
+ # get to the next line of inputfile#
+ t+= 1
+ cov = covFile[t].split()
+ starts[n] = cov[1]
+
+ #print output #
+ if (t == 0) and (t >= len(covFile)):
+ continue
+
+ if (rn == 1):
+ prev = target.id
+ nID = target.id
+ if (nID != prev):
+ rn = 1
+ ref1 = str(rn)
+ ref2 = str(int(ends[n]) - int(starts[n]) + rn)
+ outputwrite(output, target.gene, chr, ref1, ref2, cov[3], target.id, n, starts[n], ends[n])
+
+ outputwrite2(output2, chr, cov[3], starts[n], ends[n])
+
+
+ rn = int(ref2)
+ prev = nID
+ # -- #
+ output.close()
+ output2.close()
+
+
+def convertGeneCoordinate2(targetList, bufLocFolder):
+ inputfile2 = bufLocFolder + "chr/"
+ outputfile = bufLocFolder + "geneRefCoverage.txt"
+ outputfile_avg = bufLocFolder + "geneRefCoverage_targetAverage.txt"
+
+ output= open(outputfile,"w")
+ output_avg = open(outputfile_avg,"w")
+
+ rn = 1
+ prevchr = ""
+ for target in targetList:
+ chr = target.chr
+ starts = target.oriStart.split(",")
+ ends = target.oriEnd.split(",")
+ target_ttl_readdepth = 0
+ starts_leftmost = starts[0]
+
+
+ if ((len(chr) > 5) or (chr[len(chr)-1] == "M")):
+ continue
+
+ if (prevchr != chr):
+ print chr #progress checking
+ prevchr = chr
+ t = 0
+ covFile = file.readlines(open(inputfile2+chr+".txt","r"))
+
+ for n in range(0,target.numberExon):
+ if t >= len(covFile):
+ break
+ cov = covFile[t].split()
+ while ((int(starts[n]) < int(cov[1])) or (int(starts[n]) >= int(cov[2]))):
+ if (int(cov[1]) > int(starts[n])): t-=1
+ else:
+ t+=1
+ cov = covFile[t].split()
+
+ while ((int(ends[n]) < int(cov[1])) or (int(ends[n]) > int(cov[2]))):
+ # print output #
+ if (rn == 1):
+ prev = target.id
+ nID = target.id
+ if (nID != prev):
+ rn = 1
+ ref1 = str(rn)
+ ref2 = str(int(cov[2]) - int(starts[n]) + rn)
+ outputwrite2(output, chr,cov[3],starts[n],cov[2])
+ tmprange=int(cov[2])-int(starts[n])+1
+ target_ttl_readdepth+=int(cov[3])*tmprange
+ #target_length+=tmprange
+
+ rn = int(ref2)
+ prev = nID
+ # -- #
+
+ # get to the next line of inputfile#
+ t+= 1
+ cov = covFile[t].split()
+ starts[n] = cov[1]
+
+ #print output #
+ if (t == 0) and (t >= len(covFile)):
+ continue
+
+ if (rn == 1):
+ prev = target.id
+ nID = target.id
+ if (nID != prev):
+ rn = 1
+ ref1 = str(rn)
+ ref2 = str(int(ends[n]) - int(starts[n]) + rn)
+ outputwrite2(output, chr, cov[3], starts[n], ends[n])
+ tmprange=int(ends[n])-int(starts[n])+1
+ target_ttl_readdepth+=int(cov[3])*tmprange
+ #target_length+=tmprange
+ target_length = int(ends[n])-int(starts_leftmost)+1
+ output_avg.write("\t".join([chr,starts_leftmost,ends[n],str(target_ttl_readdepth/target_length),str(target_length)])+"\n")
+
+ rn = int(ref2)
+ prev = nID
+ # -- #
+ output.close()
+ output_avg.close()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/convert_targeted_regions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/convert_targeted_regions.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,71 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 28 Mar 2011 11:00AM
+
+class Target:
+ """
+ Class for target regions
+ """
+
+ population = 0
+
+ def __init__(self):
+ self.id = 0
+ self.gene = "unknown"
+ self.chr = "chr1"
+ self.start = 0
+ self.end = 0
+ self.numberExon = 0
+ self.oriStart = 0
+ self.oriEnd = 0
+
+def convertTarget(target):
+ targets = open(target)
+
+ targetList = []
+
+ count = 0
+ for region in targets:
+ region = region.split()
+ chr = "chr" + region[0].strip("chr")
+ start = region[1]
+ end = region[2]
+ try:
+ gene = region[3]
+ except:
+ gene = "unknown"
+ count += 1
+
+ aTarget = Target()
+ aTarget.id = count
+ aTarget.gene = gene
+ aTarget.chr = chr
+ aTarget.start = start
+ aTarget.end = end
+ aTarget.numberExon = 1
+ aTarget.oriStart = start
+ aTarget.oriEnd = end
+
+ targetList.append(aTarget)
+
+
+ return targetList
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/count_libsize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/count_libsize.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,38 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 05 October 2011 16:43PM
+
+def get_libsize(bedgraph_file):
+ bedgraph = open(bedgraph_file)
+ libsize = 0
+ for line in bedgraph:
+ line = line.split()
+ chr = line[0]
+ start = int(line[1])
+ end = int(line[2])
+ cov = float(line[3])
+ #cov = int(line[3])
+
+ libsize += (end-start)*cov
+
+ bedgraph.close()
+ return int(libsize)
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/get_chr_length.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/get_chr_length.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,43 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 05 October 2011 16:43PM
+
+
+import subprocess, shlex
+
+def get_genome(srcFile, genomeOut):
+ genome = open(genomeOut, "w")
+
+ args = shlex.split("samtools view -H %s" %(srcFile))
+ raw_header = subprocess.Popen(args, stdout = subprocess.PIPE).communicate()[0]
+ headers = raw_header.split("\n")
+
+ for header in headers:
+ header = header.split("\t")
+ if header[0][1:] != "SQ":
+ continue
+
+ genome.write(header[1].strip("SN:") + "\t" + header[2].strip("LN:") + "\n")
+
+ genome.close()
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/large_region_cbs.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/large_region_cbs.R Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,148 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 12 October 2011 16:43PM
+
+library(DNAcopy)
+options <- commandArgs(trailingOnly = T)
+logcov.file = options[1]
+param.small.seg = as.numeric(options[2])
+param.large.seg = as.numeric(options[3])
+param.pval.cutoff = as.numeric(options[4])
+param.pvals.size = as.numeric(options[5])
+param.call.low = as.numeric(options[6])
+param.call.high = as.numeric(options[7])
+bufloc = options[8]
+
+#param.small.seg = 1
+#param.large.seg = 25
+#param.pval.cutoff = 0.1
+#param.pvals.size = 0.35
+
+#param.call.low = -0.3
+#param.call.high= 0.3
+
+
+dat = read.delim(logcov.file, as.is=F, header=T)
+t.id = dat$Targeted.Region.ID
+t.mean = dat$Adjusted.Mean.of.LogRatio
+t.chr = dat$Chr
+
+cna.dat <- CNA(t.mean, t.chr, t.id, data.type="logratio")
+smooth.cna.dat = smooth.CNA(cna.dat)
+
+kmax.range = c(param.large.seg, param.small.seg)
+for (i in kmax.range){
+ out.f = paste(bufloc, "/CBS", i ,".txt", sep="")
+ cna.segment = segment(smooth.cna.dat, kmax = i, verbose = 1)
+ #pdf(paste("segment_", "kmax", i, ".pdf", sep=""))
+ #for (chr.list in unique(t.chr)){
+ # plotSample(cna.segment, chromlist=chr.list)
+ #}
+ #dev.off()
+
+ #Get Data
+ cna.segment.out <- cna.segment$output
+ cna.segment.start = c()
+ cna.segment.end = c()
+ cna.segment.mean = c()
+ cna.segment.pvals.size = c()
+ cna.segment.calls = c()
+ for (n in 1:nrow(cna.segment.out)){
+ cna.start.id = cna.segment.out[n,]$loc.start
+ cna.end.id = cna.segment.out[n,]$loc.end
+ cna.start.coord = dat[dat$Targeted.Region.ID==(cna.start.id),]$OriStCoordinate
+ cna.end.coord = dat[dat$Targeted.Region.ID==(cna.end.id), ]$OriEndCoordinate
+
+ dat.inrange = dat[dat$Targeted.Region.ID<=cna.end.id&dat$Targeted.Region.ID>=cna.start.id,]
+ cna.segment.logratios = dat.inrange$Adjusted.Mean.of.LogRatio
+ cna.segment.pvalues = dat.inrange$Adjusted.P.Value
+ segment.pvals.above = dat.inrange[dat.inrange$Adjusted.P.Value<=param.pval.cutoff,]$Adjusted.P.Value
+
+ segment.pvals.size = length(segment.pvals.above)/length(cna.segment.pvalues)
+ cna.segment.mean = c(cna.segment.mean, mean(cna.segment.logratios))
+ cna.segment.start = c(cna.segment.start, cna.start.coord)
+ cna.segment.end = c(cna.segment.end , cna.end.coord)
+ cna.segment.pvals.size = c(cna.segment.pvals.size, segment.pvals.size)
+
+ if (segment.pvals.size < param.pvals.size){
+ #No-Call
+ cna.segment.calls = c(cna.segment.calls, "No")
+ } else {
+ m = mean(cna.segment.logratios)
+ if ((m > param.call.low) && (m < param.call.high)){
+ #No-Call
+ cna.segment.calls = c(cna.segment.calls, "No")
+ } else {
+ #Call
+ cna.segment.calls = c(cna.segment.calls, "CNV")
+ }
+ }
+ }
+
+
+ outdf = data.frame(Chr=cna.segment.out$chrom, Target.Start=cna.segment.out$loc.start, Target.End=cna.segment.out$loc.end, NumberOfTargets=cna.segment.out$num.mark, OriStCoordinate=cna.segment.start, OriEndCoordinate=cna.segment.end, CBS.Mean = cna.segment.out$seg.mean, LogRatios = cna.segment.mean, Above.PValues.Cutoff=cna.segment.pvals.size, Calls = cna.segment.calls)
+
+ write.table(outdf, out.f, sep="\t", quote=F, row.names=F, col.names=T)
+
+}
+
+small.dat.f = paste(bufloc, "/CBS", param.small.seg ,".txt", sep="")
+large.dat.f = paste(bufloc, "/CBS", param.large.seg ,".txt", sep="")
+
+small.dat = read.delim(small.dat.f, as.is=F, header=T)
+large.dat = read.delim(large.dat.f, as.is=F, header=T)
+
+large.nocall = large.dat[large.dat$Calls=="No",]
+small.call = small.dat[small.dat$Calls!="No",]
+included.segment= large.dat[large.dat$Calls!="No",]
+
+for (i in 1:nrow(small.call)){
+ small.segment = small.call[i, ]
+ chr.small = small.segment$Chr
+ start.small = small.segment$Target.Start
+ end.small = small.segment$Target.End
+ match.large.nocall = large.nocall[large.nocall$Chr==chr.small,]
+
+ match.large.nocall = match.large.nocall[match.large.nocall$Target.End>=start.small,]
+ match.large.nocall = match.large.nocall[match.large.nocall$Target.Start<=start.small,]
+ merged.hole = match.large.nocall[match.large.nocall$Target.End>=start.small,]
+
+ if (nrow(merged.hole) > 0){
+ included.segment = merge(included.segment, small.segment, all=T)
+ }
+}
+
+out.f = paste(logcov.file, ".LargeDeletion.txt", sep="")
+write.table(included.segment, out.f, sep="\t", quote=F, row.names=F)
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/split_chromosome.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/split_chromosome.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,105 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 03 Sep 2011 15:00PM
+
+
+import os
+
+def splitByChromosome(destFolder):
+
+ try:
+ os.mkdir(destFolder + "chr/")
+ except:
+ print "folder exist"
+
+ inputfile = destFolder + "sample.BEDGRAPH"
+ outputfile = destFolder + "chr/chr1.txt"
+ file = open(inputfile,"r")
+ output = open(outputfile,"w")
+ check = "1"
+
+ for row in file:
+ if row[0] == '#':
+ continue
+
+ cols = row.split()
+ chr = cols[0].strip("chr")
+ if (chr != check):
+ output.close()
+ check = chr
+ output = open(destFolder+ "chr/chr"+check+".txt","w")
+ output.write(row)
+
+ output.close()
+
+#JLMod
+def splitByChromosome3(infile):
+ destFolder = os.path.dirname(infile)+"/"
+
+ try:
+ os.mkdir(destFolder + "chr/")
+ except:
+ print "folder exist"
+
+ #inputfile = destFolder + "sample.BEDGRAPH"
+ inputfile=infile
+ outputfile = destFolder + "chr/chr1.txt"
+ file = open(inputfile,"r")
+ output = open(outputfile,"w")
+ check = "1"
+
+ for row in file:
+ cols = row.split()
+ chr = cols[0].strip("chr")
+ if (chr != check):
+ output.close()
+ check = chr
+ output = open(destFolder+ "chr/chr"+check+".txt","w")
+ output.write(row)
+
+ output.close()
+
+def splitByChromosome2(folder):
+
+ try:
+ os.mkdir(folder + "target/")
+ except:
+ print "folder exist"
+
+ inputfile = folder + "geneRefCoverage.txt"
+ outputfile = folder + "target/chr1.txt"
+ file = open(inputfile,"r")
+ output = open(outputfile,"w")
+ check = "1"
+
+ for row in file:
+ cols = row.split()
+ chr = cols[0].strip("chr")
+ if (chr != check):
+ output.close()
+ check = chr
+ output = open(folder+ "target/chr"+check+".txt","w")
+ output.write(row)
+
+ output.close()
+
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/target_breakdown.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/target_breakdown.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,56 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 05 October 2011 16:43PM
+
+def target_breakdown(filename, maxRegionSize, targetRegionSize, out_fname):
+ from math import ceil
+
+ a = open(filename)
+ output = open(out_fname, "w")
+
+ for x in a:
+ x = x.split()
+ chr = x[0]
+ start = int(x[1])
+ end = int(x[2])
+ info = ""
+ if (len(x) > 3):
+ info = x[3]
+
+ regionSize = end-start
+
+ if regionSize > maxRegionSize:
+ seg = ceil(regionSize/float(targetRegionSize))
+ limit = ceil(regionSize/float(seg))
+
+ s_end = start
+ for i in range(int(seg)):
+ s_start = s_end
+ s_end = int(start + (limit*(i+1)))
+ if s_end > end:
+ s_end = end
+ output.write("\t".join([chr, str(s_start), str(s_end), info+"(SPLIT)"])+"\n")
+ else:
+ output.write("\t".join([chr, str(start), str(end), info])+"\n")
+
+ output.close()
+
diff -r 000000000000 -r 7564f3b1e675 Contra/scripts/vcf_out.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/vcf_out.py Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,64 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+# CONTRA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# CONTRA is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with CONTRA. If not, see .
+#
+#
+#-----------------------------------------------------------------------#
+# Last Updated : 09 Apr 2011 11:00AM
+
+def vcf_out(inF, outF):
+ import math
+ f = file.readlines(open(inF))
+ vcf = open(outF, "w")
+
+ #header
+ vcf.write("##fileformat=VCFv4.0\n")
+ vcf.write("##reference=1000GenomesPilot-NCBI36\n")
+ vcf.write('##INFO=\n")
+ region = region.split(":")
+ chr = region[0]
+
+ adjPVal = float(region[2])
+ if adjPVal <= 0:
+ adjPVal = 0
+ else:
+ adjPVal = -10 * math.log(adjPVal, 10)
+ adjPVal = str(round(adjPVal,3))
+ region[1] = region[1].split("-")
+ start = region[1][0]
+ end = region[1][1]
+ else:
+ ref = f[count].strip("\n")
+ vcf.write(chr +"\t"+ start + "\t" + "." + "\t" + ref + "\t")
+ vcf.write("" + "\t" + adjPVal + "\t" + "PASS" + "\t")
+ vcf.write("SVTYPE=CNV;END="+ end + "\n")
+ count += 1
+
+ vcf.close()
+
+
+
+