# HG changeset patch
# User xuebing
# Date 1333198386 14400
# Node ID 0e221dbd17b2c7f4bfed1e8f42bda964bd48c766
# Parent d1f0f85ee5bce9f1f05d2339b0776f410a9e691a
Uploaded
diff -r d1f0f85ee5bc -r 0e221dbd17b2 altschulEriksonDinuclShuffle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/altschulEriksonDinuclShuffle.py Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,150 @@
+#! /usr/bin/env python
+
+# altschulEriksonDinuclShuffle.py
+# P. Clote, Oct 2003
+# NOTE: One cannot use function "count(s,word)" to count the number
+# of occurrences of dinucleotide word in string s, since the built-in
+# function counts only nonoverlapping words, presumably in a left to
+# right fashion.
+
+
+import sys,string,random
+
+
+
+def computeCountAndLists(s):
+ #WARNING: Use of function count(s,'UU') returns 1 on word UUU
+ #since it apparently counts only nonoverlapping words UU
+ #For this reason, we work with the indices.
+
+ #Initialize lists and mono- and dinucleotide dictionaries
+ List = {} #List is a dictionary of lists
+ List['A'] = []; List['C'] = [];
+ List['G'] = []; List['T'] = [];
+ nuclList = ["A","C","G","T"]
+ s = s.upper()
+ s = s.replace("U","T")
+ nuclCnt = {} #empty dictionary
+ dinuclCnt = {} #empty dictionary
+ for x in nuclList:
+ nuclCnt[x]=0
+ dinuclCnt[x]={}
+ for y in nuclList:
+ dinuclCnt[x][y]=0
+
+ #Compute count and lists
+ nuclCnt[s[0]] = 1
+ nuclTotal = 1
+ dinuclTotal = 0
+ for i in range(len(s)-1):
+ x = s[i]; y = s[i+1]
+ List[x].append( y )
+ nuclCnt[y] += 1; nuclTotal += 1
+ dinuclCnt[x][y] += 1; dinuclTotal += 1
+ assert (nuclTotal==len(s))
+ assert (dinuclTotal==len(s)-1)
+ return nuclCnt,dinuclCnt,List
+
+
+def chooseEdge(x,dinuclCnt):
+ numInList = 0
+ for y in ['A','C','G','T']:
+ numInList += dinuclCnt[x][y]
+ z = random.random()
+ denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']
+ numerator = dinuclCnt[x]['A']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['A'] -= 1
+ return 'A'
+ numerator += dinuclCnt[x]['C']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['C'] -= 1
+ return 'C'
+ numerator += dinuclCnt[x]['G']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['G'] -= 1
+ return 'G'
+ dinuclCnt[x]['T'] -= 1
+ return 'T'
+
+def connectedToLast(edgeList,nuclList,lastCh):
+ D = {}
+ for x in nuclList: D[x]=0
+ for edge in edgeList:
+ a = edge[0]; b = edge[1]
+ if b==lastCh: D[a]=1
+ for i in range(2):
+ for edge in edgeList:
+ a = edge[0]; b = edge[1]
+ if D[b]==1: D[a]=1
+ ok = 0
+ for x in nuclList:
+ if x!=lastCh and D[x]==0: return 0
+ return 1
+
+
+
+def eulerian(s):
+ nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+ #compute nucleotides appearing in s
+ nuclList = []
+ for x in ["A","C","G","T"]:
+ if x in s: nuclList.append(x)
+ #compute numInList[x] = number of dinucleotides beginning with x
+ numInList = {}
+ for x in nuclList:
+ numInList[x]=0
+ for y in nuclList:
+ numInList[x] += dinuclCnt[x][y]
+ #create dinucleotide shuffle L
+ firstCh = s[0] #start with first letter of s
+ lastCh = s[-1]
+ edgeList = []
+ for x in nuclList:
+ if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
+ ok = connectedToLast(edgeList,nuclList,lastCh)
+ return ok,edgeList,nuclList,lastCh
+
+
+def shuffleEdgeList(L):
+ n = len(L); barrier = n
+ for i in range(n-1):
+ z = int(random.random() * barrier)
+ tmp = L[z]
+ L[z]= L[barrier-1]
+ L[barrier-1] = tmp
+ barrier -= 1
+ return L
+
+def dinuclShuffle(s):
+ ok = 0
+ while not ok:
+ ok,edgeList,nuclList,lastCh = eulerian(s)
+ nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+
+ #remove last edges from each vertex list, shuffle, then add back
+ #the removed edges at end of vertex lists.
+ for [x,y] in edgeList: List[x].remove(y)
+ for x in nuclList: shuffleEdgeList(List[x])
+ for [x,y] in edgeList: List[x].append(y)
+
+ #construct the eulerian path
+ L = [s[0]]; prevCh = s[0]
+ for i in range(len(s)-2):
+ ch = List[prevCh][0]
+ L.append( ch )
+ del List[prevCh][0]
+ prevCh = ch
+ L.append(s[-1])
+ t = string.join(L,"")
+ return t
+
+
+
+if __name__ == '__main__':
+ if len(sys.argv)!=3:
+ print "Usage: python altschulEriksonDinuclShuffle.py GCATCGA 5"
+ sys.exit(1)
+ s = sys.argv[1].upper()
+ for i in range(int(sys.argv[2])):
+ print dinuclShuffle(s)
diff -r d1f0f85ee5bc -r 0e221dbd17b2 dreme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dreme.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,50 @@
+
+ short motif discovery
+ /Users/xuebing/bin/dreme.py -p $input -png -e $ethresh
+ #if $background_select.bg_select == "fromfile":
+ -n "${bgfile}"
+ #end if
+
+ && mv dreme_out/dreme.html ${html_outfile}
+
+ && mv dreme_out/dreme.txt ${txt_outfile}
+
+ && mv dreme_out/dreme.xml ${xml_outfile}
+
+ && rm -rf dreme_out
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+http://meme.sdsc.edu/meme/doc/dreme.html
+
+DREME (Discriminative Regular Expression Motif Elicitation) finds relatively short motifs (up to 8 bases) fast, and can perform discriminative motif discovery if given a negative set, consisting of sequences unlikely to contain a motif of interest that is however likely to be found in the main ("positive") sequence set. If you do not provide a negative set the program shuffles the positive set to provide a background (in the role of the negative set).
+
+The input to DREME is one or two sets of DNA sequences. The program uses a Fisher Exact Test to determine significance of each motif found in the postive set as compared with its representation in the negative set, using a significance threshold that may be set on the command line.
+
+DREME achieves its high speed by restricting its search to regular expressions based on the IUPAC alphabet representing bases and ambiguous characters, and by using a heuristic estimate of generalised motifs' statistical significance.
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fasta-dinucleotide-shuffle.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta-dinucleotide-shuffle.py Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/python
+
+import sys, string, random
+import sequence
+
+#
+# turn on psyco to speed up by 3X
+#
+if __name__=='__main__':
+ try:
+ import psyco
+ #psyco.log()
+ psyco.full()
+ psyco_found = True
+ except ImportError:
+# psyco_found = False
+ pass
+# print >> sys.stderr, "psyco_found", psyco_found
+
+
+# altschulEriksonDinuclShuffle.py
+# P. Clote, Oct 2003
+
+def computeCountAndLists(s):
+
+ #Initialize lists and mono- and dinucleotide dictionaries
+ List = {} #List is a dictionary of lists
+ List['A'] = []; List['C'] = [];
+ List['G'] = []; List['T'] = [];
+ # FIXME: is this ok?
+ List['N'] = []
+ nuclList = ["A","C","G","T","N"]
+ s = s.upper()
+ #s = s.replace("U","T")
+ nuclCnt = {} #empty dictionary
+ dinuclCnt = {} #empty dictionary
+ for x in nuclList:
+ nuclCnt[x]=0
+ dinuclCnt[x]={}
+ for y in nuclList:
+ dinuclCnt[x][y]=0
+
+ #Compute count and lists
+ nuclCnt[s[0]] = 1
+ nuclTotal = 1
+ dinuclTotal = 0
+ for i in range(len(s)-1):
+ x = s[i]; y = s[i+1]
+ List[x].append( y )
+ nuclCnt[y] += 1; nuclTotal += 1
+ dinuclCnt[x][y] += 1; dinuclTotal += 1
+ assert (nuclTotal==len(s))
+ assert (dinuclTotal==len(s)-1)
+ return nuclCnt,dinuclCnt,List
+
+
+def chooseEdge(x,dinuclCnt):
+ z = random.random()
+ denom=dinuclCnt[x]['A']+dinuclCnt[x]['C']+dinuclCnt[x]['G']+dinuclCnt[x]['T']+dinuclCnt[x]['N']
+ numerator = dinuclCnt[x]['A']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['A'] -= 1
+ return 'A'
+ numerator += dinuclCnt[x]['C']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['C'] -= 1
+ return 'C'
+ numerator += dinuclCnt[x]['G']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['G'] -= 1
+ return 'G'
+ numerator += dinuclCnt[x]['T']
+ if z < float(numerator)/float(denom):
+ dinuclCnt[x]['T'] -= 1
+ return 'T'
+ dinuclCnt[x]['N'] -= 1
+ return 'N'
+
+def connectedToLast(edgeList,nuclList,lastCh):
+ D = {}
+ for x in nuclList: D[x]=0
+ for edge in edgeList:
+ a = edge[0]; b = edge[1]
+ if b==lastCh: D[a]=1
+ for i in range(3):
+ for edge in edgeList:
+ a = edge[0]; b = edge[1]
+ if D[b]==1: D[a]=1
+ ok = 0
+ for x in nuclList:
+ if x!=lastCh and D[x]==0: return 0
+ return 1
+
+def eulerian(s):
+ nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+ #compute nucleotides appearing in s
+ nuclList = []
+ for x in ["A","C","G","T","N"]:
+ if x in s: nuclList.append(x)
+ #create dinucleotide shuffle L
+ firstCh = s[0] #start with first letter of s
+ lastCh = s[-1]
+ edgeList = []
+ for x in nuclList:
+ if x!= lastCh: edgeList.append( [x,chooseEdge(x,dinuclCnt)] )
+ ok = connectedToLast(edgeList,nuclList,lastCh)
+ return ok,edgeList,nuclList,lastCh
+
+
+def shuffleEdgeList(L):
+ n = len(L); barrier = n
+ for i in range(n-1):
+ z = int(random.random() * barrier)
+ tmp = L[z]
+ L[z]= L[barrier-1]
+ L[barrier-1] = tmp
+ barrier -= 1
+ return L
+
+def dinuclShuffle(s):
+ ok = 0
+ while not ok:
+ ok,edgeList,nuclList,lastCh = eulerian(s)
+ nuclCnt,dinuclCnt,List = computeCountAndLists(s)
+
+ #remove last edges from each vertex list, shuffle, then add back
+ #the removed edges at end of vertex lists.
+ for [x,y] in edgeList: List[x].remove(y)
+ for x in nuclList: shuffleEdgeList(List[x])
+ for [x,y] in edgeList: List[x].append(y)
+
+ #construct the eulerian path
+ L = [s[0]]; prevCh = s[0]
+ for i in range(len(s)-2):
+ ch = List[prevCh][0]
+ L.append( ch )
+ del List[prevCh][0]
+ prevCh = ch
+ L.append(s[-1])
+ t = string.join(L,"")
+ return t
+
+def main():
+
+ #
+ # defaults
+ #
+ file_name = None
+ seed = 1
+ copies = 1
+
+ #
+ # get command line arguments
+ #
+ usage = """USAGE:
+ %s [options]
+
+ -f file name (required)
+ -t added to shuffled sequence names
+ -s random seed; default: %d
+ -c make shuffled copies of each sequence; default: %d
+ -h print this usage message
+ """ % (sys.argv[0], seed, copies)
+
+ # no arguments: print usage
+ if len(sys.argv) == 1:
+ print >> sys.stderr, usage; sys.exit(1)
+
+ tag = "";
+
+ # parse command line
+ i = 1
+ while i < len(sys.argv):
+ arg = sys.argv[i]
+ if (arg == "-f"):
+ i += 1
+ try: file_name = sys.argv[i]
+ except: print >> sys.stderr, usage; sys.exit(1)
+ elif (arg == "-t"):
+ i += 1
+ try: tag = sys.argv[i]
+ except: print >> sys.stderr, usage; sys.exit(1)
+ elif (arg == "-s"):
+ i += 1
+ try: seed = string.atoi(sys.argv[i])
+ except: print >> sys.stderr, usage; sys.exit(1)
+ elif (arg == "-c"):
+ i += 1
+ try: copies = string.atoi(sys.argv[i])
+ except: print >> sys.stderr, usage; sys.exit(1)
+ elif (arg == "-h"):
+ print >> sys.stderr, usage; sys.exit(1)
+ else:
+ print >> sys.stderr, "Unknown command line argument: " + arg
+ sys.exit(1)
+ i += 1
+
+ # check that required arguments given
+ if (file_name == None):
+ print >> sys.stderr, usage; sys.exit(1)
+
+ random.seed(seed)
+
+ # read sequences
+ seqs = sequence.readFASTA(file_name,'Extended DNA')
+
+ for s in seqs:
+ str = s.getString()
+ #FIXME altschul can't handle ambigs
+ name = s.getName()
+
+ #print >> sys.stderr, ">%s" % name
+
+ for i in range(copies):
+
+ shuffledSeq = dinuclShuffle(str)
+
+ if (copies == 1):
+ print >> sys.stdout, ">%s\n%s" % (name+tag, shuffledSeq)
+ else:
+ print >> sys.stdout, ">%s_%d\n%s" % (name+tag, i, shuffledSeq)
+
+if __name__ == '__main__': main()
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fastamarkov.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastamarkov.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,21 @@
+
+ of DNA sequence
+ cat $input | fasta-get-markov -m $m $norc > $output 2> err.txt
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool generates a markov model from a fasta sequence file.
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fastashuffle1.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastashuffle1.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,17 @@
+
+ preserving mono-nucleotide frequency
+ cat $input | fasta-shuffle-letters > $output
+
+
+
+
+
+
+
+
+**Description**
+
+shuffle the position of nucleotides in each sequence, preserving the frequency of nucleotides in that sequence, but do not preserve di-nucleotide frequency.
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fastashuffle2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastashuffle2.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,24 @@
+
+ preserving dinucleotide frequency
+ fasta-dinucleotide-shuffle.py -f $input -t $tag -c $n -s $seed > $output
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool shuffles the sequences in the input file but preserves the dinucleotide frequency of each sequence.
+
+The code implements the Altschul-Erikson dinucleotide shuffle algorithm, described in "Significance of nucleotide sequence alignments: A method for random sequence permutation that preserves dinucleotide and codon usage", S.F. Altschul and B.W. Erikson, Mol. Biol. Evol., 2(6):526--538, 1985.
+
+Code adapted from http://bioinformatics.bc.edu/clotelab/RNAdinucleotideShuffle/dinucleotideShuffle.html
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fastqdump.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqdump.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,18 @@
+
+ convert SRA to FASTQ
+ /Users/xuebing/tools/sratoolkit.2.1.9-mac32/fastq-dump -A $input -M $minReadLen -Z > $out_file1
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This is a wrapper of the fastq-dump tool from sra-toolkit. See http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fimo2-old.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fimo2-old.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,63 @@
+
+ using FIMO
+ fimo
+ #if $background_select.bg_select == "fromfile":
+ -bgfile $bgfile
+ #end if
+
+ $norc --output-pthresh $pth --verbosity 1 $motif $database
+ && mv fimo_out/fimo.html ${html_outfile}
+
+ && mv fimo_out/fimo.txt ${txt_outfile}
+
+ && rm -rf fimo_out
+
+
+
+
+
+
+
+
+ ##
+ ##
+ ##
+ ##
+ ##
+ ##
+ ##
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses FIMO to find matches of a motif in a fasta file. See more details:
+
+http://meme.sdsc.edu/meme/fimo-intro.html
+
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fimo2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fimo2.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,47 @@
+
+ using FIMO
+ fimo
+ #if $background_select.bg_select == "fromfile":
+ -bgfile $bgfile
+ #end if
+
+ $norc --max-stored-scores 5000000 --output-pthresh $pth --verbosity 1 $motif $database
+ && mv fimo_out/fimo.html ${html_outfile}
+
+ && mv fimo_out/fimo.txt ${txt_outfile}
+
+ && rm -rf fimo_out
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool uses FIMO to find matches of a motif in a fasta file. See more details:
+
+http://meme.sdsc.edu/meme/fimo-intro.html
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fimo2bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fimo2bed.py Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,46 @@
+'''
+#pattern name sequence name start stop score p-value q-value matched sequence
+constitutive-donor mm9_chr1_39533592_39535592_- 1815 1823 12.032 4.26e-06 0.397 CAGGTAAGT
+constitutive-donor mm9_chr1_59313750_59315750_+ 1889 1897 12.032 4.26e-06 0.397 CAGGTAAGT
+
+#pattern name sequence name start stop score p-value q-value matched sequence
+constitutive-donor mm9_chr1_172019075_172021075_- 1947 1955 12.032 4.26e-06 0.843 CAGGTAAGT
+constitutive-donor mm9_chr1_15300532_15302532_+ 156 164 12.032 4.26e-06 0.843 CAGGTAAGT
+'''
+
+import sys
+
+def fimo2bed(filename,rc):
+ '''
+ parse fimo output to make a bed file
+ rc: the sequence have been reverse complemented
+ '''
+ f = open(filename)
+ header = f.readline()
+ for line in f:
+ pattern,posi,begin,stop,score,pv,qv,seq = line.strip().split('\t')
+ flds = posi.split('_')
+ start = flds[-3]
+ end = flds[-2]
+ strand = flds[-1]
+ chrom = '_'.join(flds[1:-3]) #'chrX_random'
+ if not rc:
+ if strand == '+':
+ start1 = str(int(start) + int(begin)-1)
+ end1 = str(int(start) + int(stop))
+ print '\t'.join([chrom,start1,end1,seq,score,strand])
+ else:
+ start1 = str(int(end) - int(stop))
+ end1 = str(int(end) - int(begin)+1)
+ print '\t'.join([chrom,start1,end1,seq,score,strand])
+ else:
+ if strand == '-':
+ start1 = str(int(start) + int(begin)-1)
+ end1 = str(int(start) + int(stop))
+ print '\t'.join([chrom,start1,end1,seq,score,'+'])
+ else:
+ start1 = str(int(end) - int(stop))
+ end1 = str(int(end) - int(begin)+1)
+ print '\t'.join([chrom,start1,end1,seq,score,'-'])
+
+fimo2bed(sys.argv[1],sys.argv[2]=='rc')
diff -r d1f0f85ee5bc -r 0e221dbd17b2 fimo2bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fimo2bed.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,19 @@
+
+ convert FIMO output to BED
+ fimo2bed.py $input $rc > $output
+
+
+
+
+
+
+
+
+
+ Only works if your original FIMO input fasta sequences have ids like::
+
+ mm9_chr15_99358448_99360448_-
+
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 iupac2meme.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/iupac2meme.xml Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,59 @@
+
+ from one sequence
+ iupac2meme
+ #if $background_select.bg_select == "fromfile":
+ -bg $bgfile
+ #end if
+ -numseqs $numseqs $logodds $motif > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Description**
+
+Convert an IUPAC motif into a MEME version 4 formatted file suitable for use with FIMO and other MEME Suite programs.
+
+See additional information:
+
+http://meme.sdsc.edu/meme/doc/iupac2meme.html
+
+**IUPAC code**::
+
+ Nucleotide Code: Base:
+ ---------------- -----
+ A.................Adenine
+ C.................Cytosine
+ G.................Guanine
+ T (or U)..........Thymine (or Uracil)
+ R.................A or G
+ Y.................C or T
+ S.................G or C
+ W.................A or T
+ K.................G or T
+ M.................A or C
+ B.................C or G or T
+ D.................A or G or T
+ H.................A or C or T
+ V.................A or C or G
+ N.................any base
+
+
+
+
+
diff -r d1f0f85ee5bc -r 0e221dbd17b2 match
Binary file match has changed
diff -r d1f0f85ee5bc -r 0e221dbd17b2 match.cpp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/match.cpp Sat Mar 31 08:53:06 2012 -0400
@@ -0,0 +1,413 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include