# HG changeset patch
# User arkarachai-fungtammasan
# Date 1412384070 14400
# Node ID 20ab85af9505e19eb639abc0d7336e24f7d04211
Uploaded
diff -r 000000000000 -r 20ab85af9505 .DS_Store
Binary file .DS_Store has changed
diff -r 000000000000 -r 20ab85af9505 GenotypeTRcorrection.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypeTRcorrection.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,250 @@
+### import libraries ###
+import sys
+import collections, math
+import heapq
+from galaxy import eggs
+
+
+
+
+
+### basic function ###
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+def averagelist(a,b,expectedlevelofminor):
+ product=[]
+ for i in range(len(a)):
+ product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
+
+ return product
+
+def complement_base(read):
+ collect=''
+ for i in read:
+ if i.upper()=='A':
+ collect+='T'
+ elif i.upper()=='T':
+ collect+='A'
+ elif i.upper()=='C':
+ collect+='G'
+ elif i.upper()=='G':
+ collect+='C'
+ return collect
+def makeallpossible(read):
+ collect=[]
+ for i in range(len(read)):
+ tmp= read[i:]+read[:i]
+ collect.append(tmp)
+ collect.append(complement_base(tmp))
+ return collect
+
+def motifsimplify(base):
+ '''str--> str
+ '''
+ motiflength=len(base)
+ temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
+
+ return temp[0]
+
+def majorallele(seq):
+ binseq=list(set(seq))
+ binseq.sort(reverse=True) # highly mutate mode
+ #binseq.sort() # majority mode
+ storeform=''
+ storevalue=0
+ for i in binseq:
+ if seq.count(i)>storevalue:
+ storeform=i
+ storevalue=seq.count(i)
+
+ return int(storeform)
+
+### decide global parameter ###
+COORDINATECOLUMN=1
+ALLELECOLUMN=2
+MOTIFCOLUMN=3
+ ##(0.01-0.5)
+MINIMUMMUTABLE=1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
+
+
+## Fixed global variable
+inputname=sys.argv[1]
+errorprofile=sys.argv[2]
+Genotypingcorrected=sys.argv[3]
+EXPECTEDLEVELOFMINOR=float(sys.argv[4])
+if EXPECTEDLEVELOFMINOR >0.5:
+ try:
+ expected_contribution_of_minor_allele=int('expected_contribution_of_minor_allele')
+ except Exception, eee:
+ print eee
+ stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
+ALLREPEATTYPE=[1,2,3,4]
+ALLREPEATTYPENAME=['mono','di','tri','tetra']
+monomotif=['A','C']
+dimotif=['AC','AG','AT','CG']
+trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
+tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
+'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
+'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
+monorange=range(5,60)
+dirange=range(6,60)
+trirange=range(9,60)
+tetrarange=range(12,80)
+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
+
+#########################################
+######## Prob calculation sector ########
+#########################################
+def multinomial_prob(majorallele,STRlength,motif,probdatabase):
+ '''int,int,str,dict-->int
+ ### get prob for each STRlength to be generated from major allele
+ '''
+ #print (majorallele,STRlength,motif)
+ prob=probdatabase[len(motif)][motif][majorallele][STRlength]
+ return prob
+
+################################################
+######## error model database sector ###########
+################################################
+
+## structure generator
+errormodeldatabase={1:{},2:{},3:{},4:{}}
+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
+for repeattype in ALLREPEATTYPE:
+ for motif in ALLMOTIF[repeattype]:
+ errormodeldatabase[repeattype][motif]={}
+ sumbymajoralleledatabase[repeattype][motif]={}
+ for motifsize1 in ALLRANGE[repeattype]:
+ errormodeldatabase[repeattype][motif][motifsize1]={}
+ sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
+ for motifsize2 in ALLRANGE[repeattype]:
+ errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
+
+#print errormodeldatabase
+## read database
+
+
+## get read count for each major allele
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ t_major=int(temp[0])
+ t_count=int(temp[2])
+ motif=temp[3]
+ sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
+fd.close()
+##print sumbymajoralleledatabase
+
+## get probability
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ t_major=int(temp[0])
+ t_read=int(temp[1])
+ t_count=int(temp[2])
+ motif=temp[3]
+ if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
+ errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
+ #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
+
+ #else:
+ # errormodeldatabase[repeattype][motif][t_major][t_read]=0
+fd.close()
+
+#########################################
+######## input reading sector ###########
+#########################################
+fdout=open(Genotypingcorrected,'w')
+
+fd = open(inputname)
+
+lines=fd.xreadlines()
+for line in lines:
+ i_read=[]
+ i2_read=[]
+ temp=line.strip().split('\t')
+ i_coordinate=temp[COORDINATECOLUMN-1]
+ i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
+ i_read=temp[ALLELECOLUMN-1].split(',')
+ i_read=map(int,i_read)
+ coverage=len(i_read)
+
+### Evaluate 1 major allele ###
+ i_all_allele=list(set(i_read))
+ i_major_allele=majorallele(i_read)
+ f_majorallele=i_read.count(i_major_allele)
+### Evaluate 2 major allele ###
+ if len(i_all_allele)>1:
+ i2_read=filter(lambda a: a != i_major_allele, i_read)
+ i_major2_allele=majorallele(i2_read)
+ f_majorallele2=i_read.count(i_major2_allele)
+ ### Evaluate 3 major allele ###
+ if len(i_all_allele)>2:
+ i3_read=filter(lambda a: a != i_major2_allele, i2_read)
+ i_major3_allele=majorallele(i3_read)
+ f_majorallele3=i_read.count(i_major3_allele)
+ ### No 3 major allele ###
+ elif len(i_all_allele)==2:
+ i_major3_allele=i_major2_allele
+ ### No 2 major allele ###
+ elif len(i_all_allele)==1:
+ #i_major2_allele=majorallele(i_read)
+ i_major2_allele=i_major_allele+len(i_motif)
+ i_major3_allele=i_major2_allele
+ #print line.strip()+'\t'+'\t'.join(['homo','only',str(i_major_allele),str(i_major_allele),'NA'])
+ #continue
+ else:
+ print("no allele is reading")
+ sys.exit()
+
+## scope filter
+
+#########################################
+######## prob calculation option ########
+#########################################
+ homozygous_collector=0
+ heterozygous_collector=0
+
+
+ alist=[multinomial_prob(i_major_allele,x,i_motif,errormodeldatabase)for x in i_read]
+ blist=[multinomial_prob(i_major2_allele,x,i_motif,errormodeldatabase)for x in i_read]
+ clist=[multinomial_prob(i_major3_allele,x,i_motif,errormodeldatabase)for x in i_read]
+
+ ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
+ bclist=averagelist(blist,clist,EXPECTEDLEVELOFMINOR)
+ aclist=averagelist(alist,clist,EXPECTEDLEVELOFMINOR)
+
+ #print alist,blist,clist
+ majora=sum([math.log(i,10) for i in alist])
+ majorb=sum([math.log(i,10) for i in blist])
+ majorc=sum([math.log(i,10) for i in clist])
+ homozygous_collector=max(majora,majorb,majorc)
+
+ homomajor1=max([(majora,i_major_allele),(majorb,i_major2_allele),(majorc,i_major3_allele)])[1]
+ homomajordict={i_major_allele:majora,i_major2_allele:majorb,i_major3_allele:majorc}
+
+ majorab=sum([math.log(i,10) for i in ablist])
+ majorbc=sum([math.log(i,10) for i in bclist])
+ majorac=sum([math.log(i,10) for i in aclist])
+ heterozygous_collector=max(majorab,majorbc,majorac)
+ bothheteromajor=max([(majorab,(i_major_allele,i_major2_allele)),(majorbc,(i_major2_allele,i_major3_allele)),(majorac,(i_major_allele,i_major3_allele))])[1]
+ ##heteromajor1=max(bothheteromajor)
+ ##heteromajor2=min(bothheteromajor)
+ pre_heteromajor1=bothheteromajor[0]
+ pre_heteromajor2=bothheteromajor[1]
+ heteromajor1=max((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
+ heteromajor2=min((homomajordict[pre_heteromajor1],pre_heteromajor1),(homomajordict[pre_heteromajor2],pre_heteromajor2))[1]
+
+ logratio_homo=homozygous_collector-heterozygous_collector
+
+ if logratio_homo>0:
+ fdout.writelines(line.strip()+'\t'+'\t'.join(['homo',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
+ elif logratio_homo<0:
+ fdout.writelines(line.strip()+'\t'+'\t'.join(['hetero',str(logratio_homo),str(homomajor1),str(heteromajor1),str(heteromajor2)])+'\n')
+fd.close()
+fdout.close()
diff -r 000000000000 -r 20ab85af9505 GenotypingSTR.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GenotypingSTR.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,72 @@
+
+ during sequencing and library prep
+ GenotypeTRcorrection.py $microsat_raw $microsat_error_profile $microsat_corrected $expectedminorallele
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will correct for microsatellite sequencing and library preparation errors using error rates estimated from hemizygous male X chromosome or any rates provided by user. The read profile for each locus will be processed independently.
+- First, this tool will find three most common read lengths from input read length profile. If the read profile has only one length of TR, the length of one motif longer than the observed length will be used as the second most common read length.
+- Second, it will calculate probability of three forms of homozygous and use the form which give the highest probability. The same goes for heterozygous.
+- Third, this tools will calculate log based 10 of (the probability of homozygous/the probability of heterozygous). If this value is more than 0, it will predict this locus to homozygous. If this value is less than 0, it will predict this locus to heterozygous. If this value is 0, read profile at this locus will be discard.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+- The input files need to contain at least three columns.
+- Column 1 = location of microsatellite locus.
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format).
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column.
+
+**Output**
+
+The output will be contain original three (or more) column as the input. However, it will also have these following columns.
+
+- Additional column 1 = homozygous/heterozygous label.
+- Additional column 2 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Additional column 3 = Allele for most probable homozygous form.
+- Additional column 4 = Allele 1 for most probable heterozygous form.
+- Additional column 5 = Allele 2 for most probable heterozygous form.
+
+**Example**
+
+- Suppose that we sequence one locus of microsatellite with NGS. This locus has **A** motif and the following length (bp) profile. ::
+
+ chr1_100_106 5, 6, 6, 6, 6, 7, 7, 8, 8 A
+
+- We want to figure out if this locus is a homolozygous or heterozygous and the corresponding allele(s). Therefore, we use this tool to refine genotype.
+- This tool will calculate the probability of homozygous A6A6, A7A7, and A8A8 to generate observed length profile. Among this A7A7 has the highest probability. Therefore, we use this form as the representative for homozygous.
+- Then, this tool will calculate the probability of heterozygous A6A7, A7A8, and A6A8 to generate observed length profile. Among this A6A8 has the highest probability. Therefore, we use this form as the representative for heterozygous.
+- The A6A7 has higher probability than A7A7. Therefore, the program will report that this locus is a heterozygous locus. ::
+
+ chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 PEsortedSAM2readprofile.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+import sys
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.seq.twobit
+
+##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence
+
+samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
+seq_path = sys.argv[2] #Path to the reference genome in 2bit format
+
+##maxTRlength=int(sys.argv[4])
+##maxoriginalreadlength=int(sys.argv[5])
+maxTRlength=int(sys.argv[3])
+maxoriginalreadlength=int(sys.argv[4])
+outfile=sys.argv[5]
+fout = open(outfile,'w')
+
+twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
+
+skipped=0
+while True:
+ read = samf.readline().strip()
+ if not(read): #EOF reached
+ break
+ if read[0] == "@":
+ #print read
+ continue
+ mate = samf.readline().strip()
+ if not(mate): #EOF reached
+ break
+ read_elems = read.split()
+ mate_elems = mate.split()
+ read_name = read_elems[0].strip()
+ mate_name = mate_elems[0].strip()
+ while True:
+ if read_name == mate_name:
+ break
+ elif read_name != mate_name:
+ #print >>sys.stderr, "Input SAM file doesn't seem to be sorted by readname. Please sort and retry."
+ #break
+ skipped += 1
+ read = mate
+ read_elems = mate_elems
+ mate = samf.readline().strip()
+ read_name = read_elems[0].strip()
+ mate_name = mate_elems[0].strip()
+ if not(mate): #EOF reached
+ break
+ mate_elems = mate.split()
+ #extract XT:A tag
+ #for e in read_elems:
+ # if e.startswith('XT:A'):
+ # read_xt = e
+ #for e in mate_elems:
+ # if e.startswith('XT:A'):
+ # mate_xt = e
+ #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely
+ # continue
+ read_chr = read_elems[2]
+ read_start = int(read_elems[3])
+ read_cigar = read_elems[5]
+ if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= M
+ continue
+ read_len = int(read_cigar.split('M')[0])
+ mate_chr = mate_elems[2]
+ mate_start = int(mate_elems[3])
+ mate_cigar = mate_elems[5]
+ if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= M
+ continue
+ mate_len = int(mate_cigar.split('M')[0])
+ if read_chr != mate_chr: # check that they were mapped to the same chromosome
+ continue
+ if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
+ continue
+ if read_start < mate_start:
+ pre_s = read_start-1
+ pre_e = read_start-1+read_len
+ tr_s = read_start-1+read_len
+ tr_e = mate_start-1
+ suf_s = mate_start-1
+ suf_e = mate_start-1+mate_len
+ else:
+ pre_s = mate_start-1
+ pre_e = mate_start-1+mate_len
+ tr_s = mate_start-1+mate_len
+ tr_e = read_start-1
+ suf_s = read_start-1
+ suf_e = read_start-1+read_len
+ tr_len = abs(tr_e - tr_s)
+ if tr_len > maxTRlength:
+ continue
+ if pre_e >= suf_s: #overlapping prefix and suffix
+ continue
+ tr_ref_seq = twobitfile[read_chr][tr_s:tr_e]
+ ##print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" %(read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+ fout.writelines('\t'.join(map(str,[read_name,read_chr,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq]))+'\n')
+
+print "Skipped %d unpaired reads" %(skipped)
diff -r 000000000000 -r 20ab85af9505 PEsortedSAM2readprofile.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/PEsortedSAM2readprofile.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,62 @@
+
+ from SAM file sorted by readname
+ PEsortedSAM2readprofile.py $flankedbasesSAM $twobitref $maxTRlength $maxoriginalreadlength $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will take SAM file sorted by read name, remove unpaired reads, report microsatellites sequences in the reference genome that correspond to the space between paired end reads. Coordinate of start and stop for left and right flanking regions of microsatellites and microsatellite itself as inferred from paired end reads will also be reported.
+- These microsatellites in reference can be used to filter out reads that do not contain microsatellites that concur with microsatellites in reference where the reads mapped to.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+- Sorted SAM files by read name
+
+**Output**
+
+The output will combined two lines of input which are paired. The output format is as follow.
+
+- Column 1 = read name
+- Column 2 = chromosome
+- Column 3 = left flanking region start
+- Column 4 = left flanking region stop
+- Column 5 = microsatellite start
+- Column 6 = microsatellite stop
+- Column 7 = right flanking region start
+- Column 8 = right flanking region stop
+- Column 9 = microsatellite length in reference
+- Column 10= microsatellite sequence in reference
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 changespacetounderscore_readname.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/changespacetounderscore_readname.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,15 @@
+import sys
+fd=open(sys.argv[1])
+output=open(sys.argv[2],'w')
+columntochange=int(sys.argv[3])-1 # default is 6-1=5
+lines=fd.xreadlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ temp=filter(None,temp)
+ temp2=temp[columntochange].replace(' ','_')
+ product=temp[:columntochange]
+ product.append(temp2)
+ product.extend(temp[columntochange+1:])
+ output.writelines('\t'.join(product)+'\n')
+fd.close()
+output.close()
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 combinedprobforallelecombination.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combinedprobforallelecombination.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,41 @@
+import sys
+import collections
+import math
+SAMPLINGCOL=11
+ALLELE1COL=7
+ALLELE2COL=8
+SIGNCOL=4
+readprofileCOL=2
+motifCOL=3
+filaname=sys.argv[1]
+fd=open(filaname)
+lines=fd.readlines()
+binomialcombine=collections.defaultdict(list)
+for line in lines:
+ temp=line.strip().split('\t')
+ allelelist=[]
+ allelelist.append(int(temp[ALLELE1COL-1]))
+ allelelist.append(int(temp[ALLELE2COL-1]))
+ allelelist.sort()
+ #allelelist=map(str,allelelist)
+ alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+ #alleleave=str(sum(allelelist)/2.0)
+ ##alleleave=str(allelelist[0])+'_'+str(allelelist[1])
+ totalcov=len(temp[readprofileCOL-1].split(','))
+ motif=temp[motifCOL-1]
+ samplingvalue=float(temp[SAMPLINGCOL-1])
+ SIGN=1
+ binomialcombine[(totalcov,alleleave,motif)].append(SIGN*samplingvalue)
+allkeys= binomialcombine.keys()
+allkeys.sort()
+##print allkeys
+print 'read_depth'+'\t'+'allele'+'\t'+'heterozygous_prob'+'\t'+'motif'
+for key in allkeys:
+ ##templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2],str(map(str,(binomialcombine[key])))]
+ templist=[str(key[0]),key[1],str(sum(binomialcombine[key])),key[2]]
+
+ print '\t'.join(templist)
+#print allkeys#,binomialcombine
+
+
+
diff -r 000000000000 -r 20ab85af9505 combineprobforallelecombination.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combineprobforallelecombination.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,67 @@
+
+ from the same allele combination
+ combinedprobforallelecombination.py $input > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will combine probability that the allele combination can generated any read profile in the input. This is the last step to calculate probability to detect heterozygous for each allele combination and each depth.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input format is the same as output from **Evaluate the probability of the allele combination to generate read profile** tool.
+
+- Column 1 = location of microsatellite locus.
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format).
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column.
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+Only column 2,3,4,7,8,11 were used in calculation.
+
+**Output**
+
+
+The output will contain the following header and column
+
+- Line 1 header: read_depth allele heterozygous_prob motif
+- Column 1 = read depth
+- Column 2 = allele combination
+- Column 3 = probability to detect heterozygous of that allele combination
+- Column 4 = motif
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 fetchflank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fetchflank.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,73 @@
+
+ of microsatellites and output as two fastq files in forward-forward orientation
+ pair_fetch_DNA_ff.py $microsat_in_read $Leftflanking $Rightflanking $qualitycutoff $lengthofbasetocheckquality
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will fetch flanking regions around microsatellites, screen for quality score at microsatellites and adjacent flanking regions, and output two fastq files containing flanking regions in forward-forward direction.
+
+- This tool assumes that the quality score is Phred+33, such as Sanger fastq.
+- Reads that have either left or right flanking regions shorter than the length of flanking regions that require quality screening will be removed.
+
+**Citation**
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input files need to be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The output will be the two fastq files. The first file contains left flank regions. The second file contains right flanking regions.
+
+**Example**
+
+- Suppose we detected the microsatellites from short reads ::
+
+ 6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+
+
+- We want to get fastq files of flanking regions around microsatellite with quality score at least 20 on Phred +33
+
+- Then the program will report these two fastq files ::
+
+ @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
+ +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
+
+
+ @SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
+ +SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+ GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=99(/=5'6=4:CCC*AA
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 heteroprob.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/heteroprob.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,199 @@
+### import libraries ###
+import sys
+import collections, math
+import heapq
+import itertools
+
+
+
+### basic function ###
+def permuterepeat(n,rlist):
+ f = math.factorial
+ nfac=f(n)
+ rfaclist=[f(i) for i in rlist]
+ for rfac in rfaclist:
+ nfac=nfac/rfac
+ return nfac
+
+def nCr(n,r):
+ f = math.factorial
+ return f(n) / f(r) / f(n-r)
+
+def averagelist(a,b,expectedlevelofminor):
+ product=[]
+ for i in range(len(a)):
+ product.append((1-expectedlevelofminor)*a[i]+expectedlevelofminor*b[i])
+
+ return product
+
+def complement_base(read):
+ collect=''
+ for i in read:
+ if i.upper()=='A':
+ collect+='T'
+ elif i.upper()=='T':
+ collect+='A'
+ elif i.upper()=='C':
+ collect+='G'
+ elif i.upper()=='G':
+ collect+='C'
+ return collect
+def makeallpossible(read):
+ collect=[]
+ for i in range(len(read)):
+ tmp= read[i:]+read[:i]
+ collect.append(tmp)
+ collect.append(complement_base(tmp))
+ return collect
+
+def motifsimplify(base):
+ '''str--> str
+ '''
+ motiflength=len(base)
+ temp=list(set(ALLMOTIF[motiflength]).intersection(set(makeallpossible(base))))
+
+ return temp[0]
+
+def majorallele(seq):
+ binseq=list(set(seq))
+ binseq.sort(reverse=True) # highly mutate mode
+ #binseq.sort() # majority mode
+ storeform=''
+ storevalue=0
+ for i in binseq:
+ if seq.count(i)>storevalue:
+ storeform=i
+ storevalue=seq.count(i)
+
+ return int(storeform)
+
+### decide global parameter ###
+COORDINATECOLUMN=1
+ALLELECOLUMN=2
+MOTIFCOLUMN=3
+inputname=sys.argv[1]
+errorprofile=sys.argv[2]
+EXPECTEDLEVELOFMINOR=float(sys.argv[3])
+if EXPECTEDLEVELOFMINOR >0.5:
+ try:
+ errorexpectcontribution=int('a')
+ except Exception, eee:
+ print eee
+ stop_err("Expected contribution of minor allele must be at least 0 and not more than 0.5")
+MINIMUMMUTABLE=0 ###1.2*(1.0/(10**8)) #http://www.ncbi.nlm.nih.gov/pubmed/22914163 Kong et al 2012
+
+
+## Fixed global variable
+ALLREPEATTYPE=[1,2,3,4]
+ALLREPEATTYPENAME=['mono','di','tri','tetra']
+monomotif=['A','C']
+dimotif=['AC','AG','AT','CG']
+trimotif=['AAC','AAG','AAT','ACC','ACG','ACT','AGC','AGG','ATC','CCG']
+tetramotif=['AAAC','AAAG','AAAT','AACC','AACG','AACT','AAGC','AAGG','AAGT','AATC','AATG','AATT',\
+'ACAG','ACAT','ACCC','ACCG','ACCT','ACGC','ACGG','ACGT','ACTC','ACTG','AGAT','AGCC','AGCG','AGCT',\
+'AGGC','AGGG','ATCC','ATCG','ATGC','CCCG','CCGG','AGTC']
+ALLMOTIF={1:monomotif,2:dimotif,3:trimotif,4:tetramotif}
+monorange=range(5,60)
+dirange=range(6,60)
+trirange=range(9,60)
+tetrarange=range(12,80)
+ALLRANGE={1:monorange,2:dirange,3:trirange,4:tetrarange}
+
+#########################################
+######## Prob calculation sector ########
+#########################################
+def multinomial_prob(majorallele,STRlength,motif,probdatabase):
+ '''int,int,str,dict-->int
+ ### get prob for each STRlength to be generated from major allele
+ '''
+ #print (majorallele,STRlength,motif)
+ prob=probdatabase[len(motif)][motif][majorallele][STRlength]
+ return prob
+
+################################################
+######## error model database sector ###########
+################################################
+
+## structure generator
+errormodeldatabase={1:{},2:{},3:{},4:{}}
+sumbymajoralleledatabase={1:{},2:{},3:{},4:{}}
+for repeattype in ALLREPEATTYPE:
+ for motif in ALLMOTIF[repeattype]:
+ errormodeldatabase[repeattype][motif]={}
+ sumbymajoralleledatabase[repeattype][motif]={}
+ for motifsize1 in ALLRANGE[repeattype]:
+ errormodeldatabase[repeattype][motif][motifsize1]={}
+ sumbymajoralleledatabase[repeattype][motif][motifsize1]=0
+ for motifsize2 in ALLRANGE[repeattype]:
+ errormodeldatabase[repeattype][motif][motifsize1][motifsize2]=MINIMUMMUTABLE
+#print errormodeldatabase
+## read database
+
+## get read count for each major allele
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ t_major=int(temp[0])
+ t_count=int(temp[2])
+ motif=temp[3]
+ sumbymajoralleledatabase[len(motif)][motif][t_major]+=t_count
+fd.close()
+##print sumbymajoralleledatabase
+
+## get probability
+fd=open(errorprofile)
+lines=fd.readlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ t_major=int(temp[0])
+ t_read=int(temp[1])
+ t_count=int(temp[2])
+ motif=temp[3]
+ if sumbymajoralleledatabase[len(motif)][motif][t_major]>0:
+ errormodeldatabase[len(motif)][motif][t_major][t_read]=t_count/(sumbymajoralleledatabase[len(motif)][motif][t_major]*1.0)
+ #errormodeldatabase[repeattype][motif][t_major][t_read]=math.log(t_count/(sumbymajorallele[t_major]*1.0))
+
+ #else:
+ # errormodeldatabase[repeattype][motif][t_major][t_read]=0
+fd.close()
+#print errormodeldatabase
+#print math.log(100,10)
+#########################################
+######## input reading sector ###########
+#########################################
+
+
+
+fd = open(inputname)
+##fd=open('sampleinput_C.txt')
+lines=fd.xreadlines()
+for line in lines:
+ i_read=[]
+ i2_read=[]
+ temp=line.strip().split('\t')
+ i_coordinate=temp[COORDINATECOLUMN-1]
+ i_motif=motifsimplify(temp[MOTIFCOLUMN-1])
+ i_read=temp[ALLELECOLUMN-1].split(',')
+ i_read=map(int,i_read)
+ depth=len(i_read)
+ heteromajor1=int(temp[6])
+ heteromajor2=int(temp[7])
+
+### calculate the change to detect combination (using error profile)
+ heterozygous_collector=0
+ alist=[multinomial_prob(heteromajor1,x,i_motif,errormodeldatabase)for x in i_read]
+ blist=[multinomial_prob(heteromajor2,x,i_motif,errormodeldatabase)for x in i_read]
+
+ ablist=averagelist(alist,blist,EXPECTEDLEVELOFMINOR)
+
+ if 0 in ablist:
+ continue
+ heterozygous_collector=reduce(lambda y, z: y*z,ablist )
+
+### prob of combination (using multinomial distribution)
+ frequency_distribution=[len(list(group)) for key, group in itertools.groupby(i_read)]
+ ## print frequency_distribution
+ expandbypermutation=permuterepeat(depth,frequency_distribution)
+
+ print line.strip()+'\t'+str(heterozygous_collector)+'\t'+str(expandbypermutation)+'\t'+str(expandbypermutation*heterozygous_collector)+'\t'+str(depth)
diff -r 000000000000 -r 20ab85af9505 microsatcompat.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,36 @@
+import sys
+# remove all read that have unmatch microsat
+# check only one line at a time
+def complement_base(read):
+ collect=''
+ for i in read:
+ if i.upper()=='A':
+ collect+='T'
+ elif i.upper()=='T':
+ collect+='A'
+ elif i.upper()=='C':
+ collect+='G'
+ elif i.upper()=='G':
+ collect+='C'
+ return collect
+
+def makeallpossible(read):
+ collect=[]
+ for i in range(len(read)):
+ tmp= read[i:]+read[:i]
+ collect.append(tmp)
+ collect.append(complement_base(tmp))
+ return collect
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+firstcolumn=int(sys.argv[2])-1 #4
+secondcolumn=int(sys.argv[3])-1 # 10
+for line in lines:
+ temp=line.strip().split('\t')
+ temp=filter(None,temp)
+ micro1=temp[firstcolumn]
+ micro2=temp[secondcolumn]
+ if micro1 in makeallpossible(micro2):
+ print line.strip()
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 microsatcompat.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatcompat.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,76 @@
+
+
+ microsatcompat.py $input $column1 $column2 > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the input lines which have compatible microsatellite motifs between two columns. Compatible here is defined as the microsatellites motif that are complementary or have the same sequence when change starting point of motif. For example, **A** is the same as **T**. Also, **AGG** is the same as **GAG**.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to make sure that the microsatellites in the reads have the same motif as the microsatellites in the reference at the corresponding mapped location.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input files can be any tab delimited file.
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 microsatellite.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1271 @@
+#!/usr/bin/env python
+"""
+Snoop thru a fasta file looking for microsatellite repeats of given periods
+Output format: length_of_repeat left_flank_length right_flank_length repeat_motif hamming_distance read_name read_sequence read_quality (additional columns)
+
+If --r option turned on, output format will have additional columns behind:
+read_name read_chr pre_s pre_e tr_s tr_e suf_s suf_e tr_len tr_ref_seq
+
+pre_s where the read start
+pre_e the last position before microsatellite
+tr_s where microsatellite start
+tr_e where microsatellite end
+suf_s first base after microsatellite
+tr_ref_seq reference sequence corresponding to microsatellite
+
+* output positions are 0 based
+
+:Author: Chen Sun (cxs1031@cse.psu.edu); Bob Harris (rsharris@bx.psu.edu)
+
+modifing log:
+
+09/27/2013
+replace function dense_intervals with function non_negative_intervals, which do not need to import such file.
+
+10/18/2013
+modify function find_repeat_element to get a quick speed, under the condition that hamming_distance = 0, which means do not allowed any mutation/indel
+
+02/25/2014
+add function that can deal with mapped reads
+with additional output
+
+02/28/2014
+modify the 0-based end point, as in 0-base area, it is half-open [ )
+so the 0-based site, should always be added by 1
+
+03/05/2014
+deal with multi-fasta
+"""
+from sys import argv,stdin,stderr,exit
+from string import maketrans
+from md5 import new as md5_new
+import re
+#from pyfracluster import dense_intervals
+
+def usage(s=None):
+ message = """
+usage: microsat_snoop [fasta_file] [options]
+ Name of file to read sequences from; if absent,
+ sequences are read from stdin
+ --fasta Input file is in fasta format
+ (this is the default)
+ --fastq Input file is in fastq format
+ (default is fasta unless filename is .fastq)
+ --fastq:noquals Input file is in fastq format, but discard quals
+ --sam Input file is SAM file
+ --r Indicate additional output information, if indicated,
+ --ref option is mendatory
+ --ref= Reference file (absolute) path
+ --period= (mandatory,cumulative) repeat length(s) to be
+ searched for
+ is expected to be small, less than 10
+ can also be a comma-separated list, or
+ a range ..
+ --rate= control the candidate repeat interval detector;
+ it will consider intervals with at least
+ of matches when shifted by the period;
+ is between 0 and 1 and can be either a
+ real number or /
+ (default is 6/7)
+ --minlength= minimum length of intervals reported, in bp
+ (default is 20)
+ --progress= how often to report the sequence we're searching
+ (default is no progress report)
+ --allowduplicates process all input sequences
+ (this is the default)
+ --noduplicates ignore any input sequence that's the same as an
+ earlier sequence
+ --nonearduplicates ignore any input sequence that has the same first
+ 100 bp as an earlier sequence
+ --nonearduplicate= ignore any input sequence that has the same first
+ bp as an earlier sequence
+ --hamming= Don't report candidate repeat intervals that have
+ more than mismatches
+ (default is to do no such filtering)
+ --prefix= Don't report candidate repeat intervals that
+ start within of the sequence start
+ (default is to do no such filtering)
+ --suffix= Don't report candidate repeat intervals that
+ end within of the sequence end
+ (default is to do no such filtering)
+ --subsample=/ Process only the th sequence of every group of
+ sequences; ranges from 1 to
+ --multipleruns Consider all candidate intervals in a sequence
+ (default is to consider only the longest)
+ --partialmotifs Consider microatelites with a partial motif
+ (default is to consider only whole motifs)
+ --splitbyvalidity Preprocess sequences, splitting at Ns; this
+ prevents candidates from including Ns
+ (default is not to split)
+ --noflankdisplay Show entire sequence as flanking regions
+ (this is the default)
+ --flankdisplay= Limit length of flanking regions shown
+ --readnamesuffix= Root of suffix to append to read names; e.g. 1
+ for forward, 2 for reverse; this triggers other
+ info to be included in the suffix
+ (default is "1" for fastq; no suffix for fasta)
+ --head= limit the number of sequences processed
+ --markend Write a marker line upon completion
+ (default is not to write a marker)
+ --help=details Describe the process, and quit"""
+
+ if (s == None): exit (message)
+ else: exit ("%s\n%s" % (s,message))
+
+
+detailedDescription = """In broad terms, the process works as follows:
+
+(1) Identify intervals that are highly correlated with the interval shifted by
+ P (the repeat period). These intervals are called "runs" or "candidates".
+ The level of correlation required is controlled by rateThreshold.
+ Depending on whether we want to look for more than one microsat, we either
+ find the longest such run (simple algorithm) or many runs (more complicated
+ algorithm). The following steps are then performed on each run.
+
+(2) Find the most likely repeat motif in the run. This is done by counting
+ all kmers (of length P) and choosing the most frequent. If that kmer is
+ itself covered by a sub-repeat we discard this run. The idea is that we
+ can ignore a 6-mer like ACGACG because we will find it when we are looking
+ for 3-mers.
+
+(3) Once we identify the most likely repeat motif, we then modify the
+ interval, adjusting start and end to find the interval that has the fewest
+ mismatches vs. a sequence of the motif repeated (hamming distance). Only
+ whole copies of the motif are considered.
+
+(4) At this point we have a valid microsat interval (in the eyes of the
+ program). It is subjected to some filtering stages (hamming distance or too
+ close to an end), and if it satisfies those conditions, it's reported to
+ the user."""
+
+def main():
+ global debug
+
+ #=== parse the command line ===
+
+ inputFilename = None
+ referenceFileName = None #add by Chen Sun on 02/25
+ inputFormat = None
+ repeatPeriods = []
+ rateThreshold = 6 / 7.0
+ lengthThreshold = 20
+ reportProgress = None
+ discardDuplicates = False
+ discardNearDuplicates = False
+ nearDuplicatePrefix = 100
+ hammingThreshold = 0
+ prefixThreshold = None
+ suffixThreshold = None
+ subsampleK = None
+ subsampleN = None
+ reportMultipleRuns = False
+ allowPartialMotifs = False
+ splitByValidity = False
+ flankDisplayLimit = None
+ readNameSuffix = None
+ headLimit = None
+ markEndOfFile = False
+ additionalInfo = False
+ debug = []
+
+ for arg in argv[1:]:
+ if (arg == "--fasta"):
+ inputFormat = "fasta"
+ elif (arg == "--fastq"):
+ inputFormat = "fastq"
+ elif (arg == "--fastq:noquals"):
+ inputFormat = "fastq:noquals"
+ elif (arg == "--sam"):
+ inputFormat = "sam"
+ elif (arg == "--r"):
+ additionalInfo = True
+ elif (arg.startswith("--ref=")):
+ referenceFileName = arg.split("=",1)[1]
+ elif (arg.startswith("--period=")):
+ val = arg.split("=",1)[1]
+ for period in val.split(","):
+ if (".." in period):
+ (lowPeriod,highPeriod) = period.split("..",1)
+ lowPeriod = int(lowPeriod)
+ highPeriod = int(highPeriod)
+ for period in xrange(lowPeriod,highPeriod+1):
+ repeatPeriods += [period]
+ else:
+ repeatPeriods += [int(period)]
+ elif (arg.startswith("--rate=")):
+ val = arg.split("=",1)[1]
+ rateThreshold = float_or_fraction(val)
+ assert (0.0 < rateThreshold <= 1.0), "%s not a valid rate" % val
+ elif (arg.startswith("--minlength=")):
+ val = arg.split("=",1)[1]
+ lengthThreshold = int(val)
+ assert (lengthThreshold >= 0)
+ elif (arg.startswith("--progress=")):
+ val = arg.split("=",1)[1]
+ reportProgress = int(val)
+ elif (arg == "--allowduplicates"):
+ discardDuplicates = False
+ discardNearDuplicates = False
+ elif (arg == "--noduplicates"):
+ discardDuplicates = True
+ discardNearDuplicates = False
+ elif (arg == "--nonearduplicates"):
+ discardDuplicates = False
+ discardNearDuplicates = True
+ elif (arg.startswith("--nonearduplicate=")):
+ val = arg.split("=",1)[1]
+ discardDuplicates = False
+ discardNearDuplicates = True
+ nearDuplicatePrefix = int(val)
+ assert (nearDuplicatePrefix > 0)
+ elif (arg.startswith("--hamming=")):
+ val = arg.split("=",1)[1]
+ hammingThreshold = int(val)
+ assert (hammingThreshold >= 0)
+ elif (arg.startswith("--prefix=")):
+ val = arg.split("=",1)[1]
+ prefixThreshold = int(val)
+ assert (prefixThreshold >= 0)
+ elif (arg.startswith("--suffix=")):
+ val = arg.split("=",1)[1]
+ suffixThreshold = int(val)
+ assert (suffixThreshold >= 0)
+ elif (arg.startswith("--subsample=")):
+ val = arg.split("=",1)[1]
+ (k,n) = val.split("/",2)
+ subsampleK = int(k)
+ subsampleN = int(n)
+ assert (0 < subsampleK <= subsampleN)
+ elif (arg == "--multipleruns"):
+ reportMultipleRuns = True
+ elif (arg == "--partialmotifs"):
+ allowPartialMotifs = True
+ elif (arg == "--splitbyvalidity"):
+ splitByValidity = True
+ elif (arg == "--noflankdisplay"):
+ flankDisplayLimit = None
+ elif (arg.startswith("--flankdisplay=")):
+ val = arg.split("=",1)[1]
+ flankDisplayLimit = int(val)
+ assert (flankDisplayLimit >= 0)
+ elif (arg.startswith("--readnamesuffix")):
+ readNameSuffix = arg.split("=",1)[1]
+ elif (arg.startswith("--head=")):
+ headLimit = int_with_unit(arg.split("=",1)[1])
+ elif (arg == "--markend"):
+ markEndOfFile = True
+ elif (arg == "--help=details"):
+ exit (detailedDescription)
+ elif (arg.startswith("--debug=")):
+ debug += (arg.split("=",1)[1]).split(",")
+ elif (arg.startswith("--")):
+ usage("unrecognized option: %s" % arg)
+ elif (inputFilename == None):
+ inputFilename = arg
+ else:
+ usage("unrecognized option: %s" % arg)
+
+ #=== determine periods of interest ===
+
+ if (repeatPeriods == []):
+ usage("you gotta give me a repeat period")
+
+ if (additionalInfo == True):
+ if (referenceFileName == None):
+ usage("reference file path needed. use --ref= to indicate")
+
+ periodSeed = {}
+ for period in repeatPeriods:
+ if (period < 1): usage("period %d is not valid" % period)
+ periodSeed[period] = True
+
+ repeatPeriods = [period for period in periodSeed]
+ repeatPeriods.sort()
+
+ #=== determine input format ===
+
+ if (inputFormat == "fasta"): sequence_reader = fasta_sequences
+ elif (inputFormat == "fastq"): sequence_reader = fastq_sequences
+ elif (inputFormat == "fastq:noquals"): sequence_reader = fastq_sequences
+ elif (inputFormat == "sam"): sequence_reader = sam_sequences
+ elif (inputFilename == None): sequence_reader = fasta_sequences
+ elif (inputFilename.endswith(".fastq")): sequence_reader = fastq_sequences
+ elif (inputFilename.endswith(".fq")): sequence_reader = fastq_sequences
+ elif (inputFilename.endswith(".sam")): sequence_reader = sam_sequences
+ else: sequence_reader = fasta_sequences
+
+ if (inputFilename != None): inputF = file(inputFilename,"rt")
+ else: inputF = stdin
+
+ if (readNameSuffix == None) \
+ and (sequence_reader == fastq_sequences) \
+ and (inputFormat != "fastq:noquals"):
+ readNameSuffix = "1"
+
+ #=== process the sequences ===
+
+ refSequence = {}
+ rightName = ""
+ sequence = ""
+ if additionalInfo:
+ firstFasta = True
+ originalRefF = open(referenceFileName)
+ for line in originalRefF.readlines():
+ line = line.replace('\r','')
+ line = line.replace('\n','')
+ if line.startswith(">"):
+ if firstFasta:
+ firstFasta = False
+ else:
+ refSequence[rightName] = sequence
+ rightName = line[1:]
+ sequence = ""
+ continue
+ sequence += line
+ originalRefF.close()
+ refSequence[rightName] = sequence
+
+ sequenceSeen = {}
+
+ numSequences = 0
+ for seqInfo in sequence_reader(inputF):
+ numSequences += 1
+ if (headLimit != None) and (numSequences > headLimit):
+ print >>stderr, "limit of %d sequences reached" % headLimit
+ break
+
+ if (sequence_reader == sam_sequences):
+ #seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar
+ (name, sequence, refName, pre_s, cigar) = seqInfo
+ quals = None
+ elif (sequence_reader == fastq_sequences):
+ (name,sequence,quals) = seqInfo
+ if (inputFormat == "fastq:noquals"): quals = None
+ else:
+ (name,sequence) = seqInfo
+ quals = None
+
+ if (reportProgress != None) and (numSequences % reportProgress == 0):
+ print >>stderr, "%s %d" % (name,numSequences)
+
+ # if we're subsampling and not interested in this sequence, skip it
+
+ if (subsampleN != None):
+ if ((numSequences-1) % subsampleN != (subsampleK-1)):
+ continue
+
+ # if this sequence is shorter than the length of interest, skip it
+
+ seqLen = len(sequence)
+ if (seqLen < period) or (seqLen < lengthThreshold): continue
+
+ # if we're not interested in duplicates and this is one, skip it;
+ # note that we assume no hash collisions occur, i.e. that all hash
+ # matches are truly sequence matches
+
+ if (discardDuplicates):
+ h = hash108(sequence)
+ if (h in sequenceSeen): continue
+ sequenceSeen[h] = True
+ elif (discardNearDuplicates):
+ h = hash108(sequence[:nearDuplicatePrefix])
+ if (h in sequenceSeen): continue
+ sequenceSeen[h] = True
+
+ # split the sequence into chunks of valid nucleotides
+
+ if (splitByValidity):
+ chunks = [(start,end) for (start,end) in nucleotide_runs(sequence)]
+ else:
+ chunks = [(0,len(sequence))]
+
+ # evaluate for each period of interest
+
+ for period in repeatPeriods:
+
+ # operate on each chunk
+
+ for (chunkStart,chunkEnd) in chunks:
+ chunkLen = chunkEnd - chunkStart
+ if (chunkLen < period) or (chunkLen < lengthThreshold): continue
+
+ if ("validity" in debug) or ("correlation" in debug) or ("runs" in debug):
+ print >>stderr, ">%s_%d_%d" % (name,chunkStart,chunkEnd)
+
+ # compute correlation sequence
+
+ corr = correlation_sequence(sequence,period,chunkStart,chunkEnd)
+
+ if ("correlation" in debug) or ("runs" in debug):
+ print >>stderr, sequence[chunkStart:chunkEnd]
+ print >>stderr, corr
+
+ # find runs (candidates for being a microsat)
+
+ if (reportMultipleRuns):
+ runs = all_suitable_runs(corr,lengthThreshold-period,rateThreshold, hammingThreshold)
+ else:
+ runs = longest_suitable_run(corr,lengthThreshold,rateThreshold)
+ if (runs == []): continue
+
+
+ if ("runs" in debug):
+ for (start,end) in runs:
+ run = [" "] * seqLen
+ for ix in xrange(start-period,end):
+ run[ix] = "*"
+ print >>stderr, "".join(run)
+
+ if ("candidates" in debug):
+ for (start,end) in runs:
+ print >>stderr, "%s %d %d" % (name,start,end)
+
+ # process runs and report those that pass muster
+
+ runCount = 0
+ for (start,end) in runs:
+ runCount += 1
+
+ start = chunkStart + start - period
+ end = chunkStart + end
+
+ (kmer,d,start,end) = find_repeat_element(hammingThreshold, period,sequence,start,end,allowPartials=allowPartialMotifs)
+ if (kmer == None): continue # (no useful repeat kmer was found)
+
+ rptExtent = end - start
+ prefixLen = start
+ suffixLen = seqLen - end
+ if (rptExtent <= period): continue
+ if (hammingThreshold != None) and (d > hammingThreshold): continue
+ if (prefixThreshold != None) and (prefixLen < prefixThreshold): continue
+ if (suffixThreshold != None) and (suffixLen < suffixThreshold): continue
+
+ if (flankDisplayLimit == None):
+ seq = sequence[:start] \
+ + sequence[start:end].lower() \
+ + sequence[end:]
+ else:
+ seq = sequence[max(chunkStart,start-flankDisplayLimit):start] \
+ + sequence[start:end].lower() \
+ + sequence[end:min(chunkEnd,end+flankDisplayLimit)]
+ reportName = name
+ if (readNameSuffix != None):
+ reportName += "_"+readNameSuffix+"_per"+str(period)+"_"+str(runCount)
+ if (quals == None or quals == "." or quals == "\t."): quals = "\t."
+ else: quals = "\t" + quals
+ if not additionalInfo:
+ print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s" \
+ % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals)
+ else:
+ #pre_e = pre_s + prefixLen - 1
+ refPoint = pre_s
+ donorPoint = 0
+
+ donorBeforeStart = prefixLen - 1 #pre_e
+ donorMicroStart = prefixLen #tr_s
+ donorMicroEnd = donorMicroStart + rptExtent - 1 #tr_e
+ donorAfterMicro = donorMicroEnd + 1 #suf_s
+ donorEnd = len(seq) - 1 #suf_e
+
+ set_pre_e = False
+ set_tr_s = False
+ set_tr_e = False
+ set_suf_s = False
+ set_suf_e = False
+
+ pre_e = 0
+ tr_s = 0
+ tr_e = 0
+ suf_s = 0
+ suf_e = 0
+
+ matchList = re.findall('(\d+)([IDM])', cigar)
+ unCognitiveCigar = False
+ for matchN, matchType in matchList:
+ matchNum = int(matchN)
+ if matchType == "M":
+ donorPoint = donorPoint + matchNum
+ refPoint = refPoint + matchNum
+ elif matchType == "D":
+ refPoint = refPoint + matchNum
+ continue
+ elif matchType == "I":
+ donorPoint = donorPoint + matchNum
+ else:
+ unCognitiveCigar = True
+ break
+
+ if not set_pre_e:
+ if donorPoint >= donorBeforeStart:
+ pre_e = refPoint - (donorPoint - donorBeforeStart)
+ set_pre_e = True
+ else:
+ continue
+
+ if not set_tr_s:
+ if donorPoint >= donorMicroStart:
+ tr_s = refPoint - (donorPoint - donorMicroStart)
+ set_tr_s = True
+ else:
+ continue
+
+ if not set_tr_e:
+ if donorPoint >= donorMicroEnd:
+ tr_e = refPoint - (donorPoint - donorMicroEnd)
+ set_tr_e = True
+ else:
+ continue
+
+ if not set_suf_s:
+ if donorPoint >= donorAfterMicro:
+ suf_s = refPoint - (donorPoint - donorAfterMicro)
+ set_suf_s = True
+ else:
+ continue
+
+ if not set_suf_e:
+ if donorPoint >= donorEnd:
+ suf_e = refPoint - (donorPoint - donorEnd)
+ set_suf_e = True
+ else:
+ continue
+
+ if unCognitiveCigar:
+ break
+ tr_len = tr_e - tr_s + 1
+
+ if refName not in refSequence:
+ tr_ref_seq = "."
+ else:
+ if refSequence[refName] == "":
+ tr_ref_seq = "."
+ elif len(refSequence[refName]) <= tr_e:
+ tr_ref_seq = "."
+ else:
+ tr_ref_seq = refSequence[refName][tr_s:tr_e+1]
+
+ pre_e += 1
+ tr_e += 1
+ suf_e += 1
+ print "%d\t%d\t%d\t%s\t%d\t%s\t%s%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s" \
+ % (rptExtent,prefixLen,suffixLen,kmer,d,reportName,seq,quals,reportName,refName,pre_s,pre_e,tr_s,tr_e,suf_s,suf_e,tr_len,tr_ref_seq)
+
+ if (markEndOfFile):
+ print "# microsat_snoop end-of-file"
+
+ if (inputF != stdin):
+ inputF.close()
+
+# non_negative_intervals
+# find intervals with exactly + and no -
+# from string like this : +++++++++---+++++++++
+def non_negative_intervals(seq, minLength=None):
+
+ start = -1
+ end = -1
+ firstPlus = 1
+ #print seq
+ for ix in range(len(seq)): # for every char in seq
+ ch = seq[ix]
+ if(ch == "+"):
+ if(firstPlus):
+ firstPlus = 0
+ start = ix
+ else:
+ continue
+ elif(ch == "-"):
+ if(start >= 0):
+ end = ix-1
+ if((end - start + 1) >= minLength):
+ yield (start,end+1)
+ start = -1
+ firstPlus = 1
+ if(start > 0):
+ if((ix - start + 1) >= minLength):
+ yield (start, ix+1)
+
+
+###################################################################
+# modified by Chen Sun on 7/11/2014
+# We do not want other modules, so parse these functions inside
+#
+###################################################################
+
+# parse a string of the form {positives}/{positives_and_neutrals}
+
+def parse_spec(s):
+ if ("/" not in s): raise ValueError
+ (n,d) = s.split("/",1)
+ if (not n.startswith("{")) or (not n.endswith("}")): raise ValueError
+ if (not d.startswith("{")) or (not d.endswith("}")): raise ValueError
+
+ positives = n[1:-1]
+ d = d[1:-1]
+
+ for ch in positives:
+ if (ch not in d): raise ValueError
+
+ neutrals = [ch for ch in d if (ch not in positives)]
+ return (positives,neutrals)
+
+
+# convert a string to a number, allowing fractions
+
+def float_or_fraction(s):
+ if ("/" in s):
+ (numer,denom) = s.split("/",1)
+ return float(numer)/float(denom)
+ else:
+ return float(s)
+
+
+# dense_intervals--
+# Find all non-overlapping runs with a good enough rate (of positives), and
+# which meet our length threshold.
+#
+# The algorithm used is adapted from Zhang, Berman, Miller, "Post-processing
+# long pairwise alignments", Bioinformatics Vol. 15 no. 12 1999.
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+
+def dense_intervals(seq,rate,positives,neutrals,blockers="",minLength=None):
+
+ if (blockers == None):
+ blockers = "".join([chr(n) for n in range(1,256)
+ if (chr(n) not in positives)
+ and (chr(n) not in neutrals)])
+
+ stackLeft = [None] # stack with each entry containing five
+ stackRight = [None] # .. elements; note that entry zero is not
+ stackLeftScore = [None] # .. used
+ stackRightScore = [None]
+ stackLower = [None]
+ top = 0
+ score = 0
+
+ for ix in range(len(seq)):
+ ch = seq[ix]
+ if (ch in blockers):
+ # emit intervals
+
+ for sp in range(1,top+1):
+ left = stackLeft [sp] + 1
+ right = stackRight[sp]
+
+ while (left < right) and (seq[left] not in positives): left += 1
+ while (right > left) and (seq[right] not in positives): right -= 1
+
+ right += 1
+ if (minLength == None) or (right - left >= minLength):
+ yield (left,right)
+
+ #empty stack
+
+ stackLeft = [None]
+ stackRight = [None]
+ stackLeftScore = [None]
+ stackRightScore = [None]
+ stackLower = [None]
+ top = 0
+ score = 0
+ continue
+
+ if (ch in positives): weight = 1-rate
+ elif (ch in neutrals): weight = -rate
+ else: raise ValueError
+
+ score += weight
+ #if ("algorithm" in debug):
+ # print >>sys.stderr, "%3d: %c %5.2f" % (ix, ch, score),
+
+ if (weight < 0):
+ #if ("algorithm" in debug):
+ # print >>sys.stderr
+ continue
+
+ if (top > 0) and (stackRight[top] == ix-1):
+ # add this site to the interval on top of the stack
+
+ stackRight [top] = ix
+ stackRightScore[top] = score
+
+ #if ("algorithm" in debug):
+ # print >>sys.stderr, \
+ # " extending [%d] %d-%d %4.1f %4.1f" \
+ # % (top,
+ # stackLeft [top], stackRight [top],
+ # stackLeftScore[top], stackRightScore[top]),
+
+ else:
+ # create a one site interval
+
+ top += 1
+ if (top >= len(stackLeft)):
+ stackLeft += [None]
+ stackRight += [None]
+ stackLeftScore += [None]
+ stackRightScore += [None]
+ stackLower += [None]
+
+ stackLeft [top] = ix - 1
+ stackLeftScore [top] = score - weight
+ stackRight [top] = ix
+ stackRightScore[top] = score
+ stackLower [top] = top - 1
+
+ while (stackLower[top] > 0) \
+ and (stackLeftScore[stackLower[top]] > stackLeftScore[top]):
+ stackLower[top] = stackLower[stackLower[top]]
+
+ #if ("algorithm" in debug):
+ # print >>sys.stderr, \
+ # " creating [%d] %d-%d %4.1f %4.1f -> %d" \
+ # % (top,
+ # stackLeft [top], stackRight [top],
+ # stackLeftScore[top], stackRightScore[top],
+ # stackLower [top]),
+
+ # merge intervals; if there is a previous interval with a no-higher
+ # left score and no-higher right score, merge this interval (and all
+ # intervening ones) into that one
+
+ while (top > 1) \
+ and (stackLower[top] > 0) \
+ and (stackRightScore[stackLower[top]] <= stackRightScore[top]):
+ stackRight [stackLower[top]] = stackRight [top]
+ stackRightScore[stackLower[top]] = stackRightScore[top]
+ top = stackLower[top]
+
+ #if ("algorithm" in debug):
+ # print >>sys.stderr, \
+ # "\n%*s merging [%d] %d-%d %4.1f %4.1f" \
+ # % (13, "", top,
+ # stackLeft[top], stackRight [top],
+ # stackLeftScore[top], stackRightScore[top]),
+
+ #if ("algorithm" in debug):
+ # print >>sys.stderr
+
+ # emit intervals
+
+ for sp in range(1,top+1):
+ left = stackLeft [sp] + 1
+ right = stackRight[sp]
+
+ while (left < right) and (seq[left] not in positives): left += 1
+ while (right > left) and (seq[right] not in positives): right -= 1
+
+ right += 1
+ if (minLength == None) or (right - left >= minLength):
+ yield (left,right)
+
+
+###################################################################
+# modified by Chen Sun on 7/11/2014
+#
+###################################################################
+
+# correlation_sequence--
+# Compute the correlation sequence for a given period. This is a sequence
+# of + and - indicating whether the base at a given position matches the one
+# P positions earlier (where P is the period). The first P positions are
+# blank. Positions with single character runs longer than the period are
+# considered as non-matches, unless the period is 1.
+
+def correlation_sequence(sequence,period,start=None,end=None):
+ if (start == None): start = 0
+ if (end == None): end = len(sequence)
+
+ prevCh = sequence[start]
+ run = 1
+ for ix in xrange(start+1,start+period):
+ ch = sequence[ix]
+ if (ch != prevCh): run = 1
+ else: run += 1
+ prevCh = ch
+
+ corr = [" "] * period
+ for ix in xrange(start+period,end):
+ rptCh = sequence[ix-period]
+ ch = sequence[ix]
+ if (ch != prevCh): run = 1
+ else: run += 1
+ if (ch in "ACGT") \
+ and (ch == rptCh) \
+ and ((period == 1) or (run < period)):
+ corr += ["+"]
+ else:
+ corr += ["-"]
+ prevCh = ch
+
+ return "".join(corr)
+
+
+# longest_suitable_run--
+# Find longest run with a good enough rate (of positives).
+#
+# We score a "+" as 1-r and anything else as -r. This is based on the fol-
+# lowing derivation (p is the number of "+"s, n is the number of non-"+"s):
+# p/(p+n) >= r
+# ==> p >= rp + rn
+# ==> (1-r)p - rn >= 0
+#
+# We adapt an algorithm from "Programming Pearls", pg. 81 (2000 printing).
+#
+# $$$ we use the denominator as the threshold, but we really should use the
+# $$$ .. numerator, comparing it to minLength*rate
+#
+# $$$ this needs to account for $$$ this situation:
+# $$$ sequence: ACGACGACGACGTTATTATTATTA
+# $$$ matches: +++++++++---+++++++++
+# $$$ this is currently considered to be one interval (if rate <= 6/7), but it
+# $$$ ought to be two; we can't just post-process, though, because some other
+# $$$ interval might be longer than the longest half of this; maybe what we
+# $$$ need to do is consider matches at distances -P and -2P, or if we match
+# $$$ -P but that itself was a mismatch, we should carry the mismatch forward
+
+def longest_suitable_run(seq,minLength,rate):
+ maxEndingHere = 0
+ maxSoFar = 0
+ start = None
+
+ for ix in xrange(len(seq)):
+ if (seq[ix] == "+"): s = 1-rate
+ else: s = -rate
+
+ if (maxEndingHere+s < 0):
+ maxEndingHere = 0
+ block = ix
+ else:
+ maxEndingHere += s
+ if (maxEndingHere >= maxSoFar):
+ maxSoFar = maxEndingHere
+ start = block + 1
+ end = ix + 1
+
+ if (start == None) or (end - start < minLength):
+ return []
+ else:
+ return [(start,end)]
+
+
+# all_suitable_runs--
+# Find all non-overlapping runs with a good enough rate (of positives), and
+# which meet our length threshold.
+# $$$ this needs to post-process the intervals, splitting them to account for
+# $$$ this situation:
+# $$$ sequence: ACGACGACGACGTTATTATTATTA
+# $$$ matches: +++++++++---+++++++++
+# $$$ this is currently reported as one interval (if rate <= 6/7), but it
+# $$$ ought to be two
+
+def all_suitable_runs(seq,minCorrLength,rate, hammingThreshold):
+
+ ################################################################
+ # modified by Chen Sun on 07/11/2014
+ #
+ ################################################################
+
+ if hammingThreshold > 0:
+ return [(start,end) for (start,end) in dense_intervals(seq,rate,"+","-",blockers=None,minLength=minCorrLength)]
+ elif hammingThreshold == 0:
+ return [(start,end) for (start,end) in non_negative_intervals(seq, minLength=minCorrLength)]
+
+
+# find_repeat_element--
+# Find the most plausible repeat element for a run, and nudge the ends of
+# the run if needed. Note that we will not consider kmers that represent
+# shorter repeats. For example, we won't report ACTACT as a 6-mer since we
+# consider this to have a shorter period than 6.
+
+def find_repeat_element(hammingThreshold, period,seq,start,end,allowPartials=False):
+
+ if hammingThreshold > 0:
+ (kmer,bestD,bestStart,bestEnd) = find_hamming_repeat_element(period,seq,start,end,allowPartials)
+ return (kmer,bestD,bestStart,bestEnd)
+ # count the number of occurences of each k-mer; note that we can't
+ # reject kmers containing smaller repeats yet, since for a sequence like
+ # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+ # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
+ # up reporting something like ACACAA as the best motif
+
+ if ("element" in debug):
+ print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+
+ if ("partial" in debug):
+ print period, seq, start, end, allowPartials;
+ print seq[start:end]
+
+ kmerToCount = {}
+ kmerToFirst = {}
+ for ix in xrange(start,end-(period-1)):
+ kmer = seq[ix:ix+period]
+ if ("N" in kmer): continue
+ if (kmer not in kmerToCount):
+ kmerToCount[kmer] = 1
+ kmerToFirst[kmer] = ix
+ else:
+ kmerToCount[kmer] += 1
+ #if ("element" in debug):
+ # print >>stderr, " %d: %s" % (ix,kmer)
+
+ # choose the best k-mer; this is simply the most frequently occurring one,
+ # with ties broken by whichever one came first
+
+ kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+ if (kmers == []): return (None,None,start,end)
+ kmers.sort()
+
+ if ("element" in debug):
+ for (count,first,kmer) in kmers:
+ print >>stderr, " %s: %d" % (kmer,-count)
+
+ (count,first,kmer) = kmers[0]
+ if (contains_repeat(kmer)): return (None,None,start,end)
+
+ # determine the hamming distance between the run and a simple repeat, for
+ # each "plausible" start and end; we compute the distance for each such
+ # interval, and choose the one with the lowest hamming distance; ties are
+ # broken in a deterministic-but-unspecified manner
+
+ bestD = bestStart = bestEnd = None
+ ###################################################################################
+ # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/18/2013
+ # since we do not allow hamming_distance > 0, which means we do not allow mutation,
+ # we do not need this section to produce bestStart and End
+ ###################################################################################
+
+ #for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+ # d = hamming_distance(seq,s,e,kmer)
+ # if (d == None): continue
+ # if (bestD == None) or (d <= bestD):
+ # (bestD,bestStart,bestEnd) = (d,s,e)
+
+
+
+ bestStart = start
+
+ if(allowPartials):
+ bestEnd = end
+ elif(not allowPartials):
+ bestEnd = start
+ pattern = seq[start:start+period]
+ if ("partial" in debug):
+ print "kmer:", kmer
+ if(pattern != kmer):
+ print "pattern:", pattern
+
+ while(bestEnd <= end-period):
+ bestEnd += period
+
+ # bestD will always be 0, as we do not allow mutation
+ bestD = 0
+
+ if ("partial" in debug):
+ print bestD, bestStart, bestEnd
+
+ ###################################################################################
+ # modified by Chen Sun(cxs1031@cse.psu.edu) on 10/10
+ #
+ ###################################################################################
+ return (kmer,bestD,bestStart,bestEnd)
+
+
+def find_hamming_repeat_element(period,seq,start,end,allowPartials=False):
+
+ # count the number of occurences of each k-mer; note that we can't
+ # reject kmers containing smaller repeats yet, since for a sequence like
+ # ACACACACACAAACACACACACACACACAC we must first discover ACACAC as the best
+ # 6-mer, and THEN reject it; if we reject ACACAC while counting, we'd end
+ # up reporting something like ACACAA as the best motif
+
+ if ("element" in debug):
+ print >>stderr, "find_repeat_element(%d,%d,%d)" % (period,start,end)
+
+ kmerToCount = {}
+ kmerToFirst = {}
+ for ix in xrange(start,end-(period-1)):
+ kmer = seq[ix:ix+period]
+ if ("N" in kmer): continue
+ if (kmer not in kmerToCount):
+ kmerToCount[kmer] = 1
+ kmerToFirst[kmer] = ix
+ else:
+ kmerToCount[kmer] += 1
+ #if ("element" in debug):
+ # print >>stderr, " %d: %s" % (ix,kmer)
+
+ # choose the best k-mer; this is simply the most frequently occurring one,
+ # with ties broken by whichever one came first
+
+ kmers = [(-kmerToCount[kmer],kmerToFirst[kmer],kmer) for kmer in kmerToCount]
+ if (kmers == []): return (None,None,start,end)
+ kmers.sort()
+
+ if ("element" in debug):
+ for (count,first,kmer) in kmers:
+ print >>stderr, " %s: %d" % (kmer,-count)
+
+ (count,first,kmer) = kmers[0]
+ if (contains_repeat(kmer)): return (None,None,start,end)
+
+ # determine the hamming distance between the run and a simple repeat, for
+ # each "plausible" start and end; we compute the distance for each such
+ # interval, and choose the one with the lowest hamming distance; ties are
+ # broken in a deterministic-but-unspecified manner
+
+ bestD = bestStart = bestEnd = None
+
+ for (s,e) in plausible_intervals(start,end,period,len(seq),allowPartials=allowPartials):
+ d = hamming_distance(seq,s,e,kmer)
+ if (d == None): continue
+ if (bestD == None) or (d <= bestD):
+ (bestD,bestStart,bestEnd) = (d,s,e)
+
+ return (kmer,bestD,bestStart,bestEnd)
+
+# plausible_intervals--
+# Yield all plausible intervals intersecting with a run. We generate all
+# starts within P bp of the run's start. For each of these, we either (a) try
+# all ends within P bp of run's end, or (b) trim the new interval to a whole
+# multiple of the period, and report this short interval and the longer
+# interval with one more period appended. Case (a) allows partial motifs,
+# while case (b) only allows whole motifs.
+
+def plausible_intervals(start,end,period,seqLen,allowPartials=False):
+
+ # generate intervals that allow a partial copy of the motif
+
+ if (allowPartials):
+ for candStart in xrange(start-(period-1),start+period):
+ if (candStart < 0): continue
+ for candEnd in xrange(end-(period-1),end+period):
+ if (candEnd > seqLen): continue
+ if (candEnd <= candStart+period): continue
+ yield (candStart,candEnd)
+
+ # -OR- generate intervals that allow only whole copies of the motif
+
+ else:
+ for candStart in xrange(start-(period-1),start+period):
+ if (candStart < 0): continue
+ candEnd = candStart + ((end-candStart)/period)*period
+ yield (candStart,candEnd)
+ candEnd += period
+ if (candEnd <= seqLen): yield (candStart,candEnd)
+
+
+# hamming_distance--
+# Determine the hamming distance between the run and a simple repeat.
+# $$$ improve this by allowing gaps, and stopping when we reach a threshold
+
+kmerToDiffs = {} # (this is used for memo-ization)
+
+def hamming_distance(seq,start,end,kmer):
+ period = len(kmer)
+ if (end < start + period): return None
+
+ wholeEnd = start + ((end-start)/period)*period
+
+ if (kmer not in kmerToDiffs):
+ kmerToDiffs[kmer] = { kmer:0 }
+
+ d = 0
+ for ix in xrange(start,wholeEnd,period):
+ qmer = seq[ix:ix+period] # same size as the kmer motif
+ if (qmer in kmerToDiffs[kmer]):
+ d += kmerToDiffs[kmer][qmer]
+ continue
+ diffs = 0
+ for iy in xrange(0,period):
+ if (qmer[iy] != kmer[iy]): diffs += 1
+ kmerToDiffs[kmer][qmer] = diffs
+ d += diffs
+
+ if (end > wholeEnd):
+ qmer = seq[wholeEnd:end] # shorter than the kmer motif
+ if (qmer in kmerToDiffs[kmer]):
+ d += kmerToDiffs[kmer][qmer]
+ else:
+ diffs = 0
+ for iy in xrange(0,len(qmer)):
+ if (qmer[iy] != kmer[iy]): diffs += 1
+ kmerToDiffs[kmer][qmer] = diffs
+ d += diffs
+
+ return d
+
+
+# fasta_sequences--
+# Read the fasta sequences from a file. Note that we convert to upper case,
+# and convert any letter other than ACGT to N.
+
+nonDnaMap = maketrans("BDEFHIJKLMOPQRSUVWXYZ","NNNNNNNNNNNNNNNNNNNNN")
+
+def fasta_sequences(f):
+ seqName = None
+ seqNucs = None
+
+ for line in f:
+ line = line.strip()
+ if (line.startswith(">")):
+ if (seqName != None):
+ yield (seqName,"".join(seqNucs))
+ seqName = sequence_name(line)
+ seqNucs = []
+ elif (seqName == None):
+ assert (False), "first sequence has no header"
+ else:
+ seqNucs += [line]
+
+ if (seqName != None):
+ yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap))
+
+
+# fastq_sequences--
+# Read the fastq sequences from a file. Note that we convert to upper case,
+# and convert any letter other than ACGT to N.
+
+def fastq_sequences(f):
+ lineNum = 0
+ for line in f:
+ lineNum += 1
+ line = line.strip()
+
+ if (lineNum % 4 == 1):
+ assert (line.startswith("@")), \
+ "bad read name at line %d" % lineNum
+ seqName = line[1:]
+ continue
+
+ if (lineNum % 4 == 2):
+ seqNucs = line
+ continue
+
+ if (lineNum % 4 == 3):
+ assert (line.startswith("+")), \
+ "can't understand line %d:\n%s" % (lineNum,line)
+ continue
+
+ quals = line
+ assert (len(quals) == len(seqNucs)), \
+ "length mismatch read vs. qualities at line %d" % lineNum
+ yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap),quals)
+
+ assert (lineNum % 4 == 0), \
+ "incomplete read at end of file"
+
+def sam_sequences(f):
+ lineNum = 0
+ for line in f:
+ lineNum += 1
+ line = line.strip()
+
+ if line.startswith("@"):
+ continue
+
+ columns = line.split("\t")
+ seqName = columns[0]
+ refName = columns[2]
+ pre_s = int(columns[3]) - 1
+ cigar = columns[5]
+ seqNucs = columns[9]
+
+ yield (seqName,"".join(seqNucs).upper().translate(nonDnaMap), refName, pre_s, cigar)
+
+# sequence_name--
+# Extract the sequence name from a fasta header.
+# $$$ this may need to be improved $$$
+
+def sequence_name(s):
+ s = s[1:].strip()
+ if (s == ""): return ""
+ else: return s.split()[0]
+
+
+# nucleotide_runs--
+# Yield (start,end) for all runs of valid nucleotides in a sequence.
+
+def nucleotide_runs(s):
+ runs = []
+ start = None
+ for (ix,nuc) in enumerate(s):
+ if (nuc in "ACGT"):
+ if (start == None):
+ start = ix
+ else:
+ if (start != None):
+ yield (start,ix)
+ start = None
+
+ if (start != None): yield (start,len(s))
+
+
+# contains_repeat--
+# Determine whether a short sequence contains a repeated element, such as a
+# 6-mer containing a repeated 2-mer (ACACAC) or 3-mer (ACTACT). The repeat
+# must cover the entire sequence, without mismatches.
+
+def contains_repeat(kmer):
+ kmerLength = len(kmer)
+ hasRepeat = False
+ rptLen = 1
+ while (not hasRepeat) and (2 * rptLen <= kmerLength):
+ if (kmerLength % rptLen != 0):
+ rptLen += 1
+ continue
+ isRepeat = True
+ for i in xrange(rptLen,kmerLength,rptLen):
+ if (kmer[i:i+rptLen] != kmer[:rptLen]):
+ isRepeat = False
+ break
+ if (isRepeat):
+ hasRepeat = True
+ break
+ rptLen += 1
+ return hasRepeat
+
+
+# hash108--
+# Return a 108-bit hash "value" of a string
+
+def hash108(s):
+ m = md5_new()
+ m.update(s)
+ return m.hexdigest()[:27]
+
+
+# float_or_fraction--
+# Convert a string to a number, allowing fractions
+
+def float_or_fraction(s):
+ if ("/" in s):
+ (numer,denom) = s.split("/",1)
+ return float(numer)/float(denom)
+ else:
+ return float(s)
+
+
+# int_with_unit--
+# Parse a string as an integer, allowing unit suffixes
+
+def int_with_unit(s):
+ if (s.endswith("K")):
+ multiplier = 1000
+ s = s[:-1]
+ elif (s.endswith("M")):
+ multiplier = 1000 * 1000
+ s = s[:-1]
+ elif (s.endswith("G")):
+ multiplier = 1000 * 1000 * 1000
+ s = s[:-1]
+ else:
+ multiplier = 1
+
+ try: return int(s) * multiplier
+ except ValueError: return int(math.ceil(float(s) * multiplier))
+
+
+if __name__ == "__main__": main()
+
diff -r 000000000000 -r 20ab85af9505 microsatellite.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatellite.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,178 @@
+
+ for short read, reference, and mapped data
+ microsatellite.py
+ "${filePath}"
+ #if $inputFileSource.inputFileType == "fasta"
+ --fasta
+ #elif $inputFileSource.inputFileType == "fastq"
+ --fastq
+ #elif $inputFileSource.inputFileType == "fastq_noquals"
+ --fastq:noquals
+ #elif $inputFileSource.inputFileType == "sam"
+ --sam
+ #end if
+
+ #if $inputFileSource.inputFileType == "sam"
+ #if $inputFileSource.referenceFileSource.requireReference
+ --r --ref="${inputFileSource.referenceFileSource.referencePath}"
+ #end if
+ #end if
+
+ --period="${period}"
+
+ #if $partialmotifs == "true"
+ --partialmotifs
+ #end if
+
+ --minlength="${minlength}"
+
+
+ --prefix="${prefix}"
+ --suffix="${surfix}"
+
+ --hamming="${hammingThreshold}"
+
+ #if $multipleruns
+ --multipleruns
+ #end if
+
+ #if $flankSetting.noflankdisplay
+ --noflankdisplay
+ #else
+ --flankdisplay=${flankSetting.flankdisplay}
+ #end if
+ > $stdout
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+We use different algorithms to detect microsatellites depend on hamming distance parameter.
+If hamming distance is set to zero, the program will only concern about uninterrupted microsatellites. The process works as follows.
+
+1) Scanning reads using sliding windows. For a given repeat period ‘k’ (e.g. k=2 for dinucleotide TRs), we compared consecutive k-mer window size sequences, with a step size of k. If a base at a given position matches one k positions earlier it was marked with a plus, if corresponding sites had different bases it was marked with a minus. The first k position is blank.
+
+2) Since we do not allow mutations in reported TR, consecutive “+” signal sequence means that a k-mer TR is present in this sample.
+
+3) Report k-mer TRs if the length is larger than a threshold provided by the user.
+
+If hamming distance is set to integer more than zero, the program will concern both uninterrupted and interrupted microsatellites. The process works as follows:
+
+(1) Identify intervals that are highly correlated with the interval shifted by ‘k’ (the repeat period). These intervals are called "runs" or "candidates". The allowed level of correlation is 6/7. Depending on whether we want to look for more than one microsat, we either find the longest such run (simple algorithm) or many runs (more complicated algorithm). The following steps are then performed on each run.
+
+(2) Find the most likely repeat motif in the run. This is done by counting all kmers (of length P) and choosing the most frequent. If that kmer is itself covered by a sub-repeat we discard this run. The idea is that we can ignore a 6-mer like ACGACG because we will find it when we are looking for 3-mers.
+
+(3) Once we identify the most likely repeat motif, we then modify the interval, adjusting start and end to find the interval that has the fewest mismatches vs. a sequence of the motif repeated (hamming distance).
+
+(4) At this point we have a valid microsat interval (in the eyes of the program). It is subjected to some filtering stages (hamming distance or too close to an end), and if it satisfies those conditions, it's reported to the user
+
+For more option, the script to run this program can be downloaded and run with python independently from Galaxy. There are more option for the script mode. Help page is build-in inside the script.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)
+
+**Input**
+
+- The input files can be fastq, fasta, fastq without quality score, and SAM format.
+
+**Output**
+
+For fastq, the output will contain the following columns:
+
+- Column 1 = length of microsatellites (bp)
+- Column 2 = length of left flanking regions (bp)
+- Column 3 = length of right flanking regions (bp)
+- Column 4 = repeat motif (bp)
+- Column 5 = hamming distance
+- Column 6 = read name
+- Column 7 = read sequence with soft masking of microsatellites
+- Column 8 = read quality (the same Phred score scale as input)
+
+For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).
+
+If the users have mapped file (SAM) and would like to profile microsatellites from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond microsatellites in reference for comparison. The output will be as follow:
+
+- Column 1 = length of microsatellites (bp)
+- Column 2 = length of left flanking regions (bp)
+- Column 3 = length of right flanking regions (bp)
+- Column 4 = repeat motif (bp)
+- Column 5 = hamming distance
+- Column 6 = read name
+- Column 7 = read sequence with soft masking of microsatellites
+- Column 8 = read quality (the same Phred score scale as input)
+- Column 9 = read name (The same as column 6)
+- Column 10 = chromosome
+- Column 11 = left flanking region start
+- Column 12 = left flanking region stop
+- Column 13 = microsatellite start as infer from pair-end
+- Column 14 = microsatellite stop as infer from pair-end
+- Column 15 = right flanking region start
+- Column 16 = right flanking region stop
+- Column 17 = microsatellite length in reference
+- Column 18 = microsatellite sequence in reference
+
+
+
diff -r 000000000000 -r 20ab85af9505 microsatpurity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,24 @@
+import sys
+# remove all read that have impure microsat
+# check only one line at a time
+
+
+fd=open(sys.argv[1])
+lines=fd.xreadlines()
+##motifIx=int(sys.argv[2])
+period=int(sys.argv[2])
+tr_ref_seqIx=int(sys.argv[3])-1
+##output=(sys.argv[4])
+##fout=open(output,'w')
+for line in lines:
+ temp=line.strip().split('\t')
+ temp=filter(None,temp)
+ #motif=temp[motifIx]
+ tr_ref_seq=temp[tr_ref_seqIx]
+ ##period=len(motif)
+ cand_motif=tr_ref_seq[:period]
+ len_microsat=len(tr_ref_seq)
+ expand_microsat_cand=cand_motif*(len_microsat/period) + cand_motif[:(len_microsat%period)]
+ if tr_ref_seq == expand_microsat_cand:
+ print line.strip()
+ ##print line.strip() >> fout
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 microsatpurity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/microsatpurity.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,79 @@
+
+ of a specific column
+ microsatpurity.py $input $period $column_n > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to select only the uninterrupted microsatellites. Interrupted microsatellites (e.g. ATATATATAATATAT) or sequences of microsatellites with non-microsatellite parts (e.g. ATATATATATG) will be removed.
+
+For TRFM pipeline (profiling microsatellites in short read data), this tool can be used to avoid the cases that flanking bases were misread as microsatellite. Thus, the read profile will only reflect the variation of TR length from expansion/contraction.
+For example, suppose that the sequence around microsatellite is AGCGACGaaaaaaGCGATCA. If we observe read with sequence AGCGACGaaaaaaaaaaGCGATCA, we can indicate that this is microsatellite expansion. However, if we observe AGCGACGaaaaaaaCGATCA, this is more like a substitution of G to A. These incidents can be removed with this tool.
+You can use the tool **combine mapped flaked bases** to get the microsatellites in reference that correspond to sequence between mapped reads. If the user map these reads around the uninterrupted microsatelites in reference, the corresponding sequences between these pairs should be the uninterrupted microsatellites regardless of expansion/contraction of microsatellites in short read data. However, if the substitution of flanking base or if the fluorescent signal from the previous run make it look like substitution, the corresponding sequences in reference in between the pairs will not be uninterrupted microsatellites. Thus this tool can remove those cases and keep only microsatellite expansion/contraction.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input files can be any tab delimited file.
+
+If this tool is used in TRFM microsatellite profiling, it should contains:
+
+- Column 1 = microsatellite location in reference chromosome
+- Column 2 = microsatellite location in reference start
+- Column 3 = microsatellite location in reference stop
+- Column 4 = microsatellite location in reference motif
+- Column 5 = microsatellite location in reference length
+- Column 6 = microsatellite location in reference motif size
+- Column 7 = length of microsatellites (bp)
+- Column 8 = length of left flanking regions (bp)
+- Column 9 = length of right flanking regions (bp)
+- Column 10 = repeat motif (bp)
+- Column 11 = hamming distance
+- Column 12 = read name
+- Column 13 = read sequence with soft masking of microsatellites
+- Column 14 = read quality (the same Phred score scale as input)
+- Column 15 = read name (The same as column 12)
+- Column 16 = chromosome
+- Column 17 = left flanking region start
+- Column 18 = left flanking region stop
+- Column 19 = microsatellite start as infer from pair-end
+- Column 20 = microsatellite stop as infer from pair-end
+- Column 21 = right flanking region start
+- Column 22 = right flanking region stop
+- Column 23 = microsatellite length in reference
+- Column 24 = microsatellite sequence in reference
+
+**Output**
+
+The same as input format.
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 pair_fetch_DNA_ff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pair_fetch_DNA_ff.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# pair_fetch_DNA_ff.py
+# Function: filter microsat and flanking region by quality score;
+# remove read with any base that has lower quality score than "quality_require" within "flanking_base" and convert from snoope to fastq
+# Note that require flanking length need to be screen by Bob snoope script first
+
+# Author: Arkarachai Fungtammasan
+# Version 1.0.0 (15 July 2012)
+# Input format: length_of_repeat[0] left_flank_length[1] right_flank_length[2] repeat_motif[3] hamming_distance[4] read_name[5] read_sequence[6] read_quality[7]
+# Output format: two fastq file. First file contain left flank. Second file contain right flank.
+# Command: python pair_fetch_DNA_ff.py input.txt
+
+import sys
+from galaxy import eggs
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+# read file name
+
+
+
+filename=sys.argv[1]
+L_filename=sys.argv[2]
+R_filename=sys.argv[3]
+quality_require=sys.argv[4]
+flanking_base=sys.argv[5]
+try:
+ quality_require=int(quality_require)
+ flanking_base=int(flanking_base)
+except Exception, eee:
+ print eee
+ stop_err("Quality score cutoff and Length of flanking regions that require quality screening must be integer")
+
+fd=open(filename)
+fdd1=open(L_filename,'w')
+fdd2=open(R_filename,'w')
+lines=fd.xreadlines()
+for line in lines:
+ temp=line.strip().split('\t')
+ temp=filter(None,temp)
+ #get index
+ left_flank=(0,int(temp[1]))
+ microsat=(int(temp[1]),int(temp[1])+int(temp[0]))
+ right_flank=(int(temp[1])+int(temp[0]),int(temp[1])+int(temp[0])+int(temp[2]))
+ flag=0
+ #filter length of left and right flank
+ if (right_flank[1]-right_flank[0])
+
+ heteroprob.py $microsat_raw $microsat_error_profile $expectedminorallele > $microsat_corrected
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+- This tool will calculate the probability that the allele combination can generated the given read profile. This tool is part of the pipeline to estimate minimum read depth.
+- The calculation of probability is very similar to the tool **Correct genotype for microsatellite errors**. However, this tool will restrict the calculation to only the allele combination indicated in input. Also, when it encounter allele combination that cannot be generated from error profile, the total probability will be zero instead of using base substitution rate.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input format is the same as output from **Correct genotype for microsatellite errors** tool.
+
+- Column 1 = location of microsatellite locus.
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format).
+- Column 3 = motif of microsatellite in this locus. The input file can contain more than three column.
+- Column 4 = homozygous/heterozygous label.
+- Column 5 = log based 10 of (the probability of homozygous/the probability of heterozygous)
+- Column 6 = Allele for most probable homozygous form.
+- Column 7 = Allele 1 for most probable heterozygous form.
+- Column 8 = Allele 2 for most probable heterozygous form.
+
+Only column 2,3,7,8 were used in calculation.
+
+**Output**
+
+
+The output will be contain original eight column from the input. However, it will also add these following columns.
+- Column 9 = Probability of the allele combination to generate given read profile.
+- Column 10 = Number of possible rearrangement of given read profile.
+- Column 11 = Probability of the allele combination to generate read profile with any rearrangement (Product of column 9 and column 10)
+- Column 12 = Read depth
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 profilegenerator.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,66 @@
+import collections
+import itertools
+import sys
+
+filename=sys.argv[1]
+MOTIF=sys.argv[2]
+MOTIFSIZE=len(MOTIF)
+MaxDEPTH=int(sys.argv[3])
+MINIMUMPROB=float(sys.argv[4])##1.0/(10**4)
+MININUMCOUNT=1
+fd=open(filename)
+lines=fd.readlines()
+countbymajorallele=collections.defaultdict(list)
+for line in lines:
+ temp=line.strip().split('\t')
+ t_major=int(temp[0])
+ t_count=int(temp[2])
+ countbymajorallele[t_major].append(t_count)
+fd.close()
+sumbymajorallele=collections.defaultdict(int)
+for t_majorallele in countbymajorallele.keys():
+ sumbymajorallele[t_majorallele]=sum(countbymajorallele[t_majorallele])
+
+fd=open(filename)
+##fd=open('PCRinclude.mono.A.bymajorallele')
+lines=fd.readlines()
+allmajor=collections.defaultdict(list)
+for line in lines:
+ temp=line.strip().split()
+ if int(temp[0])%MOTIFSIZE==0:
+ if (int(temp[2])/(sumbymajorallele[int(temp[0])]*1.0))>=MINIMUMPROB:
+ if int(temp[2])>=MININUMCOUNT:
+ allmajor[int(temp[0])].append(int(temp[1]))
+##print allmajor
+allkey=allmajor.keys()
+allkey.sort()
+#print allkey
+keycount=0
+combinelist_collection=[]
+for dummycount in range(len(allkey)-1):
+ pair1,pair2=allkey[keycount],allkey[keycount+1]
+ pair1list=allmajor[pair1]
+ pair2list=allmajor[pair2]
+ #print pair1list,pair2list
+ pair1list.extend(pair2list)
+ combinelist=list(set(pair1list))
+ combinelist.sort()
+ ##print combinelist
+ combinelist_collection.append(tuple(combinelist))
+ keycount+=1
+combinelist_collection=list(set(combinelist_collection))
+newcombinelist_collection=combinelist_collection[:]
+#combinelist_collection=set(combinelist_collection)
+for smallset1 in combinelist_collection:
+ for smallset2 in combinelist_collection:
+ if set(smallset1).issubset(set(smallset2)) and smallset1 != smallset2:
+ newcombinelist_collection.remove(smallset1)
+ break
+##print combinelist_collection
+
+for depth in range(2,MaxDEPTH+1):
+ for member_list in newcombinelist_collection:
+ for member in itertools.combinations_with_replacement(member_list,depth):
+ print 'chr'+'\t'+','.join(map(str,member))+'\t'+MOTIF
+
+
diff -r 000000000000 -r 20ab85af9505 profilegenerator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/profilegenerator.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,110 @@
+
+ of the consecutive allele from given error profile
+ profilegenerator.py $error_profile $MOTIF $Maxdepth $minprob > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool will generate all possible combination of observed read profile of the consecutive alleles from given error profile. The range of observed read length can be filtered to contain only those that are frequently occur using "Minimum error rate to be considered" parameter.
+
+This problem will collect the lists of valid (pass "Minimum error rate to be considered" threshold) observed length profiles from combination of consecutive allele lengths. The lists that are equivalent or the subset of the other lists will be removed. For each depth and each list, length profile were generated from combination with replacement which compatible with python 2.7. There could be redundant error profiles generated from different lists if more than one combination of allele is generated due to overlap range of observed microsatellite lengths. The user need to remove them which can be done easily using **sort | uniq** command in unix.
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+- The error profile needs to contain these three columns.
+- Column 1 = Correct microsatellite length
+- Column 2 = Observed microsatellite length
+- Column 3 = Number of observation
+
+**Output**
+
+- Column 1 = Place holder for location of microsatellite locus. (just "chr")
+- Column 2 = length profile (length of microsatellite in each read that mapped to this location in comma separated format).
+- Column 3 = motif of microsatellite in this locus.
+
+**Example**
+
+- Suppose that we provide the following read profile ::
+
+ 9 9 100000
+ 10 10 91456
+ 10 9 1259
+ 11 11 39657
+ 11 10 1211
+ 11 12 514
+
+
+- Using default minimum probability to be consider and motif = A, all observed read lengths are valid. The program will generated lists of observed length profiles from consecutive allele length. ::
+
+ 9:10 = [9,10]
+ 10:11 = [9,10,11,12]
+
+- Lists that are subsets of other lists will be removed. Thus, [9,10] will not be considered.
+
+- Then the program will generate all combination with replacement for each depth from each list. Using **maximum read depth =3**, we will ge the following output. ::
+
+
+ chr 9,9 A
+ chr 9,10 A
+ chr 9,11 A
+ chr 9,12 A
+ chr 10,10 A
+ chr 10,11 A
+ chr 10,12 A
+ chr 11,11 A
+ chr 11,12 A
+ chr 12,12 A
+ chr 9,9,9 A
+ chr 9,9,10 A
+ chr 9,9,11 A
+ chr 9,9,12 A
+ chr 9,10,10 A
+ chr 9,10,11 A
+ chr 9,10,12 A
+ chr 9,11,11 A
+ chr 9,11,12 A
+ chr 9,12,12 A
+ chr 10,10,10 A
+ chr 10,10,11 A
+ chr 10,10,12 A
+ chr 10,11,11 A
+ chr 10,11,12 A
+ chr 10,12,12 A
+ chr 11,11,11 A
+ chr 11,11,12 A
+ chr 11,12,12 A
+ chr 12,12,12 A
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 readdepth2sequencingdepth.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readdepth2sequencingdepth.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,57 @@
+
+ for flank-based mapping of microsatellites
+ sequencingdepthconversion_G.py $repeatlength $flanksize $readlength $infodepth $probprediction > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to convert informative read depth (specified by user) to sequencing depth when the microsatellites is mapped using TRFM pipeline.
+The locus specific sequencing depth is the sequencing depth that will make a certain loci have certain read depth based on uniform mapped of read. It is calculated as: ::
+
+ yrequired = ( X * L ) / (L - (2F+r-1))
+
+Where X = read depth, L = read length, F = the number of flanked bases required on each flanking regions, r = the expected repeat length of microsatellite of interest.
+
+The genome wide sequencing depth is the sequencing depth that will make certain percentage of genome (e.g. 90 percent or 95 percent) to have certain locus specific sequencing depth. It's calculated using numerical guessing to find smallest lambda that: ::
+
+ 0.90 (or other proportion specified by user) < = P(Y=0) + P(Y=1) + …+ P(Y=yrequired-1)
+
+ P(Y=y) = (lambda^(y) * e ^(-lambda)) /y!
+
+ y = specific level of sequencing depth. Lambda = genome wide sequencing depth
+
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 sequencingdepthconversion_G.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequencingdepthconversion_G.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,54 @@
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit()
+
+def info2require(X,L,F,r):
+ '''infodepth,readlength,flanksize,repeatlength
+ '''
+ return int(math.ceil((X*L*1.0)/(L-(1*((2*F)+r-1)))))
+
+def poissondef(meancov,specificcov):
+ nominator=1.0*(meancov**specificcov)*(math.e**(-1*meancov))
+ denominator=math.factorial(specificcov)
+ return nominator/denominator
+
+def require2recommend(needprob,mindepth):
+ i=mindepth
+ reverseneedprob=1-needprob
+ sumprob=1
+ while sumprob>reverseneedprob: #mean cov
+ sumprob=0
+ for j in range(0,mindepth): #specific cov
+ sumprob+=poissondef(i,j)
+ i+=1
+
+ return i-1
+
+import sys,math
+
+repeatlength=int(sys.argv[1])
+flanksize=int(sys.argv[2])#20
+readlength=int(sys.argv[3])#100
+infodepth=int(sys.argv[4])#5
+probdetection=float(sys.argv[5])#0.90
+
+if probdetection >1:
+ try:
+ probvalue=int('probvalue')
+ except Exception, eee:
+ print eee
+ stop_err("Proportion of genome to have certain locus specific must be between 0 and 1")
+
+print 'repeat_length'+'\t'+'read_length'+'\t'+'informative_read_depth''\t'+'=locus_specific_sequencing_depth'+'\t'+'=genome_wide_sequencing_depth'
+t_requiredepth=info2require(infodepth,readlength,flanksize,repeatlength)
+t_recomendseq=require2recommend(probdetection,t_requiredepth)
+preplotlist=[repeatlength,readlength,infodepth,t_requiredepth,t_recomendseq]
+plotlist=map(str,preplotlist)
+print '\t'.join(plotlist)
+
+#print info2require(infodepth,readlength,flanksize,repeatlength)
+#print poissondef(10,3)
+#print require2recommend(0.90,80)
+#informative_read_depth
+#required_seq_depth
+#recommend_seq_depth
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 space2underscore_readname.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/space2underscore_readname.xml Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,47 @@
+
+ --change space to underscore of a specific column
+ changespacetounderscore_readname.py $input $output $column_n
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What it does**
+
+This tool is used to change space to underscore. For TRFM pipeline (profiling microsatellites in short read data), this tool is used to change space in read name to underscore to prevent the downstream tools which might recognize incorrect column number due to space in read name. If the input do not have space in read name, this step can be skipped.
+
+**Citation**
+
+When you use this tool, please cite **Arkarachai Fungtammasan and Guruprasad Ananda (2014).**
+
+**Input**
+
+The input files can be any tab delimited file.
+
+If this tool is used in TRFM microsatellite profiling, it should be in the same format as output from **microsatellite detection program**. This format contains **length of repeat**, **length of left flanking region**, **length of right flanking region**, **repeat motif**, **hamming (editing) distance**, **read name**, **read sequence**, **read quality score**
+
+**Output**
+
+The same as input format.
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 test-data/.DS_Store
Binary file test-data/.DS_Store has changed
diff -r 000000000000 -r 20ab85af9505 test-data/C_sample_fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_fastq Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,8 @@
+@IL2_40_2_1_735_755
+ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGAAATAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+@IL2_40_2_1_919_700
+ATAAGGAAAAAAAAAAAAAAAACCAGGTCTTTTTTTTTTTTTTTTTGTTAT
++
+IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
diff -r 000000000000 -r 20ab85af9505 test-data/C_sample_snoope
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/C_sample_snoope Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+3 33 15 A 0 IL2_40_2_1_735_755_1_per1_2 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTaaaGTGCTGAAATAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+3 42 6 A 0 IL2_40_2_1_735_755_1_per1_3 ATTTTCCAGCACCGTCATGTGGTTCCAGAGGTTAAAGTGCTGaaaTAACAT IIIIIIIIIIIIIIIIIIIIIIII4IIIIIIIII5IIDI)'7%*8%%%%5*
+16 6 29 A 0 IL2_40_2_1_919_700_1_per1_1 ATAAGGaaaaaaaaaaaaaaaaCCAGGTCTTTTTTTTTTTTTTTTTGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
+17 29 5 T 0 IL2_40_2_1_919_700_1_per1_2 ATAAGGAAAAAAAAAAAAAAAACCAGGTCtttttttttttttttttGTTAT IIIIIIIIIIIIIIIIIIIIII@IIII2III4-II47I?CII>-%:C-;$&
diff -r 000000000000 -r 20ab85af9505 test-data/PCRinclude.allrate.bymajorallele
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/PCRinclude.allrate.bymajorallele Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,997 @@
+10 10 91456 A
+10 9 1259 A
+10 11 605 A
+10 8 16 A
+10 12 8 A
+10 7 2 A
+11 11 39657 A
+11 10 1211 A
+11 12 514 A
+11 9 54 A
+11 13 9 A
+11 8 3 A
+11 14 1 A
+12 12 18850 A
+12 11 986 A
+12 13 417 A
+12 10 73 A
+12 14 8 A
+12 9 1 A
+12 8 1 A
+13 13 10201 A
+13 12 885 A
+13 14 320 A
+13 11 83 A
+13 15 12 A
+13 10 8 A
+14 14 3649 A
+14 13 409 A
+14 15 151 A
+14 12 62 A
+14 11 6 A
+14 16 5 A
+14 10 1 A
+15 15 847 A
+15 14 140 A
+15 16 60 A
+15 13 20 A
+15 17 4 A
+15 12 3 A
+16 16 182 A
+16 15 60 A
+16 17 14 A
+16 14 12 A
+16 13 1 A
+16 12 1 A
+16 18 1 A
+17 17 11 A
+17 16 5 A
+17 15 2 A
+17 18 1 A
+18 18 4 A
+18 17 2 A
+5 5 10047169 A
+5 6 44 A
+6 6 2808071 A
+6 5 195 A
+6 7 69 A
+7 7 1097174 A
+7 6 313 A
+7 8 83 A
+7 5 6 A
+8 8 369496 A
+8 7 387 A
+8 9 248 A
+8 6 3 A
+8 10 2 A
+9 9 184958 A
+9 8 707 A
+9 10 486 A
+9 7 5 A
+9 11 4 A
+10 10 46 C
+10 9 3 C
+5 5 1354993 C
+5 6 7 C
+6 6 193431 C
+6 5 14 C
+6 7 2 C
+7 7 22171 C
+7 6 4 C
+8 8 2966 C
+8 9 3 C
+8 7 3 C
+9 9 638 C
+9 8 8 C
+9 7 1 C
+10 10 21211 AC
+10 8 3 AC
+10 12 1 AC
+11 11 15048 AC
+11 9 10 AC
+12 12 6043 AC
+12 10 15 AC
+12 14 1 AC
+13 13 5070 AC
+13 11 40 AC
+13 15 1 AC
+14 14 3093 AC
+14 12 44 AC
+14 10 1 AC
+15 15 2848 AC
+15 13 31 AC
+15 17 1 AC
+16 16 1273 AC
+16 14 30 AC
+16 12 2 AC
+17 17 1297 AC
+17 15 27 AC
+18 18 1269 AC
+18 16 43 AC
+18 20 2 AC
+18 14 1 AC
+19 19 679 AC
+19 17 17 AC
+19 21 1 AC
+20 20 645 AC
+20 18 34 AC
+20 22 2 AC
+20 16 1 AC
+21 21 723 AC
+21 19 28 AC
+21 17 1 AC
+21 23 1 AC
+22 22 499 AC
+22 20 29 AC
+22 18 3 AC
+23 23 540 AC
+23 21 30 AC
+23 19 2 AC
+23 25 1 AC
+24 24 385 AC
+24 22 38 AC
+24 26 2 AC
+24 20 1 AC
+25 25 407 AC
+25 23 22 AC
+25 27 2 AC
+25 21 1 AC
+26 26 257 AC
+26 24 30 AC
+26 22 3 AC
+26 28 1 AC
+26 20 1 AC
+27 27 339 AC
+27 25 28 AC
+27 23 3 AC
+27 29 2 AC
+28 28 202 AC
+28 26 17 AC
+28 30 6 AC
+29 29 277 AC
+29 27 29 AC
+29 31 6 AC
+29 25 3 AC
+30 30 117 AC
+30 28 12 AC
+30 32 3 AC
+30 18 1 AC
+31 31 144 AC
+31 29 18 AC
+31 27 4 AC
+31 33 2 AC
+32 32 101 AC
+32 30 23 AC
+32 28 2 AC
+32 34 2 AC
+32 26 1 AC
+33 33 106 AC
+33 31 15 AC
+33 35 3 AC
+33 29 1 AC
+34 34 33 AC
+34 32 7 AC
+35 35 21 AC
+35 33 4 AC
+35 31 1 AC
+36 36 12 AC
+36 34 1 AC
+37 37 10 AC
+37 35 3 AC
+37 31 1 AC
+37 39 1 AC
+38 38 4 AC
+38 36 1 AC
+6 6 1521439 AC
+7 7 513952 AC
+8 8 134603 AC
+8 6 2 AC
+9 9 60741 AC
+9 7 3 AC
+9 11 1 AC
+10 10 21772 AG
+10 8 3 AG
+10 12 1 AG
+11 11 13880 AG
+11 9 10 AG
+11 13 1 AG
+12 12 5628 AG
+12 10 13 AG
+12 14 4 AG
+13 13 4494 AG
+13 11 17 AG
+14 14 1898 AG
+14 12 15 AG
+15 15 2427 AG
+15 13 18 AG
+16 16 1076 AG
+16 14 24 AG
+16 12 1 AG
+17 17 874 AG
+17 15 12 AG
+17 19 1 AG
+17 13 1 AG
+18 18 536 AG
+18 16 20 AG
+18 14 1 AG
+19 19 563 AG
+19 17 25 AG
+20 20 201 AG
+20 18 14 AG
+21 21 260 AG
+21 19 10 AG
+22 22 83 AG
+22 20 5 AG
+23 23 147 AG
+23 21 5 AG
+23 25 1 AG
+24 24 99 AG
+24 22 4 AG
+24 18 1 AG
+25 25 62 AG
+25 23 3 AG
+25 27 1 AG
+26 26 38 AG
+26 24 8 AG
+27 27 24 AG
+27 25 3 AG
+27 23 1 AG
+28 28 14 AG
+28 26 2 AG
+29 29 12 AG
+29 27 5 AG
+29 31 1 AG
+30 30 7 AG
+30 28 2 AG
+31 31 7 AG
+31 27 3 AG
+31 23 1 AG
+32 32 4 AG
+32 28 1 AG
+6 6 1880822 AG
+7 7 684837 AG
+7 9 1 AG
+8 8 183381 AG
+9 9 75547 AG
+9 7 6 AG
+9 11 1 AG
+10 10 18179 AT
+10 8 7 AT
+10 12 4 AT
+11 11 8969 AT
+11 9 5 AT
+11 13 2 AT
+12 12 4888 AT
+12 10 8 AT
+12 14 2 AT
+13 13 2785 AT
+13 11 17 AT
+13 15 1 AT
+14 14 2310 AT
+14 12 40 AT
+14 16 4 AT
+14 10 2 AT
+15 15 1461 AT
+15 13 33 AT
+15 11 1 AT
+15 17 1 AT
+16 16 879 AT
+16 14 42 AT
+16 18 2 AT
+16 12 1 AT
+17 17 599 AT
+17 15 38 AT
+17 19 2 AT
+17 13 1 AT
+18 18 367 AT
+18 16 29 AT
+18 20 7 AT
+18 14 1 AT
+19 19 223 AT
+19 17 34 AT
+19 21 3 AT
+20 20 97 AT
+20 18 14 AT
+20 16 2 AT
+20 22 1 AT
+21 21 60 AT
+21 19 18 AT
+21 17 1 AT
+22 22 53 AT
+22 20 15 AT
+22 24 5 AT
+22 18 3 AT
+23 23 11 AT
+23 21 1 AT
+24 24 7 AT
+24 20 2 AT
+24 22 2 AT
+6 6 1671932 AT
+6 8 1 AT
+7 7 595145 AT
+8 8 195533 AT
+8 10 5 AT
+8 6 2 AT
+9 9 52576 AT
+9 7 3 AT
+10 10 17 CG
+11 11 17 CG
+12 12 6 CG
+6 6 4097 CG
+7 7 678 CG
+8 8 184 CG
+9 9 19 CG
+10 10 19552 AAC
+11 11 19003 AAC
+12 12 6245 AAC
+12 9 1 AAC
+13 13 3406 AAC
+14 14 8448 AAC
+14 11 2 AAC
+15 15 2356 AAC
+15 12 6 AAC
+16 16 1373 AAC
+16 13 4 AAC
+17 17 3140 AAC
+17 14 5 AAC
+18 18 944 AAC
+18 15 2 AAC
+19 19 456 AAC
+19 16 1 AAC
+20 20 1474 AAC
+20 17 3 AAC
+21 21 328 AAC
+21 18 1 AAC
+22 22 178 AAC
+23 23 538 AAC
+23 26 1 AAC
+24 24 112 AAC
+25 25 60 AAC
+26 26 239 AAC
+26 23 1 AAC
+27 27 45 AAC
+28 28 58 AAC
+28 25 2 AAC
+29 29 77 AAC
+30 30 17 AAC
+31 31 38 AAC
+31 28 1 AAC
+32 32 94 AAC
+32 29 3 AAC
+33 33 15 AAC
+35 35 55 AAC
+35 32 1 AAC
+38 38 12 AAC
+41 41 6 AAC
+9 9 57212 AAC
+10 10 31455 AAG
+11 11 11876 AAG
+12 12 3458 AAG
+12 9 6 AAG
+13 13 1141 AAG
+14 14 928 AAG
+15 15 548 AAG
+15 12 4 AAG
+16 16 189 AAG
+17 17 235 AAG
+18 18 63 AAG
+19 19 66 AAG
+20 20 122 AAG
+22 22 11 AAG
+23 23 33 AAG
+9 9 104524 AAG
+10 10 69106 AAT
+11 11 30381 AAT
+12 12 12001 AAT
+12 9 1 AAT
+13 13 7168 AAT
+13 10 2 AAT
+14 14 5470 AAT
+14 11 3 AAT
+15 15 2524 AAT
+15 12 3 AAT
+16 16 1733 AAT
+16 13 1 AAT
+17 17 1324 AAT
+17 14 3 AAT
+18 18 1022 AAT
+18 15 3 AAT
+19 19 502 AAT
+19 16 3 AAT
+20 20 570 AAT
+20 17 2 AAT
+21 21 370 AAT
+21 18 1 AAT
+22 22 98 AAT
+23 23 164 AAT
+23 20 3 AAT
+24 24 143 AAT
+24 21 1 AAT
+25 25 122 AAT
+25 22 1 AAT
+26 26 45 AAT
+26 23 2 AAT
+27 27 32 AAT
+27 24 1 AAT
+28 28 6 AAT
+29 29 64 AAT
+29 26 1 AAT
+30 30 28 AAT
+30 24 1 AAT
+31 31 9 AAT
+32 32 9 AAT
+32 29 1 AAT
+38 38 6 AAT
+9 9 179182 AAT
+9 12 1 AAT
+10 10 14290 ACC
+11 11 5692 ACC
+12 12 1795 ACC
+13 13 1141 ACC
+14 14 545 ACC
+15 15 308 ACC
+16 16 162 ACC
+17 17 107 ACC
+18 18 23 ACC
+19 19 35 ACC
+20 20 44 ACC
+21 21 5 ACC
+22 22 5 ACC
+22 19 1 ACC
+23 23 11 ACC
+25 25 7 ACC
+26 26 7 ACC
+27 27 10 ACC
+28 28 24 ACC
+28 25 1 ACC
+35 35 5 ACC
+9 9 46614 ACC
+10 10 2865 ACG
+11 11 900 ACG
+12 12 325 ACG
+13 13 82 ACG
+14 14 83 ACG
+9 9 9465 ACG
+10 10 6269 ACT
+11 11 2284 ACT
+12 12 634 ACT
+13 13 441 ACT
+14 14 295 ACT
+15 15 118 ACT
+16 16 60 ACT
+17 17 71 ACT
+18 18 58 ACT
+19 19 42 ACT
+20 20 24 ACT
+24 24 5 ACT
+37 37 8 ACT
+41 41 5 ACT
+41 35 1 ACT
+9 9 20025 ACT
+10 10 2897 AGC
+11 11 948 AGC
+12 12 320 AGC
+13 13 97 AGC
+14 14 87 AGC
+15 15 13 AGC
+16 16 9 AGC
+17 17 25 AGC
+17 14 1 AGC
+9 9 9579 AGC
+10 10 21141 AGG
+11 11 8128 AGG
+12 12 2964 AGG
+13 13 1209 AGG
+14 14 860 AGG
+15 15 320 AGG
+16 16 190 AGG
+17 17 225 AGG
+18 18 147 AGG
+20 20 80 AGG
+21 21 9 AGG
+22 22 35 AGG
+23 23 27 AGG
+24 24 8 AGG
+26 26 9 AGG
+9 9 57350 AGG
+10 10 5964 ATC
+11 11 2346 ATC
+12 12 789 ATC
+13 13 386 ATC
+14 14 285 ATC
+15 15 165 ATC
+16 16 93 ATC
+17 17 149 ATC
+18 18 51 ATC
+19 19 6 ATC
+20 20 15 ATC
+21 21 15 ATC
+22 22 29 ATC
+23 23 25 ATC
+24 24 24 ATC
+26 26 34 ATC
+27 27 9 ATC
+28 28 30 ATC
+29 29 8 ATC
+30 30 8 ATC
+31 31 11 ATC
+34 34 11 ATC
+34 31 1 ATC
+36 36 5 ATC
+9 9 19837 ATC
+10 10 11 CCG
+11 11 24 CCG
+14 14 5 CCG
+16 16 5 CCG
+9 9 135 CCG
+12 12 10192 AAAC
+13 13 4917 AAAC
+14 14 4704 AAAC
+15 15 12713 AAAC
+16 16 2415 AAAC
+17 17 1431 AAAC
+18 18 1861 AAAC
+18 14 2 AAAC
+19 19 5254 AAAC
+19 15 2 AAAC
+19 23 1 AAAC
+20 20 913 AAAC
+20 16 1 AAAC
+21 21 615 AAAC
+22 22 509 AAAC
+22 18 2 AAAC
+23 23 2249 AAAC
+23 19 5 AAAC
+23 15 1 AAAC
+24 24 329 AAAC
+24 20 2 AAAC
+25 25 230 AAAC
+25 21 1 AAAC
+26 26 175 AAAC
+27 27 548 AAAC
+27 23 2 AAAC
+28 28 195 AAAC
+28 24 1 AAAC
+29 29 62 AAAC
+30 30 67 AAAC
+31 31 165 AAAC
+31 27 1 AAAC
+32 32 64 AAAC
+33 33 63 AAAC
+34 34 21 AAAC
+35 35 40 AAAC
+36 36 55 AAAC
+37 37 6 AAAC
+38 38 8 AAAC
+39 39 10 AAAC
+40 40 7 AAAC
+45 45 7 AAAC
+12 12 12855 AAAG
+12 16 13 AAAG
+12 20 9 AAAG
+12 18 2 AAAG
+13 13 6727 AAAG
+14 14 3699 AAAG
+14 13 8 AAAG
+15 15 3858 AAAG
+15 17 6 AAAG
+15 13 1 AAAG
+16 16 1244 AAAG
+17 17 750 AAAG
+17 13 1 AAAG
+18 18 380 AAAG
+18 20 5 AAAG
+18 14 1 AAAG
+19 19 1164 AAAG
+19 15 1 AAAG
+20 20 153 AAAG
+21 21 186 AAAG
+22 22 115 AAAG
+23 23 321 AAAG
+23 19 1 AAAG
+24 24 82 AAAG
+25 25 89 AAAG
+26 26 26 AAAG
+26 13 3 AAAG
+27 27 64 AAAG
+28 28 36 AAAG
+29 29 32 AAAG
+31 31 31 AAAG
+33 33 19 AAAG
+35 35 10 AAAG
+36 36 11 AAAG
+38 38 16 AAAG
+41 41 5 AAAG
+12 12 23143 AAAT
+13 13 10045 AAAT
+14 14 6815 AAAT
+15 15 8439 AAAT
+16 16 3102 AAAT
+16 12 2 AAAT
+17 17 2018 AAAT
+17 13 2 AAAT
+18 18 2044 AAAT
+19 19 2955 AAAT
+19 15 1 AAAT
+19 14 1 AAAT
+20 20 909 AAAT
+21 21 711 AAAT
+21 17 2 AAAT
+22 22 500 AAAT
+22 18 2 AAAT
+23 23 993 AAAT
+23 19 3 AAAT
+24 24 382 AAAT
+24 20 3 AAAT
+25 25 190 AAAT
+26 26 185 AAAT
+26 22 1 AAAT
+27 27 281 AAAT
+27 23 2 AAAT
+28 28 165 AAAT
+28 24 2 AAAT
+29 29 48 AAAT
+30 30 46 AAAT
+31 31 101 AAAT
+32 32 28 AAAT
+33 33 19 AAAT
+34 34 24 AAAT
+34 30 1 AAAT
+35 35 41 AAAT
+35 31 2 AAAT
+36 36 16 AAAT
+37 37 6 AAAT
+38 38 5 AAAT
+39 39 20 AAAT
+39 35 1 AAAT
+40 40 5 AAAT
+41 41 10 AAAT
+42 42 6 AAAT
+45 45 6 AAAT
+12 12 1468 AACC
+13 13 590 AACC
+14 14 318 AACC
+15 15 163 AACC
+16 16 102 AACC
+17 17 106 AACC
+18 18 18 AACC
+19 19 34 AACC
+20 20 7 AACC
+22 22 7 AACC
+23 23 13 AACC
+24 24 16 AACC
+25 25 9 AACC
+31 31 9 AACC
+12 12 214 AACG
+13 13 135 AACG
+14 14 39 AACG
+15 15 45 AACG
+12 12 522 AACT
+13 13 142 AACT
+14 14 143 AACT
+15 15 88 AACT
+16 16 16 AACT
+17 17 51 AACT
+18 18 7 AACT
+20 20 21 AACT
+21 21 27 AACT
+23 23 7 AACT
+24 24 11 AACT
+30 30 5 AACT
+12 12 346 AAGC
+13 13 83 AAGC
+14 14 60 AAGC
+15 15 40 AAGC
+16 16 21 AAGC
+18 18 9 AAGC
+19 19 7 AAGC
+12 12 4943 AAGG
+13 13 2714 AAGG
+14 14 1385 AAGG
+14 15 3 AAGG
+15 15 949 AAGG
+16 16 612 AAGG
+16 14 4 AAGG
+17 17 331 AAGG
+18 18 362 AAGG
+19 19 204 AAGG
+20 20 138 AAGG
+21 21 149 AAGG
+22 22 68 AAGG
+23 23 49 AAGG
+24 24 27 AAGG
+25 25 44 AAGG
+26 26 8 AAGG
+27 27 14 AAGG
+28 28 14 AAGG
+29 29 14 AAGG
+30 30 12 AAGG
+31 31 23 AAGG
+34 34 11 AAGG
+43 43 6 AAGG
+12 12 2676 AAGT
+13 13 1438 AAGT
+14 14 940 AAGT
+15 15 649 AAGT
+16 16 305 AAGT
+17 17 291 AAGT
+18 18 181 AAGT
+19 19 55 AAGT
+20 20 73 AAGT
+21 21 8 AAGT
+22 22 43 AAGT
+22 26 1 AAGT
+23 23 32 AAGT
+23 19 1 AAGT
+24 24 18 AAGT
+25 25 19 AAGT
+26 26 8 AAGT
+27 27 12 AAGT
+29 29 18 AAGT
+30 30 12 AAGT
+31 31 12 AAGT
+32 32 11 AAGT
+33 33 35 AAGT
+34 34 9 AAGT
+35 35 6 AAGT
+12 12 594 AATC
+13 13 205 AATC
+14 14 88 AATC
+15 15 112 AATC
+16 16 20 AATC
+17 17 81 AATC
+18 18 23 AATC
+21 21 13 AATC
+22 22 8 AATC
+24 24 19 AATC
+26 26 7 AATC
+28 28 9 AATC
+33 33 6 AATC
+12 12 2293 AATG
+13 13 1226 AATG
+14 14 678 AATG
+15 15 455 AATG
+16 16 222 AATG
+17 17 211 AATG
+18 18 104 AATG
+19 19 79 AATG
+20 20 40 AATG
+21 21 33 AATG
+22 22 73 AATG
+23 23 24 AATG
+24 24 16 AATG
+25 25 18 AATG
+26 26 15 AATG
+27 27 22 AATG
+27 23 1 AATG
+28 28 5 AATG
+32 32 17 AATG
+33 33 16 AATG
+12 12 2633 AATT
+13 13 1086 AATT
+14 14 1052 AATT
+15 15 386 AATT
+16 16 393 AATT
+17 17 98 AATT
+18 18 104 AATT
+19 19 105 AATT
+20 20 34 AATT
+21 21 12 AATT
+22 22 20 AATT
+25 25 18 AATT
+26 26 25 AATT
+27 27 7 AATT
+29 29 7 AATT
+35 35 12 AATT
+12 12 1406 ACAG
+13 13 964 ACAG
+14 14 300 ACAG
+15 15 130 ACAG
+16 16 102 ACAG
+17 17 49 ACAG
+18 18 30 ACAG
+19 19 88 ACAG
+20 20 5 ACAG
+23 23 5 ACAG
+12 12 4868 ACAT
+12 15 4 ACAT
+13 13 3216 ACAT
+14 14 957 ACAT
+15 15 1052 ACAT
+16 16 588 ACAT
+17 17 422 ACAT
+18 18 239 ACAT
+19 19 238 ACAT
+19 15 1 ACAT
+20 20 25 ACAT
+21 21 79 ACAT
+22 22 20 ACAT
+23 23 38 ACAT
+27 27 42 ACAT
+29 29 18 ACAT
+31 31 5 ACAT
+32 32 5 ACAT
+35 35 6 ACAT
+36 36 9 ACAT
+41 41 14 ACAT
+44 44 8 ACAT
+44 40 1 ACAT
+50 50 12 ACAT
+12 12 833 ACCC
+13 13 345 ACCC
+14 14 190 ACCC
+15 15 60 ACCC
+16 16 12 ACCC
+17 17 15 ACCC
+19 19 8 ACCG
+12 12 416 ACCT
+13 13 123 ACCT
+14 14 140 ACCT
+15 15 69 ACCT
+16 16 41 ACCT
+17 17 45 ACCT
+19 19 18 ACCT
+20 20 27 ACCT
+21 21 19 ACCT
+22 22 6 ACCT
+27 27 13 ACCT
+28 28 7 ACCT
+29 29 9 ACCT
+30 30 7 ACCT
+34 34 6 ACCT
+45 45 5 ACCT
+12 12 84 ACGC
+13 13 52 ACGC
+15 15 63 ACGC
+12 12 433 ACGG
+13 13 163 ACGG
+14 14 38 ACGG
+15 15 44 ACGG
+16 16 7 ACGG
+17 17 11 ACGG
+19 19 6 ACGG
+25 25 10 ACGG
+12 12 1119 ACGT
+13 13 509 ACGT
+14 14 338 ACGT
+15 15 16 ACGT
+16 16 66 ACGT
+17 17 7 ACGT
+19 19 27 ACGT
+12 12 2211 ACTC
+13 13 685 ACTC
+14 14 188 ACTC
+15 15 151 ACTC
+16 16 91 ACTC
+18 18 17 ACTC
+19 19 24 ACTC
+20 20 23 ACTC
+21 21 13 ACTC
+23 23 19 ACTC
+45 45 8 ACTC
+12 12 161 ACTG
+13 13 69 ACTG
+14 14 7 ACTG
+15 15 14 ACTG
+16 16 15 ACTG
+12 12 3118 AGAT
+13 13 1216 AGAT
+14 14 1084 AGAT
+15 15 869 AGAT
+16 16 508 AGAT
+17 17 322 AGAT
+18 18 159 AGAT
+19 19 258 AGAT
+20 20 63 AGAT
+21 21 84 AGAT
+22 22 69 AGAT
+22 14 6 AGAT
+23 23 112 AGAT
+24 24 107 AGAT
+25 25 36 AGAT
+26 26 113 AGAT
+27 27 42 AGAT
+28 28 58 AGAT
+29 29 37 AGAT
+30 30 16 AGAT
+31 31 32 AGAT
+32 32 24 AGAT
+33 33 10 AGAT
+34 34 43 AGAT
+35 35 6 AGAT
+36 36 13 AGAT
+36 32 1 AGAT
+37 37 35 AGAT
+38 38 34 AGAT
+39 39 20 AGAT
+39 35 2 AGAT
+40 40 27 AGAT
+41 41 29 AGAT
+42 42 30 AGAT
+43 43 87 AGAT
+44 44 67 AGAT
+45 45 20 AGAT
+46 46 15 AGAT
+47 47 28 AGAT
+48 48 26 AGAT
+49 49 13 AGAT
+50 50 11 AGAT
+52 52 5 AGAT
+54 54 6 AGAT
+12 12 236 AGCC
+13 13 109 AGCC
+14 14 17 AGCC
+15 15 14 AGCC
+16 16 8 AGCC
+18 18 12 AGCC
+21 21 18 AGCC
+23 23 13 AGCC
+12 12 23 AGCG
+13 13 19 AGCG
+18 18 9 AGCG
+12 12 272 AGCT
+13 13 89 AGCT
+14 14 108 AGCT
+15 15 49 AGCT
+16 16 19 AGCT
+17 17 19 AGCT
+18 18 19 AGCT
+19 19 44 AGCT
+22 22 12 AGCT
+27 27 16 AGCT
+12 12 87 AGGC
+13 13 19 AGGC
+14 14 16 AGGC
+18 18 7 AGGC
+12 12 3610 AGGG
+13 13 1980 AGGG
+14 14 1095 AGGG
+15 15 624 AGGG
+16 16 159 AGGG
+17 17 59 AGGG
+18 18 43 AGGG
+19 19 60 AGGG
+20 20 49 AGGG
+21 21 12 AGGG
+23 23 10 AGGG
+12 12 531 ATCC
+13 13 323 ATCC
+14 14 221 ATCC
+15 15 58 ATCC
+16 16 78 ATCC
+17 17 38 ATCC
+18 18 12 ATCC
+19 19 19 ATCC
+20 20 17 ATCC
+21 21 44 ATCC
+22 22 12 ATCC
+23 23 39 ATCC
+24 24 11 ATCC
+25 25 12 ATCC
+27 27 10 ATCC
+32 32 6 ATCC
+39 39 8 ATCC
+40 40 6 ATCC
+48 48 7 ATCC
+12 12 272 ATCG
+13 13 89 ATCG
+14 14 108 ATCG
+15 15 49 ATCG
+16 16 19 ATCG
+17 17 19 ATCG
+18 18 19 ATCG
+19 19 44 ATCG
+22 22 12 ATCG
+27 27 16 ATCG
+12 12 1119 ATGC
+13 13 509 ATGC
+14 14 338 ATGC
+15 15 16 ATGC
+16 16 66 ATGC
+17 17 7 ATGC
+19 19 27 ATGC
+12 12 13 CCCG
+12 12 178 AGTC
+13 13 77 AGTC
+14 14 13 AGTC
+15 15 12 AGTC
diff -r 000000000000 -r 20ab85af9505 test-data/combineprob_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/combineprob_out.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,7 @@
+read_depth allele heterozygous_prob motif
+2 10_11 0.485943568663 A
+2 11_12 0.472130683091 A
+2 9_10 0.494635026326 A
+3 10_11 0.71878954705 A
+3 11_12 0.688571908761 A
+3 9_10 0.73801798345 A
diff -r 000000000000 -r 20ab85af9505 test-data/microsatcompat_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_in.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
+17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
diff -r 000000000000 -r 20ab85af9505 test-data/microsatcompat_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatcompat_out.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 TA 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 GA 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
+17 52191125 52191133 AC 8 4 8 26 67 AG 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
diff -r 000000000000 -r 20ab85af9505 test-data/microsatellite_flanking_L.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_L.fastq Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCT
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG
diff -r 000000000000 -r 20ab85af9505 test-data/microsatellite_flanking_R.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatellite_flanking_R.fastq Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,4 @@
+@SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+TTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG
++SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1
+GGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
diff -r 000000000000 -r 20ab85af9505 test-data/microsatpurity_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_in.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,3 @@
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATTATATATATATAT
+17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
diff -r 000000000000 -r 20ab85af9505 test-data/microsatpurity_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/microsatpurity_out.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+15 64416346 64416378 AT 32 16 18 22 61 AT 0 ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 TTCCTTTATAAGAAATCTTTACatatatatatatatatatGACTGTTTTGCTTTGTTTTGAGTTTCATAAAAATAGTATCATGGGGGCCGGTCACGGTGGC CCCFFFFFGHHFFIJIHGHIGIGGEGGIGHEGBHIIIJIFGCHGGIIJJEEIEIADHGICBFIGIGCGIJIIIGIIHIGDHGIIJHF>C888=@DB92<@? ERR194158.789781069_HSQ1008:176:D0UYCACXX:2:1201:4831:11242/1_1_per2_1 15 64416324 64416346 64416346 64416378 64416378 64416439 32 ATATATATATATATATATATATATATATATAT
+17 52191125 52191133 AC 8 4 8 26 67 AC 0 ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 CTTCCAGGGCCCTTCCAATGCCAAAAacacacacCTTTTTCCCCTGACCCTCTGTCAGTCTTCTGAATTTAAAGCTGGGCTCTGGGACTTACCAGTGTGAG CCCFFFFFHHHHHJJJJJJJJJJJJJJIHIIJIJJJJJJJJJJJIGIJJJJJJJHIJJIIJJJHHHHHHHFFFFFCEEDDDDDDDDBDDDDDDDDDCCCDC ERR194158.781426177_HSQ1008:176:D0UYCACXX:2:1109:7175:90983/1_1_per2_1 17 52191099 52191125 52191125 52191133 52191133 52191200 8 ACACACAC
diff -r 000000000000 -r 20ab85af9505 test-data/nice1tab.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/nice1tab.py Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+import sys
+fd=open(sys.argv[1])
+lines=fd.readlines()
+for line in lines:
+ temp=line.strip().split()
+ print '\t'.join(temp)
\ No newline at end of file
diff -r 000000000000 -r 20ab85af9505 test-data/probvalueforhetero_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_in.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,9 @@
+chr 9,10 A hetero -1.27220836321 10 10 9
+chr 10,11 A hetero -0.939119957032 11 11 10
+chr 11,12 A hetero -0.720375026792 12 12 11
+chr 9,9,10 A hetero -1.6841441619 9 9 10
+chr 9,10,10 A hetero -0.97233405327 10 10 9
+chr 10,10,11 A hetero -1.29451118958 10 10 11
+chr 10,11,11 A hetero -0.641022011041 11 11 10
+chr 11,11,12 A hetero -1.01921634129 11 11 12
+chr 11,12,12 A hetero -0.425116661902 12 12 11
diff -r 000000000000 -r 20ab85af9505 test-data/probvalueforhetero_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/probvalueforhetero_out.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,9 @@
+chr 9,10 A hetero -1.27220836321 10 10 9 0.247317513163 2 0.494635026326 2
+chr 10,11 A hetero -0.939119957032 11 11 10 0.242971784331 2 0.485943568663 2
+chr 11,12 A hetero -0.720375026792 12 12 11 0.236065341545 2 0.472130683091 2
+chr 9,9,10 A hetero -1.6841441619 9 9 10 0.124528157268 3 0.373584471803 3
+chr 9,10,10 A hetero -0.97233405327 10 10 9 0.121477837216 3 0.364433511647 3
+chr 10,10,11 A hetero -1.29451118958 10 10 11 0.122575544751 3 0.367726634253 3
+chr 10,11,11 A hetero -0.641022011041 11 11 10 0.117020970932 3 0.351062912797 3
+chr 11,11,12 A hetero -1.01921634129 11 11 12 0.11865253007 3 0.35595759021 3
+chr 11,12,12 A hetero -0.425116661902 12 12 11 0.110871439517 3 0.332614318551 3
diff -r 000000000000 -r 20ab85af9505 test-data/profilegenerator_in.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_in.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+9 9 100000
+10 10 91456
+10 9 1259
+11 11 39657
+11 10 1211
+11 12 514
diff -r 000000000000 -r 20ab85af9505 test-data/profilegenerator_out.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/profilegenerator_out.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,30 @@
+chr 9,9 A
+chr 9,10 A
+chr 9,11 A
+chr 9,12 A
+chr 10,10 A
+chr 10,11 A
+chr 10,12 A
+chr 11,11 A
+chr 11,12 A
+chr 12,12 A
+chr 9,9,9 A
+chr 9,9,10 A
+chr 9,9,11 A
+chr 9,9,12 A
+chr 9,10,10 A
+chr 9,10,11 A
+chr 9,10,12 A
+chr 9,11,11 A
+chr 9,11,12 A
+chr 9,12,12 A
+chr 10,10,10 A
+chr 10,10,11 A
+chr 10,10,12 A
+chr 10,11,11 A
+chr 10,11,12 A
+chr 10,12,12 A
+chr 11,11,11 A
+chr 11,11,12 A
+chr 11,12,12 A
+chr 12,12,12 A
diff -r 000000000000 -r 20ab85af9505 test-data/readdepth2seqdepth.out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/readdepth2seqdepth.out Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+repeat_length read_length informative_read_depth =locus_specific_sequencing_depth =genome_wide_sequencing_depth
+10 100 10 20 26
diff -r 000000000000 -r 20ab85af9505 test-data/samplePESAM_2_profile_C.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplePESAM_2_profile_C.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,5 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 shifted 540 713 713 719 719 759 6 GGGGGG
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 shifted 4007 4082 4082 4088 4088 4258 6 TTTTTT
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 shifted 1849 1930 1930 1936 1936 2100 6 CCCCCC
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 shifted 1849 2025 2025 2030 2030 2100 5 GGGGG
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 shifted 1428 1517 1517 1522 1522 1543 5 AAAAA
diff -r 000000000000 -r 20ab85af9505 test-data/sampleTRgenotypingcorrection
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRgenotypingcorrection Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+chr1 14,13,13,13 A hetero -0.429451855856 13 13 14
+chr1 5,6,6,6,6,7,7,8,8 A hetero -14.8744881854 7 6 8
diff -r 000000000000 -r 20ab85af9505 test-data/sampleTRprofile_C.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleTRprofile_C.txt Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,2 @@
+chr1 14,13,13,13 A
+chr1 5,6,6,6,6,7,7,8,8 A
diff -r 000000000000 -r 20ab85af9505 test-data/samplefq.snoope
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1 @@
+6 40 54 G 0 SRR345592.75000006 HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
diff -r 000000000000 -r 20ab85af9505 test-data/samplefq.snoope.new
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplefq.snoope.new Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,1 @@
+6 40 54 G 0 SRR345592.75000006_HS2000-192_107:1:63:5822:176818_1_per1_1 TACCCTCCTGTCTTCCCAGACTGATTTCTGTTCCTGCCCTggggggTTCTTGACTCCTCTGAATGGGTACGGGAGTGTGGACCTCAGGGAGGCCCCCTTG GGGGGGGGGGGGGGGGGFGGGGGGGGGFEGGGGGGGGGGG?FFDFGGGGGG?FFFGGGGGDEGGEFFBEFCEEBD@BACB*?=9;(/=5'6=4:?>C*A<
diff -r 000000000000 -r 20ab85af9505 test-data/sampleprofilegenerator_in
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_in Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,6 @@
+9 9 100000
+10 10 91456
+10 9 1259
+11 11 39657
+11 10 1211
+11 12 514
diff -r 000000000000 -r 20ab85af9505 test-data/sampleprofilegenerator_out
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sampleprofilegenerator_out Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,30 @@
+chr 9,9 A
+chr 9,10 A
+chr 9,11 A
+chr 9,12 A
+chr 10,10 A
+chr 10,11 A
+chr 10,12 A
+chr 11,11 A
+chr 11,12 A
+chr 12,12 A
+chr 9,9,9 A
+chr 9,9,10 A
+chr 9,9,11 A
+chr 9,9,12 A
+chr 9,10,10 A
+chr 9,10,11 A
+chr 9,10,12 A
+chr 9,11,11 A
+chr 9,11,12 A
+chr 9,12,12 A
+chr 10,10,10 A
+chr 10,10,11 A
+chr 10,10,12 A
+chr 10,11,11 A
+chr 10,11,12 A
+chr 10,12,12 A
+chr 11,11,11 A
+chr 11,11,12 A
+chr 11,12,12 A
+chr 12,12,12 A
diff -r 000000000000 -r 20ab85af9505 test-data/samplesortedPESAM_C.sam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samplesortedPESAM_C.sam Fri Oct 03 20:54:30 2014 -0400
@@ -0,0 +1,10 @@
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 113 shifted 720 37 40M = 541 -46 TTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACC HHFG@IIHHHHHIHHFHHGFGGGGDBDDEDDDBBB????? XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:40
+M01368:22:000000000-A4T24:1:1101:10010:3775_1:N:0:2_1_per1_1 177 shifted 541 37 173M = 720 46 CTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAAC ::GECC:*:)DACGGEC:CC?>>2GEGGGGGEEGGGGGGGGGGGGGEEECEGEAGGEEGEB>=GGFGEAGHHHEHHHFHFF?ED;HFIHHIIIIHIIHHHHIHHHHIHHHHHHHHIIIIHIHHHHIHHHHHIIHHIIHHIIHIIIIIGGGGGGDDDDDDDDBBB????< XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:170
+M01368:22:000000000-A4T24:1:1101:10015:2849_1:N:0:2_1_per1_2 177 shifted 4008 37 75M = 4089 176 TGCCATAACCATGAGTGATAACACTGCGGCCAACTTACTTCTGACAACGATCGGAGGACCGAAGGAGCTAACCGC CEGGEEEECC?:EEGECGGGGECGGGGEEGGEEGCCGEGGGGGGGGGGDGGGGGE>EEGGGGGGGGGGGAGGGGE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:75
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 129 shifted 1937 37 164M = 1850 -87 TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT HHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:138T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_1 65 shifted 1850 37 81M = 1937 87 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGA ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGH XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:81
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 129 shifted 2031 37 70M = 1850 -181 TAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGCCATAAAGCCTAAATAGCCCACACGT GGGGGGGGECGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGEGGGCEGEGG XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:44T25
+M01368:22:000000000-A4T24:1:1101:10070:4955_1:N:0:2_1_per1_2 65 shifted 1850 37 176M = 2031 181 CCCTTAACAGTACATAGTACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCCTCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT ?????BBBEEDBBDDDGGGGGGIIIIIIIIIIIIIHHHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIHIHHHIIIIIIHGHIIIHHHHHHHIHHHHHHHHHHHHHHHHHHHHHGGFGGGGGGGHGGGGGGGGGGGGEGGGGGGAEEGGGEGGGGGGEGEEGGGGGGGGGGGGGGGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:176
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 129 shifted 1523 37 21M = 1429 -94 GTCTTTAACTCCACCATTAGC GGGEGGEGGGGGCGGGGGEGG XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:21
+M01368:22:000000000-A4T24:1:1101:10126:5433_1:N:0:2_1_per1_1 65 shifted 1429 37 89M = 1523 94 CTATGCATCCAACGCGTTGGGAGCTCTCCCATATGGTCGACCTGCAGGCGGCCGCGAATTCACTAGTGATTTCCAAGGACAAATCAGAG ?????BBBDDDDDDDDGGGFGGFEHIIIIIIIHIIIHIHHHHHIIHFHHHHHHHHHHHHHHHHHHHHGGGGGGGGGGGGGGGGGGEGEE XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:89
diff -r 000000000000 -r 20ab85af9505 test-data/shifted.2bit
Binary file test-data/shifted.2bit has changed