# HG changeset patch # User davidvanzessen # Date 1493898189 14400 # Node ID 64711f461c8e7fa71cf6d6f9475f472ff4ab391f # Parent cfc9a442e59d72371a2961cce688172e1d0f1a2f Uploaded diff -r cfc9a442e59d -r 64711f461c8e imgt_loader.r --- a/imgt_loader.r Wed Apr 12 04:28:16 2017 -0400 +++ b/imgt_loader.r Thu May 04 07:43:09 2017 -0400 @@ -9,6 +9,22 @@ aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T) junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T) +fix_column_names = function(df){ + if("V.DOMAIN.Functionality" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality" + print("found V.DOMAIN.Functionality, changed") + } + if("V.DOMAIN.Functionality.comment" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment" + print("found V.DOMAIN.Functionality.comment, changed") + } + return(df) +} + +summ = fix_column_names(summ) +aa = fix_column_names(aa) +junction = fix_column_names(junction) + old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation') old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT') old_junction_columns=c('JUNCTION') diff -r cfc9a442e59d -r 64711f461c8e merge_and_filter.r --- a/merge_and_filter.r Wed Apr 12 04:28:16 2017 -0400 +++ b/merge_and_filter.r Thu May 04 07:43:09 2017 -0400 @@ -26,6 +26,25 @@ AAs = read.table(aafile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") +fix_column_names = function(df){ + if("V.DOMAIN.Functionality" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality" + print("found V.DOMAIN.Functionality, changed") + } + if("V.DOMAIN.Functionality.comment" %in% names(df)){ + names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment" + print("found V.DOMAIN.Functionality.comment, changed") + } + return(df) +} + +summ = fix_column_names(summ) +sequences = fix_column_names(sequences) +mutationanalysis = fix_column_names(mutationanalysis) +mutationstats = fix_column_names(mutationstats) +hotspots = fix_column_names(hotspots) +AAs = fix_column_names(AAs) + if(method == "blastn"){ #"qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] @@ -36,8 +55,8 @@ colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") } -print("Summary analysis files columns") -print(names(summ)) +#print("Summary analysis files columns") +#print(names(summ)) @@ -75,6 +94,14 @@ filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) +if(FALSE){ #to speed up debugging + set.seed(1) + summ = summ[sample(nrow(summ), floor(nrow(summ) * 0.05)),] + print(paste("Number of sequences after sampling 5%:", nrow(summ))) + + filtering.steps = rbind(filtering.steps, c("Number of sequences after sampling 5%", nrow(summ))) +} + print("mutation analysis files columns") print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])])) @@ -82,8 +109,8 @@ print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) -print("mutation stats files columns") -print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])])) +#print("mutation stats files columns") +#print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])])) result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") @@ -135,10 +162,10 @@ write.table(x=result, file=gsub("merged.txt$", "before_filters.txt", output), sep="\t",quote=F,row.names=F,col.names=T) -print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == ""))) -print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == ""))) -print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == ""))) -print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == ""))) +print(paste("Number of empty CDR1 sequences:", sum(result$CDR1.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty FR2 sequences:", sum(result$FR2.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty CDR2 sequences:", sum(result$CDR2.IMGT.seq == "", na.rm=T))) +print(paste("Number of empty FR3 sequences:", sum(result$FR3.IMGT.seq == "", na.rm=T))) if(empty.region.filter == "leader"){ result = result[result$FR1.IMGT.seq != "" & result$CDR1.IMGT.seq != "" & result$FR2.IMGT.seq != "" & result$CDR2.IMGT.seq != "" & result$FR3.IMGT.seq != "", ] @@ -219,6 +246,8 @@ # result[i,"past"] = paste(result[i,cls], collapse=":") #} + + result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":")) result.matched = result[!grepl("unmatched", result$best_match),] diff -r cfc9a442e59d -r 64711f461c8e shm_csr.py --- a/shm_csr.py Wed Apr 12 04:28:16 2017 -0400 +++ b/shm_csr.py Thu May 04 07:43:09 2017 -0400 @@ -1,287 +1,436 @@ -from __future__ import division +import argparse +import logging +import sys +import os +import re + from collections import defaultdict -import re -import argparse + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") + parser.add_argument("--genes", help="The genes available in the 'best_match' column") + parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2']) + parser.add_argument("--output", help="Output file") -parser = argparse.ArgumentParser() -parser.add_argument("--input", - help="The '7_V-REGION-mutation-and-AA-change-table' and '10_V-REGION-mutation-hotspots' merged together, with an added 'best_match' annotation") -parser.add_argument("--genes", help="The genes available in the 'best_match' column") -parser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=['leader', 'FR1', 'CDR1', 'FR2']) -parser.add_argument("--output", help="Output file") + args = parser.parse_args() + + infile = args.input + genes = str(args.genes).split(",") + empty_region_filter = args.empty_region_filter + outfile = args.output -args = parser.parse_args() + genedic = dict() -infile = args.input -genes = str(args.genes).split(",") -empty_region_filter = args.empty_region_filter -outfile = args.output - -genedic = dict() + mutationdic = dict() + mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") + NAMatchResult = (None, None, None, None, None, None, '') + geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} + linecount = 0 -mutationdic = dict() -mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") -NAMatchResult = (None, None, None, None, None, None, '') -linecount = 0 + IDIndex = 0 + best_matchIndex = 0 + fr1Index = 0 + cdr1Index = 0 + fr2Index = 0 + cdr2Index = 0 + fr3Index = 0 + first = True + IDlist = [] + mutationList = [] + mutationListByID = {} + cdr1LengthDic = {} + cdr2LengthDic = {} + + fr1LengthDict = {} + fr2LengthDict = {} + fr3LengthDict = {} + + cdr1LengthIndex = 0 + cdr2LengthIndex = 0 -IDIndex = 0 -best_matchIndex = 0 -fr1Index = 0 -cdr1Index = 0 -fr2Index = 0 -cdr2Index = 0 -fr3Index = 0 -first = True -IDlist = [] -mutationList = [] -mutationListByID = {} -cdr1LengthDic = {} -cdr2LengthDic = {} + fr1SeqIndex = 0 + fr2SeqIndex = 0 + fr3SeqIndex = 0 + + tandem_sum_by_class = defaultdict(int) + expected_tandem_sum_by_class = defaultdict(float) -with open(infile, 'ru') as i: - for line in i: - if first: + with open(infile, 'ru') as i: + for line in i: + if first: + linesplt = line.split("\t") + IDIndex = linesplt.index("Sequence.ID") + best_matchIndex = linesplt.index("best_match") + fr1Index = linesplt.index("FR1.IMGT") + cdr1Index = linesplt.index("CDR1.IMGT") + fr2Index = linesplt.index("FR2.IMGT") + cdr2Index = linesplt.index("CDR2.IMGT") + fr3Index = linesplt.index("FR3.IMGT") + cdr1LengthIndex = linesplt.index("CDR1.IMGT.seq") + cdr2LengthIndex = linesplt.index("CDR2.IMGT.seq") + fr1SeqIndex = linesplt.index("FR1.IMGT.seq") + fr2SeqIndex = linesplt.index("FR2.IMGT.seq") + fr3SeqIndex = linesplt.index("FR3.IMGT.seq") + first = False + continue + linecount += 1 linesplt = line.split("\t") - IDIndex = linesplt.index("Sequence.ID") - best_matchIndex = linesplt.index("best_match") - fr1Index = linesplt.index("FR1.IMGT") - cdr1Index = linesplt.index("CDR1.IMGT") - fr2Index = linesplt.index("FR2.IMGT") - cdr2Index = linesplt.index("CDR2.IMGT") - fr3Index = linesplt.index("FR3.IMGT") - cdr1LengthIndex = linesplt.index("CDR1.IMGT.length") - cdr2LengthIndex = linesplt.index("CDR2.IMGT.length") - first = False - continue - linecount += 1 - linesplt = line.split("\t") - ID = linesplt[IDIndex] - genedic[ID] = linesplt[best_matchIndex] - try: - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] - mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] - mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] - mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] - mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] - mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] - except Exception as e: - print "Something went wrong while processing this line:" - print linesplt - print linecount - print e - mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] - mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + ID = linesplt[IDIndex] + genedic[ID] = linesplt[best_matchIndex] + try: + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] + except Exception as e: + print "Something went wrong while processing this line:" + print linesplt + print linecount + print e + mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + + cdr1Length = len(linesplt[cdr1LengthIndex]) + cdr2Length = len(linesplt[cdr2LengthIndex]) + + #print linesplt[fr2SeqIndex] + fr1Length = len(linesplt[fr1SeqIndex]) if empty_region_filter == "leader" else 0 + fr2Length = len(linesplt[fr2SeqIndex]) if empty_region_filter in ["leader", "FR1", "CDR1"] else 0 + fr3Length = len(linesplt[fr3SeqIndex]) + + cdr1LengthDic[ID] = cdr1Length + cdr2LengthDic[ID] = cdr2Length + + fr1LengthDict[ID] = fr1Length + fr2LengthDict[ID] = fr2Length + fr3LengthDict[ID] = fr3Length + + IDlist += [ID] + + + #tandem mutation stuff + tandem_frequency = defaultdict(int) + mutation_frequency = defaultdict(int) + + tandem_file = os.path.join(os.path.dirname(outfile), "tandems_by_id.txt") + with open(tandem_file, 'w') as o: + highest_tandem_length = 0 + + o.write("Sequence.ID\tnumber_of_mutations\tnumber_of_tandems\tregion_length\texpected_tandems\tlongest_tandem\ttandems\n") + for ID in IDlist: + mutations = mutationListByID[ID] + if len(mutations) == 0: + continue + last_mut = max(mutations, key=lambda x: int(x[1])) + + last_mut_pos = int(last_mut[1]) + + mut_positions = [False] * (last_mut_pos + 1) + + for mutation in mutations: + frm, where, to, frmAA, whereAA, toAA, thing = mutation + where = int(where) + mut_positions[where] = True + + tandem_muts = [] + tandem_start = -1 + tandem_length = 0 + for i in range(len(mut_positions)): + if mut_positions[i]: + if tandem_start == -1: + tandem_start = i + tandem_length += 1 + #print "".join(["1" if x else "0" for x in mut_positions[:i+1]]) + else: + if tandem_length > 1: + tandem_muts.append((tandem_start, tandem_length)) + #print "{0}{1} {2}:{3}".format(" " * (i - tandem_length), "^" * tandem_length, tandem_start, tandem_length) + tandem_start = -1 + tandem_length = 0 + if tandem_length > 1: # if the sequence ends with a tandem mutation + tandem_muts.append((tandem_start, tandem_length)) + + if len(tandem_muts) > 0: + if highest_tandem_length < len(tandem_muts): + highest_tandem_length = len(tandem_muts) - cdr1Length = linesplt[cdr1LengthIndex] - cdr2Length = linesplt[cdr2LengthIndex] + region_length = fr1LengthDict[ID] + cdr1LengthDic[ID] + fr2LengthDict[ID] + cdr2LengthDic[ID] + fr3LengthDict[ID] + longest_tandem = max(tandem_muts, key=lambda x: x[1]) if len(tandem_muts) else (0, 0) + num_mutations = len(mutations) + f_num_mutations = float(num_mutations) + num_tandem_muts = len(tandem_muts) + expected_tandem_muts = f_num_mutations * (f_num_mutations - 1.0) / float(region_length) + o.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(ID, + str(num_mutations), + str(num_tandem_muts), + str(region_length), + str(round(expected_tandem_muts, 2)), + str(longest_tandem[1]), + str(tandem_muts))) + gene = genedic[ID] + if gene.find("unmatched") == -1: + tandem_sum_by_class[gene] += num_tandem_muts + expected_tandem_sum_by_class[gene] += expected_tandem_muts - cdr1LengthDic[ID] = int(cdr1Length) if cdr1Length != "X" else 0 - cdr2LengthDic[ID] = int(cdr2Length) if cdr2Length != "X" else 0 - - IDlist += [ID] + tandem_sum_by_class["all"] += num_tandem_muts + expected_tandem_sum_by_class["all"] += expected_tandem_muts -#print mutationList, linecount + gene = gene[:3] + if gene in ["IGA", "IGG"]: + tandem_sum_by_class[gene] += num_tandem_muts + expected_tandem_sum_by_class[gene] += expected_tandem_muts + else: + tandem_sum_by_class["unmatched"] += num_tandem_muts + expected_tandem_sum_by_class["unmatched"] += expected_tandem_muts + -AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent -if AALength < 60: - AALength = 64 + for tandem_mut in tandem_muts: + tandem_frequency[str(tandem_mut[1])] += 1 + #print "\t".join([ID, str(len(tandem_muts)), str(longest_tandem[1]) , str(tandem_muts)]) + + tandem_freq_file = os.path.join(os.path.dirname(outfile), "tandem_frequency.txt") + with open(tandem_freq_file, 'w') as o: + for frq in sorted([int(x) for x in tandem_frequency.keys()]): + o.write("{0}\t{1}\n".format(frq, tandem_frequency[str(frq)])) -AA_mutation = [0] * AALength -AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} -AA_mutation_empty = AA_mutation[:] + tandem_row = [] + print genes + print tandem_sum_by_class + print expected_tandem_sum_by_class + genes_extra = list(genes) + genes_extra.append("all") + for x, y, in zip([tandem_sum_by_class[x] for x in genes_extra], [expected_tandem_sum_by_class[x] for x in genes_extra]): + if y != 0: + tandem_row += [x, round(y, 2), round(x / y, 2)] + else: + tandem_row += [x, round(y, 2), 0] + + """ + print tandem_row + tandem_row += tandem_row[-3:] + print tandem_row + all_expected_tandem = expected_tandem_sum_by_class["all"] + all_tandem = tandem_sum_by_class["all"] + if all_expected_tandem == 0: + tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), 0] + else: + tandem_row[-6:-3] = [all_tandem, round(all_expected_tandem, 2), round(all_tandem / all_expected_tandem, 2)] + print tandem_row + """ + for i in range(len(genes_extra)): + gene = genes_extra[i] + print gene, tandem_row[i*3:i*3+3] -aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" -with open(aa_mutations_by_id_file, 'w') as o: - o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") - for ID in mutationListByID.keys(): - AA_mutation_for_ID = AA_mutation_empty[:] - for mutation in mutationListByID[ID]: - if mutation[4]: - AA_mutation_position = int(mutation[4]) - AA_mutation[AA_mutation_position] += 1 - AA_mutation_for_ID[AA_mutation_position] += 1 - clss = genedic[ID][:3] - AA_mutation_dic[clss][AA_mutation_position] += 1 - o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") + tandem_freq_file = os.path.join(os.path.dirname(outfile), "shm_overview_tandem_row.txt") + with open(tandem_freq_file, 'w') as o: + o.write("Tandems/Expected (ratio),{0}\n".format(",".join([str(x) for x in tandem_row]))) + + #print mutationList, linecount + + AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent + if AALength < 60: + AALength = 64 + + AA_mutation = [0] * AALength + AA_mutation_dic = {"IGA": AA_mutation[:], "IGG": AA_mutation[:], "IGM": AA_mutation[:], "IGE": AA_mutation[:], "unm": AA_mutation[:], "all": AA_mutation[:]} + AA_mutation_empty = AA_mutation[:] + + aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/aa_id_mutations.txt" + with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") + for ID in mutationListByID.keys(): + AA_mutation_for_ID = AA_mutation_empty[:] + for mutation in mutationListByID[ID]: + if mutation[4]: + AA_mutation_position = int(mutation[4]) + AA_mutation[AA_mutation_position] += 1 + AA_mutation_for_ID[AA_mutation_position] += 1 + clss = genedic[ID][:3] + AA_mutation_dic[clss][AA_mutation_position] += 1 + o.write(ID + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in AA_mutation_for_ID[1:]]) + "\n") -#absent AA stuff -absentAACDR1Dic = defaultdict(list) -absentAACDR1Dic[5] = range(29,36) -absentAACDR1Dic[6] = range(29,35) -absentAACDR1Dic[7] = range(30,35) -absentAACDR1Dic[8] = range(30,34) -absentAACDR1Dic[9] = range(31,34) -absentAACDR1Dic[10] = range(31,33) -absentAACDR1Dic[11] = [32] + #absent AA stuff + absentAACDR1Dic = defaultdict(list) + absentAACDR1Dic[5] = range(29,36) + absentAACDR1Dic[6] = range(29,35) + absentAACDR1Dic[7] = range(30,35) + absentAACDR1Dic[8] = range(30,34) + absentAACDR1Dic[9] = range(31,34) + absentAACDR1Dic[10] = range(31,33) + absentAACDR1Dic[11] = [32] -absentAACDR2Dic = defaultdict(list) -absentAACDR2Dic[0] = range(55,65) -absentAACDR2Dic[1] = range(56,65) -absentAACDR2Dic[2] = range(56,64) -absentAACDR2Dic[3] = range(57,64) -absentAACDR2Dic[4] = range(57,63) -absentAACDR2Dic[5] = range(58,63) -absentAACDR2Dic[6] = range(58,62) -absentAACDR2Dic[7] = range(59,62) -absentAACDR2Dic[8] = range(59,61) -absentAACDR2Dic[9] = [60] + absentAACDR2Dic = defaultdict(list) + absentAACDR2Dic[0] = range(55,65) + absentAACDR2Dic[1] = range(56,65) + absentAACDR2Dic[2] = range(56,64) + absentAACDR2Dic[3] = range(57,64) + absentAACDR2Dic[4] = range(57,63) + absentAACDR2Dic[5] = range(58,63) + absentAACDR2Dic[6] = range(58,62) + absentAACDR2Dic[7] = range(59,62) + absentAACDR2Dic[8] = range(59,61) + absentAACDR2Dic[9] = [60] -absentAA = [len(IDlist)] * (AALength-1) -for k, cdr1Length in cdr1LengthDic.iteritems(): - for c in absentAACDR1Dic[cdr1Length]: - absentAA[c] -= 1 + absentAA = [len(IDlist)] * (AALength-1) + for k, cdr1Length in cdr1LengthDic.iteritems(): + for c in absentAACDR1Dic[cdr1Length]: + absentAA[c] -= 1 -for k, cdr2Length in cdr2LengthDic.iteritems(): - for c in absentAACDR2Dic[cdr2Length]: - absentAA[c] -= 1 + for k, cdr2Length in cdr2LengthDic.iteritems(): + for c in absentAACDR2Dic[cdr2Length]: + absentAA[c] -= 1 -aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" -with open(aa_mutations_by_id_file, 'w') as o: - o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") - for ID in IDlist: - absentAAbyID = [1] * (AALength-1) - cdr1Length = cdr1LengthDic[ID] - for c in absentAACDR1Dic[cdr1Length]: - absentAAbyID[c] -= 1 + aa_mutations_by_id_file = outfile[:outfile.rindex("/")] + "/absent_aa_id.txt" + with open(aa_mutations_by_id_file, 'w') as o: + o.write("ID\tcdr1length\tcdr2length\tbest_match\t" + "\t".join([str(x) for x in range(1,AALength)]) + "\n") + for ID in IDlist: + absentAAbyID = [1] * (AALength-1) + cdr1Length = cdr1LengthDic[ID] + for c in absentAACDR1Dic[cdr1Length]: + absentAAbyID[c] -= 1 - cdr2Length = cdr2LengthDic[ID] - for c in absentAACDR2Dic[cdr2Length]: - absentAAbyID[c] -= 1 - o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") + cdr2Length = cdr2LengthDic[ID] + for c in absentAACDR2Dic[cdr2Length]: + absentAAbyID[c] -= 1 + o.write(ID + "\t" + str(cdr1Length) + "\t" + str(cdr2Length) + "\t" + genedic[ID] + "\t" + "\t".join([str(x) for x in absentAAbyID]) + "\n") -if linecount == 0: - print "No data, exiting" - with open(outfile, 'w') as o: - o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) - o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) - o.write("WA (%)," + ("0,0,0\n" * len(genes))) - o.write("TW (%)," + ("0,0,0\n" * len(genes))) - import sys + if linecount == 0: + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%)," + ("0,0,0\n" * len(genes))) + o.write("WRCY (%)," + ("0,0,0\n" * len(genes))) + o.write("WA (%)," + ("0,0,0\n" * len(genes))) + o.write("TW (%)," + ("0,0,0\n" * len(genes))) + import sys - sys.exit() + sys.exit() -hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") -RGYWCount = {} -WRCYCount = {} -WACount = {} -TWCount = {} + hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") + RGYWCount = {} + WRCYCount = {} + WACount = {} + TWCount = {} -#IDIndex = 0 -ataIndex = 0 -tatIndex = 0 -aggctatIndex = 0 -atagcctIndex = 0 -first = True -with open(infile, 'ru') as i: - for line in i: - if first: + #IDIndex = 0 + ataIndex = 0 + tatIndex = 0 + aggctatIndex = 0 + atagcctIndex = 0 + first = True + with open(infile, 'ru') as i: + for line in i: + if first: + linesplt = line.split("\t") + ataIndex = linesplt.index("X.a.t.a") + tatIndex = linesplt.index("t.a.t.") + aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") + atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") + first = False + continue linesplt = line.split("\t") - ataIndex = linesplt.index("X.a.t.a") - tatIndex = linesplt.index("t.a.t.") - aggctatIndex = linesplt.index("X.a.g.g.c.t..a.t.") - atagcctIndex = linesplt.index("X.a.t..a.g.c.c.t.") - first = False - continue - linesplt = line.split("\t") - gene = linesplt[best_matchIndex] - ID = linesplt[IDIndex] - RGYW = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] - WRCY = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] - WA = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] - TW = [(int(x), int(y), z) for (x, y, z) in - [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] - RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 + gene = linesplt[best_matchIndex] + ID = linesplt[IDIndex] + RGYW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[aggctatIndex].split("|") if x]] + WRCY = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[atagcctIndex].split("|") if x]] + WA = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[ataIndex].split("|") if x]] + TW = [(int(x), int(y), z) for (x, y, z) in + [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] + RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 - mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] - for mutation in mutationList: - frm, where, to, AAfrm, AAwhere, AAto, junk = mutation - mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) - mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) - mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) - mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) + mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] + for mutation in mutationList: + frm, where, to, AAfrm, AAwhere, AAto, junk = mutation + mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) + mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) + mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) + mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) - in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) + in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) - if in_how_many_motifs > 0: - RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs - WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs - WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs - TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs + if in_how_many_motifs > 0: + RGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs + WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs + WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs + TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs -def mean(lst): - return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 + def mean(lst): + return (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0 -def median(lst): - lst = sorted(lst) - l = len(lst) - if l == 0: - return 0 - if l == 1: - return lst[0] + def median(lst): + lst = sorted(lst) + l = len(lst) + if l == 0: + return 0 + if l == 1: + return lst[0] + + l = int(l / 2) - l = int(l / 2) - - if len(lst) % 2 == 0: - return float(lst[l] + lst[(l - 1)]) / 2.0 - else: - return lst[l] - -funcs = {"mean": mean, "median": median, "sum": sum} + if len(lst) % 2 == 0: + return float(lst[l] + lst[(l - 1)]) / 2.0 + else: + return lst[l] -directory = outfile[:outfile.rfind("/") + 1] -value = 0 -valuedic = dict() + funcs = {"mean": mean, "median": median, "sum": sum} -for fname in funcs.keys(): - for gene in genes: - with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: - valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) - with open(directory + "all_" + fname + "_value.txt", 'r') as v: - valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) - + directory = outfile[:outfile.rfind("/") + 1] + value = 0 + valuedic = dict() -def get_xyz(lst, gene, f, fname): - x = round(round(f(lst), 1)) - y = valuedic[gene + "_" + fname] - z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" - return (str(x), str(y), z) + for fname in funcs.keys(): + for gene in genes: + with open(directory + gene + "_" + fname + "_value.txt", 'r') as v: + valuedic[gene + "_" + fname] = float(v.readlines()[0].rstrip()) + with open(directory + "all_" + fname + "_value.txt", 'r') as v: + valuedic["total_" + fname] = float(v.readlines()[0].rstrip()) + -dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} -arr = ["RGYW", "WRCY", "WA", "TW"] + def get_xyz(lst, gene, f, fname): + x = round(round(f(lst), 1)) + y = valuedic[gene + "_" + fname] + z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" + return (str(x), str(y), z) -geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} + dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} + arr = ["RGYW", "WRCY", "WA", "TW"] -for fname in funcs.keys(): - func = funcs[fname] - foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" - with open(foutfile, 'w') as o: - for typ in arr: - o.write(typ + " (%)") - curr = dic[typ] - for gene in genes: - geneMatcher = geneMatchers[gene] - if valuedic[gene + "_" + fname] is 0: - o.write(",0,0,0") - else: - x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) - o.write("," + x + "," + y + "," + z) - x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) - #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) - o.write("," + x + "," + y + "," + z + "\n") + for fname in funcs.keys(): + func = funcs[fname] + foutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt" + with open(foutfile, 'w') as o: + for typ in arr: + o.write(typ + " (%)") + curr = dic[typ] + for gene in genes: + geneMatcher = geneMatchers[gene] + if valuedic[gene + "_" + fname] is 0: + o.write(",0,0,0") + else: + x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) + o.write("," + x + "," + y + "," + z) + x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) + #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) + o.write("," + x + "," + y + "," + z + "\n") -# for testing -seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" -with open(seq_motif_file, 'w') as o: - o.write("ID\tRGYW\tWRCY\tWA\tTW\n") - for ID in IDlist: - #o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") - o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") + # for testing + seq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt" + with open(seq_motif_file, 'w') as o: + o.write("ID\tRGYW\tWRCY\tWA\tTW\n") + for ID in IDlist: + #o.write(ID + "\t" + str(round(RGYWCount[ID], 2)) + "\t" + str(round(WRCYCount[ID], 2)) + "\t" + str(round(WACount[ID], 2)) + "\t" + str(round(TWCount[ID], 2)) + "\n") + o.write(ID + "\t" + str(RGYWCount[ID]) + "\t" + str(WRCYCount[ID]) + "\t" + str(WACount[ID]) + "\t" + str(TWCount[ID]) + "\n") + +if __name__ == "__main__": + main() diff -r cfc9a442e59d -r 64711f461c8e shm_csr.xml --- a/shm_csr.xml Wed Apr 12 04:28:16 2017 -0400 +++ b/shm_csr.xml Thu May 04 07:43:09 2017 -0400 @@ -39,6 +39,7 @@ + @@ -183,7 +184,8 @@ *Chunk hit percentage*: The percentage of the chunks that is aligned -*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass. +*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass. +The option “>25% class” can be chosen when you only are interested in the class (Cα/Cγ/Cµ/Cɛ) of your sequences and the length of your sequence is not long enough to assign the subclasses. ----- diff -r cfc9a442e59d -r 64711f461c8e wrapper.sh --- a/wrapper.sh Wed Apr 12 04:28:16 2017 -0400 +++ b/wrapper.sh Thu May 04 07:43:09 2017 -0400 @@ -251,7 +251,7 @@ echo "---------------- $func table ----------------" echo "---------------- $func table ----------------
" >> $log - cat $outdir/mutations_${func}.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt + cat $outdir/mutations_${func}.txt $outdir/shm_overview_tandem_row.txt $outdir/hotspot_analysis_${func}.txt > $outdir/data_${func}.txt echo "---------------- pattern_plots.r ----------------" echo "---------------- pattern_plots.r ----------------
" >> $log @@ -276,7 +276,7 @@ while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz cex cey cez unx uny unz allx ally allz do - if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] ; then #meh + if [ "$name" == "FR R/S (ratio)" ] || [ "$name" == "CDR R/S (ratio)" ] || [ "$name" == "Tandems/Expected (ratio)" ] ; then #meh echo "$name${cax}/${cay} (${caz})${ca1x}/${ca1y} (${ca1z})${ca2x}/${ca2y} (${ca2z})${cgx}/${cgy} (${cgz})${cg1x}/${cg1y} (${cg1z})${cg2x}/${cg2y} (${cg2z})${cg3x}/${cg3y} (${cg3z})${cg4x}/${cg4y} (${cg4z})${cmx}/${cmy} (${cmz})${cex}/${cey} (${cez})${allx}/${ally} (${allz})${unx}/${uny} (${unz})" >> $output elif [ "$name" == "Median of Number of Mutations (%)" ] ; then echo "$name${caz}%${ca1z}%${ca2z}%${cgz}%${cg1z}%${cg2z}%${cg3z}%${cg4z}%${cmz}%${cez}%${allz}%${unz}%" >> $output @@ -665,6 +665,7 @@ echo "The data used to generate the percentage of mutations in AID and pol eta motives plotDownload" >> $output echo "The data used to generate the relative mutation patterns plotDownload" >> $output echo "The data used to generate the absolute mutation patterns plotDownload" >> $output +echo "Data about tandem mutations by IDDownload" >> $output echo "SHM Frequency" >> $output echo "The data generate the frequency scatter plotDownload" >> $output