# HG changeset patch # User davidvanzessen # Date 1512489433 18000 # Node ID aa8d37bd1930590f6ebe57c1b7a18bd34201c922 # Parent 275e759e7985085af2f6a94babb5c5e7565a3adb Uploaded diff -r 275e759e7985 -r aa8d37bd1930 shm_csr.py --- a/shm_csr.py Fri Aug 18 10:43:31 2017 -0400 +++ b/shm_csr.py Tue Dec 05 10:57:13 2017 -0500 @@ -23,7 +23,10 @@ genedic = dict() mutationdic = dict() - mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") + mutationMatcher = re.compile("^(.)(\d+).(.),?[ ]?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") + mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?.?([A-Z])?(.*)?") + mutationMatcher = re.compile("^([actg])(\d+).([actg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") + mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?") NAMatchResult = (None, None, None, None, None, None, '') geneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes} linecount = 0 @@ -78,17 +81,45 @@ linesplt = line.split("\t") ID = linesplt[IDIndex] genedic[ID] = linesplt[best_matchIndex] + + mutationdic[ID + "_FR1"] = [] + if len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader": + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] + + mutationdic[ID + "_CDR1"] = [] + if len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]: + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] + + mutationdic[ID + "_FR2"] = [] + if len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]: + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] + + mutationdic[ID + "_CDR2"] = [] + if len(linesplt[cdr2Index]) > 5: + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] + + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + + mutationdic[ID + "_FR3"] = [] + if len(linesplt[fr3Index]) > 5: + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] + try: - mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x] if (linesplt[fr1Index] != "NA" and empty_region_filter == "leader") else [] - mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x] if (linesplt[cdr1Index] != "NA" and empty_region_filter in ["leader", "FR1"]) else [] - mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x] if (linesplt[fr2Index] != "NA" and empty_region_filter in ["leader", "FR1", "CDR1"]) else [] - mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x] if (linesplt[cdr2Index] != "NA") else [] - mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] - mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x] if linesplt[fr3Index] != "NA" else [] + pass except Exception as e: print "Something went wrong while processing this line:" + print "line:", linecount + print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index] + print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index] + print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index] + print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index] + print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index] + print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic + print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic + print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic + print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic + print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic print linesplt - print linecount print e mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] @@ -109,7 +140,12 @@ fr3LengthDict[ID] = fr3Length IDlist += [ID] + print "len(mutationdic) =", len(mutationdic) + with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "mutationdict.txt"), 'w') as out_handle: + for ID, lst in mutationdic.iteritems(): + for mut in lst: + out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in mut]))) #tandem mutation stuff tandem_frequency = defaultdict(int) @@ -222,7 +258,7 @@ #print mutationList, linecount - AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent + AALength = (int(max(mutationList, key=lambda i: int(i[4]) if i[4] and i[5] != ";" else 0)[4]) + 1) # [4] is the position of the AA mutation, None if silent if AALength < 60: AALength = 64 @@ -338,13 +374,17 @@ [hotspotMatcher.match(x).groups() for x in linesplt[tatIndex].split("|") if x]] RGYWCount[ID], WRCYCount[ID], WACount[ID], TWCount[ID] = 0, 0, 0, 0 + with open(os.path.join(os.path.dirname(os.path.abspath(infile)), "RGYW.txt"), 'a') as out_handle: + for hotspot in RGYW: + out_handle.write("{0}\t{1}\n".format(ID, "\t".join([str(x) for x in hotspot]))) + mutationList = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] for mutation in mutationList: frm, where, to, AAfrm, AAwhere, AAto, junk = mutation - mutation_in_RGYW = any([(start <= int(where) <= end) for (start, end, region) in RGYW]) - mutation_in_WRCY = any([(start <= int(where) <= end) for (start, end, region) in WRCY]) - mutation_in_WA = any([(start <= int(where) <= end) for (start, end, region) in WA]) - mutation_in_TW = any([(start <= int(where) <= end) for (start, end, region) in TW]) + mutation_in_RGYW = any(((start <= int(where) <= end) for (start, end, region) in RGYW)) + mutation_in_WRCY = any(((start <= int(where) <= end) for (start, end, region) in WRCY)) + mutation_in_WA = any(((start <= int(where) <= end) for (start, end, region) in WA)) + mutation_in_TW = any(((start <= int(where) <= end) for (start, end, region) in TW)) in_how_many_motifs = sum([mutation_in_RGYW, mutation_in_WRCY, mutation_in_WA, mutation_in_TW]) diff -r 275e759e7985 -r aa8d37bd1930 shm_csr.r --- a/shm_csr.r Fri Aug 18 10:43:31 2017 -0400 +++ b/shm_csr.r Tue Dec 05 10:57:13 2017 -0500 @@ -10,7 +10,9 @@ empty.region.filter = args[4] setwd(outputdir) -dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) +#dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +dat = data.frame(fread(input, sep="\t", header=T, stringsAsFactors=F)) #fread because read.table suddenly skips certain rows... if(length(dat$Sequence.ID) == 0){ setwd(outputdir)