changeset 63:8728284105ee draft

Uploaded
author davidvanzessen
date Wed, 06 Dec 2017 08:04:52 -0500
parents aa8d37bd1930
children c6dd3215ebe0
files baseline/filter.r baseline/script_imgt.py baseline/wrapper.sh shm_csr.py
diffstat 4 files changed, 82 insertions(+), 43 deletions(-) [+]
line wrap: on
line diff
--- a/baseline/filter.r	Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/filter.r	Wed Dec 06 08:04:52 2017 -0500
@@ -9,6 +9,20 @@
 summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F)
 
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+gappeddat = fix_column_names(gappeddat)
+
 #dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
 
 dat = cbind(gappeddat, summarydat$AA.JUNCTION)
@@ -24,12 +38,18 @@
 dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
 dat$JGene = gsub("[*].*", "", dat$JGene)
 
-#print(str(dat))
+print(str(dat))
 
 dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
 
 dat = dat[!duplicated(dat$past), ]
 
+print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
+
 dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
 
+print(paste("Sequences remaining after functionality filter:", nrow(dat)))
+
+print(paste("Sequences remaining:", nrow(dat)))
+
 write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)
--- a/baseline/script_imgt.py	Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/script_imgt.py	Wed Dec 06 08:04:52 2017 -0500
@@ -10,12 +10,18 @@
 
 args = parser.parse_args()
 
+print "script_imgt.py"
+print "input:", args.input
+print "ref:", args.ref
+print "output:", args.output
+print "id:", args.id
+
 refdic = dict()
 with open(args.ref, 'rU') as ref:
 	currentSeq = ""
 	currentId = ""
 	for line in ref:
-		if line[0] is ">":
+		if line.startswith(">"):
 			if currentSeq is not "" and currentId is not "":
 				refdic[currentId[1:]] = currentSeq
 			currentId = line.rstrip()
@@ -23,7 +29,8 @@
 		else:
 			currentSeq += line.rstrip()
 	refdic[currentId[1:]] = currentSeq
-	
+
+print "Have", str(len(refdic)), "reference sequences"
 
 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
 #						r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
@@ -37,16 +44,13 @@
 vPattern = re.compile("|".join(vPattern))
 
 def filterGene(s, pattern):
-	s1 = s[s.find(" ") + 1:]
-	return s1[:s1.find(" ")]
-	"""
     if type(s) is not str:
         return None
     res = pattern.search(s)
     if res:
         return res.group(0)
     return None
-	"""
+
 
 
 currentSeq = ""
--- a/baseline/wrapper.sh	Tue Dec 05 10:57:13 2017 -0500
+++ b/baseline/wrapper.sh	Wed Dec 06 08:04:52 2017 -0500
@@ -41,7 +41,7 @@
 do
 	f=$(file $current)
 	zipType="Zip archive"
-	if [[ "$f" == *"$zipType"* ]] || [[ "$f" == *"XZ compressed data"* ]]
+	if [[ "$f" == *"Zip archive"* ]] || [[ "$f" == *"XZ compressed data"* ]]
 	then
 		id=${IDs[$count]}
 		echo "id=$id"
@@ -55,19 +55,13 @@
 			mkdir -p "$PWD/$id/files"
 			tar -xJf $current -C "$PWD/$id/files/"
 		fi
-		summaryfile="$PWD/summary_${id}.txt"
-		gappedfile="$PWD/gappednt_${id}.txt"
 		filtered="$PWD/filtered_${id}.txt"
-		filecount=`ls -l $PWD/$id/ | wc -l`
-		if [[ "$filecount" -eq "2" ]]
-		then
-			cat $PWD/$id/*/1_* > $summaryfile
-			cat $PWD/$id/*/2_* > $gappedfile
-		else
-			cat $PWD/$id/1_* > $summaryfile
-			cat $PWD/$id/2_* > $gappedfile
-		fi
-		Rscript $dir/filter.r $summaryfile $gappedfile "$selection" $filtered 2>&1
+		imgt_1_file="`find $PWD/$id -name '1_*.txt'`"
+		imgt_2_file="`find $PWD/$id -name '2_*.txt'`"
+		echo "1_Summary file: ${imgt_1_file}"
+		echo "2_IMGT-gapped file: ${imgt_2_file}"
+		echo "filter.r for $id"
+		Rscript $dir/filter.r ${imgt_1_file} ${imgt_2_file} "$selection" $filtered 2>&1
 		
 		final="$PWD/final_${id}.txt"
 		cat $filtered | cut -f2,4,7 > $final
@@ -77,12 +71,6 @@
 	fi
 	count=$((count+1))
 done
-
-if [[ $(wc -l < $fasta) -eq "1" ]]; then
-	echo "No sequences in the fasta file, exiting"
-	exit 0
-fi
-
 workdir="$PWD"
 cd $dir
 echo "file: ${inputs[0]}"
--- a/shm_csr.py	Tue Dec 05 10:57:13 2017 -0500
+++ b/shm_csr.py	Wed Dec 06 08:04:52 2017 -0500
@@ -104,23 +104,6 @@
 			if len(linesplt[fr3Index]) > 5:
 				mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]
 				
-			try:
-				pass
-			except Exception as e:
-				print "Something went wrong while processing this line:"
-				print "line:", linecount
-				print "fr1 len:", len(linesplt[fr1Index]), "value:", linesplt[fr1Index]
-				print "cdr1 len:", len(linesplt[cdr1Index]), "value:", linesplt[cdr1Index]
-				print "fr2 len:", len(linesplt[fr2Index]), "value:", linesplt[fr2Index]
-				print "cdr2 len:", len(linesplt[cdr2Index]), "value:", linesplt[cdr2Index]
-				print "fr3 len:", len(linesplt[fr3Index]), "value:", linesplt[fr3Index]
-				print ID + "_FR1 in mutationdic", ID + "_FR1" in mutationdic
-				print ID + "_CDR1 in mutationdic", ID + "_CDR1" in mutationdic
-				print ID + "_FR2 in mutationdic", ID + "_FR2" in mutationdic
-				print ID + "_CDR2 in mutationdic", ID + "_CDR2" in mutationdic
-				print ID + "_FR3 in mutationdic", ID + "_FR3" in mutationdic
-				print linesplt
-				print e
 			mutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 			mutationListByID[ID] = mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"]
 
@@ -393,6 +376,50 @@
 					WRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs
 					WACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs
 					TWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs
+			
+			mutations_in_motifs_file = os.path.join(os.path.dirname(os.path.abspath(infile)), "mutation_in_motifs.txt")
+			if not os.path.exists(mutation_by_id_file):
+				with open(mutations_in_motifs_file, 'w') as out_handle:
+					out_handle.write("{0}\n".format("\t".join([
+						"Sequence.ID",
+						"mutation_position",
+						"region",
+						"from_nt",
+						"to_nt",
+						"mutation_position_AA",
+						"from_AA",
+						"to_AA",
+						"motif",
+						"motif_start_nt",
+						"motif_end_nt",
+						"rest"
+					])))
+
+			with open(mutations_in_motifs_file, 'a') as out_handle:
+				motif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}
+				for mutation in mutationList:
+					frm, where, to, AAfrm, AAwhere, AAto, junk = mutation
+					for motif in motif_dic.keys():
+							
+						for start, end, region in motif_dic[motif]:
+							if start <= int(where) <= end:
+								out_handle.write("{0}\n".format(
+									"\t".join([
+										ID,
+										where,
+										region,
+										frm,
+										to,
+										str(AAwhere),
+										str(AAfrm),
+										str(AAto),
+										motif,
+										str(start),
+										str(end),
+										str(junk)
+									])
+								))
+
 
 
 	def mean(lst):