annotate nt_overview.r @ 92:cf8ad181628f draft

planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
author rhpvorderman
date Mon, 12 Dec 2022 12:32:44 +0000
parents
children 385dea3c6cb5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
92
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
1 args <- commandArgs(trailingOnly = TRUE)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
2
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
3 merged.file = args[1]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
4 outputdir = args[2]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
5 gene.classes = unlist(strsplit(args[3], ","))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
6 hotspot.analysis.sum.file = args[4]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
7 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
8 empty.region.filter = args[5]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
9
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
10
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
11 setwd(outputdir)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
12
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
13 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
14 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
15
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
16 #ACGT overview
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
17
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
18 NToverview = merged
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
19
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
20 if(empty.region.filter == "leader"){
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
21 NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
22 } else if(empty.region.filter == "FR1"){
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
23 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
24 } else if(empty.region.filter == "CDR1"){
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
25 NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
26 } else if(empty.region.filter == "FR2"){
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
27 NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
28 }
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
29
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
30 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
31 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
32 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
33 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
34
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
35 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
36
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
37 #NToverview = rbind(NToverview, NTsum)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
38
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
39 NTresult = data.frame(nt=c("A", "C", "T", "G"))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
40
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
41 for(clazz in gene.classes){
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
42 print(paste("class:", clazz))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
43 NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
44 print(paste("nrow:", nrow(NToverview.sub)))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
45 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
46 new.col.y = sum(new.col.x)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
47 new.col.z = round(new.col.x / new.col.y * 100, 2)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
48
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
49 tmp = names(NTresult)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
50 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
51 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
52 }
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
53
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
54 NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
55
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
56 names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
57
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
58 write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
59
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
60 NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
61
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
62 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
63 new.col.y = sum(new.col.x)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
64 new.col.z = round(new.col.x / new.col.y * 100, 2)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
65
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
66 tmp = names(NTresult)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
67 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
68 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
69
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
70 names(hotspot.analysis.sum) = names(NTresult)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
71
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
72 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
73
cf8ad181628f planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
rhpvorderman
parents:
diff changeset
74 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")