comparison nt_overview.r @ 92:cf8ad181628f draft

planemo upload commit 36be3b053802693392f935e6619ba3f2b1704e3c
author rhpvorderman
date Mon, 12 Dec 2022 12:32:44 +0000
parents
children 385dea3c6cb5
comparison
equal deleted inserted replaced
91:f387cc1580c6 92:cf8ad181628f
1 args <- commandArgs(trailingOnly = TRUE)
2
3 merged.file = args[1]
4 outputdir = args[2]
5 gene.classes = unlist(strsplit(args[3], ","))
6 hotspot.analysis.sum.file = args[4]
7 NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
8 empty.region.filter = args[5]
9
10
11 setwd(outputdir)
12
13 merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
14 hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
15
16 #ACGT overview
17
18 NToverview = merged
19
20 if(empty.region.filter == "leader"){
21 NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
22 } else if(empty.region.filter == "FR1"){
23 NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
24 } else if(empty.region.filter == "CDR1"){
25 NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
26 } else if(empty.region.filter == "FR2"){
27 NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
28 }
29
30 NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
31 NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
32 NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
33 NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
34
35 #Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
36
37 #NToverview = rbind(NToverview, NTsum)
38
39 NTresult = data.frame(nt=c("A", "C", "T", "G"))
40
41 for(clazz in gene.classes){
42 print(paste("class:", clazz))
43 NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
44 print(paste("nrow:", nrow(NToverview.sub)))
45 new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
46 new.col.y = sum(new.col.x)
47 new.col.z = round(new.col.x / new.col.y * 100, 2)
48
49 tmp = names(NTresult)
50 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
51 names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
52 }
53
54 NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
55
56 names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
57
58 write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
59
60 NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
61
62 new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
63 new.col.y = sum(new.col.x)
64 new.col.z = round(new.col.x / new.col.y * 100, 2)
65
66 tmp = names(NTresult)
67 NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
68 names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
69
70 names(hotspot.analysis.sum) = names(NTresult)
71
72 hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
73
74 write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")