#-------- Daten einlesen #install.packages("gdata") #install.packages("cluster") #install.packages("clue") library(gdata) library(cluster) library(clue) setwd("P:\\Daten\\Uni\\Master Wirtschaftsinformatik\\2018_WS\\VU Business Intelligence II\\aufgabe2\\frage2") files<-list.files("csv") alldata<-list() for(i in 1:length(files)) { alldata[[i]]<-read.csv(paste("csv\\",files[i],sep=""),header=TRUE,sep="*",stringsAsFactors=FALSE) } allsplitdata<-list() for(i in c(1:length(files))) { dataset<-alldata[[i]] ordereddata<-dataset[order(dataset$Id,dataset$ServerTimestamp,dataset$timestamp),] splitdata<-split(ordereddata,ordereddata$Id) allsplitdata[[i]]<-splitdata } dataitems<-array() for(i in 1:length(files)) { dataitems[i]<-nrow(alldata[i][[1]]) } goodfiles<-c() for(i in 1:length(files)) { if(dataitems[i]>=100) { goodfiles<-c(goodfiles,i) } } listofids<-c() for(i in goodfiles) { listofids<-c(listofids,names(allsplitdata[[i]])) } ids=unique(listofids) sumofids<-array() for(i in c(1:length(ids))) { counter<-0 for(j in c(1:length(listofids))) { if(ids[i]==listofids[j]) counter<-counter+1 } sumofids[i]<-counter } idswithsums<-cbind(ids,sumofids) goodids<-c() for(i in c(1:length(ids))) { if(idswithsums[i,2]=="41") goodids<-c(goodids,idswithsums[i,1]) } numbers<-data.frame() for(i in goodfiles) { buffer<-array() buffer[1]<-i for(j in c(1:length(goodids))) { splitid<-which(goodids[j] == names(allsplitdata[[i]])) buffer[j+1]<-nrow(allsplitdata[[i]][[splitid]]) } numbers<-rbind(numbers,buffer) } numbers<-setNames(numbers,c("filenumber",goodids)) minimum<-array() for(i in 2:9) { minimum[i-1]<-min(numbers[i]) } idswithminimum<-cbind(goodids,minimum) reallygoodids<-c() for(i in c(1:length(goodids))) { if(as.numeric(idswithminimum[i,2])>=10) reallygoodids<-c(reallygoodids,idswithminimum[i,1]) } gooddata<-data.frame(); for(i in goodfiles) { buffer<-array() for(j in c(1:length(reallygoodids))) { splitid<-which(reallygoodids[j] == names(allsplitdata[[i]])) for(k in c(1:10)) { buffer[(j-1)*10+k]<-as.numeric(allsplitdata[[i]][[splitid]]$value[nrow(allsplitdata[[i]][[splitid]])-(10-k)]) } } gooddata<-rbind(gooddata,buffer) } gooddatanames=c() for(i in c(1:length(reallygoodids))) { for(j in c(1:10)) { gooddatanames=c(gooddatanames,paste(reallygoodids[i],j,sep="")) } } gooddata<-setNames(gooddata,gooddatanames) successdata<-read.xls("machning_success.xlsx",stringsAsFactors=FALSE) j<-1 success<-array() for(i in goodfiles) { success[j]<-successdata$Success[i] j<-j+1 } gooddata<-cbind(gooddata,success) #hierarchical clustering clustvar<-gooddata[1:70] d<-daisy(clustvar) cl<-hclust(d, method="ward.D2",members=NULL) summary(cl) plot(cl) plot(1:42, c(cl$height[41:1],0),type="l") abline(v=7) abline(v=18) clcut7<-cutree(cl, k=7) table(clcut7) sil_clcut7<-silhouette(clcut7,d) plot(sil_clcut7,col=c(1:7)) clcut18<-cutree(cl, k=18) table(clcut18) sil_clcut18<-silhouette(clcut18,d) plot(sil_clcut18,col=c(1:18)) clcut2<-cutree(cl, k=2) table(clcut2) sil_clcut2<-silhouette(clcut2,d) plot(sil_clcut2,col=c(1:2)) #kmeans kmeans2<-kmeans(clustvar,centers=2) kmeans7<-kmeans(clustvar,centers=7) kmeans18<-kmeans(clustvar,centers=18) table(kmeans2$cluster) table(kmeans7$cluster) table(kmeans18$cluster) sil_kmeans2<-silhouette(kmeans2$cluster,d) plot(sil_kmeans2,col=c(1:2)) sil_kmeans7<-silhouette(kmeans7$cluster,d) plot(sil_kmeans7,col=c(1:7)) sil_kmeans18<-silhouette(kmeans18$cluster,d) plot(sil_kmeans18,col=c(1:18)) results<-cbind(gooddata[71],clcut2,clcut7,clcut18,kmeans2$cluster,kmeans7$cluster,kmeans18$cluster) results[1]<-results[1]=="true" sum(results[1]==TRUE) sum(results[1]==FALSE) table(c(unlist(results[1])),c(unlist(results[2]))) table(c(unlist(results[1])),c(unlist(results[3]))) table(c(unlist(results[1])),c(unlist(results[4]))) table(c(unlist(results[1])),c(unlist(results[5]))) table(c(unlist(results[1])),c(unlist(results[6]))) table(c(unlist(results[1])),c(unlist(results[7])))