library(gdata) library(cluster) library(clue) setwd("E:\\frage2") files<-list.files("csv") alldata<-list() for(i in 1:length(files)) { alldata[[i]]<-read.csv(paste("csv\\",files[i],sep=""),header=TRUE,sep="*",stringsAsFactors=FALSE) } allsplitdata<-list() for(i in c(1:length(files))) { dataset<-alldata[[i]] ordereddata<-dataset[order(dataset$Id,dataset$ServerTimestamp,dataset$timestamp),] splitdata<-split(ordereddata,ordereddata$Id) allsplitdata[[i]]<-splitdata } dataitems<-array() for(i in 1:length(files)) { dataitems[i]<-nrow(alldata[i][[1]]) } goodfiles<-c() for(i in 1:length(files)) { if(dataitems[i]>=100) { goodfiles<-c(goodfiles,i) } } listofids<-c() for(i in goodfiles) { listofids<-c(listofids,names(allsplitdata[[i]])) } ids=unique(listofids) sumofids<-array() for(i in c(1:length(ids))) { counter<-0 for(j in c(1:length(listofids))) { if(ids[i]==listofids[j]) counter<-counter+1 } sumofids[i]<-counter } idswithsums<-cbind(ids,sumofids) goodids<-c() for(i in c(1:length(ids))) { if(idswithsums[i,2]=="41") goodids<-c(goodids,idswithsums[i,1]) } numbers<-data.frame() for(i in goodfiles) { buffer<-array() buffer[1]<-i for(j in c(1:length(goodids))) { splitid<-which(goodids[j] == names(allsplitdata[[i]])) buffer[j+1]<-nrow(allsplitdata[[i]][[splitid]]) } numbers<-rbind(numbers,buffer) } numbers<-setNames(numbers,c("filenumber",goodids)) minimum<-array() for(i in 2:9) { minimum[i-1]<-min(numbers[i]) } idswithminimum<-cbind(goodids,minimum) reallygoodids<-c() for(i in c(1:length(goodids))) { if(as.numeric(idswithminimum[i,2])>=10) reallygoodids<-c(reallygoodids,idswithminimum[i,1]) } gooddata<-data.frame(); for(i in goodfiles) { buffer<-array() for(j in c(1:length(reallygoodids))) { splitid<-which(reallygoodids[j] == names(allsplitdata[[i]])) for(k in c(1:1)) { buffer[(j-1)+k]<-as.numeric(allsplitdata[[i]][[splitid]]$value[nrow(allsplitdata[[i]][[splitid]])-(10-k)]) } } gooddata<-rbind(gooddata,buffer) } gooddatanames=c() for(i in c(1:length(reallygoodids))) { for(j in c(1:1)) { gooddatanames=c(gooddatanames,paste(reallygoodids[i],j,sep="")) } } gooddata<-setNames(gooddata,gooddatanames) successdata<-read.xls("machning_success.xlsx",stringsAsFactors=FALSE) j<-1 success<-array() for(i in goodfiles) { success[j]<-successdata$Success[i] j<-j+1 } gooddata<-cbind(gooddata,success) #hierarchical clustering clustvar<-gooddata[1:3] d<-daisy(clustvar) cl<-hclust(d, method="ward.D2",members=NULL) summary(cl) plot(cl) plot(1:42, c(cl$height[41:1],0),type="l") abline(v=6) abline(v=10) abline(v=15) clcut6<-cutree(cl, k=6) table(clcut6) sil_clcut6<-silhouette(clcut6,d) plot(sil_clcut6,col=c(1:6)) clcut10<-cutree(cl, k=10) table(clcut10) sil_clcut10<-silhouette(clcut10,d) plot(sil_clcut10,col=c(1:10)) clcut15<-cutree(cl, k=15) table(clcut15) sil_clcut15<-silhouette(clcut15,d) plot(sil_clcut15,col=c(1:15)) clcut2<-cutree(cl, k=2) table(clcut2) sil_clcut2<-silhouette(clcut2,d) plot(sil_clcut2,col=c(1:2)) #kmeans kmeans2<-kmeans(clustvar,centers=2) kmeans6<-kmeans(clustvar,centers=6) kmeans10<-kmeans(clustvar,centers=10) kmeans15<-kmeans(clustvar,centers=15) table(kmeans2$cluster) table(kmeans6$cluster) table(kmeans10$cluster) table(kmeans15$cluster) sil_kmeans2<-silhouette(kmeans2$cluster,d) plot(sil_kmeans2,col=c(1:2)) sil_kmeans6<-silhouette(kmeans6$cluster,d) plot(sil_kmeans6,col=c(1:6)) sil_kmeans10<-silhouette(kmeans10$cluster,d) plot(sil_kmeans10,col=c(1:10)) sil_kmeans15<-silhouette(kmeans15$cluster,d) plot(sil_kmeans15,col=c(1:15)) results<-cbind(gooddata[8],clcut2,clcut6,clcut10,clcut15,kmeans2$cluster,kmeans6$cluster,kmeans10$cluster,kmeans15$cluster) results[1]<-results[1]=="true" sum(results[1]==TRUE) sum(results[1]==FALSE) table(c(unlist(results[1])),c(unlist(results[2]))) table(c(unlist(results[1])),c(unlist(results[3]))) table(c(unlist(results[1])),c(unlist(results[4]))) table(c(unlist(results[1])),c(unlist(results[5]))) table(c(unlist(results[1])),c(unlist(results[6]))) table(c(unlist(results[1])),c(unlist(results[7]))) table(c(unlist(results[1])),c(unlist(results[8]))) table(c(unlist(results[1])),c(unlist(results[9])))