### April 12th, 2006 # logistic regression, clustering TGDR; # no-scale; # 3-fold cross validation # mainly copied from HIV analysis (logistic TGDR) and CTGDR/Cox (CTGDR) all.y<-read.table("ind-org.dat")$V1; all.x<-read.table("sel-norm-exp.dat"); cluster<-read.table("cluster.dat"); samplesize<-length(all.x); aug.x<-cbind(rep(1, samplesize), t(all.x)); aug.cov<-ncol(aug.x); num.cov<-ncol(all.x); ### V<-3; tau1<-1.0; tau2<-1.0; step<-1000; increase<-0.01; ### cv.score<-rep(0, step); count<-rep(0, step); delete.size<-as.integer(samplesize/V); complete.data<-data.frame(cbind(all.y, aug.x)); # --- cross validation --- for(v in 1:V){ tstart <- Sys.time() cat("data preparation start time: ") print(tstart) ### --- cut data --- cross.data<- complete.data[-((delete.size*(v-1)+1):(delete.size*v)), ]; work.size<-nrow(cross.data); cross.y<-cross.data[ ,1]; cross.x<-cross.data[ ,2:ncol(cross.data)]; cross.xt<-matrix(0, nrow=nrow(cross.x), ncol=ncol(cross.x)); for(i in 1:ncol(cross.x)) cross.xt[ ,i]<-cross.x[[i]]; cross.size<-nrow(cross.x); evaluate.data<- complete.data[((delete.size*(v-1)+1):(delete.size*v)), ]; evaluate.size<-nrow(evaluate.data); evaluate.y<-evaluate.data[ ,1]; evaluate.x<-evaluate.data[ ,2:ncol(evaluate.data)]; evaluate.size<-nrow(evaluate.x); evaluate.xt<-matrix(0, nrow=evaluate.size, ncol=aug.cov); for(ii in 1:ncol(evaluate.x)) evaluate.xt[ ,ii]<-evaluate.x[[ii]]; tstart <- Sys.time() cat("data preparation end time: ") print(tstart) # --- now compute the gradient # --- the difference with CTGDR is the threshold step --- beta.est<-rep(0, aug.cov); h.v<-rep(0, aug.cov); f.v<-rep(0, aug.cov); f.v2<-rep(0, aug.cov); g.v<-rep(0, aug.cov); g.v2<-rep(0, aug.cov); cri.2<-rep(0, aug.cov); # --- for each step --- for(s in 1:step){ Sys.time(); gradient<-rep(0, aug.cov); # --- "new" and hopefully faster gradient --- beta.z<-cross.xt%*%beta.est; beta.p<-1/(1+exp(-beta.z)); term.1<-cross.y/beta.p-(1-cross.y)/(1-beta.p); term.2<-exp(beta.z)/((1+exp(beta.z))^2); gradient<-(t(term.1*term.2))%*%cross.xt; Sys.time(); max.gradient<-max(abs(gradient)); write.table(cbind(v, s, max(gradient)),file="check.txt", append=T, quote=F, row.name=F, col.name=F); grad<-gradient/max.gradient; gradient<-grad[1, ]; # --- we work with the covariates only # --- when computing the threshold, the intercept is not considered work.gradient.dat<- data.frame(cbind(cluster[ ,2], abs(gradient[2:length(gradient)]))); num.cluster<-length(unique(cluster[ ,2])); cri.cluster<-rep(0, num.cluster); max.grad.cluster<-rep(0, num.cluster); # --- get the cluster gradient; and the max within each cluster for(clus in 1:num.cluster){ work.grad.clus<- work.gradient.dat[work.gradient.dat[ ,1]==clus,]; cri.cluster[clus]<-sum(work.grad.clus[ ,2]); max.grad.cluster[clus]<-max(abs(work.grad.clus[ ,2])); } # --- for each gene, get the cluster criteria (pseudi.gv) # and within cluster criteria (cri.2) # Note: length differs by 1. pseudo.gv<-rep(0, aug.cov); for(i in 2:aug.cov){ pseudo.gv[i]<-cri.cluster[cluster$V2[i-1]]; cri.2[i]<-max.grad.cluster[cluster$V2[i-1]]*tau2; } for(i in 1:length(g.v)) g.v[i]<-gradient[i]; # --- get within cluster threshold indicator f.v2<-ifelse(abs(g.v)>=cri.2, 1, 0); f.v2[1]<-1; #-- for the intercept # --- get the cluster threshold indicator for each covariate cri.t<- tau1*max(pseudo.gv); f.v<-ifelse(pseudo.gv>=cri.t, 1, 0); f.v[1]<-1; #-- for the intercept h.v<-f.v*f.v2*g.v; beta.est<-beta.est+increase*h.v; # --- now evaluate --- likelihood.eva<-0; eva.score<-evaluate.xt%*%beta.est; eva.temp<-exp(eva.score); eva.p<-eva.temp/(1+eva.temp); likelihood.eva<-sum(log(eva.p)*evaluate.y)+ sum(log(1-eva.p)*(1-evaluate.y)); cv.score[s]<-cv.score[s]+likelihood.eva; } } se<-seq(from=1, to=step, by=1); write.table(cbind(se, cv.score), file="cv.txt", quote=F, append=F, row.name=F, col.name=F);