diff --git a/GlnLM-EffLocDisp-DT-fctNM-01_2021.R b/GlnLM-EffLocDisp-DT-fctNM-01_2021.R new file mode 100644 index 0000000000000000000000000000000000000000..141b9c601af98959c5fe9a60e425984f353bfd99 --- /dev/null +++ b/GlnLM-EffLocDisp-DT-fctNM-01_2021.R @@ -0,0 +1,345 @@ +###############################e##################################################################### +## Modele ln-lineaire generalise ## +## Gestion donnees en DataTable - Optimisation Nelder-Mead en fonction ## +## 2020-01-20 ## +## source("ANC/GlnLM-EffLocDisp-DT-fctNM.R") ## +##################################################################################################### + +options(width=100); +ok <-1; +if (ok == 1) { + rm(list = ls()); +} + +library(data.table); +library(numDeriv); + +## Repertoire de travail ## +chem <- "/Users/eva.falipou/Documents/Eva/ANC/ln_linéaire/"; + +##################################################################################################### +## Lecture donnees - Exemple : analyse NTK avec loq=1 ## +##################################################################################################### +ok <-1; +if (ok == 1) { + fin <- paste(chem,"BDD_exemple_ln-lineaire.csv",sep=""); + don <- fread(fin,dec=",",sep=";"); + don[,y:=NTK]; # choix de la variable dependante + ## Libelle variable dependante pour documenter affichages ## + labvdep <- "NTK"; + ## Left-censoring ## + loq <- 1.0; # LQ correspondant a la variable dependante + don[,lb:=y]; + don[y<=loq,lb:=0]; + don[,ub:=y]; + don[y<=loq,ub:=loq]; + ## Variables explicatives ## + don[,D0:=ifelse(Dispositif=="D0",1,0)]; + don[,D2:=ifelse(Dispositif=="D2",1,0)]; + don[,inf2ans:=ifelse(classe_age=="< 2 ans",1,0)]; + don[,inf70pct:=ifelse(classe_charge=="< 70%",1,0)]; + ## Nettoyage ## + don <- don[,.(y,lb,ub,D0,D2,inf2ans,inf70pct)]; + don <- na.omit(don); + ## Effects on location (median) ## + zm <- c("D0"); + #zm <- c();## No effect on location ## + nbzm <- length(zm); + ## Effects on dispersion ## + zd <- c(); + #zd <- c("D1","D2","inf2ans","inf70pct");## No effect on dispersion ## + nbzd <- length(zd); +} + +##################################################################################################### +## Log-Likelihood calculation - Based on log-normal distribution ## +##################################################################################################### +funk <- function(theta) {## theta : parameter vector (beta0,beta1,gamma0,gamma1) ## + n <- don[,.N]; + ## Regression - Effects on median ## + if (nbzm > 0) { + Zm <- cbind(array(1, dim=c(n,1)), as.matrix(don[,zm,with=F])); + } else { + Zm <- array(1, dim=c(n,1)); + } + zbm <- Zm%*%theta[1:(1+nbzm)]; + ## Regression - Effects on dispersion ## + if (nbzd > 0) { + Zd <- cbind(array(1, dim=c(n,1)), as.matrix(don[,zd,with=F])); + } else { + Zd <- array(1, dim=c(n,1)); + } + zbd <- Zd%*%theta[(2+nbzm):(2+nbzm+nbzd)]; + s <- exp(zbd); + ## Log-Likelihood calculation ## + ll <- 0; + li <- don[, which(lb == 0)]; + if (length(li) > 0) { + ll <- sum(log(plnorm(q=don[li, ub],meanlog=zbm[li],sdlog=s[li]))); + } + li <- don[, which(lb > 0 & ub > lb)]; + if (length(li) > 0) { + Fu <- plnorm(don[li, ub],meanlog=zbm[li],sdlog=s[li]); + Fl <- plnorm(don[li, lb],meanlog=zbm[li],sdlog=s[li]); + ll <- ll + sum(log(Fu - Fl)); + } + li <- don[, which(lb > 0 & ub == lb)]; + if (length(li) > 0) { + f <- dlnorm(don[li, ub],meanlog=zbm[li],sdlog=s[li]); + ll <- ll + sum(log(f)); + } + if(is.na(ll) | is.infinite(ll)) { + ll <- -1.0e200; + ## Checking calculation in case of data problem ## + #cat("theta = ", theta, " - ll = ", -ll, "\n"); + } + -ll; ## minimize -ll <=> maximize ll ## +} + +## Amotry Function ########################################################################### +amotry <- function(p, y, psum, ndim, ihi, fac) { + ptry <- numeric(); + fac1 <- (1 - fac) / ndim; + fac2 <- fac1 - fac; + for(j in 1:ndim) ptry[j] <- psum[j] * fac1 - p[ihi,j] * fac2; + ptry <- pmin(pmax(ptry, lowb), uppb); ## Bounds + ytry <- funk(ptry); + if(ytry < y[ihi]) { + y[ihi] <- ytry; + for(j in 1:ndim) { + psum[j] <- psum[j] + ptry[j] - p[ihi,j]; + p[ihi,j] <- ptry[j]; + } + } + list(ytry, p, y, psum); +} + +## Nelder-Mead calibration ## +NMcal <- function(NMpar) { + ## Libelle variable dependante ## + labvdep <- NMpar[[1]]; + ## Libelle parametres ## + labtheta <- NMpar[[2]]; + ## Nb parametres a estimer ## + nbp <- length(labtheta); + ## Valeurs de reference des parametres pour test H0 ## + thref <- NMpar[[3]]; + ## Valeurs initiales des parametres et bornes ## + theta <- NMpar[[4]];print(theta); + lowb <- NMpar[[5]]; + uppb <- NMpar[[6]]; + ## Pas de recherche initial ## + lbd <- NMpar[[7]]; + ## Initialisation simplex #################################################################### + ndim <- length(theta); + mpts <- ndim + 1; + p <- array(0, c(mpts, ndim)); + p[1,] <- theta; + for(i in 2:mpts) { + p[i,] <- p[1,]; + p[i,i-1] <- p[i,i-1] + lbd[i-1]; + } + #print(p); + psum <- numeric(); + y <- numeric(); + yr <- numeric(); + for(i in 1:mpts) y[i] <- funk(p[i,]); + ftol <- 1e-8; + maxiter <- 5000; + #maxiter <- 1; + maxcycle <- 0; + tiny <- 1e-10; + ############################################################################################## + iter <- 0; + cycle <- 0; + for(j in 1:ndim) psum[j] <- sum(p[,j]); + swap <- numeric(); + repeat { + iter <- iter + 1; + if (iter == 1 | round(iter / 100) == iter / 100) { + cat("Iteration", iter, "\n"); + print(cbind(p, y)); + } + ## Rank points of simplex ################################################################## + yr <- rank(y, ties.method="first"); + for(i in 1:mpts) { + if(yr[i] == 1) ilo <- i; + if(yr[i] == ndim) inhi <- i; + if(yr[i] == mpts) {ihi <- i;yhi <- y[i];} + } + ## Test whether convergence achieved ####################################################### + rtol <- 2 * abs(y[ihi] - y[ilo]) / (abs(y[ihi]) + abs(y[ilo]) + tiny); + if(rtol < ftol){ + print("precision reached"); + converge <- 1; + } + if(iter >= maxiter){ + print("maxiter reached"); + converge <- 0; + } + if(rtol < ftol || iter >= maxiter) { + swap[1] <- y[1];y[1] <- y[ilo];y[ilo] <- swap[1]; + swap <- p[1,];p[1,] <- p[ilo,];p[ilo,] <- swap; + print("Iteration", quote=FALSE);print(iter, quote=FALSE); + print(cbind(p, y)); + print("", quote=FALSE); + ## Display Calibration Results ######################################################## + print("Final", quote=FALSE);print(c(p[1,], y[1])); + ## Covariance matrix ## + covmat <- tryCatch(expr={solve(hessian(funk,p[1,]))},error=function(e){hinv <- 0;return(array(NA,dim=c(nbp,nbp)))}); + hinv <- ifelse(sum(is.na(covmat))>0,0,1); + cat("Covariance matrix :\n"); + print(covmat); + cat("\n"); + cat("Dependent variable : ",labvdep,"\n"); + ## Table of estimates ## + cat(format("Label", width=15)); + cat(format("Estimate", width=15, justify="right")); + cat(format("Std. Dev.", width=15, justify="right")); + cat(format("Ref", width=7, justify="right")); + cat(format("Chi2", width=15, justify="right")); + cat(format("DF", width=4, justify="right")); + cat(format("Pval", width=11, justify="right"), "\n"); + for (j in 1:length(theta)) { + thtest <- p[1,]; + thtest[j] <- thref[j]; + #chi2 <- 2 * abs(y[1] - funk(thtest)); + #stdev <- abs(p[1,j] - thref[j]) / sqrt(chi2); + stdev <- sqrt(covmat[j,j]); + chi2 <- ((p[1,j] - thref[j])^2/covmat[j,j]); + df <- 1; + pval <- pchisq(chi2, df, lower.tail=FALSE); + cat(format(labtheta[j], width=15)); + cat(formatC(p[1,j], digits=4, format="e", width=15)); + cat(formatC(stdev, digits=4, format="e", width=15)); + cat(format(thref[j], digits=2, width=7)); + cat(formatC(chi2, digits=4, format="e", width=15)); + cat(format(df, digits=1, width=4)); + cat(formatC(pval, digits=4, format="f", width=11), "\n"); + } + if (cycle < maxcycle) { + for(i in 2:mpts) { + p[i,] <- p[1,]; + p[i,i-1] <- p[i,i-1] + lbd[i-1]; + } + for(i in 1:mpts) y[i] <- funk(p[i,]); + iter <- 0; + cycle <- cycle + 1; + } + else break; + } + ## New iteration ########################################################################### + lamo <- amotry(p, y, psum, ndim, ihi, -1); + ytry <- lamo[[1]];p <- lamo[[2]];y <- lamo[[3]];psum <- lamo[[4]]; + ##print("Factor -1"); + ##print(cbind(p, y)); + if(ytry <= y[ilo]) { + lamo <- amotry(p, y, psum, ndim, ihi, 2); + ytry <- lamo[[1]];p <- lamo[[2]];y <- lamo[[3]];psum <- lamo[[4]]; + ##print("Factor 2"); + ##print(cbind(p, y)); + } + else if(ytry >= y[inhi]) { + ysave <- y[ihi]; + lamo <- amotry(p, y, psum, ndim, ihi, .5); + ytry <- lamo[[1]];p <- lamo[[2]];y <- lamo[[3]];psum <- lamo[[4]]; + ##print("Factor .5"); + ##print(cbind(p, y)); + if(ytry >= ysave) { + for(i in 1:mpts) { + if(i != ilo) { + for(j in 1:ndim) p[i,j] <- 0.5 * (p[i,j] + p[ilo,j]); + y[i] <- funk(p[i,]); + } + } + for(j in 1:ndim) psum[j] <- sum(p[,j]); + } + } + } + list(p[1,],converge,hinv,covmat); +} + +##################################################################################################### +## Variance of theoretical percent-quantile ## +##################################################################################################### +delt <- function(tab,percent,covm) { + n <- tab[,.N]; + if (nbzm > 0) { + Zm <- cbind(array(1, dim=c(n,1)), as.matrix(tab[,zm,with=F])); + } else { + Zm <- array(1, dim=c(n,1)); + } + if (nbzd > 0) { + Zd <- cbind(array(1, dim=c(n,1)), as.matrix(tab[,zd,with=F])); + } else { + Zd <- array(1, dim=c(n,1)); + } + zbd <- Zd%*%theta[(2+nbzm):(2+nbzm+nbzd)]; + Zmd <- Zm; + for(j in 1:(nbzd+1)) Zmd <- cbind(Zmd, Zd[,j] * exp(zbd) * qnorm(percent)); + diag(Zmd%*%covm%*%t(Zmd)); +} + +################################################################################################### +## Model parameter estimation ## +################################################################################################### +ok <- 1; +if (ok == 1) { + ## Libelle parametres ## + labtheta <- c("beta0",zm,"gamma0",zd); + ## Nb parametres a estimer ## + nbp <- length(labtheta); + ## Valeurs de reference des parametres pour test H0 ## + thref <- rep(0,nbp); + ## Valeurs initiales des parametres et bornes ## + theta <- rep(0,nbp); + lowb <- rep(-Inf,nbp); + uppb <- rep(+Inf,nbp); + ## Pas de recherche initial ## + lbd <- rep(0.5,nbp); + NMpar <- list(labvdep,labtheta,theta,thref,lowb,uppb,lbd); + calage <- NMcal(NMpar); + thopt <- calage[[1]]; + converge <- calage[[2]]; + hinv <- calage[[3]]; + covmat <- calage[[4]]; + ##################################################################################################### + ## Quantile estimation ## + ##################################################################################################### + percent <- 0.5;## Quantile probability ## + ## Quantiles are estimated by fac variables ## + fac <- unique(c(zm,zd)); + ## Empirical (observed) quantiles ## + qua <- don[,.(qobs=quantile(ub,probs=percent,na.rm=T),nbobs=.N),by=fac]; + n <- qua[,.N]; + ## Regression - Effects on median ## + if (nbzm > 0) { + Zm <- cbind(array(1,dim=c(n,1)),as.matrix(qua[,zm,with=F])); + } else { + Zm <- array(1,dim=c(n,1)); + } + zbm <- Zm%*%thopt[1:(1+nbzm)]; + ## Regression - Effects on dispersion ## + if (nbzd > 0) { + Zd <- cbind(array(1,dim=c(n,1)),as.matrix(qua[,zd,with=F])); + } else { + Zd <- array(1,dim=c(n,1)); + } + zbd <- Zd%*%thopt[(2+nbzm):(2+nbzm+nbzd)]; + #s <- exp(zbd); + qua[,qcal:=exp(zbm+exp(zbd)*qnorm(percent))]; + ## Left-censored counts ## + lce <- don[lb<ub,.(nblce=.N),by=fac]; + qua <- merge(qua,lce,by=fac,all=TRUE); + qua[is.na(nblce),nblce:=0]; + ## Variance of theoretical quantile ## + qua[,vqc:=delt(qua,percent,covmat)]; + ## Lower and upper bounds of theoretical quantile ## + qua[,lbqc:=qcal*exp(-1.96*vqc^0.5)]; + qua[,ubqc:=qcal*exp(+1.96*vqc^0.5)]; + ## Displaying observed and theoretical quantile ## + cat("\n"); + cat("Dependent variable : ",labvdep," - Table of observed and theoretical ",percent*100,"%-quantiles:\n",sep=""); + #print(qua[, .(Z1,Z2,nbobs,nblce,qobs,qcal,vqc,lbqc,ubqc)]); + print(qua[,c(fac,"nbobs","nblce","qobs","qcal","vqc","lbqc","ubqc"),with=F]); +}