Antonietta di Salvatore
2006-May-24 08:26 UTC
[Rd] the computation of exact p-value for the nonparametric cor-test with ties
Hello, I wuold like to propose my modifications of the original cor.test to you : I tried to calcolate the correct p-value for Spearman and Kendall's test with ties. Let me know what you think. Thanks you for your time. Antonietta di Salvatore test <- function(x, ...) UseMethod("test") test.default <- function(x, y, alternative = c("two.sided", "less", "greater"), method = c("pearson", "newkendall", "newspearman"), exact = NULL, conf.level = 0.95, ...) { alternative <- match.arg(alternative) method <- match.arg(method) DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y))) if(length(x) != length(y)) stop("x and y must have the same length") OK <- complete.cases(x, y) x <- x[OK] y <- y[OK] n <- length(x) PVAL <- NULL NVAL <- 0 conf.int <- FALSE if(method == "pearson") { if(n < 3) stop("not enough finite observations") method <- "Pearson's product-moment correlation" names(NVAL) <- "correlation" r <- cor(x, y) df <- n - 2 ESTIMATE <- c(cor = r) PARAMETER <- c(df = df) STATISTIC <- c(t = sqrt(df) * r / sqrt(1 - r^2)) p <- pt(STATISTIC, df) if(n > 3) { ## confidence int. if(!missing(conf.level) && (length(conf.level) != 1 || !is.finite(conf.level) || conf.level < 0 || conf.level > 1)) stop(paste("conf.level must be a single number", "between 0 and 1")) conf.int <- TRUE z <- atanh(r) sigma <- 1 / sqrt(n - 3) cint <- switch(alternative, less = c(-Inf, z + sigma * qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level), Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 + conf.level) / 2)) cint <- tanh(cint) attr(cint, "conf.level") <- conf.level } } else { if(n < 2) stop("not enough finite observations") PARAMETER <- NULL if(method == "newkendall") { method <- "Kendall's rank correlation tau" names(NVAL) <- "tau" r <- cor(x,y, method = "kendall") ESTIMATE <- c(tau = r) if(!is.finite(ESTIMATE)) { # all x or all y the same ESTIMATE[] <- NA STATISTIC <- c(T = NA) PVAL <- NA } else { if(is.null(exact)) exact <- (n < 50) if(exact) { xr<-rank(x) yr<-rank(y) Gx<-table(xr) Gx<-Gx[Gx>1] Gx<-sum(Gx^2-Gx) #Gx is null when x variable is without ties Gy<-table(yr) Gy<-Gy[Gy>1] Gy<-sum(Gy^2-Gy) Gy is null when y variable is without ties q <- round((r + 1)*sqrt(n^2-n-Gx)*sqrt(n^2-n-Gy)/4) pkendall <- function(q, n) { .C("pkendall", length(q), p = as.double(q), as.integer(n), PACKAGE = "stats")$p } PVAL <- switch(alternative, "two.sided" = { if(q > sqrt(n^2-n-Gx)*sqrt(n^2-n-Gy)/4) p <- 1 - pkendall(q - 1, n) else p <- pkendall(q, n) min(2 * p, 1) }, "greater" = 1 - pkendall(q - 1, n), "less" = pkendall(q, n)) STATISTIC <- c(T = q) } else { STATISTIC <- c(z = r / sqrt((4 * n + 10) / (9 * n*(n-1)))) p <- pnorm(STATISTIC) } } } else { method<- "Spearman's rank correlation rho" names(NVAL) <- "rho" r <- cor(x,y,method="spearman") ESTIMATE <- c(rho = r) if(!is.finite(ESTIMATE)) { # all x or all y the same ESTIMATE[] <- NA STATISTIC <- c(S = NA) PVAL <- NA } else { ## Use the test statistic S = sum(rank(x) - rank(y))^2 ## and AS 89 for obtaining better p-values than via the ## simple normal approximation. ## In the case of no ties, S = (1-rho) * (n^3-n)/6. pspearman <- function(q, n, lower.tail = TRUE) { if(n <= 1290) # n*(n^2 - 1) does not overflow .C("prho", as.integer(n), as.double(q + 1), p = double(1), integer(1), as.logical(lower.tail), PACKAGE = "stats")$p else { # for large n: aymptotic t_{n-2} r <- 1 - 6 * q / (n*(n*n - 1)) pt(r / sqrt((1 - r^2)/(n-2)), df = n-2, lower.tail= !lower.tail) } } xr<-rank(x) yr<-rank(y) Gx<-table(xr) Gx<-Gx[Gx>1] Gx<-sum(Gx^3-Gx) Gy<-table(yr) Gy<-Gy[Gy>1] Gy<-sum(Gy^3-Gy) q <-round(1/6*((n^3-n)-(Gx+Gy)/2-r*sqrt((n^3-n)^2-(Gx+Gy)*(n^3-n)+Gx*Gy))) STATISTIC <- c(S = q) PVAL <- switch(alternative, "two.sided" = { p <- if(q > (n^3 - n) / 6) pspearman(q - 1, n, lower.tail = FALSE) else pspearman(q, n, lower.tail = TRUE) min(2 * p, 1) }, "greater" = pspearman(q, n, lower.tail = TRUE), "less" = pspearman(q - 1, n, lower.tail = FALSE)) } } } if(is.null(PVAL)) # for "pearson" only, currently PVAL <- switch(alternative, "less" = p, "greater" = 1 - p, "two.sided" = 2 * min(p, 1 - p)) RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL, alternative = alternative, method = method, data.name = DNAME) if(conf.int) RVAL <- c(RVAL, list(conf.int = cint)) class(RVAL) <- "htest" RVAL } test.formula <- function(formula, data, subset, na.action, ...) { if(missing(formula) || !inherits(formula, "formula") || length(formula) != 2) stop("formula missing or invalid") m <- match.call(expand.dots = FALSE) if(is.matrix(eval(m$data, parent.frame()))) m$data <- as.data.frame(data) m[[1]] <- as.name("model.frame") m$... <- NULL mf <- eval(m, environment(formula)) if(length(mf) != 2) stop("invalid formula") DNAME <- paste(names(mf), collapse = " and ") names(mf) <- c("x", "y") y <- do.call("test", c(mf, list(...))) y$data.name <- DNAME y }