Hello, I'm sorry, but apparently I missed the point of your problem. Please do not take my previous answer seriously. But you can use ks.test, just in a different way than what I wrote previously. Corrected code: #simulation for (i in 1:1000) { #sample from the reference distribution m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] m_2 <-m_2[order(m_2$d_1),] d_2 <- m_2$d_1 p_2 <- m_2$p_1 #weighted ecdf for the reference distribution and the sample f_d_1 <- ewcdf(d_1, normalise=F) f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) #kolmogorov-smirnov distance x <- f_d_1(d_2) y <- f_d_2(d_2) ht <- ks.test(x, y) d_stat[i, 2] <- ht$statistic d_stat[i, 3] <- ht$p.value } Hope this helps, Rui Barradas ?s 19:29 de 05/09/19, Rui Barradas escreveu:> Hello, > > I don't have the algorithms at hand but the KS statistic calculation is > more complicated than your max/abs difference. > > Anyway, why not use ks.test? it's not that difficult: > > > set.seed(1234) > #reference distribution > d_1 <- sort(rpois(1000, 500)) > p_1 <- d_1/sum(d_1) > m_1 <- data.frame(d_1, p_1) > > #data frame to store the values of the simulation > d_stat <- data.frame(1:1000, NA, NA) > names(d_stat) <- c("sample_size", "ks_distance", "p_value") > > #simulation > for (i in 1:1000) { > ? #sample from the reference distribution > ? m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] > ? d_2 <- m_2$d_1 > > ? ht <- ks.test(d_1, d_2) > ? #kolmogorov-smirnov distance > ? d_stat[i, 2] <- ht$statistic > ? d_stat[i, 3] <- ht$p.value > } > > hist(d_stat[, 2]) > hist(d_stat[, 3]) > > > Note that d_2 is not sorted, but the results are equal in the sense of > function identical(), meaning they are *exactly* the same. Why shouldn't > they? > > Hope this helps, > > Rui Barradas > > > ?s 17:06 de 05/09/19, Boo G. escreveu: >> Hello, >> >> I am trying to perform a Kolmogorov?Smirnov test to assess the >> difference between a distribution and samples drawn proportionally to >> size of different sizes. I managed to compute the Kolmogorov?Smirnov >> distance but I am lost with the p-value. I have looked into the >> ks.test function unsuccessfully. Can anyone help me with computing >> p-values for a two-tailed test? >> >> Below a simplified version of my code. >> >> Thanks in advance. >> Gianluca >> >> >> library(spatstat) >> >> #reference distribution >> d_1 <- sort(rpois(1000, 500)) >> p_1 <- d_1/sum(d_1) >> m_1 <- data.frame(d_1, p_1) >> >> #data frame to store the values of the siumation >> d_stat <- data.frame(1:1000, NA, NA) >> names(d_stat) <- c("sample_size", "ks_distance", "p_value") >> >> #simulation >> for (i in 1:1000) { >> ?? #sample from the reference distribution >> ?? m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] >> ?? m_2 <-m_2[order(m_2$d_1),] >> ?? d_2 <- m_2$d_1 >> ?? p_2 <- m_2$p_1 >> >> ?? #weighted ecdf for the reference distribution and the sample >> ?? f_d_1 <- ewcdf(d_1, normalise=F) >> ?? f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) >> >> ?? #kolmogorov-smirnov distance >> ?? d_stat[i,2] <- max(abs(f_d_1(d_2) - f_d_2(d_2))) >> } >> >> >> ????[[alternative HTML version deleted]] >> >> ______________________________________________ >> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see >> https://stat.ethz.ch/mailman/listinfo/r-help >> PLEASE do read the posting guide >> http://www.R-project.org/posting-guide.html >> and provide commented, minimal, self-contained, reproducible code. >> > > ______________________________________________ > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.
Hello again. I have tied this before but I see two problems: 1) According to the documentation I could read (including the ks.test code), the ks statistic would be max(abs(x - y)) and if you plot this for very low sample sizes you can actually see that this make sense. The results of ks.test(x, y) yields very different values. 2) Also in this case the p-values don?t make much sense, according to my previous interpretation. Again, I could be wrong in my interpretation. On 5 Sep 2019, at 20:46, Rui Barradas <ruipbarradas at sapo.pt<mailto:ruipbarradas at sapo.pt>> wrote: Hello, I'm sorry, but apparently I missed the point of your problem. Please do not take my previous answer seriously. But you can use ks.test, just in a different way than what I wrote previously. Corrected code: #simulation for (i in 1:1000) { #sample from the reference distribution m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] m_2 <-m_2[order(m_2$d_1),] d_2 <- m_2$d_1 p_2 <- m_2$p_1 #weighted ecdf for the reference distribution and the sample f_d_1 <- ewcdf(d_1, normalise=F) f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) #kolmogorov-smirnov distance x <- f_d_1(d_2) y <- f_d_2(d_2) ht <- ks.test(x, y) d_stat[i, 2] <- ht$statistic d_stat[i, 3] <- ht$p.value } Hope this helps, Rui Barradas ?s 19:29 de 05/09/19, Rui Barradas escreveu: Hello, I don't have the algorithms at hand but the KS statistic calculation is more complicated than your max/abs difference. Anyway, why not use ks.test? it's not that difficult: set.seed(1234) #reference distribution d_1 <- sort(rpois(1000, 500)) p_1 <- d_1/sum(d_1) m_1 <- data.frame(d_1, p_1) #data frame to store the values of the simulation d_stat <- data.frame(1:1000, NA, NA) names(d_stat) <- c("sample_size", "ks_distance", "p_value") #simulation for (i in 1:1000) { #sample from the reference distribution m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] d_2 <- m_2$d_1 ht <- ks.test(d_1, d_2) #kolmogorov-smirnov distance d_stat[i, 2] <- ht$statistic d_stat[i, 3] <- ht$p.value } hist(d_stat[, 2]) hist(d_stat[, 3]) Note that d_2 is not sorted, but the results are equal in the sense of function identical(), meaning they are *exactly* the same. Why shouldn't they? Hope this helps, Rui Barradas ?s 17:06 de 05/09/19, Boo G. escreveu: Hello, I am trying to perform a Kolmogorov?Smirnov test to assess the difference between a distribution and samples drawn proportionally to size of different sizes. I managed to compute the Kolmogorov?Smirnov distance but I am lost with the p-value. I have looked into the ks.test function unsuccessfully. Can anyone help me with computing p-values for a two-tailed test? Below a simplified version of my code. Thanks in advance. Gianluca library(spatstat) #reference distribution d_1 <- sort(rpois(1000, 500)) p_1 <- d_1/sum(d_1) m_1 <- data.frame(d_1, p_1) #data frame to store the values of the siumation d_stat <- data.frame(1:1000, NA, NA) names(d_stat) <- c("sample_size", "ks_distance", "p_value") #simulation for (i in 1:1000) { #sample from the reference distribution m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] m_2 <-m_2[order(m_2$d_1),] d_2 <- m_2$d_1 p_2 <- m_2$p_1 #weighted ecdf for the reference distribution and the sample f_d_1 <- ewcdf(d_1, normalise=F) f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) #kolmogorov-smirnov distance d_stat[i,2] <- max(abs(f_d_1(d_2) - f_d_2(d_2))) } [[alternative HTML version deleted]] ______________________________________________ R-help at r-project.org<mailto:R-help at r-project.org> mailing list -- To UNSUBSCRIBE and more, see https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fstat.ethz.ch%2Fmailman%2Flistinfo%2Fr-help&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=ZntDJlPtp%2Bu%2FeO7xNZLbUQgLwpvS1M%2FUVNwovp%2FZPmA%3D&reserved=0 PLEASE do read the posting guide https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww.R-project.org%2Fposting-guide.html&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=YKZNBeGme6V9QGyV2%2F150H6rZnLy9bX7xly%2BQEf6O14%3D&reserved=0 and provide commented, minimal, self-contained, reproducible code. ______________________________________________ R-help at r-project.org<mailto:R-help at r-project.org> mailing list -- To UNSUBSCRIBE and more, see https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fstat.ethz.ch%2Fmailman%2Flistinfo%2Fr-help&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=ZntDJlPtp%2Bu%2FeO7xNZLbUQgLwpvS1M%2FUVNwovp%2FZPmA%3D&reserved=0 PLEASE do read the posting guide https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww.R-project.org%2Fposting-guide.html&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=YKZNBeGme6V9QGyV2%2F150H6rZnLy9bX7xly%2BQEf6O14%3D&reserved=0 and provide commented, minimal, self-contained, reproducible code. [[alternative HTML version deleted]]
Hello, Inline. ?s 20:09 de 05/09/19, Boo G. escreveu:> Hello again. > > I have tied this before but I see two problems: > > 1) According to the documentation I could read?(including the ks.test > code), the ks statistic would be?max(abs(x - y)) and if you plot this > for very low sample sizes you can actually see that this make sense. The > results of ks.test(x, y)?yields?very different values.The problem is that the distribution of Dn is very difficult to compute. From the reference [1] in the help page ?ks.test: Kolmogorov's goodness-of-fit measure, Dn , for a sample CDF has consistently been set aside for methods such as the D+n or D-n of Smirnov, primarily, it seems, because of the difficulty of computing the distribution of Dn . As far as we know, no easy way to compute that distribution has ever been provided in the 70+ years since Kolmogorov's fundamental paper. We provide one here, a C procedure that provides Pr(Dn < d) with 13-15 digit accuracy for n ranging from 2 to at least 16000. That is why I used ks.test and its Dn and p-values. Note that n >= 2, size = 1 is not covered (p-value == 1). Also, the p-values distribution seem to become closer to a uniform with increasing sizes. Try hist(d_stat[801:1000, 3]) [1]https://www.jstatsoft.org/article/view/v008i18 Hope this helps, Rui Barradas> > 2) Also in this case the p-values?don?t make much sense, according to my > previous?interpretation. > > Again, I could be wrong in my interpretation. > >> On 5 Sep 2019, at 20:46, Rui Barradas <ruipbarradas at sapo.pt >> <mailto:ruipbarradas at sapo.pt>> wrote: >> >> Hello, >> >> I'm sorry, but apparently I missed the point of your problem. >> Please do not take my previous answer seriously. >> >> But you can use ks.test, just in a different way than what I wrote >> previously. >> >> Corrected code: >> >> >> #simulation >> for (i in 1:1000) { >> ?#sample from the reference distribution >> ?m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] >> ?m_2 <-m_2[order(m_2$d_1),] >> ?d_2 <- m_2$d_1 >> ?p_2 <- m_2$p_1 >> >> ?#weighted ecdf for the reference distribution and the sample >> ?f_d_1 <- ewcdf(d_1, normalise=F) >> ?f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) >> >> ?#kolmogorov-smirnov distance >> ?x <- f_d_1(d_2) >> ?y <- f_d_2(d_2) >> ?ht <- ks.test(x, y) >> ?d_stat[i, 2] <- ht$statistic >> ?d_stat[i, 3] <- ht$p.value >> } >> >> >> Hope this helps, >> >> Rui Barradas >> >> ?s 19:29 de 05/09/19, Rui Barradas escreveu: >>> Hello, >>> I don't have the algorithms at hand but the KS statistic calculation >>> is more complicated than your max/abs difference. >>> Anyway, why not use ks.test? it's not that difficult: >>> set.seed(1234) >>> #reference distribution >>> d_1 <- sort(rpois(1000, 500)) >>> p_1 <- d_1/sum(d_1) >>> m_1 <- data.frame(d_1, p_1) >>> #data frame to store the values of the simulation >>> d_stat <- data.frame(1:1000, NA, NA) >>> names(d_stat) <- c("sample_size", "ks_distance", "p_value") >>> #simulation >>> for (i in 1:1000) { >>> #sample from the reference distribution >>> m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] >>> d_2 <- m_2$d_1 >>> ht <- ks.test(d_1, d_2) >>> #kolmogorov-smirnov distance >>> d_stat[i, 2] <- ht$statistic >>> d_stat[i, 3] <- ht$p.value >>> } >>> hist(d_stat[, 2]) >>> hist(d_stat[, 3]) >>> Note that d_2 is not sorted, but the results are equal in the sense >>> of function identical(), meaning they are *exactly* the same. Why >>> shouldn't they? >>> Hope this helps, >>> Rui Barradas >>> ?s 17:06 de 05/09/19, Boo G. escreveu: >>>> Hello, >>>> >>>> I am trying to perform a Kolmogorov?Smirnov test to assess the >>>> difference between a distribution and samples drawn proportionally >>>> to size of different sizes. I managed to compute the >>>> Kolmogorov?Smirnov distance but I am lost with the p-value. I have >>>> looked into the ks.test function unsuccessfully. Can anyone help me >>>> with computing p-values for a two-tailed test? >>>> >>>> Below a simplified version of my code. >>>> >>>> Thanks in advance. >>>> Gianluca >>>> >>>> >>>> library(spatstat) >>>> >>>> #reference distribution >>>> d_1 <- sort(rpois(1000, 500)) >>>> p_1 <- d_1/sum(d_1) >>>> m_1 <- data.frame(d_1, p_1) >>>> >>>> #data frame to store the values of the siumation >>>> d_stat <- data.frame(1:1000, NA, NA) >>>> names(d_stat) <- c("sample_size", "ks_distance", "p_value") >>>> >>>> #simulation >>>> for (i in 1:1000) { >>>> #sample from the reference distribution >>>> m_2 <-m_1[(sample(nrow(m_1), size=i, prob=p_1, replace=F)),] >>>> m_2 <-m_2[order(m_2$d_1),] >>>> d_2 <- m_2$d_1 >>>> p_2 <- m_2$p_1 >>>> >>>> #weighted ecdf for the reference distribution and the sample >>>> f_d_1 <- ewcdf(d_1, normalise=F) >>>> f_d_2 <- ewcdf(d_2, 1/p_2, normalise=F, adjust=1/length(d_2)) >>>> >>>> #kolmogorov-smirnov distance >>>> d_stat[i,2] <- max(abs(f_d_1(d_2) - f_d_2(d_2))) >>>> } >>>> >>>> >>>> ????[[alternative HTML version deleted]] >>>> >>>> ______________________________________________ >>>> R-help at r-project.org <mailto:R-help at r-project.org> mailing list -- >>>> To UNSUBSCRIBE and more, see >>>> https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fstat.ethz.ch%2Fmailman%2Flistinfo%2Fr-help&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=ZntDJlPtp%2Bu%2FeO7xNZLbUQgLwpvS1M%2FUVNwovp%2FZPmA%3D&reserved=0 >>>> PLEASE do read the posting >>>> guidehttps://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww.R-project.org%2Fposting-guide.html&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=YKZNBeGme6V9QGyV2%2F150H6rZnLy9bX7xly%2BQEf6O14%3D&reserved=0 >>>> and provide commented, minimal, self-contained, reproducible code. >>>> >>> ______________________________________________ >>> R-help at r-project.org <mailto:R-help at r-project.org>mailing list -- To >>> UNSUBSCRIBE and more, see >>> https://eur03.safelinks.protection.outlook.com/?url=https%3A%2F%2Fstat.ethz.ch%2Fmailman%2Flistinfo%2Fr-help&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=ZntDJlPtp%2Bu%2FeO7xNZLbUQgLwpvS1M%2FUVNwovp%2FZPmA%3D&reserved=0 >>> PLEASE do read the posting >>> guidehttps://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fwww.R-project.org%2Fposting-guide.html&data=01%7C01%7Cgianluca.boo%40soton.ac.uk%7C4b4a253a1edc4960297f08d732316627%7C4a5378f929f44d3ebe89669d03ada9d8%7C0&sdata=YKZNBeGme6V9QGyV2%2F150H6rZnLy9bX7xly%2BQEf6O14%3D&reserved=0 >>> and provide commented, minimal, self-contained, reproducible code. >