Luigi Marongiu
2016-Feb-15 00:13 UTC
[R] cut-off point to separate overlapping distributions
dear all, I have a set of data that can be defined as bimodal; would be possible to define a cut-off to split it? my mathematical/statistical knowledge is limited, my apologies, thus i am not sure what kind of area this problem belongs to; at the moment I can fit -- thanks to previous help from the community -- the modes of the distribution, but the problem i am facing is to split the distribution in two. based on the example below, i recknon the cut-off would be where i placed the green arrow -- the local minimum -- but would be even better if the cut-off could be be placed as close as possible to the first peak (blue arrow). thank you luigi>>>xval <- c(42.39, 44.53, 5.05, 6.9, 45, 2.35, 20.73, 3.31, 45, 4.76, 2.47, 2.37, 2.8, 3.26, 3.21, 45, 6.41, 4.77, 4.72, 4.89, 44.71, 2.8, 4.08, 4.07, 4.81, 2.93, 2.73, 44.75, 2.61, 2.56, 2.75, 2.75, 44.82, 36.59, 2.8, 43.82, 2.53, 2.75, 2.73, 3.05, 2.66, 5.61, 2.28, 4.83, 38.63, 44.23, 2.35, 2.47, 44.03, 6.33, 2.7, 2.96, 42.85, 2.47, 2, 12.76, 2.99, 2, 35.11, 2.63, 44.69, 2.96, 45, 42.13, 41.04, 3.22, 45, 45, 2.55, 4.58, 3.09, 39.98, 2, 2.97, 2.87, 2, 44.82, 45, 2.95, 45, 2, 2.82, 2.47, 2.98, 4.81, 44.53, 44.38, 2.87, 44.45, 2.9, 2.48, 44.14, 3.05, 2.76, 45, 45, 44.54, 42.85, 3.17, 2.46, 39.95, 36.96, 2.59, 2.75, 5.38, 2.8, 44.53, 45, 38.84, 4.64, 3.04, 2.59, 2.64, 45, 2.66, 44.37, 45, 26.32, 3.29, 40.44, 2, 41.51, 2, 45, 2, 5, 2.78, 2.11, 3.31, 2.61, 2.83, 2.6, 2.66, 2.95, 2.46, 2.58, 2.94, 45, 45, 2.71, 2.63, 2.81, 2, 3.29, 5.48, 45, 3.02, 2.82, 3.07, 2.65, 2.61, 2.67, 36.6, 2.08, 40.2, 45, 2.5, 45, 41.46, 45, 2.62, 2.77, 4.14, 2.63, 3.21, 4.79, 42.63, 2.66, 45, 4.69, 3.05, 45, 45, 2.97, 42.07, 2.73, 3.26, 5.17, 2.47, 44.66, 2.42, 5.14, 5.03, 2.65, 2.88, 2.69, 44.1, 3.15, 4.92, 42.02, 6.97, 2.46, 35.98, 2.95, 32.98, 2.79, 44.82, 2.84, 2.15, 44.42, 2.96, 45, 2.42, 2.75, 2.44, 4.58, 2, 45, 41.04, 4.04, 3.08, 2.46, 44.54, 3.21, 39.16, 2, 35.36, 3.08, 5.77, 2.71, 4.41, 2.46, 44.43, 2.62, 45, 2.7, 45, 41.43, 4.65, 3.05, 4.76, 40.66, 32.88, 45, 44.94, 44.67, 3.07, 2.92, 2.75, 2.63, 2.68, 34.15, 3.27, 2.47, 2, 2.63, 45, 3.06, 42.53, 35.25, 2.82, 42.62, 5.83, 4.69, 38.04, 2.47, 38.14, 3.73, 10, 4.93, 4.93, 4.65, 40.8, 2.32, 5.53, 3.01, 41.13, 4.5, 2.65, 44.85, 5.02, 2, 39.99, 2.89, 3.09, 2, 43.77, 44.53, 4.09, 6.22, 3.31, 44.64, 4.65, 45, 6.68, 39.93, 45, 2.77, 2.51, 2, 45, 4.08, 4.61, 6.11, 3.02, 44.8, 45, 44.54, 2.95, 2.77) yval <- c(-0.002, 0.001, 0.002, 0.001, -0.001, 0.003, 0.003, 0.005, 0, 0.011, 0.003, 0.011, 0.004, 0.012, 0.004, 0.005, 0.001, 0.007, 0.006, 0.007, -0.001, 0.011, 0.005, 0.002, 0.007, 0.028, 0.01, 0.002, 0.003, 0.007, 0.033, 0.006, 0.003, 0, 0.01, 0.018, 0.01, 0.008, 0.002, 0.022, 0.02, 0.002, 0.006, 0.008, 0, -0.002, -0.001, 0.001, 0.001, 0.007, 0.005, 0.011, 0.004, 0.001, 0.005, 0.001, 0.019, 0.002, 0.001, 0.002, -0.001, 0.003, 0.001, 0, -0.001, 0.002, 0.005, 0.001, 0, 0.007, 0.011, -0.001, 0.002, 0.01, 0.004, 0.003, 0.001, 0, 0.015, 0.004, 0.001, 0.003, 0.003, 0.027, 0.005, 0, 0.003, 0.003, 0, 0.017, 0.004, -0.001, 0.043, 0.003, -0.001, 0.001, 0, 0, 0.019, 0.003, -0.001, 0.001, 0.009, 0.013, 0.001, 0.021, 0.001, -0.001, -0.001, 0.002, 0.008, 0.004, 0.007, 0.001, 0.007, 0.001, 0, 0.001, 0.004, -0.001, 0.001, 0, 0.007, 0.003, 0.002, 0.001, 0.028, 0.002, 0.005, 0.013, 0.017, 0.013, 0.009, 0.021, 0.01, 0.007, 0.015, 0, 0.002, 0.002, 0.013, 0.012, 0, 0.034, 0.005, 0, 0.041, 0.02, 0.036, 0.004, 0.002, 0.004, 0.001, 0.001, 0.001, 0.003, 0.016, 0.002, 0, 0, 0.002, 0.009, 0.006, 0.022, 0.002, 0.01, 0, 0.012, 0.006, 0.01, 0.005, 0.001, 0.001, 0.027, 0.003, 0.002, 0.02, 0.014, 0.003, 0.003, -0.001, 0.002, 0.008, 0.011, 0.014, 0.017, 0, 0.015, 0.008, 0.001, 0.005, 0.002, 0.001, 0.012, 0.002, 0.004, 0, 0.012, 0, 0.001, 0.005, 0.001, 0.008, 0.011, 0.006, 0.007, 0.005, 0, 0, 0.003, 0.004, 0.002, 0, 0.015, -0.001, 0.002, -0.001, 0.006, 0.005, 0.006, 0.003, 0.002, 0.001, 0.001, 0, 0.007, 0.002, -0.001, 0.033, 0.01, 0.007, 0.003, 0, 0.001, -0.001, 0.011, 0.006, 0.015, 0.013, 0.01, 0.015, 0, 0.014, 0.001, 0.001, 0.003, 0, 0.012, -0.001, 0.002, 0.027, -0.001, 0.009, 0.021, 0.001, 0.001, -0.001, 0.008, 0.001, 0.017, 0.002, 0.027, 0.003, -0.001, 0.018, 0.026, -0.001, 0.004, 0.003, 0.001, 0.019, 0.001, 0.001, 0.019, 0.005, 0.001, -0.001, 0.002, 0.001, 0.001, 0.004, 0.002, 0.007, 0.003, 0.01, 0.002, 0.003, 0.002, 0.005, 0.003, 0, 0.004, 0.032, 0.005, 0.018, 0.002, 0.001, 0.002, 0.003, 0.005) df <- data.frame(xval,yval) d.rv = density(df$xval) plot(df$yval ~ df$xval, ylim=c(0,0.05)) lines(d.rv) d.x = d.rv$x d.y = d.rv$y runs <- rle(sign(diff(d.rv$y))) length(runs$lengths) mode1 <- runs$lengths[1]+1 mode2 <- length(d.rv$x)- runs$lengths[4] Y1 <- d.rv$y[mode1] X1 <- d.rv$x[d.rv$y == Y1] abline(v=X1, col="red") Y2 <- d.rv$y[mode2] X2 <- d.rv$x[d.rv$y == Y2] abline(v=X2, col="red") arrows(21, 0.03, 21, 0.01, col="green") arrows(10, 0.03, 10, 0.01, col="blue")