Dear Jen,
Vectorisation is the keyword here. 250k sample takes only 2.5 seconds on my
machine. 2.5 million takes 29 seconds.
n <- 250e3
# country code
cc <- "+212"
# prefixes
IAM <- c(610, 611, 613, 615, 616,
618, 641, 642, 648, 650, 651, 652, 653,
654, 655, 658, 659, 661, 662, 666, 667,
668, 670, 671, 672, 673,
676, 677, 678)
Medi <- c(612, 614, 617, 619, 644,
645, 649, 656, 657, 660, 663, 664, 665,
669, 674, 675, 679)
MOROC <- c(0636, 0637)
prefix <- rbind(
data.frame(
region = "IAM",
prefix = IAM
),
data.frame(
region = "Medi",
prefix = Medi
),
data.frame(
region = "MOROC",
prefix = MOROC
)
)
prefix <- merge(
prefix,
as.data.frame(table(region = prefix$region))
)
system.time({
prefix.sample <- sample(prefix$prefix, n, prob = prefix$Freq, replace TRUE)
nums <- apply(
matrix(
sample(0:9, 6 * n, replace = TRUE),
ncol = 6
),
1,
paste,
collapse = ""
)
phonenumbers <- paste0(cc, prefix.sample, nums)
})
ir. Thierry Onkelinx
Instituut voor natuur- en bosonderzoek / Research Institute for Nature and
Forest
team Biometrie & Kwaliteitszorg / team Biometrics & Quality Assurance
Kliniekstraat 25
1070 Anderlecht
Belgium
To call in the statistician after the experiment is done may be no more
than asking him to perform a post-mortem examination: he may be able to say
what the experiment died of. ~ Sir Ronald Aylmer Fisher
The plural of anecdote is not data. ~ Roger Brinner
The combination of some data and an aching desire for an answer does not
ensure that a reasonable answer can be extracted from a given body of data.
~ John Tukey
2017-02-28 17:22 GMT+01:00 Jen <plessthanpointohfive at gmail.com>:
> Hi, I'm trying to generate 2.5 million phone numbers. The code below
> generates a random sample of 250K MPNS for Morocco. It takes about 10
> minutes.
>
> I need to generate 2.5 million. I've run it through once and it took
about
> 45 hours.
>
> Is there a way to speed this up?
>
> Thanks,
>
> Jen
>
> # generate random sample of mobile phone numbers (MPNs) - Morocco
>
> # Mobile phone number format: +212-6xx-xxxxxx
>
> library(data.table)
>
> # country code
>
> cc <- "+212"
>
> # prefixes
>
> IAM <- data.table(matrix(c(610, 611, 613, 615, 616,
> 618, 641, 642, 648, 650, 651, 652, 653,
> 654, 655, 658, 659, 661, 662, 666, 667,
> 668, 670, 671, 672, 673,
> 676, 677, 678), dimnames=list(NULL, "IAM")))
>
>
>
> Medi <- data.table(matrix(c(612, 614, 617, 619, 644,
> 645, 649, 656, 657, 660, 663, 664, 665,
> 669, 674, 675, 679), dimnames=list(NULL,
"Medi")))
>
> MOROC <- data.table(matrix(c(0636, 0637), dimnames=list(NULL,
"MOROC")))
>
> # combine
>
> mno <- c(IAM, Medi, MOROC)
>
> # generate MPNs
> MPN <- NULL
>
> system.time(for (i in 1:250000){
> # randomly select number from list
>
> prefix <- sapply(mno[floor(runif(1, 1, length(mno)+1))], function(x)
> sample(x, 1))
>
> MNO <- names(prefix)
>
> # randomly generate 6 numbers between 0 and 9, inclusive
>
> nums <- floor(runif(6, 0, 9))
>
> # concatenate
>
> tmp <- c(paste(c(cc,prefix,t(nums)), sep="",
collapse=""), MNO)
>
> MPN[[i]] <- tmp
>
> i <- i+1
>
>
> })
>
> # unlist
>
> df <- data.table(matrix(unlist(MPN), nrow=length(MPN), ncol=2, byrow=T,
> dimnames = list(seq(1, length(MPN),1), c("MPN", "MNO"))
))
>
> [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/
> posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
[[alternative HTML version deleted]]