Hi,
I am new to R.
and even though I've made my code to run and do what it needs to .
It is taking forever and I can't use it like this.
I was wondering if you could help me find ways to fix the code to run
faster.
Here are my codes..
the data set is a bunch of 0s and 1s in a data.frame.
What I am doing is this.
I pick a column and make up a new column Y with values associated with that
column I picked.
then I remove the column.
and see if any other columns in the data have a significant association with
the Y column I've generated.
If you see anything obvious that takes a long time, any suggestions would be
appreciated.
thanks a lot.
Mira
----------------------------------------------------------------------------------------------
#sub function for finding index
rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)]
#sub function for permutation test
perm.F = function(y,x,nperms,sites)
{
maxF = c();
for (i in 1:nperms)
{
F=numeric(S) #create an empty vector to store the F-values
newY=sample(y,length(y)) #permute the cancer types
newX = cbind(x, newY);
# anova for all sites
for ( i in sites )
{
a <- anova(lm(newY~factor(newX[,i])));
F[i] <- a$`F value`[1];
}
MSSid <- which (F == max(F)); # index of MSS (Most Significant Site)
maxF = cbind(maxF,max(F));
}
maxF;
}
# set the output file
sink("/tmp/R.out.3932.100")
# load the dataset
snp = read.table(file("/tmp/msoutput.3932.100"))
#print (snp);
# pi: desired proportion of variation due to QTN
pi = 0.05;
print (paste("pi:", pi));
MAF = 0.05;
print (paste("MAF:", MAF));
# S: number of segregating sites
S = length(snp[1,]);
# N: number of samples
N = length(snp[,1]);
Dips = sample(1:N,N)
DipA = Dips[1:50]
DipB = Dips[51:100]
disnp = snp[DipA,]+snp[DipB,]
snp = as.data.frame(disnp, row.names=NULL);
N = length(snp[,1]);
# get allele freq for all SNPs
allele_f <- mean(snp[,1:S])/2;
print (allele_f);
sites = rfind(allele_f);
print(sites);
# collapse sites that have perfect correlation
newsites <- sites;
for (i in 1:(length(sites)-1))
{
for (j in (i+1):length(sites))
{
test = (snp[sites[i]] == snp[sites[j]])
if ( all(test) || all(!test) )
{
print (paste("perfect correlation with", sites[i]));
print (paste("removing alleles", sites[j]));
newsites <- newsites[newsites!=sites[j]];
}
}
}
sites <- newsites;
print(sites);
# QTN: the site nearest right to S/4
sitesid = floor(length(sites)/4);
QTNid = sites[sitesid];
QTN = snp[,QTNid];
print (paste("QTN:", names(snp[QTNid])));
print (QTN);
# remove QTN from sites
sites <- sites [ sites != QTNid ];
print(sites);
print (paste("Number of usable SNPs:", length(sites)));
# p: allele frequency of QTN
p0 = allele_f[QTNid];
p = min(p0, 1-p0);
print (paste("QTN_freq:", p));
# z: random normal deviate
z = rnorm(N, mean = 0, sd = 1);
# foreach sample give quantitative phenotype
# each row is a sample
# phenotype value depends on QTN genotype, pi, p, and z
Y <- sqrt(10-(10*pi))*z + QTN*sqrt((10*pi)/(2*p*(1-p)));
snp = data.frame(cbind(snp, Y));
# anova for QTN
df=data.frame(Y=Y, QTN=factor(QTN));
QTN_a <- anova(lm(Y~QTN, data=df));
print (QTN_a);
SSB <- QTN_a$`Sum Sq`[1];
SSW <- QTN_a$`Sum Sq`[2];
QTN_PRE <- SSB / (SSB + SSW);
print (paste("var_QTN/var_tot:", QTN_PRE));
# anova for all sites
F=numeric(S) #create an empty vector to store the F-values
Pval=rep(1,S) #create an empty vector to store the Pval
PRE=numeric(S) #create an empty vector to store the PRE
for ( i in sites )
{
a <- anova(lm(Y~factor(snp[,i])));
print (a);
F[i] <- a$`F value`[1];
Pval[i] <- a$`Pr`[1];
SSB <- a$`Sum Sq`[1];
SSW <- a$`Sum Sq`[2];
PRE[i] <- SSB / (SSB + SSW);
}
print (paste("Max F:", max(F)));
MSSid <- which (F == max(F)); # index of MSS (Most Significant Site)
MSS = snp[,MSSid];
print (paste("MSS(Most Significant Site):", MSSid));
p0 = length(MSS[MSS==0])/N;
p = min(p0, 1-p0);
print (paste("assoc_freq:", p));
print (paste("assoc_var:", PRE[MSSid]));
#lets do a permutation test
Fdist <- perm.F(Y, snp[,1:S], 1000, sites);
print ("permutation test maxF dist");
print (Fdist);
pvalue <- mean(Fdist>F[MSSid]);
print (paste("assoc_prob:", pvalue));
# close the output file
sink()
--------------------------------------------------------------------------------------------------
[[alternative HTML version deleted]]
Here is the dataset you need to run this code.. http://mypage.iu.edu/~mirhan/msoutput.3932.100 thank you. On 1/21/07, miraceti <miraceti@gmail.com> wrote:> > Hi, > I am new to R. > and even though I've made my code to run and do what it needs to . > It is taking forever and I can't use it like this. > I was wondering if you could help me find ways to fix the code to run > faster. > Here are my codes.. > the data set is a bunch of 0s and 1s in a data.frame. > What I am doing is this. > I pick a column and make up a new column Y with values associated with > that column I picked. > then I remove the column. > and see if any other columns in the data have a significant association > with the Y column I've generated. > If you see anything obvious that takes a long time, any suggestions would > be appreciated. > thanks a lot. > > Mira > > > ---------------------------------------------------------------------------------------------- > #sub function for finding index > rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)] > > #sub function for permutation test > perm.F = function(y,x,nperms,sites) > { > maxF = c(); > for (i in 1:nperms) > { > F=numeric(S) #create an empty vector to store the F-values > newY=sample(y,length(y)) #permute the cancer types > newX = cbind(x, newY); > # anova for all sites > for ( i in sites ) > { > a <- anova(lm(newY~factor(newX[,i]))); > F[i] <- a$`F value`[1]; > } > MSSid <- which (F == max(F)); # index of MSS (Most Significant Site) > maxF = cbind(maxF,max(F)); > } > maxF; > } > > > # set the output file > sink("/tmp/R.out.3932.100") > # load the dataset > snp = read.table(file("/tmp/msoutput.3932.100")) > #print (snp); > > # pi: desired proportion of variation due to QTN > pi = 0.05; > print (paste("pi:", pi)); > MAF = 0.05; > print (paste("MAF:", MAF)); > # S: number of segregating sites > S = length(snp[1,]); > # N: number of samples > N = length(snp[,1]); > Dips = sample(1:N,N) > DipA = Dips[1:50] > DipB = Dips[51:100] > disnp = snp[DipA,]+snp[DipB,] > snp = as.data.frame(disnp, row.names=NULL); > N = length(snp[,1]); > > # get allele freq for all SNPs > allele_f <- mean(snp[,1:S])/2; > print (allele_f); > sites = rfind(allele_f); > print(sites); > > # collapse sites that have perfect correlation > newsites <- sites; > for (i in 1:(length(sites)-1)) > { > for (j in (i+1):length(sites)) > { > test = (snp[sites[i]] == snp[sites[j]]) > if ( all(test) || all(!test) ) > { > print (paste("perfect correlation with", sites[i])); > print (paste("removing alleles", sites[j])); > newsites <- newsites[newsites!=sites[j]]; > } > } > } > sites <- newsites; > print(sites); > > # QTN: the site nearest right to S/4 > sitesid = floor(length(sites)/4); > QTNid = sites[sitesid]; > QTN = snp[,QTNid]; > > print (paste("QTN:", names(snp[QTNid]))); > print (QTN); > > # remove QTN from sites > sites <- sites [ sites != QTNid ]; > print(sites); > print (paste("Number of usable SNPs:", length(sites))); > > # p: allele frequency of QTN > p0 = allele_f[QTNid]; > p = min(p0, 1-p0); > print (paste("QTN_freq:", p)); > > # z: random normal deviate > z = rnorm(N, mean = 0, sd = 1); > # foreach sample give quantitative phenotype > # each row is a sample > # phenotype value depends on QTN genotype, pi, p, and z > Y <- sqrt(10-(10*pi))*z + QTN*sqrt((10*pi)/(2*p*(1-p))); > snp = data.frame(cbind(snp, Y)); > # anova for QTN > df=data.frame(Y=Y, QTN=factor(QTN)); > QTN_a <- anova(lm(Y~QTN, data=df)); > print (QTN_a); > SSB <- QTN_a$`Sum Sq`[1]; > SSW <- QTN_a$`Sum Sq`[2]; > QTN_PRE <- SSB / (SSB + SSW); > print (paste("var_QTN/var_tot:", QTN_PRE)); > > # anova for all sites > F=numeric(S) #create an empty vector to store the F-values > Pval=rep(1,S) #create an empty vector to store the Pval > PRE=numeric(S) #create an empty vector to store the PRE > > for ( i in sites ) > { > a <- anova(lm(Y~factor(snp[,i]))); > print (a); > F[i] <- a$`F value`[1]; > Pval[i] <- a$`Pr`[1]; > SSB <- a$`Sum Sq`[1]; > SSW <- a$`Sum Sq`[2]; > PRE[i] <- SSB / (SSB + SSW); > > } > print (paste("Max F:", max(F))); > MSSid <- which (F == max(F)); # index of MSS (Most Significant Site) > MSS = snp[,MSSid]; > print (paste("MSS(Most Significant Site):", MSSid)); > p0 = length(MSS[MSS==0])/N; > p = min(p0, 1-p0); > print (paste("assoc_freq:", p)); > print (paste("assoc_var:", PRE[MSSid])); > #lets do a permutation test > Fdist <- perm.F(Y, snp[,1:S], 1000, sites); > print ("permutation test maxF dist"); > print (Fdist); > pvalue <- mean(Fdist>F[MSSid]); > print (paste("assoc_prob:", pvalue)); > > # close the output file > sink() > > -------------------------------------------------------------------------------------------------- > >[[alternative HTML version deleted]]
Dear Mira, I didn't work through your code in detail, but I did notice that you're doing something that's very inefficient in R -- building up objects element-by-element, e.g., by indexing beyond their current length. Instead, you can preallocate the objects and simply replace elements. For example, create the vector F in your perm.F() as F <- numeric(nperms) rather than as an empty vector. (BTW, I'd probably not name a variable "F" since this is usually a synonym for the logical value FALSE.) There's a similar problem in your handling of maxF, which you build up column-by-column via cbind(). (BTW, is maxF really a matrix?) You also needlessly recompute max(F), never appear to use MSSid, and end lines with unnecessary semicolons. I hope that this helps, John -------------------------------- John Fox Department of Sociology McMaster University Hamilton, Ontario Canada L8S 4M4 905-525-9140x23604 http://socserv.mcmaster.ca/jfox --------------------------------> -----Original Message----- > From: r-help-bounces at stat.math.ethz.ch > [mailto:r-help-bounces at stat.math.ethz.ch] On Behalf Of miraceti > Sent: Sunday, January 21, 2007 12:38 PM > To: r-help at stat.math.ethz.ch > Subject: [R] efficient code. how to reduce running time? > > Hi, > I am new to R. > and even though I've made my code to run and do what it needs to . > It is taking forever and I can't use it like this. > I was wondering if you could help me find ways to fix the > code to run faster. > Here are my codes.. > the data set is a bunch of 0s and 1s in a data.frame. > What I am doing is this. > I pick a column and make up a new column Y with values > associated with that column I picked. > then I remove the column. > and see if any other columns in the data have a significant > association with the Y column I've generated. > If you see anything obvious that takes a long time, any > suggestions would be appreciated. > thanks a lot. > > Mira > > -------------------------------------------------------------- > -------------------------------- > #sub function for finding index > rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)] > > #sub function for permutation test > perm.F = function(y,x,nperms,sites) > { > maxF = c(); > for (i in 1:nperms) > { > F=numeric(S) #create an empty vector to store the F-values > newY=sample(y,length(y)) #permute the cancer types > newX = cbind(x, newY); > # anova for all sites > for ( i in sites ) > { > a <- anova(lm(newY~factor(newX[,i]))); > F[i] <- a$`F value`[1]; > } > MSSid <- which (F == max(F)); # index of MSS (Most > Significant Site) > maxF = cbind(maxF,max(F)); > } > maxF; > } > > > # set the output file > sink("/tmp/R.out.3932.100") > # load the dataset > snp = read.table(file("/tmp/msoutput.3932.100")) > #print (snp); > > # pi: desired proportion of variation due to QTN pi = 0.05; > print (paste("pi:", pi)); MAF = 0.05; print (paste("MAF:", > MAF)); # S: number of segregating sites S = length(snp[1,]); > # N: number of samples N = length(snp[,1]); Dips = > sample(1:N,N) DipA = Dips[1:50] DipB = Dips[51:100] disnp = > snp[DipA,]+snp[DipB,] snp = as.data.frame(disnp, > row.names=NULL); N = length(snp[,1]); > > # get allele freq for all SNPs > allele_f <- mean(snp[,1:S])/2; > print (allele_f); > sites = rfind(allele_f); > print(sites); > > # collapse sites that have perfect correlation newsites <- > sites; for (i in 1:(length(sites)-1)) { > for (j in (i+1):length(sites)) > { > test = (snp[sites[i]] == snp[sites[j]]) > if ( all(test) || all(!test) ) > { > print (paste("perfect correlation with", sites[i])); > print (paste("removing alleles", sites[j])); > newsites <- newsites[newsites!=sites[j]]; > } > } > } > sites <- newsites; > print(sites); > > # QTN: the site nearest right to S/4 > sitesid = floor(length(sites)/4); > QTNid = sites[sitesid]; > QTN = snp[,QTNid]; > > print (paste("QTN:", names(snp[QTNid]))); print (QTN); > > # remove QTN from sites > sites <- sites [ sites != QTNid ]; > print(sites); > print (paste("Number of usable SNPs:", length(sites))); > > # p: allele frequency of QTN > p0 = allele_f[QTNid]; > p = min(p0, 1-p0); > print (paste("QTN_freq:", p)); > > # z: random normal deviate > z = rnorm(N, mean = 0, sd = 1); > # foreach sample give quantitative phenotype # each row is a > sample # phenotype value depends on QTN genotype, pi, p, and > z Y <- sqrt(10-(10*pi))*z + QTN*sqrt((10*pi)/(2*p*(1-p))); > snp = data.frame(cbind(snp, Y)); # anova for QTN > df=data.frame(Y=Y, QTN=factor(QTN)); QTN_a <- anova(lm(Y~QTN, > data=df)); print (QTN_a); SSB <- QTN_a$`Sum Sq`[1]; SSW <- > QTN_a$`Sum Sq`[2]; QTN_PRE <- SSB / (SSB + SSW); print > (paste("var_QTN/var_tot:", QTN_PRE)); > > # anova for all sites > F=numeric(S) #create an empty vector to store the F-values > Pval=rep(1,S) #create an empty vector to store the Pval > PRE=numeric(S) #create an empty vector to store the PRE > > for ( i in sites ) > { > a <- anova(lm(Y~factor(snp[,i]))); > print (a); > F[i] <- a$`F value`[1]; > Pval[i] <- a$`Pr`[1]; > SSB <- a$`Sum Sq`[1]; > SSW <- a$`Sum Sq`[2]; > PRE[i] <- SSB / (SSB + SSW); > > } > print (paste("Max F:", max(F))); > MSSid <- which (F == max(F)); # index of MSS (Most > Significant Site) MSS = snp[,MSSid]; print (paste("MSS(Most > Significant Site):", MSSid)); p0 = length(MSS[MSS==0])/N; p = > min(p0, 1-p0); print (paste("assoc_freq:", p)); print > (paste("assoc_var:", PRE[MSSid])); #lets do a permutation > test Fdist <- perm.F(Y, snp[,1:S], 1000, sites); print > ("permutation test maxF dist"); print (Fdist); pvalue <- > mean(Fdist>F[MSSid]); print (paste("assoc_prob:", pvalue)); > > # close the output file > sink() > -------------------------------------------------------------- > ------------------------------------ > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at stat.math.ethz.ch mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.