HI Satish,
colnames(Output)[4]<- colnames(dat2)[i]; #guess this line should be:
colnames(x1)[4]<- colnames(dat2)[i]
Regarding the warning, I used
read.table(..., stringsAsFactors=FALSE).? In your case, you might need to either
use that option while reading the data or convert the factor variables to
character class.
Check:
str(Output)
I forgot about sorting the data.? You can use either ?sort() or ?order
?dat1New<-dat1[order(dat1$CustID,as.Date(dat1$TripDate,"%d-%b-%y"),dat1$Store),]?
#in the example data, it didn't change anything
dat2<- dat1New[,-c(1:3)]
str(dat1New)
'data.frame':??? 7 obs. of? 7 variables:
?$ CustID? : int? 1 1 1 1 2 2 2
?$ TripDate: chr? "2-Jan-12" "6-Jan-12" "9-Jan-12"
"31-Mar-13" ... ##should be factor in your original dataset
?$ Store?? : chr? "a" "c" "a" "a" ...?
#####
?$ Bread?? : int? 2 0 3 3 0 3 3
?$ Butter? : int? 0 3 3 0 3 3 0
?$ Milk??? : int? 2 3 0 0 3 0 0
?$ Eggs??? : int? 1 0 0 0 0 0 0
Suppose, I read the data with stringsAsFactors=TRUE (default is this option)
dat1<- read.table(text="
CustID TripDate Store Bread Butter Milk Eggs
1 2-Jan-12 a 2 0 2 1
1 6-Jan-12 c 0 3 3 0
1 9-Jan-12 a 3 3 0 0
1 31-Mar-13 a 3 0 0 0
2 31-Aug-12 a 0 3 3 0
2 24-Sep-12 a 3 3 0 0
2 25-Sep-12 b 3 0 0 0
",sep="",header=TRUE)
?str(dat1)
'data.frame':??? 7 obs. of? 7 variables:
?$ CustID? : int? 1 1 1 1 2 2 2
?$ TripDate: Factor w/ 7 levels "24-Sep-12","25-Sep-12",..:
3 6 7 5 4 1 2
?$ Store?? : Factor w/ 3 levels "a","b","c": 1 3 1
1 1 1 2
?$ Bread?? : int? 2 0 3 3 0 3 3
?$ Butter? : int? 0 3 3 0 3 3 0
?$ Milk??? : int? 2 3 0 0 3 0 0
?$ Eggs??? : int? 1 0 0 0 0 0 0
dat2<- dat1[,-c(1:3)]
?
?res<- lapply(seq_len(ncol(dat2)),function(i)
{x1<-cbind(dat1[,c(1:3)],dat2[,i]);colnames(x1)[4]<-
colnames(dat2)[i];x2<-x1[x1[,4]!=0,];within(x2,
{daysbetweentrips<-unlist(tapply(as.Date(x2$TripDate,"%d-%b-%y"),list(x2$CustID),function(x)
c(NA,as.numeric(diff(x)))));previoustripstore<-ave(x2$Store,x2$CustID,FUN=function(x)
c(NA,x[-length(x)]));Nexttripstore<- ave(x2$Store,x2$CustID,FUN=function(x)
c(x[-1],NA))})})
Warning messages:
1: In `[<-.factor`(`*tmp*`, i, value = c(NA, 1L, 1L)) :
? invalid factor level, NA generated
2: In `[<-.factor`(`*tmp*`, i, value = c(NA, 1L)) :
? invalid factor level, NA generated
3: In `[<-.factor`(`*tmp*`, i, value = c(1L, 1L, NA)) :
? invalid factor level, NA generated
---------------------------------------------------
?
To convert to character class after reading the data:
dat1[]<-lapply(dat1,function(x) if(is.factor(x)) as.character(x) else x)
?str(dat1)
#'data.frame':??? 7 obs. of? 7 variables:
# $ CustID? : int? 1 1 1 1 2 2 2
# $ TripDate: chr? "2-Jan-12" "6-Jan-12"
"9-Jan-12" "31-Mar-13" ...
# $ Store?? : chr? "a" "c" "a" "a" ...
# $ Bread?? : int? 2 0 3 3 0 3 3
# $ Butter? : int? 0 3 3 0 3 3 0
# $ Milk??? : int? 2 3 0 0 3 0 0
# $ Eggs??? : int? 1 0 0 0 0 0 0
?dat2<- dat1[,-c(1:3)]
?
? res<- lapply(seq_len(ncol(dat2)),function(i)
{x1<-cbind(dat1[,c(1:3)],dat2[,i]);colnames(x1)[4]<-
colnames(dat2)[i];x2<-x1[x1[,4]!=0,];within(x2,
{daysbetweentrips<-unlist(tapply(as.Date(x2$TripDate,"%d-%b-%y"),list(x2$CustID),function(x)
c(NA,as.numeric(diff(x)))));previoustripstore<-ave(x2$Store,x2$CustID,FUN=function(x)
c(NA,x[-length(x)]));Nexttripstore<- ave(x2$Store,x2$CustID,FUN=function(x)
c(x[-1],NA))})}) #works
A.K.
?
Hi Arun-
?
Thanks for this...
?
I ran this code. without the days between trips... Can you please
confirm the paranthesis and code looks?right.?. they do to me....
?
res<- lapply(seq_len(ncol(dat2)),function(i)
{
x1<-cbind(Output[,c(1:3)],dat2[,i]);
colnames(Output)[4]<- colnames(dat2)[i];
x2<-x1[x1[,4]!=0,];
previoustripstore<-ave(x2$store,x2$CUSTID,FUN=function(x)
c(NA,x[-length(x)]));
Nexttripstore<- ave(x2$store,x2$CUSTID,FUN=function(x) c(x[-1],NA))
}
)
?
But i get an warning like this:In `[<-.factor`(`*tmp*`, i, value = c(NA, 3L,
3L, 3L,? ... :
? invalid factor level, NA generated
?
Wat might be wrong? Please help
?
Thanks,
Satish
----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: R help <r-help at r-project.org>
Cc:
Sent: Monday, September 2, 2013 5:01 PM
Subject: Re: R dataframe and looping help
HI,
You may try this:
dat1<- read.table(text="
CustID TripDate Store Bread Butter Milk Eggs
1 2-Jan-12 a 2 0 2 1
1 6-Jan-12 c 0 3 3 0
1 9-Jan-12 a 3 3 0 0
1 31-Mar-13 a 3 0 0 0
2 31-Aug-12 a 0 3 3 0
2 24-Sep-12 a 3 3 0 0
2 25-Sep-12 b 3 0 0 0
",sep="",header=TRUE,stringsAsFactors=FALSE)
dat2<- dat1[,-c(1:3)]
res<- lapply(seq_len(ncol(dat2)),function(i)
{x1<-cbind(dat1[,c(1:3)],dat2[,i]);colnames(x1)[4]<-
colnames(dat2)[i];x2<-x1[x1[,4]!=0,];within(x2,
{daysbetweentrips<-unlist(tapply(as.Date(x2$TripDate,"%d-%b-%y"),list(x2$CustID),function(x)
c(NA,as.numeric(diff(x)))));previoustripstore<-ave(x2$Store,x2$CustID,FUN=function(x)
c(NA,x[-length(x)]));Nexttripstore<- ave(x2$Store,x2$CustID,FUN=function(x)
c(x[-1],NA))})})
?res
#[[1]]
?# CustID? TripDate Store Bread Nexttripstore previoustripstore daysbetweentrips
#1????? 1? 2-Jan-12???? a???? 2???????????? a?????????????
<NA>?????????????? NA
#3????? 1? 9-Jan-12???? a???? 3???????????? a???????????????? a??????????????? 7
#4????? 1 31-Mar-13???? a???? 3????????? <NA>????????????????
a????????????? 447
#6????? 2 24-Sep-12???? a???? 3???????????? b?????????????
<NA>?????????????? NA
#7????? 2 25-Sep-12???? b???? 3????????? <NA>????????????????
a??????????????? 1
#[[2]]
?# CustID? TripDate Store Butter Nexttripstore previoustripstore
#2????? 1? 6-Jan-12???? c????? 3???????????? a????????????? <NA>
#3????? 1? 9-Jan-12???? a????? 3????????? <NA>???????????????? c
#5????? 2 31-Aug-12???? a????? 3???????????? a????????????? <NA>
#6????? 2 24-Sep-12???? a????? 3????????? <NA>???????????????? a
?# daysbetweentrips
#2?????????????? NA
#3??????????????? 3
#5?????????????? NA
#6?????????????? 24
#[[3]]
?# CustID? TripDate Store Milk Nexttripstore previoustripstore daysbetweentrips
#1????? 1? 2-Jan-12???? a??? 2???????????? c?????????????
<NA>?????????????? NA
#2????? 1? 6-Jan-12???? c??? 3????????? <NA>????????????????
a??????????????? 4
#5????? 2 31-Aug-12???? a??? 3????????? <NA>?????????????
<NA>?????????????? NA
#[[4]]
?# CustID TripDate Store Eggs Nexttripstore previoustripstore daysbetweentrips
#1????? 1 2-Jan-12???? a??? 1????????? <NA>?????????????
<NA>?????????????? NA
A.K.
Hi, I have a very quick question.. I have a data which has sales per
category per trip of each customer at different store locations, like
below..(dataset1 frome xcel attachment) CustID??? TripDate??? Store??? Bread???
Butter??? Milk??? Eggs
1??? 2-Jan-12??? ? a??? 2??? 0??? 2??? 1
1??? 6-Jan-12??? ? c??? 0??? 3??? 3??? 0
1??? 9-Jan-12??? ? a??? 3??? 3??? 0??? 0
1??? 31-Mar-13 a??? 3??? 0??? 0??? 0
2??? 31-Aug-12 a??? 0??? 3??? 3??? 0
2??? 24-Sep-12 a??? 3??? 3??? 0??? 0
2??? 25-Sep-12 b??? 3??? 0??? 0??? 0 Here i have shown 4 items and their sales
per customer per trip at each
store... However, my data contains around 100 columns with item names..
All i need to do is following: 1. Create a separate dataframe for each item.
That is, create 100
dataframs one for each item.. Within the dataframe for Butter, for
example, will be contained columns 1-3 and Butter column, specifically
filtered for rows where butter>0 in sales..(so rows 1,4,7 will be
dropped from this dataframe)..Likewise for all items...(sample output
for butter is: (dataset2) CustID??? TripDate??? Store??? Butter
1??? 6-Jan-12??? ? c??? 3
1??? 9-Jan-12??? ? a??? 3
2??? 31-Aug-12? a??? 3
2??? 24-Sep-12? a??? 3 2. In same loop, create new derived variables within each
dataframe for
each item... like create a lag variable for TripDate, create lag
variable for storename in next trip, storename in previous trip etc...
and also # days between trips to each store for each customer...(an
example for Butter dataframe with new derived variables would be...)
Dataset needs to be sorted by CustID, TripDate, Store before creating
derived variables (dataset3)Book1.xlsx CustID??? TripDate??? Store??? Butter???
NextTripstore previoustripstore
daysbetweentrips
1??? 6-Jan-12??? ? c??? 3??? a??? ? ? ? ? ? ? ? -??? ? ? ? -
1??? 9-Jan-12??? ? a??? 3??? -??? ? ? ? ? ? ? ? c??? ? ? ? -
2??? 31-Aug-12? a??? 3??? a??? ? ? ? ? ? ? ? -??? ? ? ? -
2??? 24-Sep-12? a??? 3??? -??? ? ? ? ? ? ? ? a??? ? ? 24 Point of creating
multiple item level dataframes is, i will use them
iteratively as i will perform some regression on these datasets, using
same set of variables each time