thr3ads.net - R help - [R] Long to wide format without time variable [Jun 2009]

If this information is useful, please help other people find it:
Share via:

Alan Cohen

2009-Jun-23 21:52 UTC

[R] Long to wide format without time variable

Hi all,

I am trying to convert a data set of physician death codings (each
individual's cause of death is coded by multiple physicians) from long to
wide format, but the "reshape" function doesn't seem to work
because it requires a "time" variable to identify the sequence among
the repeated observations within individuals.  My data set has no order, and
different numbers of physicians code each death, up to 23.  It is also quite
large, so for-loops are very slow, and I'll need to repeat the procedure
multiple times.  So I'm looking for a processor-efficient way to replicate
"reshape" without a time variable.

Thanks in advance for any help you can provide.  A worked example and some code
I've tried are below.  I'm working with R v2.8.1 on Windows XP
Professional.

Cheers,
Alan Cohen

Here's what my data look like now:
> id <- rep(1:5,2)
> COD <-
c("A01","A02","A03","A04","A05","B01","A02","B03","B04","A05")
> MDid <- c(1:6,3,5,7,2)
> data <- as.data.frame(cbind(id,COD,MDid))
> data   id COD MDid
1   1 A01    1
2   2 A02    2
3   3 A03    3
4   4 A04    4
5   5 A05    5
6   1 B01    6
7   2 A02    3
8   3 B03    5
9   4 B04    7
10  5 A05    2

And here's what I'd like them to look like:
> id2 <- 1:5
> COD.1 <-
c("A01","A02","A03","A04","A05")
> COD.2 <-
c("B01","A02","B03","B04","A05")
> MDid.1 <- 1:5
> MDid.2 <-c(6,3,5,7,2)
> data.wide <- as.data.frame(cbind(id2,COD.1,COD.2,MDid.1,MDid.2))
> data.wide  id2 COD.1 COD.2 MDid.1 MDid.2
1   1   A01   B01      1      6
2   2   A02   A02      2      3
3   3   A03   B03      3      5
4   4   A04   B04      4      7
5   5   A05   A05      5      2

Here's the for-loop that's very slow (with or without the if-clauses
activated):

ids<-unique(data$id)
ct<-length(ids)
codes<-matrix(0,ct,11)
colnames(codes)<-c("ID","ICD1","Coder1","ICD2","Coder2","ICD3","Coder3","ICD4","Coder4","ICD5","Coder5")
j<-0
for (i in 1:ct){
  kkk <- ids[i] 
  rpt<-data[data$id==kkk,]
  j<-max(j,nrow(rpt))
  codes[i,1]<-kkk
  codes[i,2]<-rpt$ICDCode[1]
  codes[i,3]<-rpt$T_Physician_ID[1]
  #if (nrow(rpt)>=2){
   codes[i,4]<-rpt$ICDCode[2]
   codes[i,5]<-rpt$T_Physician_ID[2] 
    #if (nrow(rpt)>=3) {
     codes[i,6]<-rpt$ICDCode[3]
     codes[i,7]<-rpt$T_Physician_ID[3]
      #if (nrow(rpt)>=4) {
       codes[i,8]<-rpt$ICDCode[4]
       codes[i,9]<-rpt$T_Physician_ID[4]
          #if (nrow(rpt)>=5) {
           codes[i,10]<-rpt$ICDCode[5]
           codes[i,11]<-rpt$T_Physician_ID[5]
#}}}}
}

jim holtman

2009-Jun-23 22:18 UTC

head link

[R] Long to wide format without time variable

This should do it:
> x   id COD MDid
1   1 A01    1
2   2 A02    2
3   3 A03    3
4   4 A04    4
5   5 A05    5
6   1 B01    6
7   2 A02    3
8   3 B03    5
9   4 B04    7
10  5 A05    2> # max number of columns
> maxCol <- max(table(x$id))
> # now create the output
> y <- lapply(split(x, x$id), function(.id){+     .cod <- as.character(.id$COD)
+     .mdid <- .id$MDid
+     length(.cod) <- maxCol
+     length(.mdid) <- maxCol
+     c(.id$id[1], .cod, .mdid)
+ })> # create the dataframe
> y <- as.data.frame(do.call(rbind, y))
> # add column names
> names(y) <- c('id', paste("COD", seq(maxCol),
sep='.'),+     paste("MDid", seq(maxCol),
sep='.'))> y  id COD.1 COD.2 MDid.1 MDid.2
1  1   A01   B01      1      6
2  2   A02   A02      2      3
3  3   A03   B03      3      5
4  4   A04   B04      4      7
5  5   A05   A05      5      2>
>

On Tue, Jun 23, 2009 at 5:52 PM, Alan Cohen
<CohenA@smh.toronto.on.ca>wrote:
> Hi all,
>
> I am trying to convert a data set of physician death codings (each
> individual's cause of death is coded by multiple physicians) from long
to
> wide format, but the "reshape" function doesn't seem to work
because it
> requires a "time" variable to identify the sequence among the
repeated
> observations within individuals.  My data set has no order, and different
> numbers of physicians code each death, up to 23.  It is also quite large,
so
> for-loops are very slow, and I'll need to repeat the procedure multiple
> times.  So I'm looking for a processor-efficient way to replicate
"reshape"
> without a time variable.
>
> Thanks in advance for any help you can provide.  A worked example and some
> code I've tried are below.  I'm working with R v2.8.1 on Windows XP
> Professional.
>
> Cheers,
> Alan Cohen
>
> Here's what my data look like now:
>
> > id <- rep(1:5,2)
> > COD <-
c("A01","A02","A03","A04","A05","B01","A02","B03","B04","A05")
> > MDid <- c(1:6,3,5,7,2)
> > data <- as.data.frame(cbind(id,COD,MDid))
> > data
>   id COD MDid
> 1   1 A01    1
> 2   2 A02    2
> 3   3 A03    3
> 4   4 A04    4
> 5   5 A05    5
> 6   1 B01    6
> 7   2 A02    3
> 8   3 B03    5
> 9   4 B04    7
> 10  5 A05    2
>
> And here's what I'd like them to look like:
>
> > id2 <- 1:5
> > COD.1 <-
c("A01","A02","A03","A04","A05")
> > COD.2 <-
c("B01","A02","B03","B04","A05")
> > MDid.1 <- 1:5
> > MDid.2 <-c(6,3,5,7,2)
> > data.wide <- as.data.frame(cbind(id2,COD.1,COD.2,MDid.1,MDid.2))
> > data.wide
>  id2 COD.1 COD.2 MDid.1 MDid.2
> 1   1   A01   B01      1      6
> 2   2   A02   A02      2      3
> 3   3   A03   B03      3      5
> 4   4   A04   B04      4      7
> 5   5   A05   A05      5      2
>
> Here's the for-loop that's very slow (with or without the
if-clauses
> activated):
>
> ids<-unique(data$id)
> ct<-length(ids)
> codes<-matrix(0,ct,11)
>
>
colnames(codes)<-c("ID","ICD1","Coder1","ICD2","Coder2","ICD3","Coder3","ICD4","Coder4","ICD5","Coder5")
> j<-0
> for (i in 1:ct){
>  kkk <- ids[i]
>  rpt<-data[data$id==kkk,]
>  j<-max(j,nrow(rpt))
>  codes[i,1]<-kkk
>  codes[i,2]<-rpt$ICDCode[1]
>  codes[i,3]<-rpt$T_Physician_ID[1]
>  #if (nrow(rpt)>=2){
>   codes[i,4]<-rpt$ICDCode[2]
>   codes[i,5]<-rpt$T_Physician_ID[2]
>    #if (nrow(rpt)>=3) {
>     codes[i,6]<-rpt$ICDCode[3]
>     codes[i,7]<-rpt$T_Physician_ID[3]
>      #if (nrow(rpt)>=4) {
>       codes[i,8]<-rpt$ICDCode[4]
>       codes[i,9]<-rpt$T_Physician_ID[4]
>          #if (nrow(rpt)>=5) {
>           codes[i,10]<-rpt$ICDCode[5]
>           codes[i,11]<-rpt$T_Physician_ID[5]
> #}}}}
> }
>
> ______________________________________________
> R-help@r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
>
http://www.R-project.org/posting-guide.html<http://www.r-project.org/posting-guide.html>
> and provide commented, minimal, self-contained, reproducible code.
>


-- 
Jim Holtman
Cincinnati, OH
+1 513 646 9390

What is the problem that you are trying to solve?

	[[alternative HTML version deleted]]

Rolf Turner

2009-Jun-24 02:02 UTC

head link

[R] Long to wide format without time variable

On 24/06/2009, at 9:52 AM, Alan Cohen wrote:
> Hi all,
>
> I am trying to convert a data set of physician death codings (each  
> individual's cause of death is coded by multiple physicians) from  
> long to wide format, but the "reshape" function doesn't seem
to
> work because it requires a "time" variable to identify the
sequence
> among the repeated observations within individuals.  My data set  
> has no order, and different numbers of physicians code each death,  
> up to 23.  It is also quite large, so for-loops are very slow, and  
> I'll need to repeat the procedure multiple times.  So I'm looking  
> for a processor-efficient way to replicate "reshape" without a
time
> variable.
	Basically your data ***should*** have a ``time variable''.  To me
	it looks perilous not to have one.  Since you haven't got one, create
	one:

	make.time <- function(a) {
		u <- tapply(1:length(a),a,function(x){
                 		y <- 1:length(x)
                 		names(y) <- x
                			y}
      		      )
		v <- unlist(u)
		w <- as.numeric(unlist(lapply(u,names)))
		z <- numeric(length(a))
		z[w] <- v
		z}

	Now try the following:

	id <- rep(1:5,2)
	COD <-
c("A01","A02","A03","A04","A05","B01","A02","B03","B04","A05")
	MDid <- c(1:6,3,5,7,2)
	data <- as.data.frame(cbind(id,COD,MDid))
	data$time <- make.time(data$id)
	wide <- reshape(data,timevar="time",v.names=c 
("COD","MDid"),direction="wide")

	Except for the order of the columns (which you can easily rearrange  
if it matters,
	which it doesn't) the result appears to be what you want.

		cheers,

			Rolf Turner
> Thanks in advance for any help you can provide.  A worked example  
> and some code I've tried are below.  I'm working with R v2.8.1 on  
> Windows XP Professional.
>
> Cheers,
> Alan Cohen
>
> Here's what my data look like now:
>
>> id <- rep(1:5,2)
>> COD <-
c("A01","A02","A03","A04","A05","B01","A02","B03","B04","A05")
>> MDid <- c(1:6,3,5,7,2)
>> data <- as.data.frame(cbind(id,COD,MDid))
>> data
>    id COD MDid
> 1   1 A01    1
> 2   2 A02    2
> 3   3 A03    3
> 4   4 A04    4
> 5   5 A05    5
> 6   1 B01    6
> 7   2 A02    3
> 8   3 B03    5
> 9   4 B04    7
> 10  5 A05    2
>
> And here's what I'd like them to look like:
>
>> id2 <- 1:5
>> COD.1 <-
c("A01","A02","A03","A04","A05")
>> COD.2 <-
c("B01","A02","B03","B04","A05")
>> MDid.1 <- 1:5
>> MDid.2 <-c(6,3,5,7,2)
>> data.wide <- as.data.frame(cbind(id2,COD.1,COD.2,MDid.1,MDid.2))
>> data.wide
>   id2 COD.1 COD.2 MDid.1 MDid.2
> 1   1   A01   B01      1      6
> 2   2   A02   A02      2      3
> 3   3   A03   B03      3      5
> 4   4   A04   B04      4      7
> 5   5   A05   A05      5      2
>
> Here's the for-loop that's very slow (with or without the if- 
> clauses activated):
>
> ids<-unique(data$id)
> ct<-length(ids)
> codes<-matrix(0,ct,11)
> colnames(codes)<-c 
>
("ID","ICD1","Coder1","ICD2","Coder2","ICD3","Coder3","ICD4","Coder4",
> "ICD5","Coder5")
> j<-0
> for (i in 1:ct){
>   kkk <- ids[i]
>   rpt<-data[data$id==kkk,]
>   j<-max(j,nrow(rpt))
>   codes[i,1]<-kkk
>   codes[i,2]<-rpt$ICDCode[1]
>   codes[i,3]<-rpt$T_Physician_ID[1]
>   #if (nrow(rpt)>=2){
>    codes[i,4]<-rpt$ICDCode[2]
>    codes[i,5]<-rpt$T_Physician_ID[2]
>     #if (nrow(rpt)>=3) {
>      codes[i,6]<-rpt$ICDCode[3]
>      codes[i,7]<-rpt$T_Physician_ID[3]
>       #if (nrow(rpt)>=4) {
>        codes[i,8]<-rpt$ICDCode[4]
>        codes[i,9]<-rpt$T_Physician_ID[4]
>           #if (nrow(rpt)>=5) {
>            codes[i,10]<-rpt$ICDCode[5]
>            codes[i,11]<-rpt$T_Physician_ID[5]
> #}}}}
> }
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting- 
> guide.html
> and provide commented, minimal, self-contained, reproducible code.

######################################################################
Attention:\ This e-mail message is privileged and confid...{{dropped:9}}

Apparently Analagous Threads

Search for more seemingly similar threads

R help - Jun 2009 - Long to wide format without time variable

[R] Long to wide format without time variable

[R] Long to wide format without time variable

[R] Long to wide format without time variable

Apparently Analagous Threads