thr3ads.net - R help - [R] Can I index a dataframe with a reference from/to a second dataframe? [Feb 2008]

If this information is useful, please help other people find it:
Share via:

Thompson, David (MNR)

2008-Feb-08 20:17 UTC

[R] Can I index a dataframe with a reference from/to a second dataframe?

Hello,

I am unable to figure out how to code a new column in a data frame based
on an existing column that matches a column in a reference data frame,
in a relational-db fashion. I would like this to maintain a minimum set
of reference tables that may be reused over several similar datasets.

Specifically, I have two data frames as listed below, 'Bos' and
'tree.'
For each case in 'Bos' I want to look up the matching 'spp' code
in
'tree' and insert the associated 'type' code into a new
'type' column in
'Bos' as in:

    # add type and keep factors from reference list
    Bos$type <- tree[as.character(tree$spp)==as.character(Bos$spp),
'type']
    Bos$keep <- tree[tree$spp==Bos$spp, 'keep']

And I know I have seen this before but, can't remember where. I have
filtered through many of the threads referencing
'as.numeric(levels(Bos$spp))[as.integer(Bos$spp)]' handling of factors,
any kind of sql reference I could think of, anything 'ODBC'-ish but, I
think this may be an indexing issue. I am trying to compare elements of
two different sized (list) objects (different type objects even?) and
not cycling through Bos$spp to find matches in tree$spp as expected, . .
. , has this an apply solution?

My data frames:> dput(head(Bos, 30))structure(list(oplt = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), rplt = c(3, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0), tree = c(32, 101, 102, 103, 104, 105, 
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 
119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129), spp structure(c(10L, 
10L, 12L, 14L, 10L, 10L, 14L, 10L, 15L, 10L, 9L, 3L, 10L, 10L, 
12L, 10L, 13L, 12L, 12L, 10L, 12L, 10L, 10L, 8L, 5L, 2L, 10L, 
2L, 12L, 10L), .Label = c("AW", "BD", "BE",
"BF", "BW", "BY",
"CB", "HE", "IW", "MH", "MR",
"OR", "PO", "SW", "SA"), class
"factor"),
    dbh = c(12.1, 10.1, 63.3, 9, 7.1, 12.1, 13.9, 6.3, 6.1, 7.9, 
    5.1, 9.8, 7.1, 18.7, 44.2, 28.7, 19.8, 28, 46.6, 9, 61.6, 
    3.3, 9.1, 8.7, 5.8, 3.1, 11.1, 12.3, 28, 8.6), cc = structure(c(2L, 
    2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 
    2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label
c("dom",
    "sup"), class = "factor"), ba = c(114.990145103020,
80.1184666481737, 
    3147.0040469356, 63.6172512351933, 39.5919214168654,
114.990145103020, 
    151.746779150021, 31.1724531052447, 29.2246656600190,
49.0166993776348, 
    20.4282062299676, 75.429639612691, 39.5919214168654,
274.645883758454, 
    1534.38526793979, 646.924613208844, 307.907495978336,
615.7521601036, 
    1705.53923570736, 63.6172512351933, 2980.24045490142,
8.55298599939821, 
    65.0388219109427, 59.4467869875528, 26.4207942166902,
7.54767635024948, 
    96.7689077121996, 118.8228881404, 615.7521601036, 58.0880481648753
    )), .Names = c("oplt", "rplt", "tree",
"spp", "dbh", "cc",
"ba"), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8",
"9", "10", "11", "12", "13",
"14", "15", "16", "17", "18",
"19",
"20", "21", "22", "23", "24",
"25", "26", "27", "28", "29",
"30"
), class = "data.frame")
> dput(tree)structure(list(spp = structure(1:33, .Label = c("AB", "AS",
"AW",
"BD", "BE", "BF", "BW", "BY",
"CA", "CB", "CC", "CE", "DL",
"DP",
"EA", "HE", "IW", "LC", "MH",
"MM", "MR", "OR", "PO", "PR",
"PV",
"PW", "RS", "SA", "SB", "SM",
"SW", "VC", "VL"), class = "factor"),
    spp.orig = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 12L, 
    10L, 13L, 11L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
    23L, 24L, 25L, 26L, 27L, 28L, 31L, 29L, 30L, 32L, 33L, 34L
    ), .Label = c("AB", "AMEHUM", "AMESPP",
"AW", "BD", "BE",
    "BF", "BW", "BY", "CB",
"CE", "CORALT", "CORCOR", "DIELON",
    "DIRPAL", "EA", "HE", "IW",
"LONCAN", "MH", "MM", "MR",
"OR",
    "PO", "PR", "PRUVIR", "PW",
"RIBSPP", "SB", "SM", "SORAME",
    "SW", "VIBACE", "VIBALN"), class =
"factor"), OPL = structure(c(15L,

    7L, 14L, 29L, 13L, 2L, 9L, 8L, 10L, 23L, 1L, 28L, 11L, 12L, 
    31L, 30L, 17L, 16L, 5L, 6L, 4L, 25L, 22L, 20L, 24L, 21L, 
    26L, 27L, 19L, 3L, 18L, 32L, 33L), .Label = c("HCORCAN", 
    "WABIBAL", "WACEPEN", "WACERUB",
"WACESAS", "WACESPI", "WAMESPP",
    "WBETALL", "WBETPAP", "WCORALT",
"WDIELON", "WDIRPAL", "WFAGGRA",
    "WFRAAME", "WFRANIG", "WLONCAN",
"WOSTVIR", "WPICGLA", "WPICMAR",
    "WPINRES", "WPINSTR", "WPOPTRE",
"WPRUSER", "WPRUVIV", "WQUERUB",
    "WRIBAME", "WSORAME", "WTHUOCC",
"WTILAME", "WTSUCAN", "WULMAME",
    "WVIBACE", "WVIBLAO"), class = "factor"), form
= c(1.1, 1.2,
    1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 2, 1.1, 1.2, 1.2, 
    1.1, 1.1, 1.1, 1.2, 1.1, 1.2, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 
    1.2, 1.1, 1.1, 1.2, 1.1, 1.2, 1.2), Type = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L), .Label = c("H", "W"), class =
"factor"), keep structure(c(1L,
    1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L, 1L), .Label = c("no", "yes"), class =
"factor"), Sname structure(c(15L,
    6L, 14L, 29L, 13L, 1L, 8L, 7L, 9L, 23L, 10L, 28L, 11L, 12L, 
    31L, 30L, 17L, 16L, 4L, 5L, 3L, 25L, 22L, 20L, 24L, 21L, 
    26L, 27L, 19L, 2L, 18L, 32L, 33L), .Label = c("Abies balsamea", 
    "Acer pensylvanicum", "Acer rubrum", "Acer
saccharum", "Acer
spicatum", 
    "Amelanchier", "Betula alleghaniensis", "Betula
papyrifera",
    "Cornus alternifolia", "Cornus canadensis",
"Diervilla lonicera",
    "Dirca palustris", "Fagus grandifolia", "Fraxinus
americana",
    "Fraxinus nigra", "Lonicera canadensis", "Ostrya
virginiana",
    "Picea glauca", "Picea mariana", "Pinus
resinosa", "Pinus strobus",
    "Populus tremuloides", "Prunus serotina", "Prunus
virginiana",
    "Quercus rubra", "Ribes ", "Sorbus americana",
"Thuja occidentalis",

    "Tilia americana", "Tsuga canadensis", "Ulmus
americana",
    "Viburnum acerifolium", "Viburnum lantanoides"), class =
"factor"),
    Cname = structure(c(7L, 27L, 30L, 2L, 3L, 6L, 31L, 33L, 1L, 
    8L, 10L, 15L, 11L, 21L, 4L, 14L, 20L, 17L, 18L, 23L, 25L, 
    24L, 29L, 26L, 12L, 16L, 13L, 5L, 9L, 28L, 32L, 22L, 19L), .Label
c("Alternate-leaved Dogwood",
    "American Basswood", "American Beech", "American
Elm", "American
Mountain-ash", 
    "Balsam Fir", "Black Ash", "Black Cherry",
"Black Spruce",
    "Bunchberry", "Bush Honeysuckle", "Choke
Cherry", "Currant",
    "Eastern Hemlock", "Eastern White Cedar", "Eastern
White Pine",
    "Fly Honeysuckle", "Hard Maple", "Hobblebush",
"Ironwood",
    "Leatherwood", "Maple-leaved Viburnum", "Mountain
Maple",
    "Northern Red Oak", "Red Maple", "Red Pine",
"Serviceberry",
    "Striped Maple", "Trembling Aspen", "White
Ash", "White Birch",
    "White Spruce", "Yellow Birch"), class =
"factor")), .Names c("spp",
"spp.orig", "OPL", "form", "Type",
"keep", "Sname", "Cname"), row.names
= c("1", 
"3", "4", "5", "6", "7",
"8", "9", "10", "11", "12",
"13", "14",
"15", "16", "17", "18", "19",
"20", "21", "22", "23", "24",
"25",
"26", "27", "28", "29", "30",
"31", "32", "33", "34"), class
"data.frame")

Thanks, DaveT.
*************************************
Silviculture Data Analyst
Ontario Forest Research Institute
Ontario Ministry of Natural Resources
david.john.thompson at ontario.ca
http://ofri.mnr.gov.on.ca

jim holtman

2008-Feb-08 23:48 UTC

head link

[R] Can I index a dataframe with a reference from/to a second dataframe?

try this:

Bos$type <- tree$Type[match(Bos$spp, tree$spp)]



On Feb 8, 2008 3:17 PM, Thompson, David (MNR)
<David.John.Thompson at ontario.ca> wrote:> Hello,
>
> I am unable to figure out how to code a new column in a data frame based
> on an existing column that matches a column in a reference data frame,
> in a relational-db fashion. I would like this to maintain a minimum set
> of reference tables that may be reused over several similar datasets.
>
> Specifically, I have two data frames as listed below, 'Bos' and
'tree.'
> For each case in 'Bos' I want to look up the matching 'spp'
code in
> 'tree' and insert the associated 'type' code into a new
'type' column in
> 'Bos' as in:
>
>    # add type and keep factors from reference list
>    Bos$type <- tree[as.character(tree$spp)==as.character(Bos$spp),
> 'type']
>    Bos$keep <- tree[tree$spp==Bos$spp, 'keep']
>
> And I know I have seen this before but, can't remember where. I have
> filtered through many of the threads referencing
> 'as.numeric(levels(Bos$spp))[as.integer(Bos$spp)]' handling of
factors,
> any kind of sql reference I could think of, anything 'ODBC'-ish
but, I
> think this may be an indexing issue. I am trying to compare elements of
> two different sized (list) objects (different type objects even?) and
> not cycling through Bos$spp to find matches in tree$spp as expected, . .
> . , has this an apply solution?
>
> My data frames:
> > dput(head(Bos, 30))
> structure(list(oplt = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), rplt = c(3,
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0, 0, 0, 0, 0), tree = c(32, 101, 102, 103, 104, 105,
> 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
> 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129), spp >
structure(c(10L,
> 10L, 12L, 14L, 10L, 10L, 14L, 10L, 15L, 10L, 9L, 3L, 10L, 10L,
> 12L, 10L, 13L, 12L, 12L, 10L, 12L, 10L, 10L, 8L, 5L, 2L, 10L,
> 2L, 12L, 10L), .Label = c("AW", "BD", "BE",
"BF", "BW", "BY",
> "CB", "HE", "IW", "MH",
"MR", "OR", "PO", "SW", "SA"),
class > "factor"),
>    dbh = c(12.1, 10.1, 63.3, 9, 7.1, 12.1, 13.9, 6.3, 6.1, 7.9,
>    5.1, 9.8, 7.1, 18.7, 44.2, 28.7, 19.8, 28, 46.6, 9, 61.6,
>    3.3, 9.1, 8.7, 5.8, 3.1, 11.1, 12.3, 28, 8.6), cc = structure(c(2L,
>    2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
>    2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label >
c("dom",
>    "sup"), class = "factor"), ba = c(114.990145103020,
> 80.1184666481737,
>    3147.0040469356, 63.6172512351933, 39.5919214168654,
> 114.990145103020,
>    151.746779150021, 31.1724531052447, 29.2246656600190,
> 49.0166993776348,
>    20.4282062299676, 75.429639612691, 39.5919214168654,
> 274.645883758454,
>    1534.38526793979, 646.924613208844, 307.907495978336,
> 615.7521601036,
>    1705.53923570736, 63.6172512351933, 2980.24045490142,
> 8.55298599939821,
>    65.0388219109427, 59.4467869875528, 26.4207942166902,
> 7.54767635024948,
>    96.7689077121996, 118.8228881404, 615.7521601036, 58.0880481648753
>    )), .Names = c("oplt", "rplt", "tree",
"spp", "dbh", "cc",
> "ba"), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8",
> "9", "10", "11", "12",
"13", "14", "15", "16", "17",
"18", "19",
> "20", "21", "22", "23",
"24", "25", "26", "27", "28",
"29", "30"
> ), class = "data.frame")
>
> > dput(tree)
> structure(list(spp = structure(1:33, .Label = c("AB",
"AS", "AW",
> "BD", "BE", "BF", "BW",
"BY", "CA", "CB", "CC", "CE",
"DL", "DP",
> "EA", "HE", "IW", "LC",
"MH", "MM", "MR", "OR", "PO",
"PR", "PV",
> "PW", "RS", "SA", "SB",
"SM", "SW", "VC", "VL"), class =
"factor"),
>    spp.orig = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 12L,
>    10L, 13L, 11L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
>    23L, 24L, 25L, 26L, 27L, 28L, 31L, 29L, 30L, 32L, 33L, 34L
>    ), .Label = c("AB", "AMEHUM", "AMESPP",
"AW", "BD", "BE",
>    "BF", "BW", "BY", "CB",
"CE", "CORALT", "CORCOR", "DIELON",
>    "DIRPAL", "EA", "HE", "IW",
"LONCAN", "MH", "MM", "MR",
"OR",
>    "PO", "PR", "PRUVIR", "PW",
"RIBSPP", "SB", "SM", "SORAME",
>    "SW", "VIBACE", "VIBALN"), class =
"factor"), OPL = structure(c(15L,
>
>    7L, 14L, 29L, 13L, 2L, 9L, 8L, 10L, 23L, 1L, 28L, 11L, 12L,
>    31L, 30L, 17L, 16L, 5L, 6L, 4L, 25L, 22L, 20L, 24L, 21L,
>    26L, 27L, 19L, 3L, 18L, 32L, 33L), .Label = c("HCORCAN",
>    "WABIBAL", "WACEPEN", "WACERUB",
"WACESAS", "WACESPI", "WAMESPP",
>    "WBETALL", "WBETPAP", "WCORALT",
"WDIELON", "WDIRPAL", "WFAGGRA",
>    "WFRAAME", "WFRANIG", "WLONCAN",
"WOSTVIR", "WPICGLA", "WPICMAR",
>    "WPINRES", "WPINSTR", "WPOPTRE",
"WPRUSER", "WPRUVIV", "WQUERUB",
>    "WRIBAME", "WSORAME", "WTHUOCC",
"WTILAME", "WTSUCAN", "WULMAME",
>    "WVIBACE", "WVIBLAO"), class = "factor"),
form = c(1.1, 1.2,
>    1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 2, 1.1, 1.2, 1.2,
>    1.1, 1.1, 1.1, 1.2, 1.1, 1.2, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1,
>    1.2, 1.1, 1.1, 1.2, 1.1, 1.2, 1.2), Type = structure(c(2L,
>    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L,
>    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
>    2L, 2L), .Label = c("H", "W"), class =
"factor"), keep > structure(c(1L,
>    1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
>    2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
>    1L, 1L), .Label = c("no", "yes"), class =
"factor"), Sname > structure(c(15L,
>    6L, 14L, 29L, 13L, 1L, 8L, 7L, 9L, 23L, 10L, 28L, 11L, 12L,
>    31L, 30L, 17L, 16L, 4L, 5L, 3L, 25L, 22L, 20L, 24L, 21L,
>    26L, 27L, 19L, 2L, 18L, 32L, 33L), .Label = c("Abies
balsamea",
>    "Acer pensylvanicum", "Acer rubrum", "Acer
saccharum", "Acer
> spicatum",
>    "Amelanchier", "Betula alleghaniensis", "Betula
papyrifera",
>    "Cornus alternifolia", "Cornus canadensis",
"Diervilla lonicera",
>    "Dirca palustris", "Fagus grandifolia",
"Fraxinus americana",
>    "Fraxinus nigra", "Lonicera canadensis",
"Ostrya virginiana",
>    "Picea glauca", "Picea mariana", "Pinus
resinosa", "Pinus strobus",
>    "Populus tremuloides", "Prunus serotina",
"Prunus virginiana",
>    "Quercus rubra", "Ribes ", "Sorbus
americana", "Thuja occidentalis",
>
>    "Tilia americana", "Tsuga canadensis", "Ulmus
americana",
>    "Viburnum acerifolium", "Viburnum lantanoides"),
class = "factor"),
>    Cname = structure(c(7L, 27L, 30L, 2L, 3L, 6L, 31L, 33L, 1L,
>    8L, 10L, 15L, 11L, 21L, 4L, 14L, 20L, 17L, 18L, 23L, 25L,
>    24L, 29L, 26L, 12L, 16L, 13L, 5L, 9L, 28L, 32L, 22L, 19L), .Label >
c("Alternate-leaved Dogwood",
>    "American Basswood", "American Beech",
"American Elm", "American
> Mountain-ash",
>    "Balsam Fir", "Black Ash", "Black Cherry",
"Black Spruce",
>    "Bunchberry", "Bush Honeysuckle", "Choke
Cherry", "Currant",
>    "Eastern Hemlock", "Eastern White Cedar",
"Eastern White Pine",
>    "Fly Honeysuckle", "Hard Maple",
"Hobblebush", "Ironwood",
>    "Leatherwood", "Maple-leaved Viburnum",
"Mountain Maple",
>    "Northern Red Oak", "Red Maple", "Red
Pine", "Serviceberry",
>    "Striped Maple", "Trembling Aspen", "White
Ash", "White Birch",
>    "White Spruce", "Yellow Birch"), class =
"factor")), .Names > c("spp",
> "spp.orig", "OPL", "form", "Type",
"keep", "Sname", "Cname"), row.names
> = c("1",
> "3", "4", "5", "6", "7",
"8", "9", "10", "11", "12",
"13", "14",
> "15", "16", "17", "18",
"19", "20", "21", "22", "23",
"24", "25",
> "26", "27", "28", "29",
"30", "31", "32", "33", "34"),
class > "data.frame")
>
> Thanks, DaveT.
> *************************************
> Silviculture Data Analyst
> Ontario Forest Research Institute
> Ontario Ministry of Natural Resources
> david.john.thompson at ontario.ca
> http://ofri.mnr.gov.on.ca
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>


-- 
Jim Holtman
Cincinnati, OH
+1 513 646 9390

What is the problem you are trying to solve?

Apparently Analagous Threads

Search for more possibly parallel threads

R help - Feb 2008 - Can I index a dataframe with a reference from/to a second dataframe?

[R] Can I index a dataframe with a reference from/to a second dataframe?

[R] Can I index a dataframe with a reference from/to a second dataframe?

Apparently Analagous Threads