Thompson, David (MNR)
2008-Feb-08 20:17 UTC
[R] Can I index a dataframe with a reference from/to a second dataframe?
Hello, I am unable to figure out how to code a new column in a data frame based on an existing column that matches a column in a reference data frame, in a relational-db fashion. I would like this to maintain a minimum set of reference tables that may be reused over several similar datasets. Specifically, I have two data frames as listed below, 'Bos' and 'tree.' For each case in 'Bos' I want to look up the matching 'spp' code in 'tree' and insert the associated 'type' code into a new 'type' column in 'Bos' as in: # add type and keep factors from reference list Bos$type <- tree[as.character(tree$spp)==as.character(Bos$spp), 'type'] Bos$keep <- tree[tree$spp==Bos$spp, 'keep'] And I know I have seen this before but, can't remember where. I have filtered through many of the threads referencing 'as.numeric(levels(Bos$spp))[as.integer(Bos$spp)]' handling of factors, any kind of sql reference I could think of, anything 'ODBC'-ish but, I think this may be an indexing issue. I am trying to compare elements of two different sized (list) objects (different type objects even?) and not cycling through Bos$spp to find matches in tree$spp as expected, . . . , has this an apply solution? My data frames:> dput(head(Bos, 30))structure(list(oplt = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), rplt = c(3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), tree = c(32, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129), spp structure(c(10L, 10L, 12L, 14L, 10L, 10L, 14L, 10L, 15L, 10L, 9L, 3L, 10L, 10L, 12L, 10L, 13L, 12L, 12L, 10L, 12L, 10L, 10L, 8L, 5L, 2L, 10L, 2L, 12L, 10L), .Label = c("AW", "BD", "BE", "BF", "BW", "BY", "CB", "HE", "IW", "MH", "MR", "OR", "PO", "SW", "SA"), class "factor"), dbh = c(12.1, 10.1, 63.3, 9, 7.1, 12.1, 13.9, 6.3, 6.1, 7.9, 5.1, 9.8, 7.1, 18.7, 44.2, 28.7, 19.8, 28, 46.6, 9, 61.6, 3.3, 9.1, 8.7, 5.8, 3.1, 11.1, 12.3, 28, 8.6), cc = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label c("dom", "sup"), class = "factor"), ba = c(114.990145103020, 80.1184666481737, 3147.0040469356, 63.6172512351933, 39.5919214168654, 114.990145103020, 151.746779150021, 31.1724531052447, 29.2246656600190, 49.0166993776348, 20.4282062299676, 75.429639612691, 39.5919214168654, 274.645883758454, 1534.38526793979, 646.924613208844, 307.907495978336, 615.7521601036, 1705.53923570736, 63.6172512351933, 2980.24045490142, 8.55298599939821, 65.0388219109427, 59.4467869875528, 26.4207942166902, 7.54767635024948, 96.7689077121996, 118.8228881404, 615.7521601036, 58.0880481648753 )), .Names = c("oplt", "rplt", "tree", "spp", "dbh", "cc", "ba"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30" ), class = "data.frame")> dput(tree)structure(list(spp = structure(1:33, .Label = c("AB", "AS", "AW", "BD", "BE", "BF", "BW", "BY", "CA", "CB", "CC", "CE", "DL", "DP", "EA", "HE", "IW", "LC", "MH", "MM", "MR", "OR", "PO", "PR", "PV", "PW", "RS", "SA", "SB", "SM", "SW", "VC", "VL"), class = "factor"), spp.orig = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 12L, 10L, 13L, 11L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 31L, 29L, 30L, 32L, 33L, 34L ), .Label = c("AB", "AMEHUM", "AMESPP", "AW", "BD", "BE", "BF", "BW", "BY", "CB", "CE", "CORALT", "CORCOR", "DIELON", "DIRPAL", "EA", "HE", "IW", "LONCAN", "MH", "MM", "MR", "OR", "PO", "PR", "PRUVIR", "PW", "RIBSPP", "SB", "SM", "SORAME", "SW", "VIBACE", "VIBALN"), class = "factor"), OPL = structure(c(15L, 7L, 14L, 29L, 13L, 2L, 9L, 8L, 10L, 23L, 1L, 28L, 11L, 12L, 31L, 30L, 17L, 16L, 5L, 6L, 4L, 25L, 22L, 20L, 24L, 21L, 26L, 27L, 19L, 3L, 18L, 32L, 33L), .Label = c("HCORCAN", "WABIBAL", "WACEPEN", "WACERUB", "WACESAS", "WACESPI", "WAMESPP", "WBETALL", "WBETPAP", "WCORALT", "WDIELON", "WDIRPAL", "WFAGGRA", "WFRAAME", "WFRANIG", "WLONCAN", "WOSTVIR", "WPICGLA", "WPICMAR", "WPINRES", "WPINSTR", "WPOPTRE", "WPRUSER", "WPRUVIV", "WQUERUB", "WRIBAME", "WSORAME", "WTHUOCC", "WTILAME", "WTSUCAN", "WULMAME", "WVIBACE", "WVIBLAO"), class = "factor"), form = c(1.1, 1.2, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 2, 1.1, 1.2, 1.2, 1.1, 1.1, 1.1, 1.2, 1.1, 1.2, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 1.2, 1.1, 1.1, 1.2, 1.1, 1.2, 1.2), Type = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("H", "W"), class = "factor"), keep structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), Sname structure(c(15L, 6L, 14L, 29L, 13L, 1L, 8L, 7L, 9L, 23L, 10L, 28L, 11L, 12L, 31L, 30L, 17L, 16L, 4L, 5L, 3L, 25L, 22L, 20L, 24L, 21L, 26L, 27L, 19L, 2L, 18L, 32L, 33L), .Label = c("Abies balsamea", "Acer pensylvanicum", "Acer rubrum", "Acer saccharum", "Acer spicatum", "Amelanchier", "Betula alleghaniensis", "Betula papyrifera", "Cornus alternifolia", "Cornus canadensis", "Diervilla lonicera", "Dirca palustris", "Fagus grandifolia", "Fraxinus americana", "Fraxinus nigra", "Lonicera canadensis", "Ostrya virginiana", "Picea glauca", "Picea mariana", "Pinus resinosa", "Pinus strobus", "Populus tremuloides", "Prunus serotina", "Prunus virginiana", "Quercus rubra", "Ribes ", "Sorbus americana", "Thuja occidentalis", "Tilia americana", "Tsuga canadensis", "Ulmus americana", "Viburnum acerifolium", "Viburnum lantanoides"), class = "factor"), Cname = structure(c(7L, 27L, 30L, 2L, 3L, 6L, 31L, 33L, 1L, 8L, 10L, 15L, 11L, 21L, 4L, 14L, 20L, 17L, 18L, 23L, 25L, 24L, 29L, 26L, 12L, 16L, 13L, 5L, 9L, 28L, 32L, 22L, 19L), .Label c("Alternate-leaved Dogwood", "American Basswood", "American Beech", "American Elm", "American Mountain-ash", "Balsam Fir", "Black Ash", "Black Cherry", "Black Spruce", "Bunchberry", "Bush Honeysuckle", "Choke Cherry", "Currant", "Eastern Hemlock", "Eastern White Cedar", "Eastern White Pine", "Fly Honeysuckle", "Hard Maple", "Hobblebush", "Ironwood", "Leatherwood", "Maple-leaved Viburnum", "Mountain Maple", "Northern Red Oak", "Red Maple", "Red Pine", "Serviceberry", "Striped Maple", "Trembling Aspen", "White Ash", "White Birch", "White Spruce", "Yellow Birch"), class = "factor")), .Names c("spp", "spp.orig", "OPL", "form", "Type", "keep", "Sname", "Cname"), row.names = c("1", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34"), class "data.frame") Thanks, DaveT. ************************************* Silviculture Data Analyst Ontario Forest Research Institute Ontario Ministry of Natural Resources david.john.thompson at ontario.ca http://ofri.mnr.gov.on.ca
jim holtman
2008-Feb-08 23:48 UTC
[R] Can I index a dataframe with a reference from/to a second dataframe?
try this: Bos$type <- tree$Type[match(Bos$spp, tree$spp)] On Feb 8, 2008 3:17 PM, Thompson, David (MNR) <David.John.Thompson at ontario.ca> wrote:> Hello, > > I am unable to figure out how to code a new column in a data frame based > on an existing column that matches a column in a reference data frame, > in a relational-db fashion. I would like this to maintain a minimum set > of reference tables that may be reused over several similar datasets. > > Specifically, I have two data frames as listed below, 'Bos' and 'tree.' > For each case in 'Bos' I want to look up the matching 'spp' code in > 'tree' and insert the associated 'type' code into a new 'type' column in > 'Bos' as in: > > # add type and keep factors from reference list > Bos$type <- tree[as.character(tree$spp)==as.character(Bos$spp), > 'type'] > Bos$keep <- tree[tree$spp==Bos$spp, 'keep'] > > And I know I have seen this before but, can't remember where. I have > filtered through many of the threads referencing > 'as.numeric(levels(Bos$spp))[as.integer(Bos$spp)]' handling of factors, > any kind of sql reference I could think of, anything 'ODBC'-ish but, I > think this may be an indexing issue. I am trying to compare elements of > two different sized (list) objects (different type objects even?) and > not cycling through Bos$spp to find matches in tree$spp as expected, . . > . , has this an apply solution? > > My data frames: > > dput(head(Bos, 30)) > structure(list(oplt = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), rplt = c(3, > 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > 0, 0, 0, 0, 0, 0, 0, 0), tree = c(32, 101, 102, 103, 104, 105, > 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, > 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129), spp > structure(c(10L, > 10L, 12L, 14L, 10L, 10L, 14L, 10L, 15L, 10L, 9L, 3L, 10L, 10L, > 12L, 10L, 13L, 12L, 12L, 10L, 12L, 10L, 10L, 8L, 5L, 2L, 10L, > 2L, 12L, 10L), .Label = c("AW", "BD", "BE", "BF", "BW", "BY", > "CB", "HE", "IW", "MH", "MR", "OR", "PO", "SW", "SA"), class > "factor"), > dbh = c(12.1, 10.1, 63.3, 9, 7.1, 12.1, 13.9, 6.3, 6.1, 7.9, > 5.1, 9.8, 7.1, 18.7, 44.2, 28.7, 19.8, 28, 46.6, 9, 61.6, > 3.3, 9.1, 8.7, 5.8, 3.1, 11.1, 12.3, 28, 8.6), cc = structure(c(2L, > 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, > 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label > c("dom", > "sup"), class = "factor"), ba = c(114.990145103020, > 80.1184666481737, > 3147.0040469356, 63.6172512351933, 39.5919214168654, > 114.990145103020, > 151.746779150021, 31.1724531052447, 29.2246656600190, > 49.0166993776348, > 20.4282062299676, 75.429639612691, 39.5919214168654, > 274.645883758454, > 1534.38526793979, 646.924613208844, 307.907495978336, > 615.7521601036, > 1705.53923570736, 63.6172512351933, 2980.24045490142, > 8.55298599939821, > 65.0388219109427, 59.4467869875528, 26.4207942166902, > 7.54767635024948, > 96.7689077121996, 118.8228881404, 615.7521601036, 58.0880481648753 > )), .Names = c("oplt", "rplt", "tree", "spp", "dbh", "cc", > "ba"), row.names = c("1", "2", "3", "4", "5", "6", "7", "8", > "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", > "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30" > ), class = "data.frame") > > > dput(tree) > structure(list(spp = structure(1:33, .Label = c("AB", "AS", "AW", > "BD", "BE", "BF", "BW", "BY", "CA", "CB", "CC", "CE", "DL", "DP", > "EA", "HE", "IW", "LC", "MH", "MM", "MR", "OR", "PO", "PR", "PV", > "PW", "RS", "SA", "SB", "SM", "SW", "VC", "VL"), class = "factor"), > spp.orig = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 12L, > 10L, 13L, 11L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, > 23L, 24L, 25L, 26L, 27L, 28L, 31L, 29L, 30L, 32L, 33L, 34L > ), .Label = c("AB", "AMEHUM", "AMESPP", "AW", "BD", "BE", > "BF", "BW", "BY", "CB", "CE", "CORALT", "CORCOR", "DIELON", > "DIRPAL", "EA", "HE", "IW", "LONCAN", "MH", "MM", "MR", "OR", > "PO", "PR", "PRUVIR", "PW", "RIBSPP", "SB", "SM", "SORAME", > "SW", "VIBACE", "VIBALN"), class = "factor"), OPL = structure(c(15L, > > 7L, 14L, 29L, 13L, 2L, 9L, 8L, 10L, 23L, 1L, 28L, 11L, 12L, > 31L, 30L, 17L, 16L, 5L, 6L, 4L, 25L, 22L, 20L, 24L, 21L, > 26L, 27L, 19L, 3L, 18L, 32L, 33L), .Label = c("HCORCAN", > "WABIBAL", "WACEPEN", "WACERUB", "WACESAS", "WACESPI", "WAMESPP", > "WBETALL", "WBETPAP", "WCORALT", "WDIELON", "WDIRPAL", "WFAGGRA", > "WFRAAME", "WFRANIG", "WLONCAN", "WOSTVIR", "WPICGLA", "WPICMAR", > "WPINRES", "WPINSTR", "WPOPTRE", "WPRUSER", "WPRUVIV", "WQUERUB", > "WRIBAME", "WSORAME", "WTHUOCC", "WTILAME", "WTSUCAN", "WULMAME", > "WVIBACE", "WVIBLAO"), class = "factor"), form = c(1.1, 1.2, > 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, 2, 1.1, 1.2, 1.2, > 1.1, 1.1, 1.1, 1.2, 1.1, 1.2, 1.1, 1.1, 1.1, 1.1, 1.2, 1.1, > 1.2, 1.1, 1.1, 1.2, 1.1, 1.2, 1.2), Type = structure(c(2L, > 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, > 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, > 2L, 2L), .Label = c("H", "W"), class = "factor"), keep > structure(c(1L, > 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, > 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, > 1L, 1L), .Label = c("no", "yes"), class = "factor"), Sname > structure(c(15L, > 6L, 14L, 29L, 13L, 1L, 8L, 7L, 9L, 23L, 10L, 28L, 11L, 12L, > 31L, 30L, 17L, 16L, 4L, 5L, 3L, 25L, 22L, 20L, 24L, 21L, > 26L, 27L, 19L, 2L, 18L, 32L, 33L), .Label = c("Abies balsamea", > "Acer pensylvanicum", "Acer rubrum", "Acer saccharum", "Acer > spicatum", > "Amelanchier", "Betula alleghaniensis", "Betula papyrifera", > "Cornus alternifolia", "Cornus canadensis", "Diervilla lonicera", > "Dirca palustris", "Fagus grandifolia", "Fraxinus americana", > "Fraxinus nigra", "Lonicera canadensis", "Ostrya virginiana", > "Picea glauca", "Picea mariana", "Pinus resinosa", "Pinus strobus", > "Populus tremuloides", "Prunus serotina", "Prunus virginiana", > "Quercus rubra", "Ribes ", "Sorbus americana", "Thuja occidentalis", > > "Tilia americana", "Tsuga canadensis", "Ulmus americana", > "Viburnum acerifolium", "Viburnum lantanoides"), class = "factor"), > Cname = structure(c(7L, 27L, 30L, 2L, 3L, 6L, 31L, 33L, 1L, > 8L, 10L, 15L, 11L, 21L, 4L, 14L, 20L, 17L, 18L, 23L, 25L, > 24L, 29L, 26L, 12L, 16L, 13L, 5L, 9L, 28L, 32L, 22L, 19L), .Label > c("Alternate-leaved Dogwood", > "American Basswood", "American Beech", "American Elm", "American > Mountain-ash", > "Balsam Fir", "Black Ash", "Black Cherry", "Black Spruce", > "Bunchberry", "Bush Honeysuckle", "Choke Cherry", "Currant", > "Eastern Hemlock", "Eastern White Cedar", "Eastern White Pine", > "Fly Honeysuckle", "Hard Maple", "Hobblebush", "Ironwood", > "Leatherwood", "Maple-leaved Viburnum", "Mountain Maple", > "Northern Red Oak", "Red Maple", "Red Pine", "Serviceberry", > "Striped Maple", "Trembling Aspen", "White Ash", "White Birch", > "White Spruce", "Yellow Birch"), class = "factor")), .Names > c("spp", > "spp.orig", "OPL", "form", "Type", "keep", "Sname", "Cname"), row.names > = c("1", > "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", > "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", > "26", "27", "28", "29", "30", "31", "32", "33", "34"), class > "data.frame") > > Thanks, DaveT. > ************************************* > Silviculture Data Analyst > Ontario Forest Research Institute > Ontario Ministry of Natural Resources > david.john.thompson at ontario.ca > http://ofri.mnr.gov.on.ca > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >-- Jim Holtman Cincinnati, OH +1 513 646 9390 What is the problem you are trying to solve?
Seemingly Similar Threads
- vector("integer", length) : vector size specified is too large
- Degrees of Freedom Not Allocated to Residuals in Reduced Model
- assign()ing within apply
- Warning messages in Splancs package :: no non-missing arguments to min; returning Inf
- "/" operator in model formula