Hello list,
I am not sure if the terminology that I am using here is widely used,
however, I provide an example in the hopes that my problem will become
clear. My basic problem is that I am unsure of how to 'constrain' my
model estimates to reproduce the aggregate (by factor levels) observed
dependent variable for a negative.binomial model. I realize this
sounds rather vague, so I provide an example to illustrate:
When working with migration data, it is possible to model migration
flows via a simple Poisson GLM like the following:
> model1 <- glm(flows ~ destination.pop+distance+origin-0, data=migration,
family=poisson())
> model2 <- glm(flows ~ destination.pop+distance+origin-0, data=migration,
family=quasipoisson())
where origin is a factor of origins (1 - n). This has the effect of
'constraining' the predicted flows to reproduce the observed outgoing
flows from each origin:
> sums <- aggregate(migration$flows, by=list(migration$origin), sum)
> sums1 <- aggregate(predict(model1, type='response'),
by=list(migration$origin), sum)
> sums2 <- aggregate(predict(model2, type='response'),
by=list(migration$origin), sum)
which works fine for both poisson and quasipoisson models. However, I
would also like to fit a negative.binomial model to this data to
(again) account for over-dispersion (which may still exist even after
the constraint is imposed), however, the effect of the 'constraint'
factor does not seem to work, due likely to the additional theta
parameter.
> model3 <- glm.nb(flows ~ destination.pop+distance+origin-0,
data=migration)
> sums3 <- aggregate(predict(model3,type='response'),
by=list(migration$origin), sum)
> data.frame(origin=unique(migration$origin), observed=sums$x,
poisson=sums1$x,quasi=sums2$x,negbin=sums3$x)
origin observed poisson quasi negbin
1 1 676308 676308 676308 651113.7
2 2 1155811 1155811 1155811 1141729.4
3 3 1789112 1789112 1789112 1845908.1
4 4 942162 942162 942162 978599.6
5 5 2484387 2484387 2484387 2486435.1
6 6 819222 819222 819222 747022.1
7 7 1237079 1237079 1237079 1405065.0
8 8 1067069 1067069 1067069 963339.5
9 9 2143172 2143172 2143172 2139171.5
Can anyone tell me how I might go about doing this for the
negative.binomial model? Or perhaps better still, an alternative way
of enforcing this 'constraint'?
The following dataset is what I used for the above example (this data
comes from Fotheringham and O?Kelly, 1989. Spatial interaction models:
Formulations and applications. Dordrecht: Kluwer Academic Publishers):
> dput(migration)
structure(list(flows = c(283049L, 87267L, 29877L, 130830L, 21434L,
30287L, 21450L, 72114L, 180048L, 237229L, 60681L, 382565L, 53772L,
64645L, 43749L, 133122L, 79223L, 300345L, 286580L, 346407L, 287340L,
161645L, 97808L, 229764L, 26887L, 67280L, 281791L, 92308L, 49828L,
144980L, 113683L, 165405L, 198144L, 718673L, 551483L, 143860L,
316650L, 199466L, 89806L, 266305L, 17995L, 55094L, 230788L, 49892L,
252189L, 121366L, 25574L, 66324L, 35563L, 93434L, 178517L, 185618L,
192223L, 141679L, 158006L, 252039L, 30528L, 87987L, 172711L,
181868L, 89389L, 27409L, 134229L, 342948L, 110792L, 268458L,
394481L, 274629L, 279739L, 87938L, 289880L, 437255L), distance = c(219L,
1009L, 1514L, 974L, 1268L, 1795L, 2420L, 3174L, 219L, 831L, 1336L,
755L, 1049L, 1576L, 2242L, 2996L, 1009L, 831L, 505L, 1019L, 662L,
933L, 1451L, 2205L, 1514L, 1336L, 505L, 1370L, 888L, 654L, 946L,
1700L, 974L, 755L, 1019L, 1370L, 482L, 1144L, 2278L, 2862L, 1268L,
1049L, 662L, 888L, 484L, 662L, 1795L, 2380L, 1795L, 1576L, 933L,
654L, 1144L, 662L, 1278L, 1779L, 2420L, 2242L, 1451L, 946L, 2278L,
1795L, 1278L, 754L, 3174L, 2996L, 2205L, 1700L, 2862L, 2380L,
1779L, 754L), origin.pop = c(16.2876696349298, 16.2876696349298,
16.2876696349298, 16.2876696349298, 16.2876696349298, 16.2876696349298,
16.2876696349298, 16.2876696349298, 17.4279408399148, 17.4279408399148,
17.4279408399148, 17.4279408399148, 17.4279408399148, 17.4279408399148,
17.4279408399148, 17.4279408399148, 17.5110179983684, 17.5110179983684,
17.5110179983684, 17.5110179983684, 17.5110179983684, 17.5110179983684,
17.5110179983684, 17.5110179983684, 16.6083307371083, 16.6083307371083,
16.6083307371083, 16.6083307371083, 16.6083307371083, 16.6083307371083,
16.6083307371083, 16.6083307371083, 17.2140377110705, 17.2140377110705,
17.2140377110705, 17.2140377110705, 17.2140377110705, 17.2140377110705,
17.2140377110705, 17.2140377110705, 16.3878173980331, 16.3878173980331,
16.3878173980331, 16.3878173980331, 16.3878173980331, 16.3878173980331,
16.3878173980331, 16.3878173980331, 16.761264461712, 16.761264461712,
16.761264461712, 16.761264461712, 16.761264461712, 16.761264461712,
16.761264461712, 16.761264461712, 15.9304398925737, 15.9304398925737,
15.9304398925737, 15.9304398925737, 15.9304398925737, 15.9304398925737,
15.9304398925737, 15.9304398925737, 17.0532473904734, 17.0532473904734,
17.0532473904734, 17.0532473904734, 17.0532473904734, 17.0532473904734,
17.0532473904734, 17.0532473904734), destination.pop = c(17.4279408399148,
17.5110179983684, 16.6083307371083, 17.2140377110705, 16.3878173980331,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.5110179983684, 16.6083307371083, 17.2140377110705, 16.3878173980331,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 16.6083307371083, 17.2140377110705, 16.3878173980331,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 17.2140377110705, 16.3878173980331,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 16.6083307371083, 16.3878173980331,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 16.6083307371083, 17.2140377110705,
16.761264461712, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 16.6083307371083, 17.2140377110705,
16.3878173980331, 15.9304398925737, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 16.6083307371083, 17.2140377110705,
16.3878173980331, 16.761264461712, 17.0532473904734, 16.2876696349298,
17.4279408399148, 17.5110179983684, 16.6083307371083, 17.2140377110705,
16.3878173980331, 16.761264461712, 15.9304398925737), destination
structure(c(2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L,
2L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 5L, 6L, 7L, 8L, 9L, 1L,
2L, 3L, 4L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("1", "2",
"3", "4", "5",
"6", "7", "8", "9"), class =
"factor"), origin = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("1", "2",
"3", "4", "5",
"6", "7", "8", "9"), class =
"factor")), .Names = c("flows",
"distance", "origin.pop", "destination.pop",
"destination", "origin"
), row.names = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 22L, 23L, 24L, 25L, 26L, 27L,
28L, 29L, 30L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 52L, 53L, 54L, 55L, 56L,
57L, 58L, 59L, 60L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L), class = "data.frame")
Regards,
Carson
--
Carson J. Q. Farmer
ISSP Doctoral Fellow
National Centre for Geocomputation
National University of Ireland, Maynooth,
http://www.carsonfarmer.com/