Operate on grouped data in data frames and data sets¶
Description¶
Group
creates a grouped variant of an object of class “data.frame” or of class
“data.set”, for which methods for with
and within
are defined, so that these
well-known functions can be applied “groupwise”.
Usage¶
# Create an object of class "grouped.data" from a
# data frame or a data set.
Groups(data,by,...)
## S4 method for signature 'data.frame'
Groups(data,by,...)
## S4 method for signature 'data.set'
Groups(data,by,...)
## S4 method for signature 'grouped.data'
Groups(data,by,...)
# Recombine grouped data into a data fame or a data set
recombine(x,...)
## S4 method for signature 'grouped.data.frame'
recombine(x,...)
## S4 method for signature 'grouped.data.set'
recombine(x,...)
# Recombine grouped data and coerce the result appropriately:
## S4 method for signature 'grouped.data'
as.data.frame(x,...)
## S4 method for signature 'grouped.data.frame'
as.data.set(x,row.names=NULL,...)
## S4 method for signature 'grouped.data.set'
as.data.set(x,row.names=NULL,...)
# Methods of the generics "with" and "within" for grouped data
## S4 method for signature 'grouped.data'
with(data,expr,...)
## S4 method for signature 'grouped.data'
within(data,expr,recombine=FALSE,...)
# This is equivalent to with(Groups(data,by),expr,...)
withGroups(data,by,expr,...)
# This is equivalent to within(Groups(data,by),expr,recombine,...)
withinGroups(data,by,expr,recombine=TRUE,...)
Arguments¶
data
-
an object of the classes “data.frame”, “data.set” if an argument to
Groups
,withGroups
,withinGroups
, by
-
a formula with the factors the levels of which define the groups.
expr
-
an expression, or several expressions enclosed in curly braces.
recombine
-
a logical vector; should the resulting grouped data be recombined?
x
-
an object of class “grouped.data”.
row.names
-
an optional character vector with row names.
...
-
other arguments, ignored.
Details¶
When applied to a data frame Groups
returns an object with class attributes
“grouped.data.frame”, “grouped.data”, and “data.frame”, when applied do an object with
class “data.set”, it returns an object with class attributes “grouped.data.set”,
“grouped.data”, and “data.set”.
When applied to objects with class attributed “grouped.data”, both the functions
with()
amd within()
evaluate expr
separately for each group defined by
Groups
. with()
returns an array composed of the results of expr
, while
within()
returns a modified copy of its data
argument, which will be a
“grouped.data” object (“grouped.data.frame” or “grouped.data.set”), unless the argument
recombine=TRUE
is set.
The expression expr
may contain references to the variables n_
, N_
, and
i_
. n_
is equal to the size of the respective group (the number of rows belonging
to it), while N_
is equal to the total number of observations in all groups. The
variable i_
equals to the indices of the rows belonging to the respective group of
observations.
Examples¶
some.data <- data.frame(x=rnorm(n=100))
some.data <- within(some.data,{
f <- factor(rep(1:4,each=25),labels=letters[1:4])
g <- factor(rep(1:5,each=4,5),labels=LETTERS[1:5])
y <- x + rep(1:4,each=25) + 0.75*rep(1:5,each=4,5)
})
# For demonstration purposes, we create an
# 'empty' group:
some.data <- subset(some.data,
f!="a" | g!="C")
some.grouped.data <- Groups(some.data,
~f+g)
# Computing the means of y for each combination f and g
group.means <- with(some.grouped.data,
mean(y))
group.means
g
f A B C D E
a 1.967418 3.052784 NA 4.486448 4.435203
b 3.459411 3.911713 4.400992 4.978906 6.433718
c 3.701949 4.022867 4.259162 5.696182 6.509524
d 4.573041 5.050737 5.769607 6.776469 7.496301
# Obtaining a groupwise centered variant of y
some.grouped.data <- within(some.grouped.data,{
y.cent <- y - mean(y)
},recombine=FALSE)
# The groupwise centered variable should have zero mean
# whithin each group
group.means <- with(some.grouped.data,
round(mean(y.cent),15))
group.means
g
f A B C D E
a 0 0 NA 0 0
b 0 0 0 0 0
c 0 0 0 0 0
d 0 0 0 0 0
# The following demonstrates the use of n_, N_, and i_
# An external copy of y
y1 <- some.data$y
group.means.n <- with(some.grouped.data,
c(mean(y), # Group means for y
n_, # Group sizes
sum(y)/n_,# Group means for y
n_/N_, # Relative group sizes
sum(y1)/N_,# NOT the grand mean
sum(y1[i_])/n_)) # Group mean for y1
group.means.n
, , g = A
f
a b c d
mean(y) 1.96741840 3.45941056 3.70194898 4.57304119
n_ 8.00000000 4.00000000 4.00000000 4.00000000
sum(y)/n_ 1.96741840 3.45941056 3.70194898 4.57304119
n_/N_ 0.08333333 0.04166667 0.04166667 0.04166667
sum(y1)/N_ 4.76831099 4.76831099 4.76831099 4.76831099
sum(y1[i_])/n_ 1.96741840 3.45941056 3.70194898 4.57304119
, , g = B
f
a b c d
mean(y) 3.05278434 3.91171267 4.02286727 5.05073728
n_ 5.00000000 7.00000000 4.00000000 4.00000000
sum(y)/n_ 3.05278434 3.91171267 4.02286727 5.05073728
n_/N_ 0.05208333 0.07291667 0.04166667 0.04166667
sum(y1)/N_ 4.76831099 4.76831099 4.76831099 4.76831099
sum(y1[i_])/n_ 3.05278434 3.91171267 4.02286727 5.05073728
, , g = C
f
a b c d
mean(y) NA 4.400992 4.259162 5.76960684
n_ NA 6.000000 6.000000 4.00000000
sum(y)/n_ NA 4.400992 4.259162 5.76960684
n_/N_ NA 0.062500 0.062500 0.04166667
sum(y1)/N_ NA 4.768311 4.768311 4.76831099
sum(y1[i_])/n_ NA 4.400992 4.259162 5.76960684
, , g = D
f
a b c d
mean(y) 4.48644766 4.97890616 5.69618230 6.77646924
n_ 4.00000000 4.00000000 7.00000000 5.00000000
sum(y)/n_ 4.48644766 4.97890616 5.69618230 6.77646924
n_/N_ 0.04166667 0.04166667 0.07291667 0.05208333
sum(y1)/N_ 4.76831099 4.76831099 4.76831099 4.76831099
sum(y1[i_])/n_ 4.48644766 4.97890616 5.69618230 6.77646924
, , g = E
f
a b c d
mean(y) 4.43520348 6.43371812 6.50952368 7.49630102
n_ 4.00000000 4.00000000 4.00000000 8.00000000
sum(y)/n_ 4.43520348 6.43371812 6.50952368 7.49630102
n_/N_ 0.04166667 0.04166667 0.04166667 0.08333333
sum(y1)/N_ 4.76831099 4.76831099 4.76831099 4.76831099
sum(y1[i_])/n_ 4.43520348 6.43371812 6.50952368 7.49630102
# Names can be attached to the groupwise results
with(some.grouped.data,
c(Centered=round(mean(y.cent),15),
Uncentered=mean(y)))
, , g = A
f
a b c d
Centered 0.000000 0.000000 0.000000 0.000000
Uncentered 1.967418 3.459411 3.701949 4.573041
, , g = B
f
a b c d
Centered 0.000000 0.000000 0.000000 0.000000
Uncentered 3.052784 3.911713 4.022867 5.050737
, , g = C
f
a b c d
Centered NA 0.000000 0.000000 0.000000
Uncentered NA 4.400992 4.259162 5.769607
, , g = D
f
a b c d
Centered 0.000000 0.000000 0.000000 0.000000
Uncentered 4.486448 4.978906 5.696182 6.776469
, , g = E
f
a b c d
Centered 0.000000 0.000000 0.000000 0.000000
Uncentered 4.435203 6.433718 6.509524 7.496301
some.data.ungrouped <- recombine(some.grouped.data)
str(some.data.ungrouped)
'data.frame': 96 obs. of 5 variables:
$ x : num 0.5168 -0.0481 -1.2802 -0.4897 1.2877 ...
$ y : num 2.27 1.7 0.47 1.26 3.79 ...
$ g : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 2 2 2 2 4 4 ...
$ f : Factor w/ 4 levels "a","b","c","d": 1 1 1 1 1 1 1 1 1 1 ...
$ y.cent: num 0.299 -0.266 -1.498 -0.707 0.735 ...
# It all works with "data.set" objects
some.dataset <- as.data.set(some.data)
some.grouped.dataset <- Groups(some.dataset,~f+g)
with(some.grouped.dataset,
c(Mean=mean(y),
Variance=var(y)))
, , g = A
f
a b c d
Mean 1.9674184 3.4594106 3.701949 4.5730412
Variance 0.8078139 0.4090148 3.456480 0.6756695
, , g = B
f
a b c d
Mean 3.0527843 3.9117127 4.0228673 5.050737
Variance 0.3902132 0.4758577 0.9795298 3.503273
, , g = C
f
a b c d
Mean NA 4.4009918 4.2591618 5.7696068
Variance NA 0.7874412 0.4159708 0.6207613
, , g = D
f
a b c d
Mean 4.486448 4.9789062 5.696182 6.776469
Variance 1.253727 0.2293157 2.132682 1.992310
, , g = E
f
a b c d
Mean 4.435203 6.4337181 6.5095237 7.4963010
Variance 1.053002 0.6742578 0.5767678 0.8263415
# The following two expressions are equivalent:
with(Groups(some.data,~f+g),mean(y))
g
f A B C D E
a 1.967418 3.052784 NA 4.486448 4.435203
b 3.459411 3.911713 4.400992 4.978906 6.433718
c 3.701949 4.022867 4.259162 5.696182 6.509524
d 4.573041 5.050737 5.769607 6.776469 7.496301
withGroups(some.data,~f+g,mean(y))
g
f A B C D E
a 1.967418 3.052784 NA 4.486448 4.435203
b 3.459411 3.911713 4.400992 4.978906 6.433718
c 3.701949 4.022867 4.259162 5.696182 6.509524
d 4.573041 5.050737 5.769607 6.776469 7.496301
# The following two expressions are equivalent:
some.data <- within(Groups(some.data,~f+g),{
y.cent <- y - mean(y)
y.cent.1 <- y - sum(y)/n_
})
some.data <- withinGroups(some.data,~f+g,{
y.cent <- y - mean(y)
y.cent.1 <- y - sum(y)/n_
})
# Both variants of groupwise centred varaibles should
# have zero groupwise means:
withGroups(some.data,~f+g,{
c(round(mean(y.cent),15),
round(mean(y.cent.1),15))
})
, , g = A
f
a b c d
1 0 0 0 0
2 0 0 0 0
, , g = B
f
a b c d
1 0 0 0 0
2 0 0 0 0
, , g = C
f
a b c d
1 NA 0 0 0
2 NA 0 0 0
, , g = D
f
a b c d
1 0 0 0 0
2 0 0 0 0
, , g = E
f
a b c d
1 0 0 0 0
2 0 0 0 0