#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 02/17/1995 15:16 UTC by feh@biostat
# Source directory /usr/local/slibrary/local
#
# existing files will NOT be overwritten unless -c is specified
#
# This shar contains:
# length mode name
# ------ ---------- ------------------------------------------
# 3975 -rw-r--r-- impute.README
# 3342 -rw-r--r-- impute.s
# 382 -rw-r--r-- in.operator.s
# 3979 -rw-r--r-- .Data/.Help/impute
# 1087 -rw-r--r-- .Data/.Help/%in%
#
# ============= impute.README ==============
if test -f 'impute.README' -a X"$1" != X"-c"; then
echo 'x - skipping impute.README (File already exists)'
else
echo 'x - extracting impute.README (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'impute.README' &&
impute Class and methods for imputation of missing values
X Works with the describe function (counts no. of imputed values)
X and the transcan function (develops imputation models predicting
X each predictor variable from all other predictors).
X
The S and S-Plus na.action mechanism in the statistical modeling language
provide a general framework for handling missing data while modeling the
response variable. If the analyst wants to impute NAs to avoid certain biases
as well as to avoid inflated variances, both caused by deleting observations
containing NAs, there are some drawbacks to this approach: (1) imputation
rules sometimes use variables that are not used in predicting the response
variable, (2) the way in which the various predictors need to have NAs
imputed may need to be customized differently for different predictors, and
even more importantly (3) development of the imputation models may require
computationally intensive steps which the analyst may not want to repeat for
each tentative response model. As an example of point (2), one may want to
impute a polytomous factor using the most frequent category, one continuous
laboratory measurement using a pre-specified "normal" value, another
continuous variable using the median of non-NAs, and a set of other
predictors imputed using customized regression models for each (as are
fitted automatically by transcan).
X
The following examples demonstrate simple imputation using impute()
and the use of customized imputation models using transcan() with impute().
X
> age <- c(1,2,NA,4)
> age.i <- impute(age) #Default imputed values = median
# Could have used impute(age,2.5), impute(age,mean), impute(age,"random")
X
> age.i
X 1 2 3 4
X 1 2 2* 4
>summary(age.i)
X 1 values imputed to 2
X
X Min. 1st Qu. Median Mean 3rd Qu. Max.
X 1 1.75 2 2.25 2.5 4
> describe(age.i)
X age.i
X n missing imputed unique Mean
X 4 0 1 3 2.25
X
1 (1, 25%), 2 (2, 50%), 4 (1, 25%)
> is.imputed(age.i)
[1] F F T F
X
#Use transcan to develop 4 regression models for customized
#imputation. Fit possibly non-monotonic transformations for
#age and blood.pressure (for both imputing other variables and
#for being imputed themselves).
X
x <- cbind(age, disease, blood.pressure, pH)
#cbind will convert factor object `disease' to integer
par(mfrow=c(2,2)) #transcan will plot 4 transformations
x.trans <- transcan(x, categorical="disease", asis="pH", imputed=T)
#Now replace NAs in original variables with imputed values, if not
#using transcan's transformations of predictors
age <- impute(x.trans, age)
disease <- impute(x.trans, disease)
blood.pressure <- impute(x.trans, blood.pressure)
pH <- impute(x.trans, pH)
summary(pH) #uses summary.impute to tell about imputations
X #and summary.default to tell about pH overall
#The following code shows how to apply old imputation rules to new data
xnew <- cbind(....)
w <- predict(x.trans, xnew, type="original")
age <- w[,"age"] #inserts imputed values
blood.pressure <- w[,"blood.pressure"]
X
X
X
To install the program and help file, place this file in your local library
directory $SHOME/library/local. (Type !echo $SHOME while under S if you
need to find out what $SHOME is). Then type sh fff, where fff is the name
you stored this shar file under. This will create these files:
impute.README, impute.s, in.operator.s, .Data/.Help/impute, .Data/.Help/%in%.
While under S or S-Plus in the /local area, source("impute.s"). This will
install the following functions: impute (generic method), impute.default,
print.impute, summary.impute, [.impute, is.imputed, and %in%.
X
COPYRIGHT NOTICE
X
You may distribute these functions freely as long as you do so without
financial gain, that you acknowledge the source, and that you
communicate improvements to the author.
X
The author is willing to help with problems. Send E-mail to
feh@biostat.mc.duke.edu.
X
SHAR_EOF
chmod 0644 impute.README ||
echo 'restore of impute.README failed'
Wc_c="`wc -c < 'impute.README'`"
test 3975 -eq "$Wc_c" ||
echo 'impute.README: original size 3975, current size' "$Wc_c"
fi
# ============= impute.s ==============
if test -f 'impute.s' -a X"$1" != X"-c"; then
echo 'x - skipping impute.s (File already exists)'
else
echo 'x - extracting impute.s (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'impute.s' &&
impute <- function(object, ...) UseMethod("impute")
X
X
impute.default <- function(x, fun=median)
{
X m <- is.na(x)
X k <- sum(m)
X if(k==0) return(x)
X
X nam <- names(x)
X if(is.null(nam)) {nam <- as.character(1:length(x)); names(x) <- nam}
X
X if(!is.function(fun))
X {
X fill <- fun
X if(is.character(fill) && fill=="random")
X fill <- sample(x[!is.na(x)], sum(is.na(x)), replace=T)
X }
X
X else {
X
X if(is.factor(x))
X {
X freq <- table(x)
X fill <- names(freq)[freq==max(freq)][1] #take first if not unique
X }
X else fill <- fun(x[!m]) }
X
X if(length(fill)>1 && length(fill)!=k)
X stop("length of vector of imputed values != no. NAs in x")
X
# lab <- label(x)
# if(is.null(lab) || lab=="") lab <- name
# lab <- paste(lab,"with",sum(m),"NAs imputed to",format(fill))
# attr(x, "label") <- lab
X if(is.factor(x))
X {
X newlev <- sort(unique(fill))
X if(any(!(z <- newlev %in% levels(x))))
X {
X xc <- as.character(x)
X xc[m] <- fill
X x <- factor(xc, c(levels(x), newlev[!z]))
X }
X else x[m] <- fill
X }
X else x[m] <- fill
X attr(x, "imputed") <- (1:length(x))[m]
X class(x) <- c("impute", class(x))
X
X x
}
X
print.impute <- function(object, ...)
{
X i <- attr(object,"imputed")
X if(is.null(i)) {print.default(object); return(invisible())}
X if(is.factor(object)) w <- as.character(object)
X else w <- format(object)
X names(w) <- names(object)
X w[i] <- paste(w[i], "*", sep="")
X attr(w, "label") <- attr(w,"imputed") <- attr(w, "class") <- NULL
X print.default(w, quote=F)
X invisible()
}
X
summary.impute <- function(object, ...)
{
X i <- attr(object, "imputed")
X oi <- object
X class(oi) <- class(oi)[class(oi)!="impute"]
X oi <- oi[i]
X if(all(oi==oi[1])) cat("\n",length(i),"values imputed to",
X if(is.numeric(oi)) format(oi[1]) else as.character(oi[1]),"\n\n")
X else
X {
X cat("\nImputed Values:\n\n")
X if(length(i)<20) print(oi)
X else describe(oi, descript=as.character(substitute(object)))
X cat("\n")
X }
NextMethod("summary")
}
X
"[.impute" <- function(x, ..., drop=F)
{
X ats <- attributes(x)
X ats$dimnames <- NULL
X ats$dim <- NULL
X ats$names <- NULL
X class(x) <- NULL
X y <- x[..., drop = drop]
X if(length(y)==0) return(y)
X k <- 1:length(x); names(k) <- names(x)
X k <- k[...]
X attributes(y) <- c(attributes(y), ats)
X imp <- attr(y, "imputed")
X attr(y, "imputed") <- j <- (1:length(k))[k %in% imp]
X if(length(j)==0)
X {
X attr(y, "imputed") <- NULL
X class(y) <- class(y)[class(y)!="impute"]
X if(length(class(y))==0) class(y) <- NULL
X }
X y
}
X
is.imputed <- function(x)
{
X w <- rep(F, length(x))
X if(!is.null(z <- attr(x,"imputed"))) w[z] <- T
X w
}
X
as.data.frame.impute <- function(x, row.names = NULL, optional = F)
{
X nrows <- length(x)
X if(is.null(row.names)) {
# the next line is not needed for the 1993 version of data.class and is
# included for compatibility with 1992 version
X if(length(row.names <- names(x)) == nrows && !any(duplicated(
X row.names))) {
X }
X else if(optional)
X row.names <- character(nrows)
X else row.names <- as.character(1:nrows)
X }
X value <- list(x)
X if(!optional)
X names(value) <- deparse(substitute(x))[[1]]
X attr(value, "row.names") <- row.names
X class(value) <- "data.frame"
X value
}
SHAR_EOF
chmod 0644 impute.s ||
echo 'restore of impute.s failed'
Wc_c="`wc -c < 'impute.s'`"
test 3342 -eq "$Wc_c" ||
echo 'impute.s: original size 3342, current size' "$Wc_c"
fi
# ============= in.operator.s ==============
if test -f 'in.operator.s' -a X"$1" != X"-c"; then
echo 'x - skipping in.operator.s (File already exists)'
else
echo 'x - extracting in.operator.s (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'in.operator.s' &&
"%in%" <- function(a,b) {
X
if(is.factor(a) & is.numeric(b)) {
X warning("a is factor, b is numeric. Assuming b is coded factor values")
X a <- unclass(a) }
X
else if(is.numeric(a) && is.factor(b)) {
X warning("a is numeric, b is factor. Assuming a is coded factor values")
X b <- unclass(b) }
X
match(a, b, nomatch=0) > 0
X }
X
X
"%nin%" <- function(a, b) ! (a %in% b)
SHAR_EOF
chmod 0644 in.operator.s ||
echo 'restore of in.operator.s failed'
Wc_c="`wc -c < 'in.operator.s'`"
test 382 -eq "$Wc_c" ||
echo 'in.operator.s: original size 382, current size' "$Wc_c"
fi
# ============= .Data/.Help/impute ==============
if test ! -d '.Data'; then
echo 'x - creating directory .Data'
mkdir '.Data'
fi
if test ! -d '.Data/.Help'; then
echo 'x - creating directory .Data/.Help'
mkdir '.Data/.Help'
fi
if test -f '.Data/.Help/impute' -a X"$1" != X"-c"; then
echo 'x - skipping .Data/.Help/impute (File already exists)'
else
echo 'x - extracting .Data/.Help/impute (Text)'
sed 's/^X//' << 'SHAR_EOF' > '.Data/.Help/impute' &&
.tr @@
.BG
.FN impute
.FN print.impute
.FN summary.impute
.FN [.impute
.FN is.imputed
.TL
Generic Functions and Methods for Imputation
.DN
These functions do simple and `transcan'
imputation and print, summarize, and subscript
variables that have NAs filled-in with imputed values. The simple
imputation method involves filling in NAs with constants,
with a specified single-valued function of the non-NAs, or from
a sample (with replacement) from the non-NA values (this is useful
in multiple imputation).
More complex imputations can be done
with the `transcan' function, which also works with the generic methods
shown here, i.e., `impute' can take a `transcan' object and use the
imputed values created by `transcan' (with `imputed=T') to fill-in NAs.
The `print' method places * after variable values that were imputed.
The `summary' method summarizes all imputed values and then uses
the next `summary' method available for the variable.
The subscript method preserves attributes of the variable and subsets
the list of imputed values corresponding with how the variable was
subsetted. The `is.imputed' function is for checking if observations
are imputed.
.CS
impute(object, fun=median) #default method
impute(object, impute.value) #alternative default method
impute(object, "random") #use random sample of non-NAs
impute(transobj, var, name) #for transcan object
print(object)
summary(object)
object[...]
is.imputed(object)
.RA
.AG object
a vector or an object created by `transcan'
.OA
.AG ...
For the default (simple) imputation method `impute.default', the first
argument is a vector needing some imputation. If there are no NAs,
the original vector is returned unchanged. For `impute.default', a
second argument `fun' and third argument `name' may be provided.
`fun' is the name of a function to use in computing the (single)
imputed value from the non-NAs. The default is `median'.
If instead of specifying a function as `fun', a single value or vector
(numeric, or character if `object' is a factor) is specified,
those values are used for insertion. `fun' can also be the character
string `"random"' to draw random values for imputation, with the random
values not forced to be the same if there are multiple NAs.
For a vector of constants, the vector must be of length one
(indicating the same value replaces all NAs) or must be as long as
the number of NAs, in which case the values correspond to consecutive NAs
to replace. For a factor `object', constants for imputation may include
character values not in the current levels of `object'. In that
case new levels are added.
When `object' is created by `transcan', the second argument, `var', is the
vector in which to insert imputed values, and the third argument, `name' is
the (optional) character string containing the corresponding name
of the variable as it was used by `transcan'. The default name
is the character rendition of the name of the second argument.
If `object' is of class `"factor"', `fun' is ignored and the
most frequent category is used for imputation.
.RT
a vector with class `"impute"' placed in front of existing classes.
For `is.imputed', a vector of logical values is returned (all `T' if
`object' is not of class `impute').
.SH AUTHOR
Frank Harrell
.sp 0
Division of Biometry
.sp 0
Duke University Medical Center
.sp 0
feh@biostat.mc.duke.edu
.SA
`transcan', `impute.transcan', `describe', `na.include', `sample'
.EX
> age <- c(1,2,NA,4)
> age.i <- impute(age)
# Could have used impute(age,2.5), impute(age,mean), impute(age,"random")
> age.i
X 1 2 3 4
X 1 2 2* 4
>summary(age.i)
X 1 values imputed to 2
X
X Min. 1st Qu. Median Mean 3rd Qu. Max.
X 1 1.75 2 2.25 2.5 4
> describe(age.i)
X age.i
X n missing imputed unique Mean
X 4 0 1 3 2.25
X
1 (1, 25%), 2 (2, 50%), 4 (1, 25%)
> is.imputed(age.i)
[1] F F T F
.KW Methods and Generic Functions
.KW Mathematical Operations
.KW Statistical Inference
.KW Statistical Models
.WR
SHAR_EOF
chmod 0644 .Data/.Help/impute ||
echo 'restore of .Data/.Help/impute failed'
Wc_c="`wc -c < '.Data/.Help/impute'`"
test 3979 -eq "$Wc_c" ||
echo '.Data/.Help/impute: original size 3979, current size' "$Wc_c"
fi
# ============= .Data/.Help/%in% ==============
if test -f '.Data/.Help/%in%' -a X"$1" != X"-c"; then
echo 'x - skipping .Data/.Help/%in% (File already exists)'
else
echo 'x - extracting .Data/.Help/%in% (Text)'
sed 's/^X//' << 'SHAR_EOF' > '.Data/.Help/%in%' &&
.tr @@
.BG
.FN %in%
.FN %nin%
.TL
Find Matching (or Non-Matching) Elements
.DN
For two vectors or scalars `a' and `b',
`a %in% b' returns a vector of logical values corresponding to
the elements in `a'. A value is `T' if that element of `a' is
found somewhere in `b', `F' otherwise. If `a' is a factor object
and `b' is numeric, converts `a' to its integer codes before
the comparison. If `a' is numeric and `b' is `factor', converts
`b' to codes. `%nin%' returns `T' if the element in `a' is not in `b'.
.CS
a %in% b
a %nin% b
.RA
.AG a
a vector (numeric, character, factor)
.AG b
a vector (numeric, character, factor), matching the mode of a
.RT
vector of logical values with length equal to length of `a'.
.SA
`match'
.EX
X w <- factor(c("a","b","c"))
X w %in% c("b","c")
X w %in% c(2,3) #same as previous, but with warning
X
#Suppose that a variable x has levels "a", "b1", "b2" and you
#want to classify levels "b1" and "b2" as just "b". You can use
#the %in% operator and do the following:
X
X levels(x)[levels(x) %in% c("b1","b2")] <- "b"
.KW manip
.KW character
.WR
SHAR_EOF
chmod 0644 .Data/.Help/%in% ||
echo 'restore of .Data/.Help/%in% failed'
Wc_c="`wc -c < '.Data/.Help/%in%'`"
test 1087 -eq "$Wc_c" ||
echo '.Data/.Help/%in%: original size 1087, current size' "$Wc_c"
fi
exit 0