##--------------------------------------------------------------## ## Script for Lecture 6: ## ## Data and Data Management ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR ## ## 2021 ## ##--------------------------------------------------------------## # Data in R packages library("carData") head(Duncan) # from carData, via "lazy data" objects() # Duncan not in workspace data("Animals", package="MASS") search() # MASS package not loaded objects() # Animals in workspace head(Animals) help("Animals", package="MASS") # reading data # entering data at the keyboard (x <- c(1, 2, 3, 4)) # numeric data (names <- c("John", "Sandy", 'Mary')) # character data (v <- c(TRUE, FALSE)) # logical data cooperation <- scan() 49 64 37 52 68 54 61 79 64 29 27 58 52 41 30 40 39 44 34 44 cooperation # must enter empty line to end scan() # alternative using text argument: cooperation <- scan(text=" 49 64 37 52 68 54 61 79 64 29 27 58 52 41 30 40 39 44 34 44 ") cooperation # patterned data rep(5, 3) rep(c(1, 2, 3), 2) rep(1:3, 3:1) rep(c(1, 2, 3), each=2) (condition <- rep(c("public", "anonymous"), c(10, 10))) (condition <- rep(c("public", "anonymous"), each=10)) # equivalent (sex <- rep(rep(c("male", "female"), each=5), 2)) # defining a data frame (data set) (Guyer <- data.frame(cooperation, condition, sex)) Guyer <- data.frame( # equivalent cooperation = c(49, 64, 37, 52, 68, 54, 61, 79, 64, 29, 27, 58, 52, 41, 30, 40, 39, 44, 34, 44), condition = rep(c("public", "anonymous"), c(10, 10)), sex = rep(rep(c("male", "female"), each=5), 2) ) # reading data from a file into a data frame Duncan <- read.table("data/Duncan.txt", header=TRUE) # adjust file location if necessary View(Duncan) str(Duncan) Duncan <- read.table("data/Duncan.txt", header=TRUE, stringsAsFactors=TRUE) str(Duncan) Duncan <- read.table("data/Duncan.txt", header=TRUE) # adjust file location if necessary View(Duncan) file.choose() # returns character string with path to file Duncan <- read.table(file.choose(), header=TRUE) # alternative head(Duncan) # reading data via the clipboard (e.g., from Excel) Duncan <- read.table(pipe("pbpaste"), header=TRUE) # in Windows: Duncan <- read.table("clipboard", header=TRUE) View(Duncan) # reading data from a URL Duncan <- read.table( "http://socserv.mcmaster.ca/jfox/Courses/R/ICPSR/Duncan.txt", header=TRUE, stringsAsFactors=TRUE) head(Duncan) # comma-separated values: Duncan.csv Duncan <- read.csv("data/Duncan.csv") str(Duncan) rownames(Duncan) <- Duncan$occupation Duncan$occupation <- NULL # remove variable head(Duncan) # Reading data from a spreadsheet library("car") Duncan <- Import("data/Duncan.xlsx") str(Duncan) head(Duncan) # Import() also supports importing data from SPSS, SAS, Stata, etc. # see help("Import) # writing data to a file write.table(Duncan, "~/Documents/temp/Duncan.txt") # adjust directory for your system write.table(Duncan, "~/Documents/temp/Duncan.txt", quote=FALSE, sep="\t") # saving an object in "external" format save(Duncan, file="~/Documents/temp/Duncan.RData") # adjust directory for your system remove(Duncan) objects() # no copy of Duncan in workspace load("~/Documents/temp/Duncan.RData") # restores Duncan as side effect objects() head(Duncan) remove(Duncan) saveRDS(Duncan, file="~/Documents/temp/Duncan.RData") Otis <- readRDS("~/Documents/temp/Duncan.RData") # returns data frame objects() head(Otis) remove(Otis) # working with data frames # The search path (on your own time) search() prestige # not accessible on path Duncan[, "prestige"] # but exists in Duncan attach(Duncan) prestige # now accessible search() attach(Prestige) search() # Prestige before Duncan prestige # prestige in Prestige shadows prestige in Duncan Duncan[, "prestige"] # still there! detach(Prestige) search() prestige # now from Duncan -- confusing? Duncan$log.prestige <- log(prestige) head(Duncan) log.prestige # not in attached version of data frame prestige # there -- confusing? Duncan$log.prestige mean(prestige) mean(prestige, trim=0.1) mean <- function(x){ warning("the mean function in the base package is shadowed") sum(x)/length(x) } mean(prestige) # our mean shadows mean in base package mean(prestige, trim=0.1) # fails! remove(mean) mean(prestige, trim=.1) # finds mean in base mean <- mean(prestige) # variable named "mean" -- no problem! mean mean(prestige) remove(mean) detach(Duncan) # avoiding attach() mean(Duncan$prestige) # explicit indexing mean(Duncan[ , "prestige"]) # equivalent, column by name mean(Duncan[ , 4]) # equivalent, column by number (lm(prestige ~ income + education, data=Duncan)) # using the data argument with(Duncan, mean(prestige)) # using with() with(Duncan, lm(prestige ~ income + education)) # better to use data argument Duncan2 <- within(Duncan, { # using within(); note compound statement with {} log.income <- log(income) log.education <- log(education) }) head(Duncan2) head(Duncan) # missing data head(Freedman, 10) # first 10 rows tail(Freedman) # last 6 rows some(Freedman) # 10 randomly sampled rows Freedman$density median(Freedman$density) median(Freedman$density, na.rm=TRUE) # other similar functions: mean, var, sd, etc. with(Freedman, { plot(density, crime) showLabels(density, crime, labels=row.names(Freedman), n=5, method="x") # from car package }) log(c(1, 10, NA, 100), base=10) # NAs propogated with(Freedman, plot(log(density, base=10), crime)) lm(crime ~ log(density, base=10), data=Freedman) # NAs handled by na.action getOption("na.action") # default NA action abline(lm(crime ~ log(density, base=10), data=Freedman), lty="dashed") # dealing directly with NAs (on your own time) good <- with(Freedman, complete.cases(crime, density)) head(good, 20) # first 20 values with(Freedman, # NAs handled by indexing: lines(lowess(log(density[good], base=10), crime[good], f=1.0))) # filtering NAs Freedman.good <- na.omit(Freedman) head(Freedman.good) # first 6 rows dim(Freedman.good) # number of rows and columns dim(Freedman) # testing for NAs NA == c(1, 2, NA, 4) # wrong! is.na(c(1, 2, NA, 4)) sum(is.na(Freedman)) # count of NAs sum(!is.na(Freedman)) # count of valid values objects() remove(good, Freedman.good, Duncan, Duncan2) # numeric variables, character variables, and factors condition is.character(condition) condition <- as.factor(condition) condition remove(cooperation, condition, sex) Guyer$condition is.character(Guyer$condition) summary(Guyer) S(Guyer) # S() from car package Guyer$condition <- as.factor(Guyer$condition) Guyer$sex <- as.factor(Guyer$sex) summary(Guyer) # modifying data (as time permits) Guyer$perc.coop <- 100*Guyer$cooperation/120 # assign to data frame Guyer <- within(Guyer, perc.coop <- 100*cooperation/120) # equivalent head(Guyer) # first 6 rows Guyer$cooperation <- with(Guyer, log(perc.coop/(100 - perc.coop))) # replace Guyer <- within(Guyer, cooperation <- log(perc.coop/(100 - perc.coop))) # equivalent head(Guyer) # matrices, arrays, and lists (on your own time) (A <- matrix(1:12, 3, 4)) # filled column-wise (B <- matrix(c("a","b","c"), 4, 3, byrow=TRUE)) dim(A) dim(B) attributes(A) str(A) # structure of an object (very useful!) str(B) (v <- sample(10, 10)) # permutation of 1:10 dim(v) str(v) vv <- v # make a copy dim(vv) <- c(5, 2) # reshape into a matrix vv is.matrix(v) is.matrix(vv) (array.3 <- array(1:24, c(4,3,2))) # 3D array dim(array.3) (list.1 <- list(mat.1=A, mat.2=B, vec=v)) # a list # indexing (as time permits) # vectors v v[2] # one element v[c(4, 2, 6)] # several elements v[c(4, 2, 4)] # elements may be repeated v[-c(2, 4, 6, 8, 10)] # omitting elements names(v) <- letters[1:10] v names(v) v[c("f", "i", "g")] # indexing by names v < 6 v[v < 6] # logical indexing (vv <- v) # make a copy vv[c(1, 3, 5)] <- c(1, 2, 3) # replacing elements vv vv[c("b", "d", "f", "h", "j")] <- 0 vv remove(vv) # matrices A A[2, 3] A[c(1, 2), 2] A[c(1, 2), c(2, 3)] A[c(1, 2), ] A[c(1, 2), 2, drop=FALSE] # retain column dimension A[ , -c(1, 3)] # delete columns 1 and 3 A[-1, -2] # delete row 1 and column 2 rownames(A) <- c("one", "two", "three") colnames(A) <- c("w", "x", "y", "z") A A[c("one", "two"), c("x", "y")] A[c(TRUE, FALSE, TRUE), ] (AA <- A) AA[1, ] <- 0 AA remove(AA) # lists list.1 list.1[c(2, 3)] list.1[2] # a one-element list str(list.1[2]) list.1[[2]] # a list element str(list.1[[2]]) list.1["mat.1"] # one-element list list.1[["mat.1"]] # list element list.1$mat.1 # list element list.1$mat.1 <- matrix(1, 2, 2) # replacing a list element list.1$title <- "an arbitrary list" # adding an element (causes list to be copied) list.1$mat.2 <- NULL # removing an element list.1 # data frames head(Guyer) Guyer[, 1] # first column Guyer[, "cooperation"] # by name Guyer[c(1, 2), ] # first two rows Guyer[c("1", "2"), "cooperation"] Guyer[-(6:20), ] # remove rows Guyer$sex=="female" & Guyer$condition=="public" # note: & is vectorized Guyer[Guyer$sex=="female" & Guyer$condition=="public", ] # selected rows Guyer$cooperation # a vector (column from data frame) Guyer[["cooperation"]] # equivalent Guyer["cooperation"] # a one-column data frame # large data sets # LS regression set.seed(123456789) # for reproducibility X <- rnorm(100000*100) # 10,000,000 values X <- matrix(X, 100000, 100) # 100,000 rows, 100 columns y <- 10 + as.vector(X %*% rep(1, 100) + rnorm(100000, sd=10)) object.size(X) # ~ 80 MB object.size(y) # < 1 MB # memory.size() # memory used in Mb, Windows only # memory.limit() # Windows only system.time(m <- lm(y ~ X)) # linear least-squares regression head(coef(m)) # first 6 coefficients # logistic regression p <- as.vector(1/(1 + exp(-X %*% rep(0.25, 100)))) summary(p) yy <- rbinom(100000, 1, prob=p) table(yy) system.time(m <- glm(yy ~ X, family=binomial)) # logistic regression head(coef(m)) objects() remove(list=objects()) objects() # try this on your own time (1 million cases): set.seed(123456789) # for reproducibility X <- rnorm(1e6*100) # 100,000,000 values X <- matrix(X, 1e6, 100) # 1,000,000 rows, 100 columns y <- 10 + as.vector(X %*% rep(1, 100) + rnorm(1e6, sd=10)) object.size(X) object.size(y) # memory.size() # memory used in Mb, Windows only # memory.limit() # Windows only system.time(m <- lm(y ~ X)) # linear least-squares regression # < 15 sec. on my iMac Pro head(coef(m)) # first 6 coefficients p <- as.vector(1/(1 + exp(-X %*% rep(0.25, 100)))) summary(p) yy <- rbinom(1e6, 1, prob=p) table(yy) system.time(m <- glm(yy ~ X, family=binomial)) # logistic regression # ~ 1 min. on my iMac Pro head(coef(m)) objects() remove(list=objects()) # alternative: just restart R objects() # Tidyverse examples, with standard R and data.table alternatives library("car") # for brief(), if in a new session library("tidyverse") library("data.table") # a moderately large data set # (originally from # https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv) system.time(FlightsDF <- read.csv("flights14.csv")) # standard R, # assumes that file is in current directory, adjust if necessary brief(FlightsDF) system.time(FlightsTB <- read_csv("flights14.csv")) # tidyverse FlightsTB system.time(FlightsDT <- fread("flights14.csv")) # data.table FlightsDT # selecting cases system.time(DeltaDF <- subset(FlightsDF, carrier == "DL" & distance > 2000)) brief(DeltaDF) system.time(DeltaDF <- FlightsDF[FlightsDF$carrier == "DL" # alternative & FlightsDF$distance > 2000, ]) brief(DeltaDF) system.time(DeltaTB <- filter(FlightsTB, carrier == "DL", distance > 2000)) DeltaTB system.time(DeltaDT <- FlightsDT[carrier == "DL" & distance > 2000]) DeltaDT # selecting variables brief(TimesDF <- subset(FlightsDF, select=c("dep_delay", "arr_delay", "air_time", "carrier"))) brief(TimesDF <- FlightsDF[, c("dep_delay", "arr_delay", "air_time", "carrier")]) # alternative (TimesTB <- select(FlightsTB, dep_delay, arr_delay, air_time, carrier)) (timesDT <- FlightsDT[, .(dep_delay, arr_delay, air_time, carrier)]) # transforming variables brief(DelayDF <- within(FlightsDF, { percent_delay <- round(100*(dep_delay + arr_delay)/air_time) delayed <- ifelse (percent_delay > 0, "yes", "no") })) (DelayTB <- mutate(FlightsTB, percent_delay=round(100*(dep_delay + arr_delay)/air_time), delayed=ifelse(percent_delay > 0, "yes", "no"))) (DelayDT <- within(FlightsDT, { percent_delay <- round(100*(dep_delay + arr_delay)/air_time) delayed <- ifelse (percent_delay > 0, "yes", "no") })) # creating summaries aggregate(percent_delay ~ carrier*origin, data=DelayDF, mean) summarize(group_by(DelayTB, carrier, origin), percent_delay=mean(percent_delay)) DelayDT[, .(percent_delay=mean(percent_delay)), by=.(carrier, origin)] # is a rectangular data set the best data structure for summaries? Tapply(percent_delay ~ carrier*origin, data=DelayDF, mean) # also works with DelayTB & DelayDT # using "pipes" # These examples use the %>% pipe operator from the # Tidyverse magrittr package. There is also a new # |> pipe operator in base R. See ?`%>%` and ?`|>` # (Rene Magritte: "Ceci n'est pas une pipe", # https://en.wikipedia.org/wiki/The_Treachery_of_Images) # how pipes work x <- rnorm(10) # direct function call sort(x) sort(x, decreasing=TRUE) # piping into a function x %>% sort() x %>% sort(decreasing=TRUE) # piping into an argument other than the first TRUE %>% sort(x, decreasing=.) TRUE %>% sort(x, .) # equivalent # application to data management filter(FlightsTB, distance > 1000) %>% select(distance, air_time, carrier) %>% mutate(log_distance=log10(distance), log_airtime=log10(air_time)) %>% group_by(carrier) %>% summarize(log_distance=mean(log_distance), log_artime=mean(log_airtime)) subset(FlightsDF, distance > 1000, select=c("distance", "air_time", "carrier")) %>% within({ log_distance <- log10(distance) log_airtime <- log10(air_time) }) %>% aggregate(cbind(log_distance, log_airtime) ~ carrier, data=., mean) # NB: use of . # solutions without pipes aggregate(cbind(log_distance=log10(distance), log_airtime=log10(air_time)) ~ carrier, data=subset(FlightsDF, distance > 1000), mean) (D_da <- FlightsDT[distance > 1000, .(log_distance=mean(log10(distance)), log_airtime=mean(log10(air_time))), by=carrier]) plot(log_airtime ~ log_distance, data=D_da) summary(mod <- lm(log_airtime ~ log_distance, data=D_da)) abline(mod)