[R] First Section Revision

Test whether you have understood all the codes below:

setwd()

getwd()

diabetes 
=c("Type1","Type2","Type1","Type2","Type2","Type2","Type1","Type1","Type2","Type1")

install.packages(tydiverse)

library(dplyr)

detach("package:stats", unload = TRUE)

data("USArrests")

head(USArrests)

summary(USArrests)

sd(USArrests$Murder)

View(USArrests)

table(UCBAdmissions)

str(UCBAdmissions)

levels(gss_cat$rincome)

shapiro.test(USArrests$Murder)

load("E:/RGEB/2015 Millennium Cohort Study/mcs.dta")

library(haven)

mcs <- read_dta("2015 Millennium Cohort Study/mcs.dta")

mcs$math<-as.factor(mcs$mths)

mcs$math<-recode("1"="Strongly Disagree", "2"="Disagree","3"="Agree", "4"="Strongly Agree")

mcs$science=recode(mcs$math,"1"="Strongly Disagree", "2"="Disagree","3"="Agree", "4"="Strongly Agree")

gfk_cleaned_eul$birthyear = 2014 - gfk_cleaned_eul$age


gfk_cleaned_eul$birthyear_cat = cut(gfk_cleaned_eul$birthyear, breaks = c(-Inf, 1945, 1965, 1985, 1997, Inf), labels = c("(-Inf,1945]", "(1945,1964]", "(1965,1984]", "(1985,1996]", "(1997,Inf]"))

gfk_cleaned_eul$birthyear_cat = recode (gfk_cleaned_eul$birthyear_cat,"(-Inf,1945]"= "born in 1945 or before","(1945,1964]"="Boomers","(1965,1984]"="GenX","(1985,1996]"="Millenium","(1997,Inf]"="GenZ")

gfk_excel_version <- read_excel("gfk_excel_version.xls")

gfk_excel_version$hhincome<-na_if(gfk_excel_version$hhincome,"Refused")


gfk_excel_version$hhincome<-factor(gfk_excel_version$hhincome, levels=c("Over 200,000","150,000 - 199,999","100,000 - 149,999","95,000 - 99,999","90,000 - 94,999","85,000 - 89,999","80,000 - 84,999","75,000 - 79,999","70,000 - 74,999","65,000 - 69,999","60,000 - 64,999","55,000 - 59,999","50,000 - 54,999","45,000 - 49,999","40,000 - 44,999","35,000 - 39,999","30,000 - 34,999","25 000 - 29 999","20,000 - 24,999","15,000 - 19,999","10,000 - 14,999","5,000 - 10,000","Under 5,000"))

nlevels(gfk_excel_version$hhincome)

min_birthyear <- min(gfk_cleaned_eul$birthyear)

print(min_birthyear)

View(SSE_students_data)

cuhksz_students_h <- rbind(SME_students_data,SSE_students_data)

mean(cuhksz_students_h$BMI)

Survey_GE_class_choice_2<-Survey_GE_class_choice_2[,-7]

CUHK_employement_1<-merge(CUHKSZ_employment_survey_1,CUHKSZ_employment_survey_1b, by="ID")


new_name <- "CUHKSZ_employment_survey_2"

assign(new_name, CUHKSZ_employment_survey_2)

CUHK_employement_2 <- CUHK_employement_2[, -ncol(CUHK_employement_2)]

column_index_to_change <- which(column_names == "Month_salary_22.x")

column_names[column_index_to_change] <- new_column_name

help(recode)

coffeenew$newcofnumcat <- cut(coffeenew$nrb_coffee_week, breaks=c(0,2,5,7,11),labels=c("0_2","2_5","5_7","more_than_7"))

Answer:

# Setting and getting working directory
setwd()
getwd()

# Creating a vector 'diabetes'
diabetes <- c("Type1", "Type2", "Type1", "Type2", "Type2", "Type2", "Type1", "Type1", "Type2", "Type1")  
# function_name: "c"; function: "combine elements into a vector"; format: "new_vector <- c(...)" 

# Installing and loading necessary packages
install.packages("tidyverse")  
# function_name: "install.packages"; function: "install specified packages"; format: "install.packages(...)"
library(dplyr)  
# function_name: "library"; function: "load specified library"; format: "library(...)"
detach("package:stats", unload = TRUE)  
# function_name: "detach"; function: "detach specified package"; format: "detach(..., unload = TRUE)"

# Loading the 'USArrests' dataset and performing basic operations
data("USArrests")  
# function_name: "data"; function: "load specified dataset"; format: "data(...)"
head(USArrests)  
# function_name: "head"; function: "display the first few rows of a dataset"; format: "head(...)"
summary(USArrests)  
# function_name: "summary"; function: "display summary statistics of a dataset"; format: "summary(...)"
sd(USArrests$Murder)  
# function_name: "sd"; function: "calculate standard deviation"; format: "new_variable <- sd(...)"
View(USArrests)  
# function_name: "View"; function: "open a viewer for a dataset"; format: "View(...)"

# Exploring data in 'UCBAdmissions'
table(UCBAdmissions)  
# function_name: "table"; function: "create a table of counts"; format: "table(...)"

# Checking levels in a categorical variable in 'gss_cat'
levels(gss_cat$rincome)  
# function_name: "levels"; function: "get the levels of a factor variable"; format: "levels(...)"

# Conducting a Shapiro-Wilk test on 'Murder' column in 'USArrests'
shapiro.test(USArrests$Murder)  
# function_name: "shapiro.test"; function: "conduct the Shapiro-Wilk test"; format: "shapiro.test(...)"

# Loading and manipulating data from the '2015 Millennium Cohort Study'
load("E:/RGEB/2015 Millennium Cohort Study/mcs.dta")  
# function_name: "load"; function: "load specified file or dataset"; format: "load(...)"
library(haven)  
# function_name: "library"; function: "load specified library"; format: "library(...)"
mcs <- read_dta("2015 Millennium Cohort Study/mcs.dta")  
# function_name: "read_dta"; function: "read data from a Stata file"; format: "new_dataset <- read_dta(...)"

# Creating a new variable 'birthyear' and categorizing it
gfk_cleaned_eul$birthyear <- 2014 - gfk_cleaned_eul$age  
# function_name: "subtract"; function: "subtract one variable from another"; format: "new_variable <- ... - ..."
gfk_cleaned_eul$birthyear_cat <- cut(gfk_cleaned_eul$birthyear, breaks = c(-Inf, 1945, 1965, 1985, 1997, Inf), labels = c("(-Inf,1945]", "(1945,1964]", "(1965,1984]", "(1985,1996]", "(1997,Inf]"))  
# function_name: "cut"; function: "create categorical variable by cutting a numeric variable"; format: "new_variable <- cut(...)"
gfk_cleaned_eul$birthyear_cat <- recode(gfk_cleaned_eul$birthyear_cat, "(-Inf,1945]"="born in 1945 or before", "(1945,1964]"="Boomers", "(1965,1984]"="GenX", "(1985,1996]"="Millenium", "(1997,Inf]"="GenZ")  
# function_name: "recode"; function: "recode levels of a factor variable"; format: "new_variable <- recode(...)"

# Reading an Excel file and processing 'hhincome' variable
gfk_excel_version <- read_excel("gfk_excel_version.xls")  
# function_name: "read_excel"; function: "read data from an Excel file"; format: "new_dataset <- read_excel(...)"
gfk_excel_version$hhincome <- na_if(gfk_excel_version$hhincome, "Refused")  
# function_name: "na_if"; function: "replace specific values with NA"; format: "new_variable <- na_if(...)"
gfk_excel_version$hhincome <- factor(gfk_excel_version$hhincome, levels = c("Over 200,000", "150,000 - 199,999", ...))  
# function_name: "factor"; function: "convert a variable to a factor with specified levels"; format: "new_variable <- factor(...)"

# Merging and cleaning datasets
CUHK_employement_1 <- merge(CUHKSZ_employment_survey_1, CUHKSZ_employment_survey_1b, by="ID")  
# function_name: "merge"; function: "merge datasets by a common variable"; format: "new_dataset <- merge(...)"
new_name <- "CUHKSZ_employment_survey_2"  
# function_name: "assign"; function: "assign a value to a variable"; format: "assign(..., ...)"; Note: This line has a placeholder, and the actual value is not provided.
assign(new_name, CUHKSZ_employment_survey_2)  
# function_name: "assign"; function: "assign a value to a variable"; format: "assign(..., ...)"
CUHK_employement_2 <- CUHK_employement_2[, -ncol(CUHK_employement_2)]  
# function_name: "subset"; function: "remove specified column(s)"; format: "new_dataset <- old_dataset[, -ncol(old_dataset)]"

# Manipulating data in 'coffeenew'
column_index_to_change <- which(column_names == "Month_salary_22.x")  
# function_name: "which"; function: "get the index of elements that satisfy a condition"; format: "new_index <- which(...)"
column_names[column_index_to_change] <- new_column_name  
# function_name: "replacement"; function: "replace specific values"; format: "new_vector <- old_vector; new_vector[index] <- new_value"
help(recode)  # function_name: "help"; function: "display help documentation"; format: "help(...)"
coffeenew$newcofnumcat <- cut(coffeenew$nrb_coffee_week, breaks=c(0,2,5,7,11), labels=c("0_2","2_5","5_7","more_than_7"))  
# function_name: "cut"; function: "create categorical variable by cutting a numeric variable"; format: "new_variable <- cut(...)"

Quiz:

Question: What R function is used to set the working directory?
- a) setdir()
- b) setwd()
- c) setworking()
- d) workdir()
Question: Which function installs specified R packages?
- a) load.packages()
- b) install.library()
- c) install.packages()
- d) library.install()
Question: What function is used to load a specified library in R?
- a) load()
- b) library()
- c) load.library()
- d) import.library()
Question: In R, which function is used to calculate the standard deviation of a numeric variable?
- a) calculate_sd()
- b) std_dev()
- c) sd()
- d) variance()
Question: What function opens a viewer for a dataset in R?
- a) explore()
- b) browse()
- c) view()
- d) View()
Question: In R, what function create a table of counts for categorical data?
- a) tabulate()
- b) table()
- c) count()
- d) crosstab()
Question: Which function is used to get the levels of a factor variable in R?
- a) getlevels()
- b) factorlevels()
- c) levels()
- d) factor_levels()
Question: What R function is used to conduct the Shapiro-Wilk test?
- a) shapiro()
- b) wilks.test()
- c) shapiro.test()
- d) test.shapiro()
Question: Which function reads data from a Stata file in R?
- a) read_spss()
- b) read_stata()
- c) read_sas()
- d) read_dta()
Question: In R, what function is used to create a categorical variable by cutting a numeric variable into bins?

a) bin()
b) create_cat()
c) cut()
d) category()

11. Question: Which function is used to merge datasets by a common variable in R?

a) combine()
b) merge()
c) join()
d) concat()

12. Question: What function is used to replace specific values with NA in R?

a) replace_na()
b) na_replace()
c) na_if()
d) replace_with_na()

13. Question: In R, which function is used to convert a variable to a factor with specified levels?

a) convert_factor()
b) to_factor()
c) factorize()
d) factor()

14. Question: What R function is used to remove specified columns from a dataset?

a) remove_cols()
b) subset()
c) drop_cols()
d) exclude()

15. Question: Which function in R is used to get the index of elements that satisfy a condition?

a) find()
b) locate()
c) index()
d) which()

16. Question: In R, what function is used to create a categorical variable by cutting a numeric variable into bins with labels?

a) categorize()
b) label_cut()
c) create_category()
d) cut()

17. Question: Which function in R displays help documentation for a specified function?

a) help()
b) info()
c) documentation()
d) assist()

18. Question: What function is used to replace specific values with new values in R?

a) replace_values()
b) change()
c) recode()
d) modify()

19. Question: In R, what function is used to create a new categorical variable based on the values of a numeric variable?

a) category_from_numeric()
b) create_categorical()
c) label_numeric()
d) cut()

20. Question: Which function in R is used to replace specific values with new values in a dataset?

a) replace()
b) modify()
c) recalculate()
d) recode()

`Answer for quiz`

Answer: b) setwd()
- Example: setwd("/path/to/your/directory")
Answer: c) install.packages()
- Example: install.packages("tidyverse")
Answer: b) library()
- Example: library(dplyr)
Answer: c) sd()
- Example: standard_deviation <- sd(data$variable)
Answer: d) View()
- Example: View(data)
Answer: b) table()
- Example: table(factor_data)
Answer: c) levels()
- Example: factor_levels <- levels(factor_data)
Answer: c) shapiro.test()
- Example: shapiro.test(data$numeric_variable)
Answer: d) read_dta()
- Example: dataset <- read_dta("file.dta")
Answer: c) cut()
- Example: cut_variable <- cut(data$numeric_variable, breaks = c(0, 25, 50, 75, 100))
Answer: b) merge()
- Example: merged_data <- merge(data1, data2, by="common_variable")
Answer: c) na_if()
- Example: data$variable <- na_if(data$variable, "specific_value")
Answer: d) factor()
- Example: data$variable <- factor(data$variable, levels = c("level1", "level2", "level3"))
Answer: b) subset()
- Example: new_data <- subset(data, select = -c(column_to_remove))
Answer: d) which()
- Example: index <- which(data$condition == TRUE)
Answer: d) cut()
- Example: category_variable <- cut(data$numeric_variable, breaks = c(0, 25, 50, 75, 100), labels = c("Low", "Medium", "High"))（注意：按照此代码样例是划分了4个区间，最后一个区间 (75, 100] 将没有与之关联的标签。如果你想要为这个区间指定一个特定的标签，你可以在 labels 参数中加入一个额外的标签）
Answer: a) help()
- Example: help(function_name)
Answer: c) recode()
- Example: data$variable <- recode(data$variable, "old_value" = "new_value")
Answer: d) cut()
- Example: category_variable <- cut(data$numeric_variable, breaks = c(0, 25, 50, 75, 100), labels = c("Low", "Medium", "High"))
Answer: a) replace()
- Example: data$variable <- replace(data$variable, data$condition, new_value)

EricWang1358