tidyverse Flashcards
count
rawmorg05 %>% count(month, sort = TRUE)
# A tibble: 12 × 2
month n
1 1 [January] 27346
2 8 [August] 27217
add_count adds a column with the counts by group
summarize
With grouping:
mtcars %>%
group_by(cyl) %>%
summarise(mean = mean(disp), n = n())
#> # A tibble: 3 × 3
#> cyl mean n
#> <dbl> <dbl> <int>
#> 1 4 105. 11
#> 2 6 183. 7
#> 3 8 353. 14</int></dbl></dbl>
summarize with group_by
group_by() tells R that we want to look at the dataset in terms of different groups,
instead of just a single block. If we use group_by()beforehand, we are subtly splitting
up the data into different groups. Then, summarize() will work on each group of data. Below, we use group_by() to tell R that we want to calculate the average for each student’s tests.
avg_score_by_student <- student_scores %>%
group_by(names) %>%
summarize(
avg_writing = mean(new_writing_score)
)
across
graphtrain %>%
summarize(across(where(is.numeric), mean))
remember to give the function without parentheses
We can find that out by supplying two functions to across(): one to compute the median and the other to count the missing values. You supply multiple functions by using a named list to .fns:
df_miss |>
summarize(
across(a:d, list(
median = function(x) median(x, na.rm = TRUE),
n_miss = function(x) sum(is.na(x))
)),
n = n()
)
#> # A tibble: 1 × 9
#> a_median a_n_miss b_median b_n_miss c_median c_n_miss d_median d_n_miss
#> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
#> 1 0.139 1 -1.11 1 -0.387 2 1.15 0
#> # ℹ 1 more variable: n <int></int></int></dbl></int></dbl></int></dbl></int></dbl>
c_across
c_across rowwise operations, e.g. creating a column that is a sum of a few columns
df <- tibble(id = 1:4, w = runif(4), x = runif(4), y = runif(4), z = runif(4))
df %>%
rowwise() %>%
mutate(
sum = sum(c_across(w:z)),
sd = sd(c_across(w:z))
)
#> # A tibble: 4 × 7
#> # Rowwise:
#> id w x y z sum sd
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.790 0.588 0.142 0.225 1.74 0.305
#> 2 2 0.892 0.514 0.781 0.207 2.39 0.305
#> 3 3 0.327 0.317 0.456 0.659 1.76 0.159
#> 4 4 0.351 0.408 0.234 0.715 1.71 0.205</dbl></dbl></dbl></dbl></dbl></dbl></int>
case_when
case_when(
x %% 35 == 0 ~ “fizz buzz”,
x %% 5 == 0 ~ “fizz”,
x %% 7 == 0 ~ “buzz”,
.default = as.character(x)
)
contains()
sat_results <- sat_results %>%
mutate(across(contains(“SAT”),as.numeric))
contains with multiple choices, acts like an OR
mtcars %>%
select(contains(c(“m”, “ar”))
This does the same thing as the following:
mtcars %>%
select(matches(‘m|ar’)) %>%
head(2)
# mpg am gear carb
#Mazda RX4 21 1 4 4
#Mazda RX4 Wag 21 1 4 4
distinct
distinct set .keep_all to TRUE to keep all the columns, default is FALSE (surprisingly to me)
mtcars |> distinct(gear, .keep_all = TRUE)
filter something containing a certain character string
mtcars$type <- rownames(mtcars)
mtcars %>%
filter(str_detect(type, ‘Toyota|Mazda’))
joins
inner_join(), right_join(), full_join() have the same interface as left_join(). The
difference is which rows they keep: left join keeps all the rows in x, the right join keeps all rows in y, the full join keeps all rows in either x or y, and the inner join only keeps rows that occur in both x and y.
matches
You want to return every column in your data whose name contains a specific string or regular expression.
Solution
table1 %>%
select(matches(“o.*u”))
Variable name in character vector, but no function call issue
for (var in names(mtcars)) {
mtcars %>% count(.data[[var]]) %>% print()
}
Passing name of variable in var, and then putting in tidyverse function
var_summary <- function(data, var) { data %>% summarise(n = n(), min = min({{ var }}), max = max({{ var }})) }
mtcars %>% group_by(cyl) %>% var_summary(mpg)
Generating a variable name programmatically
name <- “susan”
tibble(“{name}” := 2)
Select variables from names in a character vector
vars <- c(“mpg”, “vs”)
mtcars %>% select(all_of(vars))
mtcars %>% select(!all_of(vars))
select(any_of(vars)). would select all of them if there, but not give error of they don’t exist
magine you have this simple tibble and you want to count the number of observations, and compute the median of every column
df <- tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df |> summarize(
n = n(),
across(a:d, median),
)
#> # A tibble: 1 × 5
#> n a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 10 -0.246 -0.287 -0.0567 0.144</dbl></dbl></dbl></dbl></int>
Use two functions with across
df_miss |>
summarize(
across(a:d, list(
median = function(x) {median(x, na.rm = TRUE)},
n_miss = function(x) {sum(is.na(x))}
)),
n = n()
)
#> # A tibble: 1 × 9
#> a_median a_n_miss b_median b_n_miss c_median c_n_miss d_median d_n_miss
#> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
#> 1 0.139 1 -1.11 1 -0.387 2 1.15 0
#> # ℹ 1 more variable: n <int></int></int></dbl></int></dbl></int></dbl></int></dbl>
filter to just rows with at least one of columns a to d being NA
same as df_miss |> filter(is.na(a) | is.na(b) | is.na(c) | is.na(d))
df_miss |> filter(if_any(a:d, is.na))
#> # A tibble: 4 × 4
#> a b c d
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.434 -1.25 NA 1.60
#> 2 NA -1.43 -0.297 0.776
#> 3 -0.156 -0.980 NA 1.15
#> 4 1.11 NA -0.387 0.704</dbl></dbl></dbl></dbl>
df_miss |> filter(if_all(a:d, is.na))
#> # A tibble: 0 × 4
#> # ℹ 4 variables: a <dbl>, b <dbl>, c <dbl>, d <dbl></dbl></dbl></dbl></dbl>
Function to summarize over all of a set of variables
summarize_means <- function(df, summary_vars = where(is.numeric)) {
df |>
summarize(
across({{ summary_vars }}, function(x) {mean(x, na.rm = TRUE)}),
n = n(),
.groups = “drop”
)
}
diamonds |>
group_by(cut) |>
summarize_means()
#> # A tibble: 5 × 9
#> cut carat depth table price x y z n
#> <ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Fair 1.05 64.0 59.1 4359. 6.25 6.18 3.98 1610
#> 2 Good 0.849 62.4 58.7 3929. 5.84 5.85 3.64 4906
#> 3 Very Good 0.806 61.8 58.0 3982. 5.74 5.77 3.56 12082
#> 4 Premium 0.892 61.3 58.7 4584. 5.97 5.94 3.65 13791
#> 5 Ideal 0.703 61.7 56.0 3458. 5.51 5.52 3.40 21551</int></dbl></dbl></dbl></dbl></dbl></dbl></dbl></ord>
diamonds |>
group_by(cut) |>
summarize_means(c(carat, x:z))
#> # A tibble: 5 × 6
#> cut carat x y z n
#> <ord> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 Fair 1.05 6.25 6.18 3.98 1610
#> 2 Good 0.849 5.84 5.85 3.64 4906
#> 3 Very Good 0.806 5.74 5.77 3.56 12082
#> 4 Premium 0.892 5.97 5.94 3.65 13791
#> 5 Ideal 0.703 5.51 5.52 3.40 21551</int></dbl></dbl></dbl></dbl></ord>
Compute length of each column using map (just as an example of using map, obviously it’s the same for all of them)
df %>%
map(length)
get type of a vector
typeof(vec)
pivot_longer
songs
track wk1 wk2 wk3
Song A 1 5 9
Song B 4 3 3
songs |>
pivot_longer(
cols = starts_with(“wk”),
names_to = “week”,
values_to = “rank”
)
track week rank
Song A 1 1
Song A 2 5
Song A 3 9
Song B 1 4
pivot_wider
Obs CaseType Accepted
A Single Yes
A Family No
B Single No
B Family Yes
pivot_wider(names_from=”CaseType”, values_from=”Accepted”)
Obs Single Family
A Yes No
B No Yes
S4 class in R
create a class “Student_Info” with three member variables
setClass(“Student_Info”, slots=list(name=”character”, age=”numeric”, GPA=”numeric”))
student1 <- new(“Student_Info”, name = “John”, age = 21, GPA = 3.5)
student1
give col sums of a data frame as a vector, as example of map, of course there is an easier way to do this
df |> map_dbl(sum)
pass variables in function to group_by, for example
group_by() https://dplyr.tidyverse.org/reference/group_by.html uses
data-masking, not tidy-selection. We can work around that problem by using
the handy pick() https://dplyr.tidyverse.org/reference/pick.html
function, which allows you to use tidy-selection inside data-masking
functions:
df %>% count(pick(starts_with(“z”))) #> # A tibble: 3 × 3
count_missing <- function(df, group_vars, x_var) {
df |>
group_by(pick({{ group_vars }})) |>
summarize(
n_miss = sum(is.na({{ x_var }})),
.groups = “drop”
)
}
search for function you can’t remember
apropos(“replace”)