This lecture note is based on Dr. Hua Zhou’s 2018 Winter Statistical Computing course notes available at http://hua-zhou.github.io/teaching/biostatm280-2018winter/index.html.
A typical data science project:
Available from the nycflights13 package.
336,776 flights that departed from New York City in 2013:
library("tidyverse")
## ── Attaching packages ───────────────────────────────────────────────────── tidyverse 1.2.1 ──
## âś” ggplot2 3.0.0 âś” purrr 0.2.5
## âś” tibble 1.4.2 âś” dplyr 0.7.6
## âś” tidyr 0.8.1 âś” stringr 1.3.1
## âś” readr 1.1.1 âś” forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.1
## ── Conflicts ──────────────────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
library("nycflights13")
head(flights)
The Split-Apply-Combine Strategy for Data Analysis
Pick observations by their values: filter()
.
Reorder the rows: arrange()
.
Pick variables by their names: select()
.
Create new variables with functions of existing variables: mutate()
.
Collapse many values down to a single summary: summarise()
.
filter()
Flights on Jan 1st:
# same as filter(flights, month == 1 & day == 1)
filter(flights, month == 1, day == 1)
Flights in Nov or Dec:
filter(flights, month == 11 | month == 12)
One row from each month:
distinct(flights, month, .keep_all = TRUE)
Randomly select n
rows:
sample_n(flights, 10, replace = TRUE)
Randomly select fraction of rows:
sample_frac(flights, 0.1, replace = TRUE)
Select rows by position:
slice(flights, 1:5)
Top n
rows:
top_n(flights, 5)
## Selecting by time_hour
arrange()
Sort in ascending order:
arrange(flights, year, month, day)
Sort in descending order:
arrange(flights, desc(arr_delay))
select()
Select columns by variable name:
select(flights, year, month, day)
Select columns between two variables:
select(flights, year:day)
Select columns except those between two variables:
select(flights, -(year:day))
Select columns by positions:
select(flights, seq(1, 10, by = 2))
Move variables to the start of data frame:
select(flights, time_hour, air_time, everything())
Helper functions.
starts_with("abc")
: matches names that begin with “abc”.
ends_with("xyz")
: matches names that end with “xyz”.
contains("ijk")
: matches names that contain “ijk”.
matches("(.)\\1")
: selects variables that match a regular expression.
num_range("x", 1:3)
: matches x1, x2 and x3.
one_of()
mutate()
Add variables gain
and speed
:
flights_sml <-
select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
Refer to columns that you’ve just created:
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
Only keep the new variables by transmute()
:
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
mutate_all()
: apply funs to all columns.
mutate_all(data, funs(log(.), log2(.)))
mutate_at()
: apply funs to specifc columns.
mutate_at(data, vars(-Species), funs(log(.)))
mutate_if()
: apply funs of one type
mutate_if(data, is.numeric, funs(log(.)))
summarise()
Mean of a variable:
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
Convert a tibble into a grouped tibble:
(by_day <- group_by(flights, year, month, day))
Grouped summaries:
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
Consider following analysis:
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest, count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
(delay <- filter(delay, count > 20, dest != "HNL"))
Cleaner code using pipe %>%
:
(delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL"))
Unfortunately ggplot2 does not use pipe.
ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Location: mean(x)
, median(x)
.
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
)
Spread: sd(x)
, IQR(x)
, mad(x)
.
not_cancelled %>%
group_by(dest) %>%
summarise(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
Rank: min(x)
, quantile(x, 0.25)
, max(x)
.
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
Position: first(x)
, nth(x, 2)
, last(x)
.
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
Count: n(x)
, sum(!is.na(x))
, n_distinct(x)
.
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
not_cancelled %>%
count(dest)
not_cancelled %>%
count(tailnum, wt = distance)
# How many flights left before 5am? (these usually indicate delayed
# flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
# What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_perc = mean(arr_delay > 60))
Find the worst members of each group:
flights_sml %>%
group_by(year, month, day) %>%
filter(rank(desc(arr_delay)) < 10)
Find all groups bigger than a threshold:
(popular_dests <- flights %>%
group_by(dest) %>%
filter(n() > 365))
Standardise to compute per group metrics:
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
nycflights13 has >1 tables:
We already know a lot about flights:
flights
airlines:
airlines
airports:
airports
planes:
planes
Weather:
weather
A primary key uniquely identifies an observation in its own table.
A foreign key uniquely identifies an observation in another table.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
inner_join(x, y, by = "key")
Same as
x %>% inner_join(y, by = "key")
An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
A left join keeps all observations in x
.
A right join keeps all observations in y
.
A full join keeps all observations in x
or y
.
One table has duplicate keys.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
Both tables have duplicate keys. You get all possible combinations, the Cartesian product:
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
Let’s create a narrower table from the flights data:
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
by = NULL
(default): use all variables that appear in both tables:
# same as: flights2 %>% left_join(weather)
left_join(flights2, weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
by = "x"
: use the common variable x
:
# same as: flights2 %>% left_join(weather)
left_join(flights2, planes, by = "tailnum")
by = c("a" = "b")
: match variable a
in table x
to the variable b
in table y
.
# same as: flights2 %>% left_join(weather)
left_join(flights2, airports, by = c("dest" = "faa"))
Top 10 most popular destinations:
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
How to filter the cases that fly to these destinations?
semi_join(x, y)
keesp the rows in x
that have a match in y
.
Useful to see what will be joined.
semi_join(flights, top_dest)
## Joining, by = "dest"
anti_join(x, y)
keeps the rows that don’t have a match.
Useful to see what will not be joined.
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
Generate two tables:
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
bind_rows(x, y)
stacks table x
one on top of y
.
bind_rows(df1, df2)
intersect(x, y)
returns rows that appear in both x
and y
.
intersect(df1, df2)
union(x, y)
returns unique observations in x
and y
.
union(df1, df2)
setdiff(x, y)
returns rows that appear in x
but not in y
.
setdiff(df1, df2)
setdiff(df2, df1)