library(nycflights13)
We already know a lot about flights:
flights
nycflights13 has >1 tables:
airlines
airports
planes
weather
Multiple tables of data are called relational data.
A key is a variable (or set of variables) that uniquely identifies an observation.
A primary key uniquely identifies an observation in its own table, e.g., planes$tailnum
A foreign key uniquely identifies an observation in another table, e.g., flights$tailnum
Combine variables (colums) from two tables.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
inner_join(x, y, by = "key")
Same as
x %>% inner_join(y, by = "key")
An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
A left join keeps all observations in x
.
A right join keeps all observations in y
.
A full join keeps all observations in x
or y
.
What if the keys are not unique?
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
by = NULL
(natural join): use all variables that appear in both tables: # same as: flights2 %>% left_join(weather)
left_join(flights2, weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
by = "x"
: use the common variable x
: # same as left_join(flights2, planes, by = "tailnum")
flights2 %>% left_join(planes, by = "tailnum")
by = c("a" = "b")
: match variable a
in table x
to the variable b
in table y
. flights2 %>% left_join(airports, c("dest" = "faa")) # match destination, not origin
flights2 %>% left_join(airports, c("origin" = "faa"))
Combine observations (rows) from two tables.
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
semi_join(x, y)
only keeps the rows in x
that have a match in y
. semi_join(flights, top_dest)
## Joining, by = "dest"
anti_join(x, y)
keeps the rows that donโt have a match. flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(x, y)
returns rows that appear in both x
and y
. intersect(df1, df2)
union(x, y)
returns unique observations in x
and y
. union(df1, df2)
setdiff(x, y)
returns rows that appear in x
but not in y
. setdiff(df1, df2)
setdiff(df2, df1)