library(nycflights13)
We already know a lot about flights:
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13 has >1 tables:
airlines
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_…
## 2 06A Moton Field Municip… 32.5 -85.7 264 -6 A America/Chic…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chic…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_…
## 5 09J Jekyll Island Airpo… 31.1 -81.4 11 -5 A America/New_…
## 6 0A9 Elizabethton Munici… 36.4 -82.2 1593 -5 A America/New_…
## 7 0G6 Williams County Air… 41.5 -84.5 730 -5 A America/New_…
## 8 0G7 Finger Lakes Region… 42.9 -76.8 492 -5 A America/New_…
## 9 0P2 Shoestring Aviation… 39.8 -76.6 1000 -5 U America/New_…
## 10 0S9 Jefferson County In… 48.1 -123. 108 -8 A America/Los_…
## # ... with 1,448 more rows
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wi… EMBRAER EMB-1… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 3 N103US 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 5 N10575 2002 Fixed wi… EMBRAER EMB-1… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 7 N107US 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed wi… AIRBUS INDUS… A320-… 2 182 NA Turbo…
## # ... with 3,312 more rows
weather
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # ... with 26,105 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
Multiple tables of data are called relational data.
A key is a variable (or set of variables) that uniquely identifies an observation.
A primary key uniquely identifies an observation in its own table, e.g., planes$tailnum
A foreign key uniquely identifies an observation in another table, e.g., flights$tailnum
Combine variables (colums) from two tables.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)inner_join(x, y, by = "key")
## # A tibble: 2 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
Same as
x %>% inner_join(y, by = "key")An outer join keeps observations that appear in at least one of the tables.
Three types of outer joins:
A left join keeps all observations in x.
A right join keeps all observations in y.
A full join keeps all observations in x or y.
What if the keys are not unique?
One table has duplicate keys.
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
## # A tibble: 4 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x3 y2
## 4 1 x4 y1Both tables have duplicate keys. You get all possible combinations, the Cartesian product:
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
## # A tibble: 6 x 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 2 x2 y3
## 4 2 x3 y2
## 5 2 x3 y3
## 6 3 x4 y4Let’s create a narrower table from the flights data:
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # ... with 336,766 more rowsby = NULL (natural join): use all variables that appear in both tables:
# same as: flights2 %>% left_join(weather)
left_join(flights2, weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 x 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # ... with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>by = "x": use the common variable x:
# same as left_join(flights2, planes, by = "tailnum")
flights2 %>% left_join(planes, by = "tailnum")
## # A tibble: 336,776 x 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixe…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixe…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixe…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixe…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixe…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixe…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixe…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixe…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixe…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA>
## # ... with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>by = c("a" = "b"): match variable a in table x to the variable b in table y.
flights2 %>% left_join(airports, c("dest" = "faa")) # match destination, not origin
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Geor… 30.0 -95.3
## 2 2013 1 1 5 LGA IAH N24211 UA Geor… 30.0 -95.3
## 3 2013 1 1 5 JFK MIA N619AA AA Miam… 25.8 -80.3
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL Hart… 33.6 -84.4
## 6 2013 1 1 5 EWR ORD N39463 UA Chic… 42.0 -87.9
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort… 26.1 -80.2
## 8 2013 1 1 6 LGA IAD N829AS EV Wash… 38.9 -77.5
## 9 2013 1 1 6 JFK MCO N593JB B6 Orla… 28.4 -81.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chic… 42.0 -87.9
## # ... with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>,
## # dst <chr>, tzone <chr>
flights2 %>% left_join(airports, c("origin" = "faa"))
## # A tibble: 336,776 x 15
## year month day hour origin dest tailnum carrier name lat lon
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Newa… 40.7 -74.2
## 2 2013 1 1 5 LGA IAH N24211 UA La G… 40.8 -73.9
## 3 2013 1 1 5 JFK MIA N619AA AA John… 40.6 -73.8
## 4 2013 1 1 5 JFK BQN N804JB B6 John… 40.6 -73.8
## 5 2013 1 1 6 LGA ATL N668DN DL La G… 40.8 -73.9
## 6 2013 1 1 5 EWR ORD N39463 UA Newa… 40.7 -74.2
## 7 2013 1 1 6 EWR FLL N516JB B6 Newa… 40.7 -74.2
## 8 2013 1 1 6 LGA IAD N829AS EV La G… 40.8 -73.9
## 9 2013 1 1 6 JFK MCO N593JB B6 John… 40.6 -73.8
## 10 2013 1 1 6 LGA ORD N3ALAA AA La G… 40.8 -73.9
## # ... with 336,766 more rows, and 4 more variables: alt <int>, tz <dbl>,
## # dst <chr>, tzone <chr>Combine observations (rows) from two tables.
Top 10 most popular destinations:
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 x 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705How to filter the flights to these destinations?
semi_join(x, y) only keeps the rows in x that have a match in y.
Useful to see what will be joined.
semi_join(flights, top_dest)
## Joining, by = "dest"
## # A tibble: 141,145 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 542 540 2 923
## 2 2013 1 1 554 600 -6 812
## 3 2013 1 1 554 558 -4 740
## 4 2013 1 1 555 600 -5 913
## 5 2013 1 1 557 600 -3 838
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 559 0 702
## 10 2013 1 1 600 600 0 851
## # ... with 141,135 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>anti_join(x, y) keeps the rows that don’t have a match.
Useful for diagnosing join mismatches.
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # ... with 712 more rowsGenerate two tables:
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)intersect(x, y) returns rows that appear in both x and y.
intersect(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 1union(x, y) returns unique observations in x and y.
union(df1, df2)
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 2
## 2 2 1
## 3 1 1setdiff(x, y) returns rows that appear in x but not in y.
setdiff(df1, df2)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 x 2
## x y
## <dbl> <dbl>
## 1 1 2