Visualizing Chicago Public Transit Ridership

In this post, I used Open data from Chicago Transit Authority (CTA) to segment routes based on ridership, and developed a visualization dashboard to identify stops with unbalanced boardings and alightings and stops with the most ridership. Get the raw data here.

load package

library(knitr)
library(tidyverse)
library(stringr)
library(sf)
library(kableExtra)
library(RColorBrewer)
library(viridis)
library(leaflet)
library(ggmap)
library(widgetframe)
raw_df<- read_csv("CTA_-_Ridership_-_Avg._Weekday_Bus_Stop_Boardings_in_October_2012.csv")
#clean raw_df 
raw_df<- 
  raw_df %>%
  mutate( location= str_replace_all(location, "[()]", "")) %>%
  separate(location, c("lat", "lon"), sep = ",") %>%
  mutate( lat =as.numeric(lat)) %>%
  mutate( lon= as.numeric(lon))
Table 1 Routes with most stops
#What is the route with the most stops?
routes <- 
  raw_df %>%
  separate_rows(routes) %>%
  group_by(routes) %>%
  summarise(Number_of_stops=n_distinct(stop_id)) %>%
  arrange(desc(Number_of_stops))

kable(routes[1:10, ],  booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
routes Number_of_stops
9 273
49 242
151 221
8 220
3 213
82 209
62 206
79 206
N5 206
4 202
Table 2 Stops with the most routes
#What is the stop with the most routes?
stops <- 
  raw_df %>%
  mutate(number_of_routes = str_count(routes, "\\d+")) %>%
  arrange(desc(number_of_routes)) %>%
  select(stop_id,on_street, cross_street, routes, number_of_routes)

kable(stops[1:10, ], booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
stop_id on_street cross_street routes number_of_routes
1106 MICHIGAN WASHINGTON 3,4,19,20,26,60,N66,124,143,145,147,148,151,157 14
73 MICHIGAN betw. VAN BUREN & CONGRES 1,3,4,7,J14,26,X28,126,129,132,143,147,148 13
1103 MICHIGAN HUBBARD (WRIGLEY BLDG.) 2,3,10,26,N66,143,144,145,146,147,148,151,157 13
1120 MICHIGAN SOUTH WATER 3,6,20,26,N66,143,144,145,146,147,148,151,157 13
1122 MICHIGAN HUBBARD (TRIBUNE BLDG.) 2,3,10,26,N66,143,144,145,146,147,148,151,157 13
1100 MICHIGAN ERIE 3,10,26,33,125,143,144,145,146,147,148,151 12
1101 MICHIGAN OHIO 2,3,10,26,143,144,145,146,147,148,151,157 12
1102 MICHIGAN ILLINOIS 2,3,10,26,143,144,145,146,147,148,151,157 12
1105 MICHIGAN betw. LAKE & RANDOLPH 3,4,19,26,N66,124,143,145,147,148,151,157 12
1123 MICHIGAN GRAND 2,3,10,26,143,144,145,146,147,148,151,157 12
Figure 1 Stop Density Map
#get chicago basemap 
chi_bb <- c(left = -87.88430,
            bottom = 41.64416,
            right = -87.52570,
            top = 42.06470)

chicago_stamen <- get_stamenmap(bbox = chi_bb, zoom = 11, maptype = 'toner-lite')

# calculate stop density
df_sp <- 
  st_as_sf(raw_df, coords = c("lon", "lat"), crs = 4326) %>%
  select(-c('month_beginning','daytype'))

#plot them together
stops_density_map <- 
  ggmap(chicago_stamen)+ 
  stat_density_2d(data = raw_df,
                  aes(x = lon, y = lat, fill = stat(level)),
                  alpha = .4,
                  bins = 5,
                  geom = "polygon") +
  scale_fill_gradientn('number of stops',colors = brewer.pal(5, "YlOrRd")) +
  geom_sf(data = df_sp, inherit.aes = FALSE,color='black',
          size = 0.2, alpha=0.1,show.legend = 'point') 

stops_density_map

Figure 2 Map of stops with no ridership
#tranform to long dataframe
stops_route <- 
  raw_df %>%
  mutate(number_of_routes = str_count(routes, "\\d+")) %>%
  separate_rows(routes) %>%
  mutate( boardings = boardings/number_of_routes) %>%
  mutate(alightings =  alightings/ number_of_routes)  %>%
  select( -c(number_of_routes,daytype,month_beginning)) %>%
  gather("boardings", "alightings", key = type, value = ridership)

#make it sf object 
stops_route_sp <- 
  st_as_sf(stops_route, coords = c("lon", "lat"), crs = 4326) 

#stops with no boardings and alightings
#transform stops_no_ridership to spatial dataframe

stops_no_ridership_sp <-
  stops_route_sp %>%
  filter (ridership==0) %>%
  group_by(stop_id, routes) %>%
  filter(n()>1)

#n_distinct(stops_no_ridership_sp$stop_id)
#n_distinct(stops_no_ridership_sp$routes)

#set color scheme
pal <- colorFactor(palette = 'Spectral', domain =stops_no_ridership_sp$routes)

#plot
p1<-leaflet(stops_no_ridership_sp) %>%
  setView(lng=-87.705, lat=41.85443, zoom = 10) %>% 
  addProviderTiles(providers$Stamen.Toner,group="Stamen Toner") %>%
  addCircleMarkers( color = ~pal(routes),
                        radius = ~ ifelse(ridership <=100, 6,
                                           ifelse((ridership <=1000),10,15)),
                        stroke = FALSE, 
                        fillOpacity = 0.8,
                        popup = ~paste("Stop_id:", stop_id, 
                          "<br>", "Route:", routes, 
                          "<br>", "Ridership:", ridership)) %>%
      addLegend("topright", pal = pal, opacity = 1,
                values = ~routes, 
                title = "Routes")
frameWidget(p1, width='100%')
Table 3 Top 10 most trafficked routes
#routes by boarding and alighting ridership 
ridership_by_routes<- 
  stops_route %>%
  filter(!is.na(routes)) %>%
  group_by(routes, type) %>%
  summarise(n=sum(ridership)) %>%
  spread(type, n) %>%
  mutate(total =boardings+ alightings) %>%
  arrange(desc(total))

kable(ridership_by_routes[1:10, ], booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
routes alightings boardings total
9 33119.68 33704.75 66824.43
79 31520.93 31741.53 63262.47
49 29305.87 29500.68 58806.55
66 26494.62 26566.83 53061.45
77 24193.93 22996.52 47190.45
8 23579.65 23555.37 47135.02
53 23540.77 23586.90 47127.67
4 23855.90 22107.08 45962.98
82 22379.65 22005.20 44384.85
63 21785.53 21732.61 43518.14
Figure 3 Stops segmentation based on Ridership
#reshape df with only lat, lon, boardings, and alights columns
#scale the dataset
km_df_scale<- 
  raw_df%>%
  select(stop_id,boardings,alightings) %>%
  column_to_rownames(var="stop_id") %>%
  scale()
library(factoextra)
set.seed(123)
# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(km_df_scale, k,iter.max = 100, nstart = 25 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

#plot optimal K
plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

set.seed(123)

#kmeans with 3 clusters
km.res <- kmeans(km_df_scale, 3, iter.max = 100, nstart = 25)

#join with raw df
dd <- cbind(raw_df, cluster = km.res$cluster)

#plot
dd %>%
  ggplot(aes(x=boardings, y= alightings, color= factor(cluster))) +
  geom_point() +
  geom_abline(aes(intercept = 0 , slope= 1 ), linetype= "dashed") + 
  scale_x_continuous(trans = 'log') +
  scale_y_continuous(trans = 'log') +
  scale_size(range = c(0.5, 5))  +
  scale_color_brewer("", palette = "Set1", 
                     labels = c("Stops with little ridership", 
                                "Stops with most ridership", 
                                "Stops with medium ridership")) +
  theme_bw() +
  xlab("Boardings") +
  ylab("Alightings") 

#visualize on a map
m = leaflet() %>%
    setView(lng=-87.705, lat=41.85443, zoom = 10) %>% 
    addProviderTiles(providers$Stamen.Toner,group="Stamen Toner") 

# subset into clusters 
data_filterlist = list(cluster_1 = subset(dd, cluster == 1),
                cluster_2 = subset(dd, cluster == 2),
                cluster_3 = subset(dd, cluster == 3))

# Remember we also had these groups associated with each variable? Let's put them in a list too:
layerlist = c("Stops with little ridership", "Stops with most ridership", "Stops with medium ridership")

# We can keep that same color variable:
colorFactors = colorFactor(c('red', 'blue', 'seagreen'), domain=dd$cluster)

# Now we have our loop - each time through the loop, it is adding our markers to the map object:

for (i in 1:length(data_filterlist)){
  m = addCircleMarkers(m, 
                       lng=data_filterlist[[i]]$lon, 
                       lat=data_filterlist[[i]]$lat, 
                       radius=1,
                       popup= paste( "Stop_id:",data_filterlist[[i]]$stop_id,  
                                     "<br>", "Routes:", data_filterlist[[i]]$routes,
                                     "<br>", "Boardings:",data_filterlist[[i]]$boardings,
                                     "<br>", "Alightings:",data_filterlist[[i]]$alightings),
                       stroke = FALSE, fillOpacity = 0.75,
                       color = colorFactors(data_filterlist[[i]]$cluster),
                       group = layerlist[i]
  )

}


p2<- addLayersControl(m, overlayGroups = layerlist, options = layersControlOptions(collapsed = FALSE))

frameWidget(p2, width='100%')
Jane Li
Jane Li
Senior Data Scientist at Paramount Pictures