Visualizing Chicago Public Transit Ridership

Nov 23, 2019 5 min read Data Visualization, Spatial Analysis

In this post, I used Open data from Chicago Transit Authority (CTA) to segment routes based on ridership, and developed a visualization dashboard to identify stops with unbalanced boardings and alightings and stops with the most ridership. Get the raw data here.

load package

library(knitr)
library(tidyverse)
library(stringr)
library(sf)
library(kableExtra)
library(RColorBrewer)
library(viridis)
library(leaflet)
library(ggmap)
library(widgetframe)

raw_df<- read_csv("CTA_-_Ridership_-_Avg._Weekday_Bus_Stop_Boardings_in_October_2012.csv")

#clean raw_df 
raw_df<- 
  raw_df %>%
  mutate( location= str_replace_all(location, "[()]", "")) %>%
  separate(location, c("lat", "lon"), sep = ",") %>%
  mutate( lat =as.numeric(lat)) %>%
  mutate( lon= as.numeric(lon))

Table 1 Routes with most stops

#What is the route with the most stops?
routes <- 
  raw_df %>%
  separate_rows(routes) %>%
  group_by(routes) %>%
  summarise(Number_of_stops=n_distinct(stop_id)) %>%
  arrange(desc(Number_of_stops))

kable(routes[1:10, ],  booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

routes	Number_of_stops
9	273
49	242
151	221
8	220
3	213
82	209
62	206
79	206
N5	206
4	202

Table 2 Stops with the most routes

#What is the stop with the most routes?
stops <- 
  raw_df %>%
  mutate(number_of_routes = str_count(routes, "\\d+")) %>%
  arrange(desc(number_of_routes)) %>%
  select(stop_id,on_street, cross_street, routes, number_of_routes)

kable(stops[1:10, ], booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

stop_id	on_street	cross_street	routes	number_of_routes
1106	MICHIGAN	WASHINGTON	3,4,19,20,26,60,N66,124,143,145,147,148,151,157	14
73	MICHIGAN	betw. VAN BUREN & CONGRES	1,3,4,7,J14,26,X28,126,129,132,143,147,148	13
1103	MICHIGAN	HUBBARD (WRIGLEY BLDG.)	2,3,10,26,N66,143,144,145,146,147,148,151,157	13
1120	MICHIGAN	SOUTH WATER	3,6,20,26,N66,143,144,145,146,147,148,151,157	13
1122	MICHIGAN	HUBBARD (TRIBUNE BLDG.)	2,3,10,26,N66,143,144,145,146,147,148,151,157	13
1100	MICHIGAN	ERIE	3,10,26,33,125,143,144,145,146,147,148,151	12
1101	MICHIGAN	OHIO	2,3,10,26,143,144,145,146,147,148,151,157	12
1102	MICHIGAN	ILLINOIS	2,3,10,26,143,144,145,146,147,148,151,157	12
1105	MICHIGAN	betw. LAKE & RANDOLPH	3,4,19,26,N66,124,143,145,147,148,151,157	12
1123	MICHIGAN	GRAND	2,3,10,26,143,144,145,146,147,148,151,157	12

Figure 1 Stop Density Map

#get chicago basemap 
chi_bb <- c(left = -87.88430,
            bottom = 41.64416,
            right = -87.52570,
            top = 42.06470)

chicago_stamen <- get_stamenmap(bbox = chi_bb, zoom = 11, maptype = 'toner-lite')

# calculate stop density
df_sp <- 
  st_as_sf(raw_df, coords = c("lon", "lat"), crs = 4326) %>%
  select(-c('month_beginning','daytype'))

#plot them together
stops_density_map <- 
  ggmap(chicago_stamen)+ 
  stat_density_2d(data = raw_df,
                  aes(x = lon, y = lat, fill = stat(level)),
                  alpha = .4,
                  bins = 5,
                  geom = "polygon") +
  scale_fill_gradientn('number of stops',colors = brewer.pal(5, "YlOrRd")) +
  geom_sf(data = df_sp, inherit.aes = FALSE,color='black',
          size = 0.2, alpha=0.1,show.legend = 'point') 

stops_density_map

Figure 2 Map of stops with no ridership

#tranform to long dataframe
stops_route <- 
  raw_df %>%
  mutate(number_of_routes = str_count(routes, "\\d+")) %>%
  separate_rows(routes) %>%
  mutate( boardings = boardings/number_of_routes) %>%
  mutate(alightings =  alightings/ number_of_routes)  %>%
  select( -c(number_of_routes,daytype,month_beginning)) %>%
  gather("boardings", "alightings", key = type, value = ridership)

#make it sf object 
stops_route_sp <- 
  st_as_sf(stops_route, coords = c("lon", "lat"), crs = 4326) 

#stops with no boardings and alightings
#transform stops_no_ridership to spatial dataframe

stops_no_ridership_sp <-
  stops_route_sp %>%
  filter (ridership==0) %>%
  group_by(stop_id, routes) %>%
  filter(n()>1)

#n_distinct(stops_no_ridership_sp$stop_id)
#n_distinct(stops_no_ridership_sp$routes)

#set color scheme
pal <- colorFactor(palette = 'Spectral', domain =stops_no_ridership_sp$routes)

#plot
p1<-leaflet(stops_no_ridership_sp) %>%
  setView(lng=-87.705, lat=41.85443, zoom = 10) %>% 
  addProviderTiles(providers$Stamen.Toner,group="Stamen Toner") %>%
  addCircleMarkers( color = ~pal(routes),
                        radius = ~ ifelse(ridership <=100, 6,
                                           ifelse((ridership <=1000),10,15)),
                        stroke = FALSE, 
                        fillOpacity = 0.8,
                        popup = ~paste("Stop_id:", stop_id, 
                          "<br>", "Route:", routes, 
                          "<br>", "Ridership:", ridership)) %>%
      addLegend("topright", pal = pal, opacity = 1,
                values = ~routes, 
                title = "Routes")
frameWidget(p1, width='100%')

Table 3 Top 10 most trafficked routes

#routes by boarding and alighting ridership 
ridership_by_routes<- 
  stops_route %>%
  filter(!is.na(routes)) %>%
  group_by(routes, type) %>%
  summarise(n=sum(ridership)) %>%
  spread(type, n) %>%
  mutate(total =boardings+ alightings) %>%
  arrange(desc(total))

kable(ridership_by_routes[1:10, ], booktabs = T) %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "left")

routes	alightings	boardings	total
9	33119.68	33704.75	66824.43
79	31520.93	31741.53	63262.47
49	29305.87	29500.68	58806.55
66	26494.62	26566.83	53061.45
77	24193.93	22996.52	47190.45
8	23579.65	23555.37	47135.02
53	23540.77	23586.90	47127.67
4	23855.90	22107.08	45962.98
82	22379.65	22005.20	44384.85
63	21785.53	21732.61	43518.14

Figure 3 Stops segmentation based on Ridership

#reshape df with only lat, lon, boardings, and alights columns
#scale the dataset
km_df_scale<- 
  raw_df%>%
  select(stop_id,boardings,alightings) %>%
  column_to_rownames(var="stop_id") %>%
  scale()

library(factoextra)
set.seed(123)
# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(km_df_scale, k,iter.max = 100, nstart = 25 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

#plot optimal K
plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

set.seed(123)

#kmeans with 3 clusters
km.res <- kmeans(km_df_scale, 3, iter.max = 100, nstart = 25)

#join with raw df
dd <- cbind(raw_df, cluster = km.res$cluster)

#plot
dd %>%
  ggplot(aes(x=boardings, y= alightings, color= factor(cluster))) +
  geom_point() +
  geom_abline(aes(intercept = 0 , slope= 1 ), linetype= "dashed") + 
  scale_x_continuous(trans = 'log') +
  scale_y_continuous(trans = 'log') +
  scale_size(range = c(0.5, 5))  +
  scale_color_brewer("", palette = "Set1", 
                     labels = c("Stops with little ridership", 
                                "Stops with most ridership", 
                                "Stops with medium ridership")) +
  theme_bw() +
  xlab("Boardings") +
  ylab("Alightings")

#visualize on a map
m = leaflet() %>%
    setView(lng=-87.705, lat=41.85443, zoom = 10) %>% 
    addProviderTiles(providers$Stamen.Toner,group="Stamen Toner") 

# subset into clusters 
data_filterlist = list(cluster_1 = subset(dd, cluster == 1),
                cluster_2 = subset(dd, cluster == 2),
                cluster_3 = subset(dd, cluster == 3))

# Remember we also had these groups associated with each variable? Let's put them in a list too:
layerlist = c("Stops with little ridership", "Stops with most ridership", "Stops with medium ridership")

# We can keep that same color variable:
colorFactors = colorFactor(c('red', 'blue', 'seagreen'), domain=dd$cluster)

# Now we have our loop - each time through the loop, it is adding our markers to the map object:

for (i in 1:length(data_filterlist)){
  m = addCircleMarkers(m, 
                       lng=data_filterlist[[i]]$lon, 
                       lat=data_filterlist[[i]]$lat, 
                       radius=1,
                       popup= paste( "Stop_id:",data_filterlist[[i]]$stop_id,  
                                     "<br>", "Routes:", data_filterlist[[i]]$routes,
                                     "<br>", "Boardings:",data_filterlist[[i]]$boardings,
                                     "<br>", "Alightings:",data_filterlist[[i]]$alightings),
                       stroke = FALSE, fillOpacity = 0.75,
                       color = colorFactors(data_filterlist[[i]]$cluster),
                       group = layerlist[i]
  )

}


p2<- addLayersControl(m, overlayGroups = layerlist, options = layersControlOptions(collapsed = FALSE))

frameWidget(p2, width='100%')

R Markdown spatial maps segmentation