Scenario:

Imagine being a junior data analyst in the marketing team at Divvy bike share company in Chicago and working on a project to optimize the company’s profit by increasing the numbers of riders with annual membership. Your team must know the differences between casual riders and annual members to put forward the marketing strategies for the executives’ approval.

Clear Statement of Business Task (ASK)

The purpose of this project is to understand the differences between Divvy casual riders and annual members to help design an effective marketing program targeting and converting casual riders into members.

A description of all the data sources used (PREPARE)

Rides data is collected and publicly published monthly by Divvy themselves to let people analyze and make the data speak meaningful insights.

Documentation of cleaning and manipulation of data (PROCESS)

Collect data

Load R packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.1

Import monthly data into R

trip_202108 <- read.csv("202108_tripdata.csv")
trip_202107 <- read.csv("202107_tripdata.csv")
trip_202106 <- read_csv("202106_tripdata.csv")
## Rows: 729595 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202105 <- read_csv("202105_tripdata.csv")
## Rows: 531633 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202104 <- read_csv("202104_tripdata.csv")
## Rows: 337230 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202103 <- read_csv("202103_tripdata.csv")
## Rows: 228496 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202102 <- read_csv("202102_tripdata.csv")
## Rows: 49622 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202101 <- read_csv("202101_tripdata.csv")
## Rows: 96834 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202109 <- read_csv("202109_tripdata.csv")
## Rows: 756147 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202110 <- read_csv("202110_tripdata.csv")
## Rows: 631226 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202111 <- read_csv("202111_tripdata.csv")
## Rows: 359978 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
trip_202112 <- read_csv("202112_tripdata.csv")
## Rows: 247540 Columns: 13
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (9): ride_id, rideable_type, started_at, ended_at, start_station_name, s...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Wrangling and Combine Data

Make sure column names, and data type of each column match perfectly before files are combined

str(trip_202101)
## spec_tbl_df [96,834 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ rideable_type     : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr [1:96834] "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
##  $ ended_at          : chr [1:96834] "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
##  $ start_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr [1:96834] "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr [1:96834] NA NA NA NA ...
##  $ end_station_id    : chr [1:96834] NA NA NA NA ...
##  $ start_lat         : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr [1:96834] "member" "member" "member" "member" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ride_id = col_character(),
##   ..   rideable_type = col_character(),
##   ..   started_at = col_character(),
##   ..   ended_at = col_character(),
##   ..   start_station_name = col_character(),
##   ..   start_station_id = col_character(),
##   ..   end_station_name = col_character(),
##   ..   end_station_id = col_character(),
##   ..   start_lat = col_double(),
##   ..   start_lng = col_double(),
##   ..   end_lat = col_double(),
##   ..   end_lng = col_double(),
##   ..   member_casual = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Convert “started_at”, “ended_at” to date/time so they can bind correctly

trip_202101$started_at <- ymd_hms(trip_202101$started_at)
trip_202101$ended_at <- ymd_hms(trip_202101$ended_at)

trip_202102$started_at <- ymd_hms(trip_202102$started_at)
trip_202102$ended_at <- ymd_hms(trip_202102$ended_at)

trip_202103$started_at <- ymd_hms(trip_202103$started_at)
trip_202103$ended_at <- ymd_hms(trip_202103$ended_at)

trip_202104$started_at <- ymd_hms(trip_202104$started_at)
trip_202104$ended_at <- ymd_hms(trip_202104$ended_at)

trip_202105$started_at <- ymd_hms(trip_202105$started_at)
trip_202105$ended_at <- ymd_hms(trip_202105$ended_at)

trip_202106$started_at <- ymd_hms(trip_202106$started_at)
trip_202106$ended_at <- ymd_hms(trip_202106$ended_at)

trip_202107$started_at <- ymd_hms(trip_202107$started_at)
trip_202107$ended_at <- ymd_hms(trip_202107$ended_at)

trip_202108$started_at <- ymd_hms(trip_202108$started_at)
trip_202108$ended_at <- ymd_hms(trip_202108$ended_at)

trip_202109$started_at <- ymd_hms(trip_202109$started_at)
trip_202109$ended_at <- ymd_hms(trip_202109$ended_at)

trip_202110$started_at <- ymd_hms(trip_202110$started_at)
trip_202110$ended_at <- ymd_hms(trip_202110$ended_at)

trip_202111$started_at <- ymd_hms(trip_202111$started_at)
trip_202111$ended_at <- ymd_hms(trip_202111$ended_at)

trip_202112$started_at <- ymd_hms(trip_202112$started_at)
trip_202112$ended_at <- ymd_hms(trip_202112$ended_at)

We perform data transformation on “start_station_id” to proper ‘character’/string

trip_202101 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202102 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202103 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202104 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202105 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202106 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202107 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202108 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202109 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202110 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202111 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))
trip_202112 %>%
  mutate(rideable_type = as.character(rideable_type), start_station_id = as.character(start_station_id))

Combine them into a single data frame

all_trip <- rbind(trip_202108, trip_202107, trip_202106, trip_202105, 
                  trip_202104, trip_202103, trip_202102,trip_202101, 
                  trip_202112, trip_202111, trip_202110, trip_202109)

Rename some of the columns for clarity:

all_trip <- rename(all_trip, bike_type = rideable_type, id  = ride_id, 
                     member_type = member_casual, start_at = started_at, 
                     end_at = ended_at)

Remove columns having little relevance to our analysis such as coordinates. We would keep the “station_name” column in lieu of “station_id” to keep the data frame clean and reduce redundancy.

all_trip <- all_trip %>%
  select(-c(start_lat, start_lng, end_lat, end_lng, start_station_id, 
            end_station_id))

Data cleaning & Manipulation

View the statistical summary of the data frame

summary(all_trip)
##       id             bike_type            start_at                  
##  Length:5595063     Length:5595063     Min.   :2021-01-01 00:02:05  
##  Class :character   Class :character   1st Qu.:2021-06-06 23:52:40  
##  Mode  :character   Mode  :character   Median :2021-08-01 01:52:11  
##                                        Mean   :2021-07-29 07:41:02  
##                                        3rd Qu.:2021-09-24 16:36:16  
##                                        Max.   :2021-12-31 23:59:48  
##      end_at                    start_station_name end_station_name  
##  Min.   :2021-01-01 00:08:39   Length:5595063     Length:5595063    
##  1st Qu.:2021-06-07 00:44:21   Class :character   Class :character  
##  Median :2021-08-01 02:21:55   Mode  :character   Mode  :character  
##  Mean   :2021-07-29 08:02:58                                        
##  3rd Qu.:2021-09-24 16:54:05                                        
##  Max.   :2022-01-03 17:32:18                                        
##  member_type       
##  Length:5595063    
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Check the data completeness, unique values, and any N/As

skim(all_trip)
Data summary
Name all_trip
Number of rows 5595063
Number of columns 7
_______________________
Column type frequency:
character 5
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1.00 8 16 0 5595048 0
bike_type 0 1.00 11 13 0 3 0
start_station_name 515088 0.91 0 53 175721 848 0
end_station_name 551897 0.90 0 53 187273 845 0
member_type 0 1.00 6 6 0 2 0

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
start_at 0 1 2021-01-01 00:02:05 2021-12-31 23:59:48 2021-08-01 01:52:11 4677998
end_at 0 1 2021-01-01 00:08:39 2022-01-03 17:32:18 2021-08-01 02:21:55 4671372

There are a few observations/problems we would address:

  • As of December 2020, Divvy introduced “classic bike” to their current fleet including docked bike and e-bike, resulting in classic bike data is not be available before December 2020.

  • Date columns (started_at, ended_at) are too general so we have to add granularity to the data for more opportunities for analyzing. We then extract these attributes below:

    • trip_duration
    • month
    • date
    • start_hour
    • day_of_week
all_trip <- all_trip %>%
  mutate(trip_duration = round(difftime(end_at, start_at, units = "mins"),2)) %>%
  mutate(month = month(start_at, label = TRUE, abbr = TRUE)) %>%
  mutate(date = date(start_at)) %>%
  mutate(start_hour = hour(start_at)) %>%
  mutate(weekday = wday(start_at, label = TRUE, week_start = 1))

Divvy claims that there are several hundreds data points having trip duration less than 60 seconds or negative due to their system testing. Also, trips whose duration is more than 14400 mins (864000 seconds) are deemed to have the bike stolen. These observations will be filtered out to ensure data integrity.

all_trip <- all_trip %>%
  filter(trip_duration >= 1, trip_duration <14400)

Divvy system errors resulted in several data points with missing values in “start_station_name”, “end_station_name”, “start_station_id”, “end_station_id”, which we must remove.

Trips started or ended in either of the stations below would also be dropped as they are rides to deploy or collect bikes from administration stations:

  • DIVVY CASSETTE REPAIR MOBILE STATION
  • HUBBARD ST BIKE CHECKING
  • WATSON TESTING DIVVY
all_trip <- all_trip %>%
  filter(start_station_name != "DIVVY CASSETTE REPAIR MOBILE STATION", 
         start_station_name != "HUBBARD ST BIKE CHECKING", 
         start_station_name != "WATSON TESTING DIVVY",
         start_station_name != "",
         end_station_name != "DIVVY CASSETTE REPAIR MOBILE STATION", 
         end_station_name != "HUBBARD ST BIKE CHECKING", 
         end_station_name != "WATSON TESTING DIVVY",
         end_station_name != "")

Again, checking for data integrity before moving on to mining the data

skim(all_trip)
Data summary
Name all_trip
Number of rows 4528752
Number of columns 12
_______________________
Column type frequency:
character 5
Date 1
difftime 1
factor 2
numeric 1
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1 8 16 0 4528744 0
bike_type 0 1 11 13 0 3 0
start_station_name 0 1 3 53 0 840 0
end_station_name 0 1 10 53 0 837 0
member_type 0 1 6 6 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2021-01-01 2021-12-31 2021-07-28 365

Variable type: difftime

skim_variable n_missing complete_rate min max median n_unique
trip_duration 0 1 1 mins 14229.63 mins 12.38 mins 23361

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
month 0 1 TRUE 12 Jul: 683153, Aug: 666081, Sep: 613354, Jun: 600456
weekday 0 1 TRUE 7 Sat: 814807, Sun: 705401, Fri: 647393, Wed: 607973

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
start_hour 0 1 14.25 5 0 11 15 18 23 ▁▃▅▇▃

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
start_at 0 1 2021-01-01 00:02:24 2021-12-31 23:59:48 2021-07-28 17:31:51 3889471
end_at 0 1 2021-01-01 00:08:39 2022-01-03 17:32:18 2021-07-28 17:49:14 3882021

Descriptive Analysis

Average trip time for each member type

aggregate(all_trip$trip_duration ~ all_trip$member_type, FUN = mean)
##   all_trip$member_type all_trip$trip_duration
## 1               casual          30.71387 mins
## 2               member          13.38241 mins
aggregate(all_trip$trip_duration ~ all_trip$member_type, FUN = median)
##   all_trip$member_type all_trip$trip_duration
## 1               casual             16.83 mins
## 2               member              9.87 mins
aggregate(all_trip$trip_duration ~ all_trip$member_type, FUN = max)
##   all_trip$member_type all_trip$trip_duration
## 1               casual          14229.63 mins
## 2               member           1495.63 mins
aggregate(all_trip$trip_duration ~ all_trip$member_type, FUN = min)
##   all_trip$member_type all_trip$trip_duration
## 1               casual                 1 mins
## 2               member                 1 mins

Average ride duration in weekdays for members vs casual users:

aggregate(all_trip$trip_duration ~ all_trip$member_type + all_trip$weekday, 
          FUN = mean)
##    all_trip$member_type all_trip$weekday all_trip$trip_duration
## 1                casual              Mon          31.51065 mins
## 2                member              Mon          12.91754 mins
## 3                casual              Tue          27.95459 mins
## 4                member              Tue          12.56382 mins
## 5                casual              Wed          26.63141 mins
## 6                member              Wed          12.63290 mins
## 7                casual              Thu          26.23866 mins
## 8                member              Thu          12.52900 mins
## 9                casual              Fri          28.46973 mins
## 10               member              Fri          12.98368 mins
## 11               casual              Sat          33.03606 mins
## 12               member              Sat          15.06266 mins
## 13               casual              Sun          35.34378 mins
## 14               member              Sun          15.45264 mins

Number of rides by member types in weekdays plus trip duration average

all_trip %>%
  group_by(member_type, weekday) %>%
  summarise(number_of_rides = n(), average_duration = mean(trip_duration)) %>%
  arrange(member_type, weekday)
## `summarise()` has grouped output by 'member_type'. You can override using the `.groups` argument.
## # A tibble: 14 x 4
## # Groups:   member_type [2]
##    member_type weekday number_of_rides average_duration
##    <chr>       <ord>             <int> <drtn>          
##  1 casual      Mon              226601 31.51065 mins   
##  2 casual      Tue              212801 27.95459 mins   
##  3 casual      Wed              215970 26.63141 mins   
##  4 casual      Thu              222039 26.23866 mins   
##  5 casual      Fri              287170 28.46973 mins   
##  6 casual      Sat              463651 33.03606 mins   
##  7 casual      Sun              399524 35.34378 mins   
##  8 member      Mon              341154 12.91754 mins   
##  9 member      Tue              382509 12.56382 mins   
## 10 member      Wed              392003 12.63290 mins   
## 11 member      Thu              368074 12.52900 mins   
## 12 member      Fri              360223 12.98368 mins   
## 13 member      Sat              351156 15.06266 mins   
## 14 member      Sun              305877 15.45264 mins

Identify top 5 stations to begin bike rides (Members and Casuals)

all_trip %>%
  filter(member_type == "member") %>%
  select(start_station_name, member_type) %>%
  count(start_station_name, sort = TRUE) %>%
  top_n(5)
## Selecting by n
##         start_station_name     n
## 1        Clark St & Elm St 23571
## 2    Wells St & Concord Ln 22474
## 3 Kingsbury St & Kinzie St 22359
## 4        Wells St & Elm St 19978
## 5    Dearborn St & Erie St 18365
all_trip %>%
  filter(member_type == 'casual') %>%
  group_by(start_station_name) %>%
  summarise(total_ride = n()) %>%
  arrange(desc(total_ride)) %>%
  top_n(5)
## Selecting by total_ride
## # A tibble: 5 x 2
##   start_station_name      total_ride
##   <chr>                        <int>
## 1 Streeter Dr & Grand Ave      63720
## 2 Millennium Park              31832
## 3 Michigan Ave & Oak St        28343
## 4 Shedd Aquarium               22291
## 5 Theater on the Lake          20389

Visualization

Total weekdays trips among members and casuals

all_trip %>%
  group_by(member_type, weekday) %>%
  summarise(total_trip = n()) %>%
  arrange(member_type, weekday) %>%
  ggplot(aes(x = weekday, y = total_trip, fill = member_type)) + 
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_type'. You can override using the `.groups` argument.

Weekdays trip duration by members and casuals

all_trip %>%
  group_by(member_type, weekday) %>%
  summarise(average_duration = mean(trip_duration)) %>%
  arrange(member_type, weekday) %>%
  ggplot(aes(x = weekday, y = average_duration, fill = member_type)) + geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_type'. You can override using the `.groups` argument.
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

Rides in 2021 to identify seasonality or any trends.

all_trip %>%
  group_by(date, member_type) %>%
  summarise(total_trip = n())%>%
  ggplot(aes(x = date, y = total_trip, fill = member_type)) + 
  geom_area()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

Display trip duration average difference between members and casuals:

ggplot(data = all_trip, aes(x = member_type, 
                              y = as.numeric(trip_duration),
                              fill = member_type)) +
  geom_boxplot(outlier.shape = NA) +
  coord_cartesian(ylim =  c(0, 80)) +
  stat_summary(fun=mean, geom="point", color="blue") +
  annotate(geom = "text", x = 1.25, y = 29, label = "Ave = 32.68 mins") +
  annotate(geom = "text", x = 2.30, y = 15, label = "Ave = 14.32 mins")

Rides in times of day

all_trip %>%
  group_by(member_type, start_hour) %>%
  summarise(total_rides = n()) %>%
  ggplot(aes(start_hour, total_rides, fill = member_type)) + 
  geom_col(position = "dodge")
## `summarise()` has grouped output by 'member_type'. You can override using the `.groups` argument.

Top Stations for members to start their trips:

all_trip %>%
  filter(member_type == 'member') %>%
  group_by(start_station_name) %>%
  summarise(trip_count = n()) %>%
  arrange(desc(trip_count)) %>%
  top_n(5) %>%
  ggplot(aes(x=trip_count, y=reorder(start_station_name, trip_count))) + 
  geom_col(fill = "#33cccc") + 
  labs(title = "Top Start Stations by Members", x="Trips", y=" ") +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Selecting by trip_count

Top Stations for casual riders to begin their trips:

all_trip %>%
  filter(member_type == 'casual') %>%
  group_by(start_station_name) %>%
  summarise(trip_count = n()) %>%
  arrange(desc(trip_count)) %>%
  top_n(5) %>%
  ggplot(aes(x=trip_count, y=reorder(start_station_name, trip_count))) + 
  geom_col(fill = "sky blue") + 
  labs(title = "Top Start Stations by Casuals", x="Trips", y=" ") +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Selecting by trip_count

Member types and their bike preferences

all_trip %>%
  group_by(member_type, bike_type) %>%
  summarise(trip_count = n()) %>%
  ggplot(aes(member_type,trip_count, fill = member_type)) +
  geom_col(position = "dodge") +  
  labs(title = "Member Type & Bike Preferences", x=" ", y = "Trips") +
  theme(legend.position = "none") +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(~bike_type)
## `summarise()` has grouped output by 'member_type'. You can override using the `.groups` argument.

Key Findings:

Recommendations:

Export cleaned data into csv for further analysis in Tableau

all_trip %>%
  select(-c(weekday, month, end_at, start_hour, date)) %>%
  write.csv(file = "all_trip.csv")

View this analysis visualization on my Tableau Dashboard here