I am a junior data analyst working in the marketing analyst team at Cyclistic, a bike-share company in Chicago. The director of marketing believes the company’s future success depends on maximizing the number of annual memberships. Therefore, my team wants to understand how casual riders and annual members use Cyclistic bikes differently. From these insights, my team will design a new marketing strategy to convert casual riders into annual members. But first, Cyclistic executives must approve my recommendations, so they must be backed up with compelling data insights and professional data visualizations.
library(tidyverse)
library(lubridate)
library(ggplot2)
Upload Divvy Datasets:
q2_2019 <- read.csv("Dataset/Raw/Divvy_Trips_2019_Q2.csv")
q3_2019 <- read.csv("Dataset/Raw/Divvy_Trips_2019_Q3.csv")
q4_2019 <- read.csv("Dataset/Raw/Divvy_Trips_2019_Q4.csv")
q1_2020 <- read.csv("Dataset/Raw/Divvy_Trips_2020_Q1.csv")
Compare column names of each data set:
colnames(q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q4_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q3_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(q2_2019)
## [1] "X01...Rental.Details.Rental.ID"
## [2] "X01...Rental.Details.Local.Start.Time"
## [3] "X01...Rental.Details.Local.End.Time"
## [4] "X01...Rental.Details.Bike.ID"
## [5] "X01...Rental.Details.Duration.In.Seconds.Uncapped"
## [6] "X03...Rental.Start.Station.ID"
## [7] "X03...Rental.Start.Station.Name"
## [8] "X02...Rental.End.Station.ID"
## [9] "X02...Rental.End.Station.Name"
## [10] "User.Type"
## [11] "Member.Gender"
## [12] "X05...Member.Details.Member.Birthday.Year"
There is inconsistency in the column names across each data set.
Rename column names to make it consistent with q1_2020:
q2_2019 <- rename(q2_2019
,ride_id = "X01...Rental.Details.Rental.ID"
,rideable_type = "X01...Rental.Details.Bike.ID"
,started_at = "X01...Rental.Details.Local.Start.Time"
,ended_at = "X01...Rental.Details.Local.End.Time"
,start_station_name = "X03...Rental.Start.Station.Name"
,start_station_id = "X03...Rental.Start.Station.ID"
,end_station_name = "X02...Rental.End.Station.Name"
,end_station_id = "X02...Rental.End.Station.ID"
,member_casual = "User.Type"
,tripduration = "X01...Rental.Details.Duration.In.Seconds.Uncapped"
,gender = "Member.Gender"
,birthyear = "X05...Member.Details.Member.Birthday.Year")
q3_2019 <- rename(q3_2019
,ride_id = trip_id
,rideable_type = bikeid
,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name
,start_station_id = from_station_id
,end_station_name = to_station_name
,end_station_id = to_station_id
,member_casual = usertype)
q4_2019 <- rename(q4_2019
,ride_id = trip_id
,rideable_type = bikeid
,started_at = start_time
,ended_at = end_time
,start_station_name = from_station_name
,start_station_id = from_station_id
,end_station_name = to_station_name
,end_station_id = to_station_id
,member_casual = usertype)
Compare column names of each data frame again:
colnames(q1_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
colnames(q4_2019)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "tripduration" "start_station_id"
## [7] "start_station_name" "end_station_id" "end_station_name"
## [10] "member_casual" "gender" "birthyear"
colnames(q3_2019)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "tripduration" "start_station_id"
## [7] "start_station_name" "end_station_id" "end_station_name"
## [10] "member_casual" "gender" "birthyear"
colnames(q2_2019)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "tripduration" "start_station_id"
## [7] "start_station_name" "end_station_id" "end_station_name"
## [10] "member_casual" "gender" "birthyear"
Column names of are consistent now.
Inspect the data frames and look for inconsistency:
str(q1_2020)
## 'data.frame': 426887 obs. of 13 variables:
## $ ride_id : chr "EACB19130B0CDA4A" "8FED874C809DC021" "789F3C21E472CA96" "C9A388DAC6ABF313" ...
## $ rideable_type : chr "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr "2020-01-21 20:06:59" "2020-01-30 14:22:39" "2020-01-09 19:29:26" "2020-01-06 16:17:07" ...
## $ ended_at : chr "2020-01-21 20:14:30" "2020-01-30 14:26:22" "2020-01-09 19:32:17" "2020-01-06 16:25:56" ...
## $ start_station_name: chr "Western Ave & Leland Ave" "Clark St & Montrose Ave" "Broadway & Belmont Ave" "Clark St & Randolph St" ...
## $ start_station_id : int 239 234 296 51 66 212 96 96 212 38 ...
## $ end_station_name : chr "Clark St & Leland Ave" "Southport Ave & Irving Park Rd" "Wilton Ave & Belmont Ave" "Fairbanks Ct & Grand Ave" ...
## $ end_station_id : int 326 318 117 24 212 96 212 212 96 100 ...
## $ start_lat : num 42 42 41.9 41.9 41.9 ...
## $ start_lng : num -87.7 -87.7 -87.6 -87.6 -87.6 ...
## $ end_lat : num 42 42 41.9 41.9 41.9 ...
## $ end_lng : num -87.7 -87.7 -87.7 -87.6 -87.6 ...
## $ member_casual : chr "member" "member" "member" "member" ...
str(q4_2019)
## 'data.frame': 704054 obs. of 12 variables:
## $ ride_id : int 25223640 25223641 25223642 25223643 25223644 25223645 25223646 25223647 25223648 25223649 ...
## $ started_at : chr "2019-10-01 00:01:39" "2019-10-01 00:02:16" "2019-10-01 00:04:32" "2019-10-01 00:04:32" ...
## $ ended_at : chr "2019-10-01 00:17:20" "2019-10-01 00:06:34" "2019-10-01 00:18:43" "2019-10-01 00:43:43" ...
## $ rideable_type : int 2215 6328 3003 3275 5294 1891 1061 1274 6011 2957 ...
## $ tripduration : chr "940.0" "258.0" "850.0" "2,350.0" ...
## $ start_station_id : int 20 19 84 313 210 156 84 156 156 336 ...
## $ start_station_name: chr "Sheffield Ave & Kingsbury St" "Throop (Loomis) St & Taylor St" "Milwaukee Ave & Grand Ave" "Lakeview Ave & Fullerton Pkwy" ...
## $ end_station_id : int 309 241 199 290 382 226 142 463 463 336 ...
## $ end_station_name : chr "Leavitt St & Armitage Ave" "Morgan St & Polk St" "Wabash Ave & Grand Ave" "Kedzie Ave & Palmer Ct" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Male" "Female" "Male" ...
## $ birthyear : int 1987 1998 1991 1990 1987 1994 1991 1995 1993 NA ...
str(q3_2019)
## 'data.frame': 1640718 obs. of 12 variables:
## $ ride_id : int 23479388 23479389 23479390 23479391 23479392 23479393 23479394 23479395 23479396 23479397 ...
## $ started_at : chr "2019-07-01 00:00:27" "2019-07-01 00:01:16" "2019-07-01 00:01:48" "2019-07-01 00:02:07" ...
## $ ended_at : chr "2019-07-01 00:20:41" "2019-07-01 00:18:44" "2019-07-01 00:27:42" "2019-07-01 00:27:10" ...
## $ rideable_type : int 3591 5353 6180 5540 6014 4941 3770 5442 2957 6091 ...
## $ tripduration : chr "1,214.0" "1,048.0" "1,554.0" "1,503.0" ...
## $ start_station_id : int 117 381 313 313 168 300 168 313 43 43 ...
## $ start_station_name: chr "Wilton Ave & Belmont Ave" "Western Ave & Monroe St" "Lakeview Ave & Fullerton Pkwy" "Lakeview Ave & Fullerton Pkwy" ...
## $ end_station_id : int 497 203 144 144 62 232 62 144 195 195 ...
## $ end_station_name : chr "Kimball Ave & Belmont Ave" "Western Ave & 21st St" "Larrabee St & Webster Ave" "Larrabee St & Webster Ave" ...
## $ member_casual : chr "Subscriber" "Customer" "Customer" "Customer" ...
## $ gender : chr "Male" "" "" "" ...
## $ birthyear : int 1992 NA NA NA NA 1990 NA NA NA NA ...
str(q2_2019)
## 'data.frame': 1108163 obs. of 12 variables:
## $ ride_id : int 22178529 22178530 22178531 22178532 22178533 22178534 22178535 22178536 22178537 22178538 ...
## $ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
## $ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
## $ rideable_type : int 6251 6226 5649 4151 3270 3123 6418 4513 3280 5534 ...
## $ tripduration : chr "446.0" "1,048.0" "252.0" "357.0" ...
## $ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
## $ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
## $ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
## $ gender : chr "Male" "Female" "Male" "Male" ...
## $ birthyear : int 1975 1984 1990 1993 1992 1999 1969 1991 NA NA ...
Convert ‘ride_id’ and ‘rideable_type’ to character so that they are consistent across data frames:
q4_2019 <- mutate(q4_2019, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
q3_2019 <- mutate(q3_2019, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
q2_2019 <- mutate(q2_2019, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
divvy_trips <- bind_rows(q2_2019, q3_2019, q4_2019, q1_2020)
head(divvy_trips)
## ride_id started_at ended_at rideable_type tripduration
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48 6251 446.0
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30 6226 1,048.0
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19 5649 252.0
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58 4151 357.0
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13 3270 1,007.0
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56 3123 257.0
## start_station_id start_station_name end_station_id
## 1 81 Daley Center Plaza 56
## 2 317 Wood St & Taylor St 59
## 3 283 LaSalle St & Jackson Blvd 174
## 4 26 McClurg Ct & Illinois St 133
## 5 202 Halsted St & 18th St 129
## 6 420 Ellis Ave & 55th St 426
## end_station_name member_casual gender birthyear start_lat start_lng
## 1 Desplaines St & Kinzie St Subscriber Male 1975 NA NA
## 2 Wabash Ave & Roosevelt Rd Subscriber Female 1984 NA NA
## 3 Canal St & Madison St Subscriber Male 1990 NA NA
## 4 Kingsbury St & Kinzie St Subscriber Male 1993 NA NA
## 5 Blue Island Ave & 18th St Subscriber Male 1992 NA NA
## 6 Ellis Ave & 60th St Subscriber Male 1999 NA NA
## end_lat end_lng
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
Remove ‘birthyear’, ‘gender’ and ‘tripduration’ as this data was not included in q1 2020. Remove lat and long as it was only included in q1 2020:
divvy_trips <- divvy_trips %>%
select(-c(birthyear, gender, tripduration
, start_lat, start_lng, end_lat, end_lng))
colnames(divvy_trips)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
Inspect the new table that has been created:
# column names
colnames(divvy_trips)
## [1] "ride_id" "started_at" "ended_at"
## [4] "rideable_type" "start_station_id" "start_station_name"
## [7] "end_station_id" "end_station_name" "member_casual"
# number of rows
nrow(divvy_trips)
## [1] 3879822
# dimension of data frame
dim(divvy_trips)
## [1] 3879822 9
# first/last 6 rows of data fram
head(divvy_trips)
## ride_id started_at ended_at rideable_type
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48 6251
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30 6226
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19 5649
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58 4151
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13 3270
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56 3123
## start_station_id start_station_name end_station_id
## 1 81 Daley Center Plaza 56
## 2 317 Wood St & Taylor St 59
## 3 283 LaSalle St & Jackson Blvd 174
## 4 26 McClurg Ct & Illinois St 133
## 5 202 Halsted St & 18th St 129
## 6 420 Ellis Ave & 55th St 426
## end_station_name member_casual
## 1 Desplaines St & Kinzie St Subscriber
## 2 Wabash Ave & Roosevelt Rd Subscriber
## 3 Canal St & Madison St Subscriber
## 4 Kingsbury St & Kinzie St Subscriber
## 5 Blue Island Ave & 18th St Subscriber
## 6 Ellis Ave & 60th St Subscriber
tail(divvy_trips)
## ride_id started_at ended_at rideable_type
## 3879817 6F4D221BDDFD943F 2020-03-10 10:40:27 2020-03-10 10:40:29 docked_bike
## 3879818 ADDAA33CEBCAE733 2020-03-10 10:40:06 2020-03-10 10:40:07 docked_bike
## 3879819 82B10FA3994BC66A 2020-03-07 15:25:55 2020-03-07 16:14:03 docked_bike
## 3879820 AA0D5AAA0B59C8AA 2020-03-01 13:12:38 2020-03-01 13:38:29 docked_bike
## 3879821 3296360A7BC20FB8 2020-03-07 18:02:45 2020-03-07 18:13:18 docked_bike
## 3879822 064EC7698E4FF9B3 2020-03-08 13:03:57 2020-03-08 13:32:27 docked_bike
## start_station_id start_station_name end_station_id
## 3879817 675 HQ QR 675
## 3879818 675 HQ QR 675
## 3879819 161 Rush St & Superior St 240
## 3879820 141 Clark St & Lincoln Ave 210
## 3879821 672 Franklin St & Illinois St 264
## 3879822 110 Dearborn St & Erie St 85
## end_station_name member_casual
## 3879817 HQ QR casual
## 3879818 HQ QR casual
## 3879819 Sheridan Rd & Irving Park Rd member
## 3879820 Ashland Ave & Division St casual
## 3879821 Stetson Ave & South Water St member
## 3879822 Michigan Ave & Oak St casual
#See list of columns and data types (numeric, character, etc)
str(divvy_trips)
## 'data.frame': 3879822 obs. of 9 variables:
## $ ride_id : chr "22178529" "22178530" "22178531" "22178532" ...
## $ started_at : chr "2019-04-01 00:02:22" "2019-04-01 00:03:02" "2019-04-01 00:11:07" "2019-04-01 00:13:01" ...
## $ ended_at : chr "2019-04-01 00:09:48" "2019-04-01 00:20:30" "2019-04-01 00:15:19" "2019-04-01 00:18:58" ...
## $ rideable_type : chr "6251" "6226" "5649" "4151" ...
## $ start_station_id : int 81 317 283 26 202 420 503 260 211 211 ...
## $ start_station_name: chr "Daley Center Plaza" "Wood St & Taylor St" "LaSalle St & Jackson Blvd" "McClurg Ct & Illinois St" ...
## $ end_station_id : int 56 59 174 133 129 426 500 499 211 211 ...
## $ end_station_name : chr "Desplaines St & Kinzie St" "Wabash Ave & Roosevelt Rd" "Canal St & Madison St" "Kingsbury St & Kinzie St" ...
## $ member_casual : chr "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
1. In the “member_casual” column, there are two names for members (“member” and “Subscriber”) and two names for casual riders (“Customer” and “casual”). I will need to consolidate that from four to two labels.
In the “member_casual” column, replace “Subscriber” with “member” and “Customer” with “casual”:
# Begin by seeing how many observations fall under each usertype
table(divvy_trips$member_casual)
##
## casual Customer member Subscriber
## 48480 857474 378407 2595461
# Reassign member_casual to the desired values
divvy_trips <- divvy_trips %>%
mutate(member_casual = recode(member_casual
, "Subscriber" = "member"
, "Customer" = "casual"))
# Check to make sure the proper number of observations were reassigned
table(divvy_trips$member_casual)
##
## casual member
## 905954 2973868
2. Add columns that list the date, month, day, and year of each ride. This will allow us to aggregate ride data for each month, day, or year:
# Date in yyyy-mm-dd
divvy_trips$date <- as.Date(divvy_trips$started_at)
# year, month, day, day of week
divvy_trips$year <- format(as.Date(divvy_trips$date), "%Y")
divvy_trips$month <- format(as.Date(divvy_trips$date), "%m")
divvy_trips$day <- format(as.Date(divvy_trips$date), "%d")
divvy_trips$day_of_week <- format(as.Date(divvy_trips$date), "%A")
Order ‘day_of_week’ column from ‘Sunday’ to ‘Saturday’:
# order day_of_week
divvy_trips$day_of_week <- ordered(divvy_trips$day_of_week
, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
3. Add ‘ride_duration’ column for duration of the rides
divvy_trips$ride_duration = difftime(divvy_trips$ended_at, divvy_trips$started_at)
Inspect structure of ‘ride_duration’ column:
str(divvy_trips$ride_duration)
## 'difftime' num [1:3879822] 446 1048 252 357 ...
## - attr(*, "units")= chr "secs"
Convert “ride_duration” to numeric so that calculation can be performed on it:
divvy_trips$ride_duration <- as.numeric(divvy_trips$ride_duration)
is.numeric(divvy_trips$ride_duration)
## [1] TRUE
4. Clean the “bad” data
Number of negative ride duration:
length(divvy_trips$ride_id[divvy_trips$ride_duration < 0])
## [1] 130
Number of bikes taken out of docks and checked for quality by Divvy:
length(divvy_trips$ride_id[divvy_trips$start_station_name == "HQ QR"])
## [1] 3767
Remove rows with bikes taken out of docks and negative ride duration:
divvy_trips_cleaned <- divvy_trips[!(divvy_trips$start_station_name == "HQ QR"
| divvy_trips$ride_duration < 0), ]
head(divvy_trips_cleaned)
## ride_id started_at ended_at rideable_type
## 1 22178529 2019-04-01 00:02:22 2019-04-01 00:09:48 6251
## 2 22178530 2019-04-01 00:03:02 2019-04-01 00:20:30 6226
## 3 22178531 2019-04-01 00:11:07 2019-04-01 00:15:19 5649
## 4 22178532 2019-04-01 00:13:01 2019-04-01 00:18:58 4151
## 5 22178533 2019-04-01 00:19:26 2019-04-01 00:36:13 3270
## 6 22178534 2019-04-01 00:19:39 2019-04-01 00:23:56 3123
## start_station_id start_station_name end_station_id
## 1 81 Daley Center Plaza 56
## 2 317 Wood St & Taylor St 59
## 3 283 LaSalle St & Jackson Blvd 174
## 4 26 McClurg Ct & Illinois St 133
## 5 202 Halsted St & 18th St 129
## 6 420 Ellis Ave & 55th St 426
## end_station_name member_casual date year month day day_of_week
## 1 Desplaines St & Kinzie St member 2019-04-01 2019 04 01 Monday
## 2 Wabash Ave & Roosevelt Rd member 2019-04-01 2019 04 01 Monday
## 3 Canal St & Madison St member 2019-04-01 2019 04 01 Monday
## 4 Kingsbury St & Kinzie St member 2019-04-01 2019 04 01 Monday
## 5 Blue Island Ave & 18th St member 2019-04-01 2019 04 01 Monday
## 6 Ellis Ave & 60th St member 2019-04-01 2019 04 01 Monday
## ride_duration
## 1 446
## 2 1048
## 3 252
## 4 357
## 5 1007
## 6 257
dim(divvy_trips_cleaned)
## [1] 3876042 15
Statistical summary on “ride_duration” (secs):
summary(divvy_trips_cleaned$ride_duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 412 712 1479 1289 9387024
setNames(aggregate(divvy_trips_cleaned$ride_duration ~ divvy_trips_cleaned$member_casual
, FUN = mean), c("member/casual", "ride_duration_mean"))
## member/casual ride_duration_mean
## 1 casual 3552.7502
## 2 member 850.0662
setNames(aggregate(divvy_trips_cleaned$ride_duration ~ divvy_trips_cleaned$member_casual
, FUN = median), c("member/casual", "ride_duration_median"))
## member/casual ride_duration_median
## 1 casual 1546
## 2 member 589
setNames(aggregate(divvy_trips_cleaned$ride_duration ~ divvy_trips_cleaned$member_casual
, FUN = max), c("member/casual", "ride_duration_max"))
## member/casual ride_duration_max
## 1 casual 9387024
## 2 member 9056634
setNames(aggregate(divvy_trips_cleaned$ride_duration ~ divvy_trips_cleaned$member_casual
, FUN = min), c("member/casual", "ride_duration_min"))
## member/casual ride_duration_min
## 1 casual 2
## 2 member 1
res1 <- divvy_trips_cleaned %>%
group_by(member_casual) %>%
summarize(number_of_rides = n()
, mean_ride_duration = mean(ride_duration))
res1 %>%
ggplot(aes(x = member_casual, y = number_of_rides)) +
geom_col(aes(fill = number_of_rides)) +
scale_fill_gradient2(low = "orange",
high = "darkblue",
midpoint = median(res1$number_of_rides)) +
theme(legend.position = "none") +
xlab("") +
ylab("Number of Rides") +
labs(title = "Number of Rides", subtitle = "Casual vs Member")
res1 %>%
ggplot(aes(x = member_casual, y = mean_ride_duration)) +
geom_col(aes(fill = mean_ride_duration)) +
scale_fill_gradient2(low = "darkblue",
high = "orange",
midpoint = median(res1$mean_ride_duration)) +
theme(legend.position = "none") +
xlab("") +
ylab("Average Ride Duration") +
labs(title = "Average Ride Duration", subtitle = "Casual vs Member")
setNames(aggregate(divvy_trips_cleaned$ride_duration
~ divvy_trips_cleaned$member_casual
+ divvy_trips_cleaned$day_of_week
, FUN = mean)
, c("member/casual", "day_of_week", "ride_duration_mean"))
## member/casual day_of_week ride_duration_mean
## 1 casual Sunday 3581.4054
## 2 member Sunday 919.9746
## 3 casual Monday 3372.2869
## 4 member Monday 842.5726
## 5 casual Tuesday 3596.3599
## 6 member Tuesday 826.1427
## 7 casual Wednesday 3718.6619
## 8 member Wednesday 823.9996
## 9 casual Thursday 3682.9847
## 10 member Thursday 823.9278
## 11 casual Friday 3773.8351
## 12 member Friday 824.5305
## 13 casual Saturday 3331.9138
## 14 member Saturday 968.9337
res2 <- divvy_trips_cleaned %>%
group_by(member_casual, day_of_week) %>%
summarize(number_of_rides = n()
, mean_ride_duration = mean(ride_duration)) %>%
arrange(member_casual, day_of_week)
## `summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
res2 %>%
ggplot(aes(x = day_of_week, y = mean_ride_duration, fill = member_casual)) +
geom_col(position = "dodge") +
xlab("") +
ylab("Average Ride Duration") +
labs(title = "Average Ride Duration Throughout the Week"
, subtitle = "Casual vs Member")
res2 %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
xlab("") +
ylab("Number of Rides") +
labs(title = "Number of Rides Throughout the Week"
, subtitle = "Casual vs Member")
res2 %>%
filter(member_casual == "casual") %>%
ggplot(aes(x = day_of_week, y = number_of_rides, group = 1)) +
geom_point(color = "orange") +
geom_line(color = "orange") +
xlab("") +
ylab("Number of Rides") +
labs(title = "Number of Rides of Casual Riders")
res2 %>%
filter(member_casual == "member") %>%
ggplot(aes(x = day_of_week, y = number_of_rides, group = 1)) +
geom_point(color = "darkblue") +
geom_line(color = "darkblue") +
xlab("") +
ylab("Number of Rides") +
labs(title = "Number of Rides of Members")
res3 <- divvy_trips_cleaned %>%
group_by(member_casual, month) %>%
summarize(number_of_rides = n()
, mean_ride_duration = mean(ride_duration)) %>%
arrange(member_casual, month)
## `summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
res3 %>%
ggplot(aes(x = month, y = mean_ride_duration, color = member_casual)) +
geom_col() +
xlab("Months") +
ylab("Average Ride Duration") +
labs(title = "Average Ride Duration (Months)")
res3 %>%
filter(member_casual == "casual") %>%
ggplot(aes(x = month, y = mean_ride_duration)) +
geom_col(fill = "orange") +
xlab("Months") +
ylab("Average Ride Duration") +
labs(title = "Average Ride Duration of Casual Riders (Months)")
res3 %>%
filter(member_casual == "member") %>%
ggplot(aes(x = month, y = mean_ride_duration)) +
geom_col(fill = "darkblue") +
xlab("Months") +
ylab("Average Ride Duration") +
labs(title = "Average Ride Duration of Members (Months)")
write.csv(divvy_trips_cleaned, 'Dataset/divvy_trips_cleaned.csv')
write.csv(res1, 'Dataset/res1.csv')
write.csv(res2, 'Dataset/res2.csv')
write.csv(res3, 'Dataset/res3.csv')