Now that we have the data formatted where each row is the latest single inspection record for a restaurant, let’s see what we can find out!
#05- play-with-data
#### Playing with the Data ####
glimpse(rest_df_filtered)
#How do the inspection score vary by city?
# Average and Max Score by City
score_by_city<-rest_df_filtered %>% group_by(CITY) %>% summarize(avg_score= mean(SCORE_INSPECTION),
max_score = max(SCORE_INSPECTION),
n_rests = n())
score_by_city
# some of the cities have very few restaurants; let's remove those
score_city_sufficient <- score_by_city %>% filter(n_rests>50)
#dplyr works great with ggplot to make plots.
ggplot(data=score_city_sufficient, aes(x=CITY, y=avg_score)) +
geom_bar(stat="identity")+theme(axis.text.x = element_text(angle = 90))+
ggtitle("Average Restaurant Inspection Score by City") +
xlab("City") + ylab("Average Restaurant Score")
# does the average score just correlate with number of restaurants?
ggplot(data=score_city_sufficient, aes(x=n_rests, y=avg_score)) +
geom_point()+ggtitle("Average Restaurant Inspection Score by Number of Restaurants") +
xlab("Number of Restaurants") + ylab("Average Restaurant Score")
# Seattle throws everything off
score_city_sufficient_not_too_many <- score_city_sufficient %>% filter(n_rests<100)
ggplot(data=score_city_sufficient_not_too_many, aes(x=n_rests, y=avg_score)) +
geom_point()+ggtitle("Average Restaurant Inspection Score by Number of Restaurants") +
xlab("Number of Restaurants") + ylab("Average Restaurant Score")
# What is the info on the restaurant with the worst score by city?
worst_by_city<-rest_df_filtered %>% group_by(CITY) %>% top_n(1, SCORE_INSPECTION)
worst_by_city
# What was the data on the restaurant I liked?
bizzaro_df_filtered <- rest_df_filtered %>% dplyr::filter(str_detect(NAME, 'BIZZARRO'))
bizzaro_df_filtered
Exercises