Seattle Pet Names

#Seattle Pet Names
#12/15/2020

#At the beginning of the project, usually you load the libraries you are planning to use

#install.packages("Package Name")
#library(Package Name)

#Where does our analysis Live?
#need to check working directory
getwd()
# Set working directory to your desired directory
#Example of how to change directory: setwd("C:/Users/pbutrina/Documents/")

#download Pets name data
# uploading locally stored data as .csv file
seattle_pet_data  <-  read.csv(file = "data/Seattle_Pet_Licenses.csv")

#let's look at the data - create a summary
summary(seattle_pet_data)

str(seattle_pet_data)

dim(seattle_pet_data)

head(seattle_pet_data)

names(seattle_pet_data)

#Let's find out how many species we have in our data by
#summarizing categories within Species column

table(seattle_pet_data$Species)

#Today we will try to answer the following questions:
# 1. What is the frequency of pet registrations per year?
# 2. What is the most popular dog and cat name?
# 3. What zip codes have the biggest number of pets per capita?

# 1. What is the frequency of pet registrations per year? ----------------------

#change a data format to be able to work with it
seattle_pet_data$License.Issue.Date  <-  as.Date((seattle_pet_data$License.Issue.Date),format= '%B %d %Y')

#Frequency of per registration per year - were there more per registration in 2020 comparing with other years? Can do a freq graph
seattle_pet_data$year  <-  format(seattle_pet_data$License.Issue.Date, "%Y")
year_freq  <-  table(seattle_pet_data$year)
year_freq <- year_freq[-1]

barplot(year_freq, main = "Number of Pet registrations in Seattle by year", xlab = "Year", ylab = "Number of Registrations",
        ylim = c(0,25000))


table(seattle_pet_data$year,seattle_pet_data$Species)
write.csv(table(seattle_pet_data$year,seattle_pet_data$Species))

# 2. What is the most popular dog and cat name? ----------------------

#Pet Names
#Subset data table
# you can find your pet here!
#one way of subsetting
seattle_pet_data[seattle_pet_data$Animal.s.Name == "Brynza",] 

#the second way of subsetting
subset(seattle_pet_data, Animal.s.Name == "Brynza")

#multiple categories
seattle_pet_data[(seattle_pet_data$Animal.s.Name == "Jack"& seattle_pet_data$ZIP.Code == "98109"),] 

subset(seattle_pet_data, Animal.s.Name == "Jasper" & grepl("Retr", Primary.Breed) & ZIP.Code == "98105")

#the most popular dog name, cat name, species?

#let's separate Dogs and Cats
seattle_dogs  <-  seattle_pet_data[seattle_pet_data$Species == "Dog",] 
dim(seattle_dogs)

seattle_cats  <-  seattle_pet_data[seattle_pet_data$Species == "Cat",]
dim(seattle_cats)

#dog names
sort(table(seattle_dogs$Animal.s.Name),decreasing=TRUE)[1:10]

#cat names
sort(table(seattle_cats$Animal.s.Name),decreasing=TRUE)[1:10]


#3. What zip codes have the biggest number of pets per capita? -----------------

#zip code with the most number of pets per capita - need to join population in each zipcode with pet names table
#link to the original data https://www.ofm.wa.gov/washington-data-research/population-demographics/population-estimates/small-area-estimates-program
population  <-  read.csv(file = "data/OFM_Population_data.csv")
str(population)

#need to transform population to integer data type and remove "," from the population numbers
population$Estimated.Total.Population.2020 <- as.integer(gsub(",","",population$Estimated.Total.Population.2020))

#group by pet licenses by the zip code
licenses_by_zipcode <- as.data.frame(table(seattle_pet_data$ZIP.Code))

#rename zipcode column
colnames(licenses_by_zipcode)[1] <- "zipcode"
colnames(licenses_by_zipcode)[2] <- "number_of_pets"

#rename columns in population table
names(population)[names(population) == "ZIP.Code.Tabulation.Area..5.Digit."] <- "zipcode"
names(population)[names(population) == "Estimated.Total.Population.2020"] <- "population"

#checking data types
str(population)
str(licenses_by_zipcode)

#transforming factor (zipcode column) in license table to character
licenses_by_zipcode$zipcode <- as.character(licenses_by_zipcode$zipcode)
population$zipcode <- as.character(population$zipcode)

#checking the zipcode data

#merge population and pet licenses
pets_and_population <- merge(licenses_by_zipcode, population, by = "zipcode")

str(pets_and_population)

pets_and_population$pets_per_capita <- as.double(pets_and_population$number_of_pets/pets_and_population$population) 
options(scipen=999)

#share of pets of total population in the zip code
pets_and_population$pets_share <- as.double(pets_and_population$number_of_pets/pets_and_population$population)*100 


#Working with strings
#Checking pandemic-related names

unique(subset(seattle_pet_data, startsWith(seattle_pet_data$Animal.s.Name, "Cov"))$Animal.s.Name)

unique(subset(seattle_pet_data, endsWith(seattle_pet_data$Animal.s.Name, "rona"))$Animal.s.Name)

#the longest name
longest_char = max(nchar(seattle_pet_data$Animal.s.Name))
subset(seattle_pet_data, nchar(seattle_pet_data$Animal.s.Name)==longest_char)$Animal.s.Name

#let's look at the top-10 longest pat names in our dataset
longest_name_length <- sort(nchar(seattle_pet_data$Animal.s.Name),decreasing=TRUE)[1:10]

subset(seattle_pet_data, nchar(seattle_pet_data$Animal.s.Name) %in% longest_name_length)$Animal.s.Name