# Loading packages
library(pacman)
p_load(
dplyr,
ggplot2,
magrittr,
tidyr,
stringr
)
Showing loaded packages and version
(.packages()) # To check packages
## [1] "stringr" "tidyr" "magrittr" "ggplot2" "dplyr" "pacman"
## [7] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [13] "base"
R.version$version.string
## [1] "R version 4.2.0 (2022-04-22 ucrt)"
# Importing the data
bdf1 = read.csv("pre_2_10_22_data_of_B.csv")
bdf1a = bdf1 %>% # Preserving original data frame
separate(time_out, c("hour_out","minute_out",sep = ":")) %>%
separate(time_in, c("hour_in","minute_in",sep = ":")) %>%
mutate(ind = as.factor(ind),
ind_0 = if_else(ind==0,T,F),
ind_1 = if_else(ind==1,T,F),
ind_2 = if_else(ind==2,T,F),
"Δ" = "--") %>%
rename(Urine = urine, Stool = shit, AM_PM = am_or_pm,
DATE = date, HOUR_IN = hour_in, MIN_IN = minute_in,
MIN_OUT = minute_out, HOUR_OUT = hour_out) %>% select(-c(":","diff"))
# Rearranging column ordering
bdf1a <- bdf1a[, c(1,2,3,4,5,6,14,8,7,11,12,13)]
# Importing the data
df2 = read.csv("2.10_thru_3.24.csv")
# Showing preview of data
# skimr::skim(df2)
bdf2a = df2 %>% # To preserve original data frame
filter(Which.Dog.!="Siobhan") %>% # only looking at one dog
separate(Timestamp,
c("Date", "Time"), # Separating time stamp at column spacing
sep = " ") %>%
rename("Stool" = "Shit") %>% # Renaming for civility's sake
mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
id = row_number()) %>%
select(-c("Time","Which.Dog.")) # No longer needing these columns
# Indicator value for urination and stool during outing
bdf2a$Urine <- substr(bdf2a$Urine,1,1)
bdf2a$Stool <- substr(bdf2a$Stool,1,1)
# Separating AM/PM from time stamp
am_pm_nchar = 2
bdf2a$AM_PM = substr(bdf2a$Time.In,
nchar(bdf2a$Time.In) - am_pm_nchar + 1,
nchar(bdf2a$Time.In))
bdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
bdf2a$ind <- substr(bdf2a$ind,1,1)
bdf2a %<>% mutate(ind = as.factor(ind),
ind_0 = if_else(ind==0,T,F),
ind_1 = if_else(ind==1,T,F),
ind_2 = if_else(ind==2,T,F),
"Δ" = "--", # Placeholder for variable
id = as.character(id)) %>%
separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
select(-c(":"))
# Reordering columns for merging files
bdf2b <- bdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15)]
sdf2a = df2 %>% # To preserve original data frame
filter(Which.Dog.!="Braonin") %>% # only looking at one dog
separate(Timestamp,
c("Date", "Time"), # Separating time stamp at column spacing
sep = " ") %>%
rename("Stool" = "Shit") %>% # Renaming for civility's sake
mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
id = row_number()) %>%
select(-c("Time","Which.Dog.")) # No longer needing these columns
# Indicator value for urination and stool during outing
sdf2a$Urine <- substr(sdf2a$Urine,1,1)
sdf2a$Stool <- substr(sdf2a$Stool,1,1)
# Separating AM/PM from time stamp
sdf2a$AM_PM = substr(sdf2a$Time.In,
nchar(sdf2a$Time.In) - am_pm_nchar + 1,
nchar(sdf2a$Time.In))
sdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
sdf2a$ind <- substr(sdf2a$ind,1,1)
sdf2a %<>% mutate(ind = as.factor(ind),
ind_0 = if_else(ind==0,T,F),
ind_1 = if_else(ind==1,T,F),
ind_2 = if_else(ind==2,T,F),
"Δ" = "--", # Placeholder for variable
id = as.character(id)) %>%
separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
select(-c(":"))
# Reordering columns for merging files
sdf2b <- sdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15,9)]
Parts of this section will be unnecessary in future versions since the data-input tool will provide a uniform structure of observations.
Verifying matching formatting before merging sets
str(bdf1a); str(bdf2b)
## 'data.frame': 256 obs. of 12 variables:
## $ DATE : chr "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
## $ HOUR_OUT: chr "1" "3" "12" "9" ...
## $ MIN_OUT : chr "18" "00" "24" "18" ...
## $ AM_PM : chr "pm" "pm" "am" "am" ...
## $ HOUR_IN : chr "1" "3" "12" "9" ...
## $ MIN_IN : chr "30" "20" "32" "25" ...
## $ Δ : chr "--" "--" "--" "--" ...
## $ Urine : chr "0" "0" "0" "1" ...
## $ Stool : chr "1" "0" "1" "1" ...
## $ ind_0 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ind_1 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ind_2 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## 'data.frame': 98 obs. of 12 variables:
## $ DATE : chr "2/10/2022" "2/11/2022" "2/11/2022" "2/11/2022" ...
## $ HOUR_OUT: chr "9" "9" "10" "1" ...
## $ MIN_OUT : chr "10" "34" "28" "02" ...
## $ AM_PM : chr "PM" "AM" "AM" "PM" ...
## $ HOUR_IN : chr "9" "9" "10" "1" ...
## $ MIN_IN : chr "22" "40" "35" "13" ...
## $ Δ : chr "--" "--" "--" "--" ...
## $ Urine : chr "1" "1" "0" "1" ...
## $ Stool : chr "0" "0" "1" "0" ...
## $ ind_0 : logi TRUE FALSE FALSE FALSE TRUE TRUE ...
## $ ind_1 : logi FALSE TRUE FALSE TRUE FALSE FALSE ...
## $ ind_2 : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
Full joining into single file
df_merged = full_join(bdf1a,bdf2b)
## Joining, by = c("DATE", "HOUR_OUT", "MIN_OUT", "AM_PM", "HOUR_IN", "MIN_IN",
## "Δ", "Urine", "Stool", "ind_0", "ind_1", "ind_2")
For a total of 354 observations of data from outings for Bucky.
# Replacing non-numeric values with NA
df_merged$HOUR_OUT <- as.numeric(as.character(df_merged$HOUR_OUT))
df_merged$HOUR_IN <- as.numeric(as.character(df_merged$HOUR_IN))
df_merged$MIN_OUT <- as.numeric(as.character(df_merged$MIN_OUT))
df_merged$MIN_IN <- as.numeric(as.character(df_merged$MIN_IN))
df_merged$Urine <- as.numeric(as.character(df_merged$Urine))
df_merged$Stool <- as.numeric(as.character(df_merged$Stool))
df_merged$`Δ` <- as.numeric(as.character(df_merged$`Δ`))
df_merged %<>% mutate(Δ = NA,
AM_PM = toupper(AM_PM))
df_merged$AM_PM[df_merged$AM_PM != "AM" & df_merged$AM_PM != "PM"] <- NA_character_
# Converting variables to factor variables
df_merged$Urine <- as.factor(df_merged$Urine)
df_merged$Stool <- as.factor(df_merged$Stool)
df_merged$AM_PM <- as.factor(df_merged$AM_PM)
Calculating value of the Δ
variable (the elapsed time of
an outing), calculated as the difference between the
IN
/OUT
values of the
HOUR
/MIN
of an observation
# First converting time to 24-hour decimal format
df_merged2 = df_merged %>%
mutate(
MIN_IN = (MIN_IN/60),
MIN_OUT = (MIN_OUT/60),
HOUR_OUT = if_else(AM_PM=="PM", (HOUR_OUT+12),HOUR_OUT),
HOUR_IN = if_else(AM_PM=="PM", (HOUR_IN+12),HOUR_IN),
RETURN = (if_else(HOUR_OUT > HOUR_IN,round(((HOUR_IN+12)+MIN_IN),digits=2),round((HOUR_IN+MIN_IN),digits=2))),
EGRESS = round((HOUR_OUT+MIN_OUT),digits=2))
# Replacing non-numeric values with NA
df_merged2$RETURN <- round(as.numeric(as.character(df_merged2$RETURN)), digits = 2)
df_merged2$EGRESS <- round(as.numeric(as.character(df_merged2$EGRESS)), digits = 2)
# Calculating Δ
df_merged2 %<>% mutate(
Δ = abs(RETURN - EGRESS),
Δ_num = Δ,
Δ_min = (Δ*60))
# Rearranging order of variables
df_merged2 <- df_merged2[, c(1,14,13,7,16,8:12)]
Checking structure/formatting before proceeding
str(df_merged2)
## 'data.frame': 354 obs. of 10 variables:
## $ DATE : chr "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
## $ EGRESS: num 13.3 15 12.4 9.3 NA ...
## $ RETURN: num 13.5 15.33 12.53 9.42 NA ...
## $ Δ : num 0.2 0.33 0.13 0.12 NA ...
## $ Δ_min : num 12 19.8 7.8 7.2 NA ...
## $ Urine : Factor w/ 2 levels "0","1": 1 1 1 2 NA 1 1 2 1 1 ...
## $ Stool : Factor w/ 2 levels "0","1": 2 1 2 2 NA 2 2 1 1 1 ...
## $ ind_0 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ind_1 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ind_2 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
The missing values from the formatted version of our data as shown
above are the result of the human error component from data
collection. As we can see in the data-overview below, the number of
missing values for RETURN
is higher than the number for
EGRESS
- this is because there were several times that I
forgot to provide a complete account of data for an outing sometime
after recording the time of departure.
skimr::skim(df_merged2)
Name | df_merged2 |
Number of rows | 354 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
character | 1 |
factor | 2 |
logical | 3 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
DATE | 0 | 1 | 8 | 10 | 0 | 86 | 0 |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Urine | 24 | 0.93 | FALSE | 2 | 1: 285, 0: 45 |
Stool | 25 | 0.93 | FALSE | 2 | 0: 230, 1: 99 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
ind_0 | 0 | 1 | 0.25 | FAL: 267, TRU: 87 |
ind_1 | 0 | 1 | 0.44 | FAL: 197, TRU: 157 |
ind_2 | 0 | 1 | 0.18 | FAL: 291, TRU: 63 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
EGRESS | 11 | 0.97 | 15.40 | 5.52 | 1.00 | 10.80 | 14.93 | 20.26 | 24.87 | ▁▅▇▆▇ |
RETURN | 34 | 0.90 | 15.56 | 5.49 | 1.17 | 11.13 | 15.04 | 20.25 | 25.12 | ▁▅▇▆▆ |
Δ | 34 | 0.90 | 0.17 | 0.09 | 0.00 | 0.12 | 0.17 | 0.21 | 0.99 | ▇▅▁▁▁ |
Δ_min | 34 | 0.90 | 10.42 | 5.27 | 0.00 | 7.20 | 10.20 | 12.60 | 59.40 | ▇▅▁▁▁ |
No worries! The purpose of The Scoop is to understand that there is often a human error component to the data collection process. Rather than allow that error to skew our results or be forced to remove those observations from out calculations entirely (again, skewing out results), we can work wit the data that we do have to make predictions on the data that we don’t have.
Since we have over 300 observations without missing values, we can use some of that information to predict what the missing parts of the given observation would have likely been if I had remembered to complete the rest of the entry.
We can use Imputation of the Mean of the
Δ
values to complete missing observations of the
EGRESS
variable. Since we know the time of departure for
the outing and we know the average time of duration of an outing from
the rest of the data, we can impute the NA
values as shown
below:
# Preserving original data
df_merged3 <- df_merged2
# Replacing NA values for the Δ variables
df_merged3$Δ[is.na(df_merged3$Δ)] <- mean(df_merged3$Δ, na.rm = TRUE)
df_merged3$Δ_min[is.na(df_merged3$Δ_min)] <- mean(df_merged3$Δ_min, na.rm = TRUE)
Taking a look at a visual representation of the merged and cleaned data
freq_df = as.data.frame(plyr::count(df_merged3, "DATE"))
freq_plot = ggplot(
freq_df, aes(x=DATE, y=freq))+
geom_bar(stat = "identity", fill = "purple")+
theme(plot.title = element_text(hjust = 0.5, face="bold.italic"),
axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 6, angle = 90, vjust = .7),
axis.title.y = element_text(size = 15))+
ylab("Outings Per Day")+
xlab("Date")
freq_plot