# Loading packages
library(pacman)
p_load(
   dplyr,
   ggplot2,
   magrittr,
   tidyr,
   stringr
)

Showing loaded packages and version

(.packages()) # To check packages
##  [1] "stringr"   "tidyr"     "magrittr"  "ggplot2"   "dplyr"     "pacman"   
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"
R.version$version.string
## [1] "R version 4.2.0 (2022-04-22 ucrt)"

Dog No. 1 - The Scoop on Bucky

Part 1 of Data

# Importing the data
bdf1 = read.csv("pre_2_10_22_data_of_B.csv")
bdf1a = bdf1 %>% # Preserving original data frame
       separate(time_out, c("hour_out","minute_out",sep = ":")) %>%
       separate(time_in, c("hour_in","minute_in",sep = ":")) %>%
       mutate(ind = as.factor(ind),
              ind_0 = if_else(ind==0,T,F),
              ind_1 = if_else(ind==1,T,F),
              ind_2 = if_else(ind==2,T,F),
              "Δ" = "--") %>%
       rename(Urine = urine, Stool = shit, AM_PM = am_or_pm,
              DATE = date, HOUR_IN = hour_in, MIN_IN = minute_in,
              MIN_OUT = minute_out, HOUR_OUT = hour_out) %>% select(-c(":","diff"))

# Rearranging column ordering
bdf1a <- bdf1a[, c(1,2,3,4,5,6,14,8,7,11,12,13)]

Part 2 of Data

# Importing the data
df2 = read.csv("2.10_thru_3.24.csv")

# Showing preview of data
# skimr::skim(df2)
bdf2a = df2 %>% # To preserve original data frame
   filter(Which.Dog.!="Siobhan") %>% # only looking at one dog
   separate(Timestamp,
            c("Date", "Time"), # Separating time stamp at column spacing
            sep = " ") %>% 
   rename("Stool" = "Shit") %>% # Renaming for civility's sake
   mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
          id = row_number()) %>%
   select(-c("Time","Which.Dog.")) # No longer needing these columns
   
# Indicator value for urination and stool during outing
bdf2a$Urine <- substr(bdf2a$Urine,1,1)
bdf2a$Stool <- substr(bdf2a$Stool,1,1)

# Separating AM/PM from time stamp
am_pm_nchar = 2                               
bdf2a$AM_PM = substr(bdf2a$Time.In,
       nchar(bdf2a$Time.In) - am_pm_nchar + 1,
       nchar(bdf2a$Time.In))

bdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
bdf2a$ind <- substr(bdf2a$ind,1,1)
bdf2a %<>% mutate(ind = as.factor(ind),
                  ind_0 = if_else(ind==0,T,F),
                  ind_1 = if_else(ind==1,T,F),
                  ind_2 = if_else(ind==2,T,F),
                  "Δ" = "--", # Placeholder for variable
                  id = as.character(id)) %>%
           separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
           separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
           select(-c(":"))

# Reordering columns for merging files
bdf2b <- bdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15)]
sdf2a = df2 %>% # To preserve original data frame
   filter(Which.Dog.!="Braonin") %>% # only looking at one dog
   separate(Timestamp,
            c("Date", "Time"), # Separating time stamp at column spacing
            sep = " ") %>% 
   rename("Stool" = "Shit") %>% # Renaming for civility's sake
   mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
          id = row_number()) %>%
   select(-c("Time","Which.Dog.")) # No longer needing these columns
   
# Indicator value for urination and stool during outing
sdf2a$Urine <- substr(sdf2a$Urine,1,1)
sdf2a$Stool <- substr(sdf2a$Stool,1,1)

# Separating AM/PM from time stamp
sdf2a$AM_PM = substr(sdf2a$Time.In,
       nchar(sdf2a$Time.In) - am_pm_nchar + 1,
       nchar(sdf2a$Time.In))

sdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
sdf2a$ind <- substr(sdf2a$ind,1,1)
sdf2a %<>% mutate(ind = as.factor(ind),
                  ind_0 = if_else(ind==0,T,F),
                  ind_1 = if_else(ind==1,T,F),
                  ind_2 = if_else(ind==2,T,F),
                  "Δ" = "--", # Placeholder for variable
                  id = as.character(id)) %>%
           separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
           separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
           select(-c(":"))

# Reordering columns for merging files
sdf2b <- sdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15,9)]

Merging Data

Parts of this section will be unnecessary in future versions since the data-input tool will provide a uniform structure of observations.

Verifying matching formatting before merging sets

str(bdf1a); str(bdf2b)
## 'data.frame':    256 obs. of  12 variables:
##  $ DATE    : chr  "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
##  $ HOUR_OUT: chr  "1" "3" "12" "9" ...
##  $ MIN_OUT : chr  "18" "00" "24" "18" ...
##  $ AM_PM   : chr  "pm" "pm" "am" "am" ...
##  $ HOUR_IN : chr  "1" "3" "12" "9" ...
##  $ MIN_IN  : chr  "30" "20" "32" "25" ...
##  $ Δ       : chr  "--" "--" "--" "--" ...
##  $ Urine   : chr  "0" "0" "0" "1" ...
##  $ Stool   : chr  "1" "0" "1" "1" ...
##  $ ind_0   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_1   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_2   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
## 'data.frame':    98 obs. of  12 variables:
##  $ DATE    : chr  "2/10/2022" "2/11/2022" "2/11/2022" "2/11/2022" ...
##  $ HOUR_OUT: chr  "9" "9" "10" "1" ...
##  $ MIN_OUT : chr  "10" "34" "28" "02" ...
##  $ AM_PM   : chr  "PM" "AM" "AM" "PM" ...
##  $ HOUR_IN : chr  "9" "9" "10" "1" ...
##  $ MIN_IN  : chr  "22" "40" "35" "13" ...
##  $ Δ       : chr  "--" "--" "--" "--" ...
##  $ Urine   : chr  "1" "1" "0" "1" ...
##  $ Stool   : chr  "0" "0" "1" "0" ...
##  $ ind_0   : logi  TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ ind_1   : logi  FALSE TRUE FALSE TRUE FALSE FALSE ...
##  $ ind_2   : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...

Full joining into single file

df_merged = full_join(bdf1a,bdf2b)
## Joining, by = c("DATE", "HOUR_OUT", "MIN_OUT", "AM_PM", "HOUR_IN", "MIN_IN",
## "Δ", "Urine", "Stool", "ind_0", "ind_1", "ind_2")

For a total of 354 observations of data from outings for Bucky.

Formatting Variables

# Replacing non-numeric values with NA
df_merged$HOUR_OUT <- as.numeric(as.character(df_merged$HOUR_OUT))
df_merged$HOUR_IN  <- as.numeric(as.character(df_merged$HOUR_IN))
df_merged$MIN_OUT  <- as.numeric(as.character(df_merged$MIN_OUT))
df_merged$MIN_IN   <- as.numeric(as.character(df_merged$MIN_IN))
df_merged$Urine    <- as.numeric(as.character(df_merged$Urine))
df_merged$Stool    <- as.numeric(as.character(df_merged$Stool))
df_merged$`Δ`      <- as.numeric(as.character(df_merged$`Δ`))

df_merged %<>% mutate(Δ = NA,
                      AM_PM = toupper(AM_PM)) 
df_merged$AM_PM[df_merged$AM_PM != "AM" & df_merged$AM_PM != "PM"] <- NA_character_

# Converting variables to factor variables
df_merged$Urine <- as.factor(df_merged$Urine)
df_merged$Stool <- as.factor(df_merged$Stool)
df_merged$AM_PM <- as.factor(df_merged$AM_PM)

Calculating value of the Δ variable (the elapsed time of an outing), calculated as the difference between the IN/OUT values of the HOUR/MIN of an observation

# First converting time to 24-hour decimal format
df_merged2 = df_merged %>%
   mutate(
      MIN_IN   = (MIN_IN/60),
      MIN_OUT  = (MIN_OUT/60),
      HOUR_OUT = if_else(AM_PM=="PM", (HOUR_OUT+12),HOUR_OUT),
      HOUR_IN  = if_else(AM_PM=="PM", (HOUR_IN+12),HOUR_IN),
      RETURN   = (if_else(HOUR_OUT > HOUR_IN,round(((HOUR_IN+12)+MIN_IN),digits=2),round((HOUR_IN+MIN_IN),digits=2))),
      EGRESS   = round((HOUR_OUT+MIN_OUT),digits=2))

# Replacing non-numeric values with NA
df_merged2$RETURN <- round(as.numeric(as.character(df_merged2$RETURN)), digits = 2)
df_merged2$EGRESS <- round(as.numeric(as.character(df_merged2$EGRESS)), digits = 2)

# Calculating Δ
df_merged2 %<>% mutate(
   Δ      = abs(RETURN - EGRESS),
   Δ_num  = Δ,
   Δ_min  = (Δ*60))

# Rearranging order of variables
df_merged2 <- df_merged2[, c(1,14,13,7,16,8:12)]

Checking structure/formatting before proceeding

str(df_merged2)
## 'data.frame':    354 obs. of  10 variables:
##  $ DATE  : chr  "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
##  $ EGRESS: num  13.3 15 12.4 9.3 NA ...
##  $ RETURN: num  13.5 15.33 12.53 9.42 NA ...
##  $ Δ     : num  0.2 0.33 0.13 0.12 NA ...
##  $ Δ_min : num  12 19.8 7.8 7.2 NA ...
##  $ Urine : Factor w/ 2 levels "0","1": 1 1 1 2 NA 1 1 2 1 1 ...
##  $ Stool : Factor w/ 2 levels "0","1": 2 1 2 2 NA 2 2 1 1 1 ...
##  $ ind_0 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_1 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_2 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Dealing with Missing Values

The missing values from the formatted version of our data as shown above are the result of the human error component from data collection. As we can see in the data-overview below, the number of missing values for RETURN is higher than the number for EGRESS - this is because there were several times that I forgot to provide a complete account of data for an outing sometime after recording the time of departure.

skimr::skim(df_merged2)
Data summary
Name df_merged2
Number of rows 354
Number of columns 10
_______________________
Column type frequency:
character 1
factor 2
logical 3
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
DATE 0 1 8 10 0 86 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Urine 24 0.93 FALSE 2 1: 285, 0: 45
Stool 25 0.93 FALSE 2 0: 230, 1: 99

Variable type: logical

skim_variable n_missing complete_rate mean count
ind_0 0 1 0.25 FAL: 267, TRU: 87
ind_1 0 1 0.44 FAL: 197, TRU: 157
ind_2 0 1 0.18 FAL: 291, TRU: 63

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
EGRESS 11 0.97 15.40 5.52 1.00 10.80 14.93 20.26 24.87 ▁▅▇▆▇
RETURN 34 0.90 15.56 5.49 1.17 11.13 15.04 20.25 25.12 ▁▅▇▆▆
Δ 34 0.90 0.17 0.09 0.00 0.12 0.17 0.21 0.99 ▇▅▁▁▁
Δ_min 34 0.90 10.42 5.27 0.00 7.20 10.20 12.60 59.40 ▇▅▁▁▁

No worries! The purpose of The Scoop is to understand that there is often a human error component to the data collection process. Rather than allow that error to skew our results or be forced to remove those observations from out calculations entirely (again, skewing out results), we can work wit the data that we do have to make predictions on the data that we don’t have.

Since we have over 300 observations without missing values, we can use some of that information to predict what the missing parts of the given observation would have likely been if I had remembered to complete the rest of the entry.

We can use Imputation of the Mean of the Δ values to complete missing observations of the EGRESS variable. Since we know the time of departure for the outing and we know the average time of duration of an outing from the rest of the data, we can impute the NA values as shown below:

# Preserving original data
df_merged3 <- df_merged2

# Replacing NA values for the Δ variables
df_merged3$Δ[is.na(df_merged3$Δ)] <- mean(df_merged3$Δ, na.rm = TRUE)
df_merged3$Δ_min[is.na(df_merged3$Δ_min)] <- mean(df_merged3$Δ_min, na.rm = TRUE)

Taking a look at a visual representation of the merged and cleaned data

freq_df = as.data.frame(plyr::count(df_merged3, "DATE"))

freq_plot = ggplot(
  freq_df, aes(x=DATE, y=freq))+
  geom_bar(stat = "identity", fill = "purple")+
  theme(plot.title = element_text(hjust = 0.5, face="bold.italic"), 
        axis.title.x = element_text(size = 15),
        axis.text.x = element_text(size = 6, angle = 90, vjust = .7),
        axis.title.y = element_text(size = 15))+
  ylab("Outings Per Day")+
  xlab("Date")

freq_plot