# Loading packages
library(pacman)
p_load(
   dplyr,
   ggplot2,
   magrittr,
   tidyr,
   stringr
)

Showing loaded packages and version

(.packages()) # To check packages

##  [1] "stringr"   "tidyr"     "magrittr"  "ggplot2"   "dplyr"     "pacman"   
##  [7] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [13] "base"

R.version$version.string

## [1] "R version 4.2.0 (2022-04-22 ucrt)"

Dog No. 1 - The Scoop on Bucky

Part 1 of Data

# Importing the data
bdf1 = read.csv("pre_2_10_22_data_of_B.csv")

bdf1a = bdf1 %>% # Preserving original data frame
       separate(time_out, c("hour_out","minute_out",sep = ":")) %>%
       separate(time_in, c("hour_in","minute_in",sep = ":")) %>%
       mutate(ind = as.factor(ind),
              ind_0 = if_else(ind==0,T,F),
              ind_1 = if_else(ind==1,T,F),
              ind_2 = if_else(ind==2,T,F),
              "Δ" = "--") %>%
       rename(Urine = urine, Stool = shit, AM_PM = am_or_pm,
              DATE = date, HOUR_IN = hour_in, MIN_IN = minute_in,
              MIN_OUT = minute_out, HOUR_OUT = hour_out) %>% select(-c(":","diff"))

# Rearranging column ordering
bdf1a <- bdf1a[, c(1,2,3,4,5,6,14,8,7,11,12,13)]

Part 2 of Data

# Importing the data
df2 = read.csv("2.10_thru_3.24.csv")

# Showing preview of data
# skimr::skim(df2)

bdf2a = df2 %>% # To preserve original data frame
   filter(Which.Dog.!="Siobhan") %>% # only looking at one dog
   separate(Timestamp,
            c("Date", "Time"), # Separating time stamp at column spacing
            sep = " ") %>% 
   rename("Stool" = "Shit") %>% # Renaming for civility's sake
   mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
          id = row_number()) %>%
   select(-c("Time","Which.Dog.")) # No longer needing these columns
   
# Indicator value for urination and stool during outing
bdf2a$Urine <- substr(bdf2a$Urine,1,1)
bdf2a$Stool <- substr(bdf2a$Stool,1,1)

# Separating AM/PM from time stamp
am_pm_nchar = 2                               
bdf2a$AM_PM = substr(bdf2a$Time.In,
       nchar(bdf2a$Time.In) - am_pm_nchar + 1,
       nchar(bdf2a$Time.In))

bdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
bdf2a$ind <- substr(bdf2a$ind,1,1)
bdf2a %<>% mutate(ind = as.factor(ind),
                  ind_0 = if_else(ind==0,T,F),
                  ind_1 = if_else(ind==1,T,F),
                  ind_2 = if_else(ind==2,T,F),
                  "Δ" = "--", # Placeholder for variable
                  id = as.character(id)) %>%
           separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
           separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
           select(-c(":"))

# Reordering columns for merging files
bdf2b <- bdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15)]

sdf2a = df2 %>% # To preserve original data frame
   filter(Which.Dog.!="Braonin") %>% # only looking at one dog
   separate(Timestamp,
            c("Date", "Time"), # Separating time stamp at column spacing
            sep = " ") %>% 
   rename("Stool" = "Shit") %>% # Renaming for civility's sake
   mutate(both = if_else(`Which.Dog.`=="Both", TRUE, FALSE),
          id = row_number()) %>%
   select(-c("Time","Which.Dog.")) # No longer needing these columns
   
# Indicator value for urination and stool during outing
sdf2a$Urine <- substr(sdf2a$Urine,1,1)
sdf2a$Stool <- substr(sdf2a$Stool,1,1)

# Separating AM/PM from time stamp
sdf2a$AM_PM = substr(sdf2a$Time.In,
       nchar(sdf2a$Time.In) - am_pm_nchar + 1,
       nchar(sdf2a$Time.In))

sdf2a %<>% rename(ind = Indicator.Value, DATE = Date)
sdf2a$ind <- substr(sdf2a$ind,1,1)
sdf2a %<>% mutate(ind = as.factor(ind),
                  ind_0 = if_else(ind==0,T,F),
                  ind_1 = if_else(ind==1,T,F),
                  ind_2 = if_else(ind==2,T,F),
                  "Δ" = "--", # Placeholder for variable
                  id = as.character(id)) %>%
           separate(Time.Out, c("HOUR_OUT","MIN_OUT",":"), remove = T) %>%
           separate(Time.In, c("HOUR_IN","MIN_IN",":"), remove = T) %>%
           select(-c(":"))

# Reordering columns for merging files
sdf2b <- sdf2a[, c(1,3,4,12,6,7,16,5,8,13,14,15,9)]

Merging Data

Parts of this section will be unnecessary in future versions since the data-input tool will provide a uniform structure of observations.

Verifying matching formatting before merging sets

str(bdf1a); str(bdf2b)

## 'data.frame':    256 obs. of  12 variables:
##  $ DATE    : chr  "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
##  $ HOUR_OUT: chr  "1" "3" "12" "9" ...
##  $ MIN_OUT : chr  "18" "00" "24" "18" ...
##  $ AM_PM   : chr  "pm" "pm" "am" "am" ...
##  $ HOUR_IN : chr  "1" "3" "12" "9" ...
##  $ MIN_IN  : chr  "30" "20" "32" "25" ...
##  $ Δ       : chr  "--" "--" "--" "--" ...
##  $ Urine   : chr  "0" "0" "0" "1" ...
##  $ Stool   : chr  "1" "0" "1" "1" ...
##  $ ind_0   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_1   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_2   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

## 'data.frame':    98 obs. of  12 variables:
##  $ DATE    : chr  "2/10/2022" "2/11/2022" "2/11/2022" "2/11/2022" ...
##  $ HOUR_OUT: chr  "9" "9" "10" "1" ...
##  $ MIN_OUT : chr  "10" "34" "28" "02" ...
##  $ AM_PM   : chr  "PM" "AM" "AM" "PM" ...
##  $ HOUR_IN : chr  "9" "9" "10" "1" ...
##  $ MIN_IN  : chr  "22" "40" "35" "13" ...
##  $ Δ       : chr  "--" "--" "--" "--" ...
##  $ Urine   : chr  "1" "1" "0" "1" ...
##  $ Stool   : chr  "0" "0" "1" "0" ...
##  $ ind_0   : logi  TRUE FALSE FALSE FALSE TRUE TRUE ...
##  $ ind_1   : logi  FALSE TRUE FALSE TRUE FALSE FALSE ...
##  $ ind_2   : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...

Full joining into single file

df_merged = full_join(bdf1a,bdf2b)

## Joining, by = c("DATE", "HOUR_OUT", "MIN_OUT", "AM_PM", "HOUR_IN", "MIN_IN",
## "Δ", "Urine", "Stool", "ind_0", "ind_1", "ind_2")

For a total of 354 observations of data from outings for Bucky.

Formatting Variables

# Replacing non-numeric values with NA
df_merged$HOUR_OUT <- as.numeric(as.character(df_merged$HOUR_OUT))
df_merged$HOUR_IN  <- as.numeric(as.character(df_merged$HOUR_IN))
df_merged$MIN_OUT  <- as.numeric(as.character(df_merged$MIN_OUT))
df_merged$MIN_IN   <- as.numeric(as.character(df_merged$MIN_IN))
df_merged$Urine    <- as.numeric(as.character(df_merged$Urine))
df_merged$Stool    <- as.numeric(as.character(df_merged$Stool))
df_merged$`Δ`      <- as.numeric(as.character(df_merged$`Δ`))

df_merged %<>% mutate(Δ = NA,
                      AM_PM = toupper(AM_PM)) 
df_merged$AM_PM[df_merged$AM_PM != "AM" & df_merged$AM_PM != "PM"] <- NA_character_

# Converting variables to factor variables
df_merged$Urine <- as.factor(df_merged$Urine)
df_merged$Stool <- as.factor(df_merged$Stool)
df_merged$AM_PM <- as.factor(df_merged$AM_PM)

Calculating value of the Δ variable (the elapsed time of an outing), calculated as the difference between the IN/OUT values of the HOUR/MIN of an observation

# First converting time to 24-hour decimal format
df_merged2 = df_merged %>%
   mutate(
      MIN_IN   = (MIN_IN/60),
      MIN_OUT  = (MIN_OUT/60),
      HOUR_OUT = if_else(AM_PM=="PM", (HOUR_OUT+12),HOUR_OUT),
      HOUR_IN  = if_else(AM_PM=="PM", (HOUR_IN+12),HOUR_IN),
      RETURN   = (if_else(HOUR_OUT > HOUR_IN,round(((HOUR_IN+12)+MIN_IN),digits=2),round((HOUR_IN+MIN_IN),digits=2))),
      EGRESS   = round((HOUR_OUT+MIN_OUT),digits=2))

# Replacing non-numeric values with NA
df_merged2$RETURN <- round(as.numeric(as.character(df_merged2$RETURN)), digits = 2)
df_merged2$EGRESS <- round(as.numeric(as.character(df_merged2$EGRESS)), digits = 2)

# Calculating Δ
df_merged2 %<>% mutate(
   Δ      = abs(RETURN - EGRESS),
   Δ_num  = Δ,
   Δ_min  = (Δ*60))

# Rearranging order of variables
df_merged2 <- df_merged2[, c(1,14,13,7,16,8:12)]

Checking structure/formatting before proceeding

str(df_merged2)

## 'data.frame':    354 obs. of  10 variables:
##  $ DATE  : chr  "12/7/2022" "12/7/2022" "12/8/2022" "12/8/2022" ...
##  $ EGRESS: num  13.3 15 12.4 9.3 NA ...
##  $ RETURN: num  13.5 15.33 12.53 9.42 NA ...
##  $ Δ     : num  0.2 0.33 0.13 0.12 NA ...
##  $ Δ_min : num  12 19.8 7.8 7.2 NA ...
##  $ Urine : Factor w/ 2 levels "0","1": 1 1 1 2 NA 1 1 2 1 1 ...
##  $ Stool : Factor w/ 2 levels "0","1": 2 1 2 2 NA 2 2 1 1 1 ...
##  $ ind_0 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_1 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ ind_2 : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Dealing with Missing Values

The missing values from the formatted version of our data as shown above are the result of the human error component from data collection. As we can see in the data-overview below, the number of missing values for RETURN is higher than the number for EGRESS - this is because there were several times that I forgot to provide a complete account of data for an outing sometime after recording the time of departure.

skimr::skim(df_merged2)

Data summary
Name	df_merged2
Number of rows	354
Number of columns	10
_______________________
Column type frequency:
character	1
factor	2
logical	3
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
DATE	0	1	8	10	0	86	0

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Urine	24	0.93	FALSE	2	1: 285, 0: 45
Stool	25	0.93	FALSE	2	0: 230, 1: 99

Variable type: logical

skim_variable	complete_rate	mean	count
ind_0	1	0.25	FAL: 267, TRU: 87
ind_1	1	0.44	FAL: 197, TRU: 157
ind_2	1	0.18	FAL: 291, TRU: 63

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
EGRESS	11	0.97	15.40	5.52	1.00	10.80	14.93	20.26	24.87	▁▅▇▆▇
RETURN	34	0.90	15.56	5.49	1.17	11.13	15.04	20.25	25.12	▁▅▇▆▆
Δ	34	0.90	0.17	0.09	0.00	0.12	0.17	0.21	0.99	▇▅▁▁▁
Δ_min	34	0.90	10.42	5.27	0.00	7.20	10.20	12.60	59.40	▇▅▁▁▁

No worries! The purpose of The Scoop is to understand that there is often a human error component to the data collection process. Rather than allow that error to skew our results or be forced to remove those observations from out calculations entirely (again, skewing out results), we can work wit the data that we do have to make predictions on the data that we don’t have.

Since we have over 300 observations without missing values, we can use some of that information to predict what the missing parts of the given observation would have likely been if I had remembered to complete the rest of the entry.

We can use Imputation of the Mean of the Δ values to complete missing observations of the EGRESS variable. Since we know the time of departure for the outing and we know the average time of duration of an outing from the rest of the data, we can impute the NA values as shown below:

# Preserving original data
df_merged3 <- df_merged2

# Replacing NA values for the Δ variables
df_merged3$Δ[is.na(df_merged3$Δ)] <- mean(df_merged3$Δ, na.rm = TRUE)
df_merged3$Δ_min[is.na(df_merged3$Δ_min)] <- mean(df_merged3$Δ_min, na.rm = TRUE)

Taking a look at a visual representation of the merged and cleaned data