Mutating detection data into binary

Question

Currently I have a dataframe of bear detections that I want to convert into a binary detection history (14 columns of day1, day2, day3, etc. where:

actual_date_out = the date the camera was deployed, binary detection history should start on this date
start_date = every date there's a detection of a bear, this is used to determine which of the 14 days should get a 1 for the detection history (e.g. actual_date_out = 12/1, start_date values for 12/3, 12/5, would yield 00101...)

What I'm trying to do is start a 14 day detection history (e.g. 001010001...) for each plot, starting with "actual_date_out" and ending 14 days after that. The problem is that sometimes the first detection (start_time) is e.g. 3 days after the deployment date and some dates don't have data, so I would have to fill in a 0 for some plots starting at the first deployment date (actual_date_out) and then for every preceding date that doesn't have a "start_time" column value (only dates that have start time row/presence have a 1). Some plots have 1 detection the whole time, others multiple for each day. The idea is each day just gets 0 or 1. Some plots may have multiple start_time values but each plot will only have one actual_date_out (each camera is only deployed once.)

Currently I have code that kind of works except it's putting a 1 for the first column and only 0s for all the preceding instead of searching for a start_time and assigning a 1. Only dates that also have a start_date get a value of 1.

library(tidyverse)
d <- read_csv(file.choose())

#Change actual_date_out to date format
d$actual_date_out <- as.Date(d$actual_date_out, format = "%m/%d/%Y")
d$retrieval_date <- as.Date(d$retrieval_date, format = "%m/%d/%Y")
d$start_time <- as.Date(d$start_time, format = "%m/%d/%Y")

#For each plot, find the earliest date in actual_date_out and then add the next 13 days. For each plot, each of the remaining 13 days should be added with a pres_abs = 0
d1 <- d |>
  group_by(plot) |>
  mutate(start_time = min(actual_date_out)) |>
  complete(actual_date_out = seq.Date(first(start_time), first(start_time) + 20, by = "day")) |>
  mutate(pres_abs = if_else(is.na(pres_abs), 0, pres_abs)) |>
  ungroup()

# Removes duplicate entries by taking max of pres_abs
d2 <- d1 |>
  group_by(plot, actual_date_out) |>
  slice_max(pres_abs, with_ties = FALSE) |>
  ungroup()

# Create news days variable as a sequence
d2 <- d2 |>
  group_by(plot) |>
  arrange(actual_date_out) |>
  mutate(day = paste0("day_", row_number())) |>
  ungroup()

# For each plot, take the actual_lat and actual_long from the first entry and propogate it to all the other entries for that plot
# For each plot, record the very first day in the 14 day period as actual_date_out
d2 <- d2 |>
  group_by(plot) |>
  mutate(actual_lat = first(actual_lat),
         actual_long = first(actual_long),
         first_day = first(actual_date_out)) |>
  ungroup()

# Pivots to wider so each of the 14 days is a binarized variable
d3 <- d2 |>
  select(-c(retrieval_date,start_time, actual_date_out)) |>
  pivot_wider(id_cols = c(plot, actual_lat, actual_long, first_day),
              names_from = day,
              values_from = pres_abs)

> dput(d)
structure(list(plot = c("d22142", "d22142", "d22489", "d22489", 
"d23081", "d23081", "d23081", "d23302", "d23544", "d23544", "d23544", 
"d23544", "d23544", "d23544", "d23569", "d23569", "d23579", "d23647"
), actual_date_out = structure(c(17158, 17158, 17229, 17229, 
17273, 17273, 17273, 17272, 17326, 17326, 17326, 17326, 17326, 
17326, 17303, 17303, 17303, 17309), class = "Date"), retrieval_date = structure(c(17178, 
17178, 17250, 17250, 17293, 17293, 17293, 17291, 17349, 17349, 
17349, 17349, 17349, 17349, 17324, 17324, 17327, 17324), class = "Date"), 
    actual_lat = c(35.5767, 35.5767, 35.5901, 35.5901, 35.2851, 
    35.2851, 35.2851, 35.3086, 35.9439, 35.9439, 35.9439, 35.9439, 
    35.9439, 35.9439, 35.0581, 35.0581, 35.1264, 35.3453), actual_long = c(-82.4956, 
    -82.4956, -82.5901, -82.5901, -83.1089, -83.1089, -83.1089, 
    -82.5258, -82.6275, -82.6275, -82.6275, -82.6275, -82.6275, 
    -82.6275, -83.4274, -83.4274, -83.0983, -82.781), start_time = structure(c(17161, 
    17161, 17248, 17248, 17281, 17283, 17281, 17273, 17336, 17336, 
    17347, 17349, 17336, 17336, 17309, 17315, 17316, 17311), class = "Date"), 
    pres_abs = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1)), row.names = c(NA, -18L), spec = structure(list(
    cols = list(plot = structure(list(), class = c("collector_character", 
    "collector")), actual_date_out = structure(list(), class = c("collector_character", 
    "collector")), retrieval_date = structure(list(), class = c("collector_character", 
    "collector")), actual_lat = structure(list(), class = c("collector_double", 
    "collector")), actual_long = structure(list(), class = c("collector_double", 
    "collector")), start_time = structure(list(), class = c("collector_character", 
    "collector")), pres_abs = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"))

"actual_date_out = 12/1, start_date values for 12/3, 12/5, would yield 00101...": remember that you're asking an international community. Not everyone in the world uses m/d[/y] date formats... — Limey
– Limey, Commented Aug 12 at 6:42

M-- · Accepted Answer · 2025-08-12 08:24:15Z

We can simply do a summarize() and find the dates that have data in the 14-day window after actual_date_out.

library(tidyverse)

d_binary <- d %>%
  summarize(
    actual_date_out = first(actual_date_out),
    actual_lat = first(actual_lat),
    actual_long = first(actual_long),
    detection_history = paste0(as.numeric((actual_date_out + 0:13) 
                                          %in% start_time), collapse = ""),
    .by = plot
  )

#> # A tibble: 8 × 5
#>   plot   actual_date_out actual_lat actual_long detection_history
#>   <chr>  <date>               <dbl>       <dbl> <chr>            
#> 1 d22142 2016-12-23            35.6       -82.5 00010000000000   
#> 2 d22489 2017-03-04            35.6       -82.6 00000000000000   
#> 3 d23081 2017-04-17            35.3       -83.1 00000000101000   
#> 4 d23302 2017-04-16            35.3       -82.5 01000000000000   
#> 5 d23544 2017-06-09            35.9       -82.6 00000000001000   
#> 6 d23569 2017-05-17            35.1       -83.4 00000010000010   
#> 7 d23579 2017-05-17            35.1       -83.1 00000000000001   
#> 8 d23647 2017-05-23            35.3       -82.8 00100000000000

Here, I have done a simple visualization of the results;

d_binary %>%
  rowwise() %>%
  mutate(
    days = list(1:14),
    detections = list(as.numeric(strsplit(detection_history, "")[[1]])),
    dates = list(actual_date_out + 0:13)
  ) %>%
  unnest(c(days, detections, dates)) %>%
  ungroup() %>%
  mutate(
    date_label = if_else(
      detections == 1, 
      str_glue("{month(dates)}/{day(dates)}/{year(dates)}"), "")
  ) %>% 
  ggplot(., aes(x = days, y = reorder(plot, desc(plot)), 
                fill = factor(detections))) +
  geom_tile(color = "white", linewidth = 0.8, width = 0.9, height = 0.8) +
  geom_text(aes(label = date_label), 
            size = 2.5, color = "white", fontface = "bold") +
  scale_fill_manual(
    values = c("0" = "#d73027", "1" = "#1a9850"), 
    name = "Detection Status", 
    labels = c("0" = "Not Detected", "1" = "Detected")
  ) +
  scale_x_continuous(
    breaks = 1:14, 
    expand = c(0, 0)
  ) +
  scale_y_discrete(expand = c(0, 0)) +
  labs(
    title = "Bear Detection History",
    x = "Study Day",
    y = "Plot ID"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(hjust = 1, size = 9),
    axis.text.y = element_text(size = 9),
    panel.grid = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "bottom",
    legend.title = element_text(face = "bold")
  )

^{Created on 2025-08-12 with reprex v2.1.1}

FJCC · Accepted Answer · 2025-08-12 01:09:19Z

Here is a different approach. I calculate the number of days from the actual_day_out to the start_time, use as the exponent of 2, make a binary number, and split the character representation of the binary number to fill the new columns.

I limited the data to the first 14 days, but you could go out to 32 days.

library(tidyverse)
d <- structure(list(plot = c("d22142", "d22142", "d22489", "d22489", 
                             "d23081", "d23081", "d23081", "d23302", "d23544", "d23544", "d23544", 
                             "d23544", "d23544", "d23544", "d23569", "d23569", "d23579", "d23647"), 
                    actual_date_out = structure(c(17158, 17158, 17229, 17229, 
                                                  17273, 17273, 17273, 17272, 17326, 17326, 17326, 17326, 17326, 
                                                  17326, 17303, 17303, 17303, 17309), class = "Date"), 
                    retrieval_date = structure(c(17178, 17178, 17250, 17250, 17293, 17293, 17293, 17291, 17349, 17349, 
                                                 17349, 17349, 17349, 17349, 17324, 17324, 17327, 17324), class = "Date"), 
                    actual_lat = c(35.5767, 35.5767, 35.5901, 35.5901, 35.2851, 
                                   35.2851, 35.2851, 35.3086, 35.9439, 35.9439, 35.9439, 35.9439, 
                                   35.9439, 35.9439, 35.0581, 35.0581, 35.1264, 35.3453), 
                    actual_long = c(-82.4956, -82.4956, -82.5901, -82.5901, -83.1089, -83.1089, -83.1089, 
                                    -82.5258, -82.6275, -82.6275, -82.6275, -82.6275, -82.6275, 
                                    -82.6275, -83.4274, -83.4274, -83.0983, -82.781), 
                    start_time = structure(c(17161, 17161, 17248, 17248, 17281, 17283, 17281, 17273, 17336, 17336, 
                                             17347, 17349, 17336, 17336, 17309, 17315, 17316, 17311), class = "Date"), 
                    pres_abs = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), 
               row.names = c(NA, -18L), spec = structure(list(
                 cols = list(plot = structure(list(), class = c("collector_character", 
                                                                "collector")), 
                             actual_date_out = structure(list(), class = c("collector_character", "collector")), 
                             retrieval_date = structure(list(), class = c("collector_character", "collector")), 
                             actual_lat = structure(list(), class = c("collector_double", "collector")), 
                             actual_long = structure(list(), class = c("collector_double", "collector")), 
                             start_time = structure(list(), class = c("collector_character", "collector")), 
                             pres_abs = structure(list(), class = c("collector_double", "collector"))), 
                 default = structure(list(), class = c("collector_guess", "collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"))

d$actual_date_out <- as.Date(d$actual_date_out, format = "%m/%d/%Y")
d$retrieval_date <- as.Date(d$retrieval_date, format = "%m/%d/%Y")
d$start_time <- as.Date(d$start_time, format = "%m/%d/%Y")

d1 <- d |>
  group_by(plot) |>
  mutate(start_date = min(actual_date_out))
d1 <- d1 |> mutate(CntDays = as.numeric(start_time - start_date))
d2 <- d1 |> filter(CntDays <=13)
d2 <- d2 |> mutate(forBinary = 2 ^ CntDays)
d3 <- d2 |> rowwise() |> 
     mutate(binary = paste(intToBits(forBinary)[1:14], collapse = ','))
d4 <- d3 |> separate_wider_delim(binary, delim = ',', names = paste("D", 0:13, sep = "_"))
d5 <- d4 |> mutate(across(D_0:D_13, as.numeric))

margusl · Accepted Answer · 2025-08-12 10:52:10Z

Assuming that cameras were deployed just once at each plot (i.e. one distinct actual_date_out value for each plot, as in example data), you could modify your current complete()-based approach a bit:

reduce number of rows and columns early (e.g. distinct() to handle multiple detections per plot per day)
calculate numeric day offsets for existing records (day)
complete day groupwise, generating ranges from 1 to group's max
fill introduced NAs (groupwise, using downup direction)
( keep or drop records where day offset is over 14 )
pivot, fill missing values

library(dplyr)
library(tidyr)

# input
d 
#> # A tibble: 18 × 7
#>    plot   actual_date_out retrieval_date actual_lat actual_long start_time pres_abs
#>    <chr>  <date>          <date>              <dbl>       <dbl> <date>        <dbl>
#>  1 d22142 2016-12-23      2017-01-12           35.6       -82.5 2016-12-26        1
#>  2 d22142 2016-12-23      2017-01-12           35.6       -82.5 2016-12-26        1
#>  3 d22489 2017-03-04      2017-03-25           35.6       -82.6 2017-03-23        1
#>  4 d22489 2017-03-04      2017-03-25           35.6       -82.6 2017-03-23        1
#>  5 d23081 2017-04-17      2017-05-07           35.3       -83.1 2017-04-25        1
#>  6 d23081 2017-04-17      2017-05-07           35.3       -83.1 2017-04-27        1
#>  7 d23081 2017-04-17      2017-05-07           35.3       -83.1 2017-04-25        1
#>  8 d23302 2017-04-16      2017-05-05           35.3       -82.5 2017-04-17        1
#>  9 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-06-19        1
#> 10 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-06-19        1
#> 11 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-06-30        1
#> 12 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-07-02        1
#> 13 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-06-19        1
#> 14 d23544 2017-06-09      2017-07-02           35.9       -82.6 2017-06-19        1
#> 15 d23569 2017-05-17      2017-06-07           35.1       -83.4 2017-05-23        1
#> 16 d23569 2017-05-17      2017-06-07           35.1       -83.4 2017-05-29        1
#> 17 d23579 2017-05-17      2017-06-10           35.1       -83.1 2017-05-30        1
#> 18 d23647 2017-05-23      2017-06-07           35.3       -82.8 2017-05-25        1

d |> 
  distinct(plot, start_time, .keep_all = TRUE) |> 
  mutate(
    day = (difftime(start_time, actual_date_out, units = "days") + 1) |> as.integer(),
    pres_abs = as.integer(pres_abs)
  ) |> 
  select(plot, starts_with("actual_"), day, pres_abs) |> 
  group_by(plot) |>  
  complete(day = 1:max(day), fill = list(pres_abs = 0)) |>
  fill(everything(), .direction = "downup") |> 
  ungroup() |> 
  # trim to 14 days, if needed:
  # filter(day <= 14) |> 
  pivot_wider(names_from = day, values_from = pres_abs, values_fill = 0, names_prefix = "day_")
#> # A tibble: 8 × 28
#>   plot   actual_date_out actual_lat actual_long day_1 day_2 day_3 day_4 day_5 day_6 day_7 day_8 day_9 day_10 day_11 day_12 day_13 day_14 day_15 day_16 day_17 day_18 day_19 day_20 day_21 day_22 day_23 day_24
#>   <chr>  <date>               <dbl>       <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
#> 1 d22142 2016-12-23            35.6       -82.5     0     0     0     1     0     0     0     0     0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0
#> 2 d22489 2017-03-04            35.6       -82.6     0     0     0     0     0     0     0     0     0      0      0      0      0      0      0      0      0      0      0      1      0      0      0      0
#> 3 d23081 2017-04-17            35.3       -83.1     0     0     0     0     0     0     0     0     1      0      1      0      0      0      0      0      0      0      0      0      0      0      0      0
#> 4 d23302 2017-04-16            35.3       -82.5     0     1     0     0     0     0     0     0     0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0
#> 5 d23544 2017-06-09            35.9       -82.6     0     0     0     0     0     0     0     0     0      0      1      0      0      0      0      0      0      0      0      0      0      1      0      1
#> 6 d23569 2017-05-17            35.1       -83.4     0     0     0     0     0     0     1     0     0      0      0      0      1      0      0      0      0      0      0      0      0      0      0      0
#> 7 d23579 2017-05-17            35.1       -83.1     0     0     0     0     0     0     0     0     0      0      0      0      0      1      0      0      0      0      0      0      0      0      0      0
#> 8 d23647 2017-05-23            35.3       -82.8     0     0     1     0     0     0     0     0     0      0      0      0      0      0      0      0      0      0      0      0      0      0      0      0

Post-complete():

#> # A tibble: 91 × 6
#> # Groups:   plot [8]
#>    plot     day actual_date_out actual_lat actual_long pres_abs
#>    <chr>  <int> <date>               <dbl>       <dbl>    <int>
#>  1 d22142     1 NA                    NA          NA          0
#>  2 d22142     2 NA                    NA          NA          0
#>  3 d22142     3 NA                    NA          NA          0
#>  4 d22142     4 2016-12-23            35.6       -82.5        1
#>  5 d22489     1 NA                    NA          NA          0
#>  6 d22489     2 NA                    NA          NA          0
#>  7 d22489     3 NA                    NA          NA          0
#>  8 d22489     4 NA                    NA          NA          0
#>  9 d22489     5 NA                    NA          NA          0
#> 10 d22489     6 NA                    NA          NA          0
#> 11 d22489     7 NA                    NA          NA          0
#> 12 d22489     8 NA                    NA          NA          0
#> 13 d22489     9 NA                    NA          NA          0
#> 14 d22489    10 NA                    NA          NA          0
#> 15 d22489    11 NA                    NA          NA          0
#> 16 d22489    12 NA                    NA          NA          0
#> 17 d22489    13 NA                    NA          NA          0
#> 18 d22489    14 NA                    NA          NA          0
#> 19 d22489    15 NA                    NA          NA          0
#> 20 d22489    16 NA                    NA          NA          0
#> 21 d22489    17 NA                    NA          NA          0
#> 22 d22489    18 NA                    NA          NA          0
#> 23 d22489    19 NA                    NA          NA          0
#> 24 d22489    20 2017-03-04            35.6       -82.6        1
#> # ℹ 67 more rows

Post-fill():

#> # A tibble: 91 × 6
#> # Groups:   plot [8]
#>    plot     day actual_date_out actual_lat actual_long pres_abs
#>    <chr>  <int> <date>               <dbl>       <dbl>    <dbl>
#>  1 d22142     1 2016-12-23            35.6       -82.5        0
#>  2 d22142     2 2016-12-23            35.6       -82.5        0
#>  3 d22142     3 2016-12-23            35.6       -82.5        0
#>  4 d22142     4 2016-12-23            35.6       -82.5        1
#>  5 d22489     1 2017-03-04            35.6       -82.6        0
#>  6 d22489     2 2017-03-04            35.6       -82.6        0
#>  7 d22489     3 2017-03-04            35.6       -82.6        0
#>  8 d22489     4 2017-03-04            35.6       -82.6        0
#>  9 d22489     5 2017-03-04            35.6       -82.6        0
#> 10 d22489     6 2017-03-04            35.6       -82.6        0
#> 11 d22489     7 2017-03-04            35.6       -82.6        0
#> 12 d22489     8 2017-03-04            35.6       -82.6        0
#> 13 d22489     9 2017-03-04            35.6       -82.6        0
#> 14 d22489    10 2017-03-04            35.6       -82.6        0
#> 15 d22489    11 2017-03-04            35.6       -82.6        0
#> 16 d22489    12 2017-03-04            35.6       -82.6        0
#> 17 d22489    13 2017-03-04            35.6       -82.6        0
#> 18 d22489    14 2017-03-04            35.6       -82.6        0
#> 19 d22489    15 2017-03-04            35.6       -82.6        0
#> 20 d22489    16 2017-03-04            35.6       -82.6        0
#> 21 d22489    17 2017-03-04            35.6       -82.6        0
#> 22 d22489    18 2017-03-04            35.6       -82.6        0
#> 23 d22489    19 2017-03-04            35.6       -82.6        0
#> 24 d22489    20 2017-03-04            35.6       -82.6        1
#> # ℹ 67 more rows

Collectives™ on Stack Overflow

Mutating detection data into binary

3 Answers 3

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

3 Answers 3

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related