Initial, intermediate work to allow multiple IDVAR referring to one QNAM

pharmaverse · statasaurus · Apr 19, 2024 · Apr 12, 2024 · Apr 12, 2024 · Apr 12, 2024
commit 5a56713e4a31b98a660206f3a2876198843d2b4f
diff --git a/R/supp.R b/R/supp.R
@@ -156,22 +156,112 @@ combine_supp <- function(dataset, supp){
                      "")
       stop(paste0(mess, ext, mis))
    }
-   by <- names(dataset) %>%
-      discard(~ . %in% supp$QNAM) # Don't want any variables in our by statement
+
+   by <- setdiff(names(dataset), supp$QNAM) # Don't want any variables in our by statement
 
    # In order to prevent issues when there are multiple IDVARS we need to merge
    # each IDVAR into the domain seperately (otherwise there is problems when the
    # two IDVARS don't overlap)
 
-   supp %>%
-      select(-any_of(c("QLABEL", "QORIG", "QEVAL"))) %>% #Removing columns not for the main dataset
-      rename(DOMAIN = RDOMAIN) %>%
-      group_by(IDVAR) %>% #For when there are multiple IDs
-      group_split() %>%
-      map(~combine_supp_by_idvar(dataset, .)) %>%
-      reduce(full_join, by= by)
+   browser()
+   stop()
+   supp_wides_prep <-
+     supp %>%
+     select(-any_of(c("QLABEL", "QORIG", "QEVAL"))) %>% #Removing columns not for the main dataset
+     rename(DOMAIN = RDOMAIN) %>%
+     group_by(IDVAR, QNAM) %>% #For when there are multiple IDs
+     group_split()
+
+   supp_wides <- purrr::pmap(.l = list(supp = supp_wides_prep), .f = combine_supp_make_wide)
+   ret <- reduce(.x = append(list(dataset), supp_wides), .f = combine_supp_join)
+   ret
+}
+
+# Create a wide version of `supp` for merging into the source dataset.
+combine_supp_make_wide <- function(supp) {
+  stopifnot(length(unique(supp$IDVAR)) == 1)
+  stopifnot(length(unique(supp$QNAM)) == 1)
+  # Get the IDVAR value to allow for renaming of IDVARVAL
+  id_var <- unique(supp$IDVAR)
+
+  wide_x <-
+    supp %>%
+    pivot_wider(
+      names_from = QNAM,
+      values_from = QVAL
+    )
+  wide_x$QNAM <- unique(supp$QNAM)
+  if (!is.na(id_var) && id_var != "") {
+    wide_x <-
+      wide_x %>%
+      mutate(IDVARVAL = str_trim(as.character(IDVARVAL)))
+  } else {
+    wide_x$IDVARVAL <- NULL
+  }
+  wide_x
 }
 
+combine_supp_join <- function(dataset, supp) {
+  current_idvar <- unique(supp$IDVAR)
+  current_qnam <- unique(supp$QNAM)
+  stopifnot(length(current_idvar) == 1)
+  stopifnot(length(current_qnam) == 1)
+
+  browser()
+  stop()
+  by <- intersect(names(supp), c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL"))
+  supp_prep <- supp %>% select(-QNAM, -IDVAR)
+  new_column <- setdiff(names(supp_prep), by)
+  stopifnot(length(new_column) == 1)
+
+  # Prepare IDVARVAL
+  ret <- dataset
+  if ("IDVARVAL" %in% by) {
+    # Match the IDVARVAL column in supp
+    ret$IDVARVAL <- str_trim(as.character(ret[[current_idvar]]))
+  } else {
+    # A dummy column that can be removed later
+    ret$IDVARVAL <- FALSE
+  }
+
+  # Put the new data in
+  if (new_column %in% names(dataset)) {
+    # Patch the data
+    mask_na_ret_before <- is.na(ret[[new_column]])
+    ret_orig <- ret
+    browser()
+    stop()
+    ret <- dplyr::rows_patch(x = ret, y = supp_prep, by = by)
+    mask_na_ret_after <- is.na(ret[[new_column]])
+
+    expected_na_difference <- sum(!is.na(y[[new_column]]))
+    actual_na_difference <- sum(!mask_na_ret_after) - sum(!mask_na_ret_before)
+    if (expected_na_difference != actual_na_difference) {
+      stop("An unexpected number of rows were replaced while merging QNAM ", current_qnam, " and IDVAR ", current_idvar)
+    }
+  } else {
+    # Verify that nothing will be missed
+    missing <- anti_join(supp_prep, ret, by = by)
+
+    # Add message for when there are rows in the supp that didn't get merged
+    if(nrow(missing) > 0) {
+      missing_txt <-
+        capture.output(
+          missing %>%
+            select(USUBJID, all_of(current_idvar)) %>%
+            print()
+        ) %>%
+        paste0(collapse = "\n")
+      stop(paste0("Not all rows of the Supp were merged. The following rows are missing:\n",
+                  missing_txt),
+           call. = FALSE)
+    }
+
+    # join the data
+    ret <- left_join(dataset, supp_prep, by = by)
+  }
+  ret
+}
 
 #' Handles the combining of datasets and supps for a single IDVAR
 #'
@@ -185,18 +275,15 @@ combine_supp <- function(dataset, supp){
 #' @importFrom stringr str_trim
 combine_supp_by_idvar <- function(dataset, supp){
    # Get the IDVAR value to allow for renaming of IDVARVAL
-   id_var <- supp %>%
-      pull(IDVAR) %>%
-      unique()
+   id_var <- unique(supp$IDVAR)
 
    wide_x <- supp %>%
       pivot_wider(
          names_from = QNAM,
          values_from = QVAL) %>%
       select(-IDVAR)
 
-
-   if(!is.na(id_var) && id_var  != ""){
+   if(!is.na(id_var) && id_var != ""){
       id_var_sym <- sym(id_var)
 
       by <- c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL")
@@ -211,10 +298,10 @@ combine_supp_by_idvar <- function(dataset, supp){
       out <- left_join(dataset_chr, wide_x,
                        by = by) %>%
          select(-IDVARVAL)
-      missing<- anti_join(wide_x,dataset_chr, by = by)
+      missing <- anti_join(wide_x,dataset_chr, by = by)
 
       # Add message for when there are rows in the supp that didn't get merged
-      if(nrow(missing) > 0){
+      if(nrow(missing) > 0) {
          missing_txt <- capture.output(missing %>%
                                           select(USUBJID, !!sym(id_var)) %>%
                                           print()) %>%
@@ -233,3 +320,34 @@ combine_supp_by_idvar <- function(dataset, supp){
    out
 }
 
+# full_join should be used when there are only new columns, but rows_patch
+# should be used when there are existing columns.
+combine_supp_join_or_patch <- function(x, y, by) {
+  other_cols_x <- setdiff(names(y), by)
+  other_cols_y <- setdiff(names(y), by)
+  column_overlap <- intersect(other_cols_x, other_cols_y)
+  columns_unique_y <- setdiff(other_cols_y, column_overlap)
+  ret <- x
+  if (length(columns_unique_y) > 0) {
+    y_no_overlap <- y[, setdiff(names(y), column_overlap)]
+    ret <- dplyr::full_join(x = ret, y = y_no_overlap, by = by)
+  }
+  for (current_nm in column_overlap) {
+    # update columns one-by-one, verifying that only NA values are replaced
+    mask_na_ret_before <- is.na(ret[[current_nm]])
+    ret_orig <- ret
+    browser()
+    stop()
+    ret <- dplyr::rows_patch(x = ret, y = y[, c(by, current_nm)], by = by)
+    mask_na_ret_after <- is.na(ret[[current_nm]])
+
+    expected_na_difference <- sum(!is.na(y[[current_nm]]))
+    actual_na_difference <- sum(!mask_na_ret_after) - sum(!mask_na_ret_before)
+    if (expected_na_difference != actual_na_difference) {
+      # TODO: Try to give the specific location of the error, but this would be
+      # invalid SDTM, and there is not a simple trace to the row with the issue.
+      stop("An unexpected number of rows were replaced while patching the data in column ", current_nm)
+    }
+  }
+  ret
+}
diff --git a/tests/testthat/test-supp.R b/tests/testthat/test-supp.R
@@ -226,3 +226,26 @@ test_that("zero-row supp returns data unchanged with a warning (#45)", {
   )
   expect_equal(result, safetyData::sdtm_ae)
 })
+
+test_that("multiple different IDVAR map to the same QNAM works", {
+  simple_ae <-
+    safetyData::sdtm_ae |>
+    filter(USUBJID %in% c("01-701-1015", "01-701-1023"))
+  simple_suppae <- safetyData::sdtm_suppae[c(1, 4), ]
+  simple_suppae$IDVAR[2] <- "AEDTC"
+  simple_suppae$IDVARVAL[2] <- "2012-09-02"
+  expect_equal(
+    combine_supp(simple_ae, supp = simple_suppae)$AETRTEM,
+    c("Y", NA, NA, NA, NA, NA, "Y")
+  )
+
+  # Replace the value in error
+  simple_suppae <- safetyData::sdtm_suppae[c(1, 4, 7), ]
+  simple_suppae$IDVAR[2] <- "AEDTC"
+  simple_suppae$IDVARVAL[2] <- "2012-09-02"
+
+  expect_error(
+    combine_supp(simple_ae, supp = simple_suppae)$AETRTEM,
+    c("Y", NA, NA, NA, NA, NA, "Y")
+  )
+})