Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

combine_supp() now allows multiple QNAM values to go to the same IDVAR #66

Merged
merged 8 commits into from
Apr 19, 2024
Prev Previous commit
Next Next commit
Initial, intermediate work to allow multiple IDVAR referring to one QNAM
  • Loading branch information
billdenney committed Apr 12, 2024
commit 5a56713e4a31b98a660206f3a2876198843d2b4f
150 changes: 134 additions & 16 deletions R/supp.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,22 +156,112 @@ combine_supp <- function(dataset, supp){
"")
stop(paste0(mess, ext, mis))
}
by <- names(dataset) %>%
discard(~ . %in% supp$QNAM) # Don't want any variables in our by statement

by <- setdiff(names(dataset), supp$QNAM) # Don't want any variables in our by statement

# In order to prevent issues when there are multiple IDVARS we need to merge
# each IDVAR into the domain seperately (otherwise there is problems when the
# two IDVARS don't overlap)

supp %>%
select(-any_of(c("QLABEL", "QORIG", "QEVAL"))) %>% #Removing columns not for the main dataset
rename(DOMAIN = RDOMAIN) %>%
group_by(IDVAR) %>% #For when there are multiple IDs
group_split() %>%
map(~combine_supp_by_idvar(dataset, .)) %>%
reduce(full_join, by= by)
browser()
stop()
supp_wides_prep <-
supp %>%
select(-any_of(c("QLABEL", "QORIG", "QEVAL"))) %>% #Removing columns not for the main dataset
rename(DOMAIN = RDOMAIN) %>%
group_by(IDVAR, QNAM) %>% #For when there are multiple IDs
group_split()

supp_wides <- purrr::pmap(.l = list(supp = supp_wides_prep), .f = combine_supp_make_wide)
ret <- reduce(.x = append(list(dataset), supp_wides), .f = combine_supp_join)
ret
}

# Create a wide version of `supp` for merging into the source dataset.
combine_supp_make_wide <- function(supp) {
stopifnot(length(unique(supp$IDVAR)) == 1)
stopifnot(length(unique(supp$QNAM)) == 1)
# Get the IDVAR value to allow for renaming of IDVARVAL
id_var <- unique(supp$IDVAR)

wide_x <-
supp %>%
pivot_wider(
names_from = QNAM,
values_from = QVAL
)
wide_x$QNAM <- unique(supp$QNAM)
if (!is.na(id_var) && id_var != "") {
wide_x <-
wide_x %>%
mutate(IDVARVAL = str_trim(as.character(IDVARVAL)))
} else {
wide_x$IDVARVAL <- NULL
}
wide_x
}

combine_supp_join <- function(dataset, supp) {
current_idvar <- unique(supp$IDVAR)
current_qnam <- unique(supp$QNAM)
stopifnot(length(current_idvar) == 1)
stopifnot(length(current_qnam) == 1)

browser()
stop()
by <- intersect(names(supp), c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL"))
supp_prep <- supp %>% select(-QNAM, -IDVAR)
new_column <- setdiff(names(supp_prep), by)
stopifnot(length(new_column) == 1)

# Prepare IDVARVAL
ret <- dataset
if ("IDVARVAL" %in% by) {
# Match the IDVARVAL column in supp
ret$IDVARVAL <- str_trim(as.character(ret[[current_idvar]]))
} else {
# A dummy column that can be removed later
ret$IDVARVAL <- FALSE
}

# Put the new data in
if (new_column %in% names(dataset)) {
# Patch the data
mask_na_ret_before <- is.na(ret[[new_column]])
ret_orig <- ret
browser()
stop()
ret <- dplyr::rows_patch(x = ret, y = supp_prep, by = by)
mask_na_ret_after <- is.na(ret[[new_column]])

expected_na_difference <- sum(!is.na(y[[new_column]]))
actual_na_difference <- sum(!mask_na_ret_after) - sum(!mask_na_ret_before)
if (expected_na_difference != actual_na_difference) {
stop("An unexpected number of rows were replaced while merging QNAM ", current_qnam, " and IDVAR ", current_idvar)
}
} else {
# Verify that nothing will be missed
missing <- anti_join(supp_prep, ret, by = by)

# Add message for when there are rows in the supp that didn't get merged
if(nrow(missing) > 0) {
missing_txt <-
capture.output(
missing %>%
select(USUBJID, all_of(current_idvar)) %>%
print()
) %>%
paste0(collapse = "\n")
stop(paste0("Not all rows of the Supp were merged. The following rows are missing:\n",
missing_txt),
call. = FALSE)
}

# join the data
ret <- left_join(dataset, supp_prep, by = by)
}
ret
}

#' Handles the combining of datasets and supps for a single IDVAR
#'
Expand All @@ -185,18 +275,15 @@ combine_supp <- function(dataset, supp){
#' @importFrom stringr str_trim
combine_supp_by_idvar <- function(dataset, supp){
# Get the IDVAR value to allow for renaming of IDVARVAL
id_var <- supp %>%
pull(IDVAR) %>%
unique()
id_var <- unique(supp$IDVAR)

wide_x <- supp %>%
pivot_wider(
names_from = QNAM,
values_from = QVAL) %>%
select(-IDVAR)


if(!is.na(id_var) && id_var != ""){
if(!is.na(id_var) && id_var != ""){
id_var_sym <- sym(id_var)

by <- c("STUDYID", "DOMAIN", "USUBJID", "IDVARVAL")
Expand All @@ -211,10 +298,10 @@ combine_supp_by_idvar <- function(dataset, supp){
out <- left_join(dataset_chr, wide_x,
by = by) %>%
select(-IDVARVAL)
missing<- anti_join(wide_x,dataset_chr, by = by)
missing <- anti_join(wide_x,dataset_chr, by = by)

# Add message for when there are rows in the supp that didn't get merged
if(nrow(missing) > 0){
if(nrow(missing) > 0) {
missing_txt <- capture.output(missing %>%
select(USUBJID, !!sym(id_var)) %>%
print()) %>%
Expand All @@ -233,3 +320,34 @@ combine_supp_by_idvar <- function(dataset, supp){
out
}

# full_join should be used when there are only new columns, but rows_patch
# should be used when there are existing columns.
combine_supp_join_or_patch <- function(x, y, by) {
other_cols_x <- setdiff(names(y), by)
other_cols_y <- setdiff(names(y), by)
column_overlap <- intersect(other_cols_x, other_cols_y)
columns_unique_y <- setdiff(other_cols_y, column_overlap)
ret <- x
if (length(columns_unique_y) > 0) {
y_no_overlap <- y[, setdiff(names(y), column_overlap)]
ret <- dplyr::full_join(x = ret, y = y_no_overlap, by = by)
}
for (current_nm in column_overlap) {
# update columns one-by-one, verifying that only NA values are replaced
mask_na_ret_before <- is.na(ret[[current_nm]])
ret_orig <- ret
browser()
stop()
ret <- dplyr::rows_patch(x = ret, y = y[, c(by, current_nm)], by = by)
mask_na_ret_after <- is.na(ret[[current_nm]])

expected_na_difference <- sum(!is.na(y[[current_nm]]))
actual_na_difference <- sum(!mask_na_ret_after) - sum(!mask_na_ret_before)
if (expected_na_difference != actual_na_difference) {
# TODO: Try to give the specific location of the error, but this would be
# invalid SDTM, and there is not a simple trace to the row with the issue.
stop("An unexpected number of rows were replaced while patching the data in column ", current_nm)
}
}
ret
}
23 changes: 23 additions & 0 deletions tests/testthat/test-supp.R
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,26 @@ test_that("zero-row supp returns data unchanged with a warning (#45)", {
)
expect_equal(result, safetyData::sdtm_ae)
})

test_that("multiple different IDVAR map to the same QNAM works", {
simple_ae <-
safetyData::sdtm_ae |>
filter(USUBJID %in% c("01-701-1015", "01-701-1023"))
simple_suppae <- safetyData::sdtm_suppae[c(1, 4), ]
simple_suppae$IDVAR[2] <- "AEDTC"
simple_suppae$IDVARVAL[2] <- "2012-09-02"
expect_equal(
combine_supp(simple_ae, supp = simple_suppae)$AETRTEM,
c("Y", NA, NA, NA, NA, NA, "Y")
)

# Replace the value in error
simple_suppae <- safetyData::sdtm_suppae[c(1, 4, 7), ]
simple_suppae$IDVAR[2] <- "AEDTC"
simple_suppae$IDVARVAL[2] <- "2012-09-02"

expect_error(
combine_supp(simple_ae, supp = simple_suppae)$AETRTEM,
c("Y", NA, NA, NA, NA, NA, "Y")
)
})