JMSLab · zhizhongpu · Aug 6, 2025 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -23,7 +23,6 @@
 !data/*.RData
 !data/*.rda
 *.csv
-
-
 inst/doc
 revdep/
+*.Rproj
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: eventstudyr
 Title: Estimation and Visualization of Linear Panel Event Studies
-Version: 1.1.4
+Version: 1.1.5
 Authors@R: 
     c(person(given = "Simon",
            family = "Freyaldenhoven",

diff --git a/NAMESPACE b/NAMESPACE
@@ -13,7 +13,8 @@ importFrom(data.table,":=")
 importFrom(data.table,.SD)
 importFrom(data.table,CJ)
 importFrom(data.table,as.data.table)
-importFrom(data.table,setDT)
+importFrom(data.table,copy)
+importFrom(data.table,is.data.table)
 importFrom(data.table,setnames)
 importFrom(data.table,setorder)
 importFrom(data.table,setorderv)

diff --git a/R/AddMeans.R b/R/AddMeans.R
@@ -19,12 +19,11 @@ AddMeans <- function(df, normalization_column, policyvar, outcomevar,
 
     if (is.null(custom_scalar)) {
         if (grepl(paste0(policyvar, "_lead"), normalization_column)) {
-            df_change <- df[df[, normalization_column] == 0 & !is.na(df[, normalization_column]), ]
-        }else {
-            df_change <- df[df[, normalization_column] != 0 & !is.na(df[, normalization_column]), ]
+            df_change <- df[get(normalization_column) == 0 & !is.na(get(normalization_column))]
+        } else {
+            df_change <- df[get(normalization_column) != 0 & !is.na(get(normalization_column))]
         }
-        mean <- mean(df_change[[outcomevar]], na.rm = T)
-
+        mean <- df_change[, mean(get(outcomevar), na.rm = TRUE)]
     } else {
         mean <- custom_scalar
     }

diff --git a/R/ComputeFirstDifferences.R b/R/ComputeFirstDifferences.R
@@ -1,11 +1,10 @@
 #' Adds first differences of a variable, robustly to gaps in time variable, as new columns in a panel dataset
 #'
-#' @param df Data frame that will be modified.
+#' @param dt Data frame that will be modified.
 #' @param idvar Character indicating column of units.
 #' @param timevar Character indicating column of time periods.
 #' @param diffvar Character indicating column of variable whose first difference will be taken.
 #' @param timevar_holes Logical indicating whether the panel contains gaps in the time variable. Defaults to FALSE.
-#' @param return_df Logical indicating whether the function should return a data frame (TRUE) or data.table (FALSE). Defaults to TRUE.
 #'
 #' @return The passed dataset augmented with columns that reflect the desired first differences
 #'
@@ -19,59 +18,51 @@
 #'     diffvar = "z"
 #' )
 #'
-#' @importFrom data.table setDT setorderv setnames shift := CJ .SD
+#' @importFrom data.table setorderv setnames shift := CJ .SD
 #'
 #' @keywords internal
 #' @noRd
 
-ComputeFirstDifferences <- function(df, idvar, timevar, diffvar,
-                                    timevar_holes = FALSE, return_df = TRUE) {
-    if (! is.data.frame(df)) {
-        stop("df should be a data frame.")
+ComputeFirstDifferences <- function(dt, idvar, timevar, diffvar,
+                                    timevar_holes = FALSE) {
+    if (! data.table::is.data.table(dt)) {
+        stop("Input data should be a data.table.")
     }
     for (var in c(idvar, timevar, diffvar)) {
         if ((! is.character(var))) {
             stop(paste0(var, " should be a character."))
         }
-        if (! var %in% colnames(df)) {
+        if (! var %in% colnames(dt)) {
             stop(paste0(var, " should be the name of a variable in the dataset."))
         }
     }
     if (! is.logical(timevar_holes)) {
         stop("timevar_holes should be logical.")
     }
-    if (! is.logical(return_df)) {
-        stop("return_df should be logical.")
-    }
 
-    data.table::setDT(df)
-    data.table::setorderv(df, cols = c(idvar, timevar))
+    data.table::setorderv(dt, cols = c(idvar, timevar))
 
     if (!timevar_holes) {
-        df[, paste0(diffvar, "_fd") := get(diffvar) - shift((get(diffvar))),
+        dt[, paste0(diffvar, "_fd") := get(diffvar) - data.table::shift((get(diffvar))),
            by = idvar]
     } else {
         ## Create dataset with all combinations to compute first differences
-        all_combinations <- CJ(unique(df[[idvar]]),
-                               min(df[[timevar]]):max(df[[timevar]]))
-        setnames(all_combinations, new = c(idvar, timevar))
+        all_combinations <- data.table::CJ(unique(dt[[idvar]]),
+                                           min(dt[[timevar]]):max(dt[[timevar]]))
+        data.table::setnames(all_combinations, new = c(idvar, timevar))
 
-        df_all <- merge(df, all_combinations,
-                        by = c(idvar, timevar), all = TRUE)
+        dt_all <- data.table::merge.data.table(dt, all_combinations,
+                                               by = c(idvar, timevar), all = TRUE)
 
-        df_all[, paste0(diffvar, "_fd") := get(diffvar) - shift((get(diffvar))),
+        dt_all[, paste0(diffvar, "_fd") := get(diffvar) - data.table::shift((get(diffvar))),
                 by = idvar]
 
         ## Bring first differences back to the original dataset
         vars_to_keep <- c(idvar, timevar, paste0(diffvar, "_fd"))
 
-        df <- merge(df, df_all[, .SD, .SDcols = vars_to_keep],
-                    by = c(idvar, timevar), all.x = TRUE)
-    }
-
-    if (return_df) {
-        df <- as.data.frame(df)
+        dt <- data.table::merge.data.table(dt, dt_all[, .SD, .SDcols = vars_to_keep],
+                                           by = c(idvar, timevar), all.x = TRUE)
     }
 
-    return(df)
+    return(dt)
 }
diff --git a/R/ComputeShifts.R b/R/ComputeShifts.R
@@ -6,7 +6,6 @@
 #' @param shiftvar Character indicating column of variable that will be shifted.
 #' @param shiftvalues Numeric vector specifying the leads/lags to be computed. For example, c(-1, 0, 1) will compute the lead, current, and lag values.
 #' @param timevar_holes Logical indicating whether the panel contains gaps in the time variable. Defaults to FALSE.
-#' @param return_df Logical indicating whether the function should return a data frame (TRUE) or data.table (FALSE). Defaults to TRUE.
 #'
 #' @return The passed dataset augmented with columns that reflect the desired shift values
 
@@ -22,14 +21,14 @@
 #' )
 #'
 #'
-#' @importFrom data.table setDT setorderv shift := CJ .SD
+#' @importFrom data.table setorderv shift := CJ .SD
 #' @keywords internal
 #' @noRd
 
 ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
-                          timevar_holes = FALSE, return_df = TRUE) {
-    if (! is.data.frame(df)) {
-        stop("df should be a data frame.")
+                          timevar_holes = FALSE) {
+    if (! data.table::is.data.table(df)) {
+        stop("df should be a data.table.")
     }
     for (var in c(idvar, timevar, shiftvar)) {
         if ((! is.character(var))) {
@@ -48,11 +47,7 @@ ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
     if (! is.logical(timevar_holes)) {
         stop("timevar_holes should be logical.")
     }
-    if (! is.logical(return_df)) {
-        stop("return_df should be logical.")
-    }
 
-    data.table::setDT(df)
     data.table::setorderv(df, cols = c(idvar, timevar))
 
     lags  <-    shiftvalues[shiftvalues >  0]
@@ -92,10 +87,5 @@ ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
         df <- merge(df, df_all,
                     by = c(idvar, timevar, shiftvar), all.x = TRUE)
     }
-
-    if (return_df) {
-        df <- as.data.frame(df)
-    }
-
     return(df)
 }
diff --git a/R/EventStudy.R b/R/EventStudy.R
@@ -32,12 +32,14 @@
 #' @param normalize Specifies the event-time coefficient to be normalized. Defaults to - pre - 1.
 #' @param anticipation_effects_normalization If set to TRUE, runs the default process and switches coefficient to be normalized to 0
 #' when there are anticipation effects. If set to FALSE, does not make the switch. Defaults to TRUE.
+#' @param allow_duplicate_id If TRUE, the function estimates a regression where duplicated ID-time rows are weighted by their duplication count. If FALSE, the function raises an error if duplicate unit-time keys exist in the input data. Default is FALSE.
+#' @param avoid_internal_copy If TRUE, the function avoids making an internal deep copy of the input data, and instead directly modifies the input data.table. Default is FALSE.
 #'
 #' @return A list that contains, under "output", the estimation output as an lm_robust object, and under "arguments", the arguments passed to the function.
 #' @import dplyr
 #' @import estimatr
 #' @importFrom stats reformulate
-#' @importFrom data.table setorderv as.data.table .SD
+#' @importFrom data.table setorderv as.data.table is.data.table .SD copy
 #' @export
 #'
 #' @examples
@@ -142,15 +144,20 @@
 
 EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, controls = NULL,
                        proxy = NULL, proxyIV = NULL, FE = TRUE, TFE = TRUE, post, overidpost = 1, pre, overidpre = post + pre,
-                       normalize = -1 * (pre + 1), cluster = TRUE, anticipation_effects_normalization = TRUE) {
+                       normalize = -1 * (pre + 1), cluster = TRUE, anticipation_effects_normalization = TRUE,
+                       allow_duplicate_id = FALSE, avoid_internal_copy = FALSE) {
 
     # Check for errors in arguments
     if (! estimator %in% c("OLS", "FHS")) {stop("estimator should be either 'OLS' or 'FHS'.")}
     if (! is.data.frame(data))            {stop("data should be a data frame.")}
-    if (! is.character(outcomevar))       {stop("outcomevar should be a character.")}
-    if (! is.character(policyvar))        {stop("policyvar should be a character.")}
-    if (! is.character(idvar))            {stop("idvar should be a character.")}
-    if (! is.character(timevar))          {stop("timevar should be a character.")}
+    for (var in c(idvar, timevar, outcomevar, policyvar)) {
+        if ((! is.character(var))) {
+            stop(paste0(var, " should be a character."))
+        }
+        if (! var %in% colnames(data)) {
+            stop(paste0(var, " should be the name of a variable in the dataset."))
+        }
+    }
     if (! (is.null(controls) | is.character(controls))) {stop("controls should be either NULL or a character.")}
 
     if ((estimator == "OLS" & ! is.null(proxy)))        {stop("proxy should only be specified when estimator = 'FHS'.")}
@@ -162,11 +169,13 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
         stop("When estimator is 'FHS' and there are no leads in the model, proxyIV must be specified explicitly.")
     }
 
-    if (! is.logical(FE))      {stop("FE should be either TRUE or FALSE.")}
-    if (! is.logical(TFE))     {stop("TFE should be either TRUE or FALSE.")}
-    if (! is.logical(cluster)) {stop("cluster should be either TRUE or FALSE.")}
+    for (var in c(FE, TFE, cluster, anticipation_effects_normalization, allow_duplicate_id, avoid_internal_copy)) {
+        if (! is.logical(var)) {
+            stop(paste0(var, " should be either TRUE or FALSE."))
+        }
+    }
+
     if (FE & !cluster)         {stop("cluster=TRUE is required when FE=TRUE.")}
-    if (! is.logical(anticipation_effects_normalization)) {stop("anticipation_effects_normalization should be either TRUE or FALSE.")}
 
     if (! (is.numeric(post)       &  post >= 0      &  post %% 1 == 0))           {stop("post should be a whole number.")}
     if (! (is.numeric(overidpost) & overidpost >= 0 & overidpost %% 1 == 0))      {stop("overidpost should be a whole number.")}
@@ -177,15 +186,27 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
            & normalize >= -(pre + overidpre + 1) & normalize <= post + overidpost)) {
         stop("normalize should be an integer between -(pre + overidpre + 1) and (post + overidpost).")
     }
+    if (avoid_internal_copy & ! data.table::is.data.table(data)) {
+        warning("`avoid_internal_copy` has no effect because dataset passed to `data` is not a `data.table`.")
+    }
 
     # Check for errors in data
     if (! is.numeric(data[[timevar]])) {stop("timevar column in dataset should be numeric.")}
     if (! all(data[[timevar]] %% 1 == 0)) {
         stop("timevar column in dataset should be a vector of integers.")
     }
 
-    data_ids <- as.data.frame(data)[, c(idvar, timevar)]
+    if (data.table::is.data.table(data)) {
+        if (!avoid_internal_copy) {
+            data <- data.table::copy(data)
+        }
+    } else {
+        data <- data.table::as.data.table(data)
+    }
+    data.table::setorderv(data, c(idvar, timevar))
+    data_ids <- data[, .SD, .SDcols = c(idvar, timevar)]
 
+    # Check panel balance and unique keys
     n_units       <- length(base::unique(data[[idvar]]))
     n_periods     <- length(base::unique(data[[timevar]]))
     n_unique_rows <- nrow(data[!base::duplicated(data_ids),])
@@ -195,11 +216,15 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
     } else {
         unbalanced <- FALSE
     }
-
-    data.table::setorderv(data, c(idvar, timevar))
+    if (n_unique_rows != nrow(data)) {
+        if (allow_duplicate_id == TRUE) {
+            warning("idvar-timevar pairs do not uniquely identify all rows in the data.")
+        } else if (allow_duplicate_id == FALSE) {
+            stop("idvar-timevar pairs do not uniquely identify all rows in the data. Turn on allow_duplicate_id if you want to proceed with weighted duplicated rows.")
+        }
+    }
 
     detect_holes <- function(dt, idvar, timevar) {
-        dt <- data.table::as.data.table(dt)
         holes_per_id <- dt[, .SD[!is.na(base::get(timevar))], by = c(idvar)
                          ][, list(holes = any(base::diff(base::get(timevar)) != 1)),
                             by = c(idvar)]
@@ -264,7 +289,7 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
                               timevar_holes = timevar_holes)
 
         lead_endpoint_var <- paste0(policyvar, "_lead", num_fd_leads)
-        data[lead_endpoint_var] <- 1 - data[lead_endpoint_var]
+        data[, (lead_endpoint_var) := 1 - get(lead_endpoint_var)]
     }
 
     if (pre != 0 & normalize == -1 & anticipation_effects_normalization) {

diff --git a/man/EventStudy.Rd b/man/EventStudy.Rd
diff --git a/tests/testthat/test-AddCIs.R b/tests/testthat/test-AddCIs.R
@@ -1,5 +1,5 @@
 test_that("correctly calculates conf_level at 0.95", {
-    df_test <- as.data.frame(data.table::fread("./input/df_test_AddCI.csv"))
+    df_test <- data.table::fread("./input/df_test_AddCI.csv")
 
     policyvar <- "z"
     eventstudy_coefficients <- c("z_fd_lag1", "z_fd_lead1")

diff --git a/tests/testthat/test-AddMeans.R b/tests/testthat/test-AddMeans.R
@@ -13,8 +13,7 @@ test_that("means are computed correctly when a first differenced variable is nor
                               sample_estimation[[2]]$policyvar,
                               sample_estimation[[2]]$outcomevar)
 
-    mean_manual   <- mean(df_estimation[df_estimation[,"z_fd_lead1"] != 0, ]$y_base,
-                          na.rm = T)
+    mean_manual <- df_estimation[z_fd_lead1 != 0, mean(y_base, na.rm = TRUE)]
 
     expect_equal(mean_function, mean_manual)
 
@@ -33,8 +32,7 @@ test_that("means are computed correctly when the furthest lead is normalized", {
                               sample_estimation[[2]]$policyvar,
                               sample_estimation[[2]]$outcomevar)
 
-    mean_manual   <- mean(df_estimation[df_estimation[,"z_lead4"] == 0, ]$y_base,
-                          na.rm = T)
+    mean_manual <- df_estimation[z_lead4 == 0, mean(y_base, na.rm = TRUE)]
 
     expect_equal(mean_function, mean_manual)
 })

diff --git a/tests/testthat/test-ComputeFirstDifferences.R b/tests/testthat/test-ComputeFirstDifferences.R
@@ -1,25 +1,27 @@
+library(data.table)
+
 test_that("correctly recognizes wrong variable type for grouping variable", {
-    df_test <- read.csv("./input/df_test.csv")
+    dt_test <- data.table::fread("./input/df_test.csv")
 
-    expect_error(ComputeFirstDifferences(df_test, country, "periods", "values"))
+    expect_error(ComputeFirstDifferences(dt_test, country, "periods", "values"))
 })
 
 test_that("correctly recognizes wrong variable type for diffvar variable", {
-    df_test <- read.csv("./input/df_test.csv")
+    dt_test <- data.table::fread("./input/df_test.csv")
 
-    expect_error(ComputeFirstDifferences(df_test, "country", "periods", values))
+    expect_error(ComputeFirstDifferences(dt_test, "country", "periods", values))
 })
 
 test_that("a column with _fd suffix is added", {
-    df_test <- read.csv("./input/df_test.csv")
+    dt_test <- data.table::fread("./input/df_test.csv")
 
-    df_diff <- ComputeFirstDifferences(df_test, "country", "periods", "values")
+    df_diff <- ComputeFirstDifferences(dt_test, "country", "periods", "values")
 
     expect_true("values_fd" %in% colnames(df_diff))
 })
 
 test_that("correctly computes differences with a balanced dataset", {
-    df <- data.frame(
+    df <- data.table(
         id     = rep(c("A", "B"), each = 4),
         time   = rep(1:4, times = 2),
         policy = c(10, 12, 11, 13, 8, 9, 10, 11),
@@ -32,7 +34,7 @@ test_that("correctly computes differences with a balanced dataset", {
 })
 
 test_that("correctly computes differences with a dataset that has holes in time var", {
-    df <- data.frame(
+    df <- data.table(
         id = c(rep("A", 4), rep("B", 2), rep("C", 3)),
         time = c(1, 2, 4, 5, 2, 3, 2, 3, 4),
         policy = c(10, 12, 13, 14, 8, 9, 10, 11, 12),