Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
3ad441f
#51 bd setdt initially
zhizhongpu Oct 29, 2024
58ee554
#51 bd add warning for duplicated ids
zhizhongpu Oct 29, 2024
d981920
#51 bd ComputeFirstDifferences
zhizhongpu Oct 29, 2024
a054ed4
#51 doc time insensitive questions
zhizhongpu Oct 29, 2024
da84fb6
#51 bd temp file for dev and tests
zhizhongpu Oct 29, 2024
e034f5f
#51 bd adapt to dt grammar
zhizhongpu Oct 29, 2024
ed23634
#51 bd adapt Addmeans.R
zhizhongpu Oct 29, 2024
b2b0273
#51 bd adapt test-AddMeans.R
zhizhongpu Oct 29, 2024
c4e20e5
#51 bd MRE for data.table problem
zhizhongpu Oct 29, 2024
a241985
#51 doc benchmark solutions
zhizhongpu Oct 30, 2024
3b8945b
#51 bd default banning duplicated rows
zhizhongpu Mar 8, 2025
c92ab2c
#51 doc
zhizhongpu Mar 8, 2025
82232a8
#51 bd shallow copy
zhizhongpu Mar 8, 2025
a6407d7
#51 bd df->dt in ComputeFirstDifferences
zhizhongpu Mar 8, 2025
d354f30
#51 bd improve checking input col types in EventStudy()
zhizhongpu Mar 8, 2025
9b01b2b
#51 fx bugs
zhizhongpu Mar 8, 2025
7f8d398
#51 doc
zhizhongpu Mar 8, 2025
9039a9e
#51 bd change test-ComputeFirstDifferences.R input to dt
zhizhongpu Mar 8, 2025
92f71c8
#51 fx bug
zhizhongpu Mar 8, 2025
2fa1ee2
#51 bd modify benchmarking copying solutions
zhizhongpu Mar 8, 2025
c588170
#51 bd test for data invariance
zhizhongpu Mar 8, 2025
2511786
#51 doc
zhizhongpu Mar 8, 2025
15ef116
#52 fx attempt to fix Github action failures by not modifying example…
zhizhongpu Jun 5, 2025
5aa2b5a
#51 bd no need to convert input data into DF in test
zhizhongpu Jun 5, 2025
d79d069
#51 fx bug
zhizhongpu Jun 5, 2025
de4e7cd
#51 cl add namesapce
zhizhongpu Jun 5, 2025
6feda61
#54 cl
zhizhongpu Jun 7, 2025
ba82321
#54 cl
zhizhongpu Jun 7, 2025
69d635a
#51 aes
zhizhongpu Jul 18, 2025
4db6aae
#51 cl
zhizhongpu Jul 18, 2025
da88346
#51 cl drop ComputeFirstDifferences(return_df)
zhizhongpu Jul 18, 2025
804bc6a
#51 cl simplify checking logical args
zhizhongpu Jul 18, 2025
5d3c26d
#51 cl
zhizhongpu Jul 18, 2025
5b27c2e
#54 drop return_df from ComputeFirstDifferences
zhizhongpu Aug 1, 2025
91ef3c4
#54 drop return_df from ComputeFirstDifferences
zhizhongpu Aug 1, 2025
4c2a7a4
#51 bd add namespace dt
zhizhongpu Aug 1, 2025
936cc0b
#54 fx drop vacuous check
zhizhongpu Aug 1, 2025
ced837d
#54 cl drop `return_df`
zhizhongpu Aug 1, 2025
45f12b2
#51 fx bug in test
zhizhongpu Aug 1, 2025
9e3934b
#54 cl trash
zhizhongpu Aug 1, 2025
6bffa52
#54 bd make ComputeShifts take in data.table
zhizhongpu Aug 1, 2025
2c0b82c
#54 bd setDT -> as.data.table()
zhizhongpu Aug 1, 2025
835b6cc
#54 bd avoid_internal_copy
zhizhongpu Aug 1, 2025
b1a5dbf
#54 fx missing data.table::copy
zhizhongpu Aug 2, 2025
51b0ac3
#54 fx copy
zhizhongpu Aug 2, 2025
63d4d1c
#54 fx namespace bug
zhizhongpu Aug 2, 2025
08852e6
#54 bd test for avoid_internal_copy
zhizhongpu Aug 2, 2025
faf7efe
#54 bd update roxygen documentation
zhizhongpu Aug 2, 2025
d6c1c4a
Merge branch 'main' into 51-switch-to-datatable-throughout
santiagohermo Aug 3, 2025
cce9e84
phrasing
santiagohermo Aug 3, 2025
38b4448
#29 fx "subtraction is peformed on the correct column" test
zhizhongpu Aug 4, 2025
02384ef
#54 fx non-vacuous test for [subtraction is peformed on the correct c…
zhizhongpu Aug 4, 2025
2616590
#54 bd drop vacuous test
zhizhongpu Aug 6, 2025
77806a4
#54 bd increase version
zhizhongpu Aug 6, 2025
b36d841
#54 cl drop issue folder
zhizhongpu Aug 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
!data/*.RData
!data/*.rda
*.csv


inst/doc
revdep/
*.Rproj
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: eventstudyr
Title: Estimation and Visualization of Linear Panel Event Studies
Version: 1.1.4
Version: 1.1.5
Authors@R:
c(person(given = "Simon",
family = "Freyaldenhoven",
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ importFrom(data.table,":=")
importFrom(data.table,.SD)
importFrom(data.table,CJ)
importFrom(data.table,as.data.table)
importFrom(data.table,setDT)
importFrom(data.table,copy)
importFrom(data.table,is.data.table)
importFrom(data.table,setnames)
importFrom(data.table,setorder)
importFrom(data.table,setorderv)
Expand Down
9 changes: 4 additions & 5 deletions R/AddMeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@ AddMeans <- function(df, normalization_column, policyvar, outcomevar,

if (is.null(custom_scalar)) {
if (grepl(paste0(policyvar, "_lead"), normalization_column)) {
df_change <- df[df[, normalization_column] == 0 & !is.na(df[, normalization_column]), ]
}else {
df_change <- df[df[, normalization_column] != 0 & !is.na(df[, normalization_column]), ]
df_change <- df[get(normalization_column) == 0 & !is.na(get(normalization_column))]
} else {
df_change <- df[get(normalization_column) != 0 & !is.na(get(normalization_column))]
}
mean <- mean(df_change[[outcomevar]], na.rm = T)

mean <- df_change[, mean(get(outcomevar), na.rm = TRUE)]
} else {
mean <- custom_scalar
}
Expand Down
45 changes: 18 additions & 27 deletions R/ComputeFirstDifferences.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#' Adds first differences of a variable, robustly to gaps in time variable, as new columns in a panel dataset
#'
#' @param df Data frame that will be modified.
#' @param dt Data frame that will be modified.
#' @param idvar Character indicating column of units.
#' @param timevar Character indicating column of time periods.
#' @param diffvar Character indicating column of variable whose first difference will be taken.
#' @param timevar_holes Logical indicating whether the panel contains gaps in the time variable. Defaults to FALSE.
#' @param return_df Logical indicating whether the function should return a data frame (TRUE) or data.table (FALSE). Defaults to TRUE.
#'
#' @return The passed dataset augmented with columns that reflect the desired first differences
#'
Expand All @@ -19,59 +18,51 @@
#' diffvar = "z"
#' )
#'
#' @importFrom data.table setDT setorderv setnames shift := CJ .SD
#' @importFrom data.table setorderv setnames shift := CJ .SD
#'
#' @keywords internal
#' @noRd

ComputeFirstDifferences <- function(df, idvar, timevar, diffvar,
timevar_holes = FALSE, return_df = TRUE) {
if (! is.data.frame(df)) {
stop("df should be a data frame.")
ComputeFirstDifferences <- function(dt, idvar, timevar, diffvar,
timevar_holes = FALSE) {
if (! data.table::is.data.table(dt)) {
stop("Input data should be a data.table.")
}
for (var in c(idvar, timevar, diffvar)) {
if ((! is.character(var))) {
stop(paste0(var, " should be a character."))
}
if (! var %in% colnames(df)) {
if (! var %in% colnames(dt)) {
stop(paste0(var, " should be the name of a variable in the dataset."))
}
}
if (! is.logical(timevar_holes)) {
stop("timevar_holes should be logical.")
}
if (! is.logical(return_df)) {
stop("return_df should be logical.")
}

data.table::setDT(df)
data.table::setorderv(df, cols = c(idvar, timevar))
data.table::setorderv(dt, cols = c(idvar, timevar))

if (!timevar_holes) {
df[, paste0(diffvar, "_fd") := get(diffvar) - shift((get(diffvar))),
dt[, paste0(diffvar, "_fd") := get(diffvar) - data.table::shift((get(diffvar))),
by = idvar]
} else {
## Create dataset with all combinations to compute first differences
all_combinations <- CJ(unique(df[[idvar]]),
min(df[[timevar]]):max(df[[timevar]]))
setnames(all_combinations, new = c(idvar, timevar))
all_combinations <- data.table::CJ(unique(dt[[idvar]]),
min(dt[[timevar]]):max(dt[[timevar]]))
data.table::setnames(all_combinations, new = c(idvar, timevar))

df_all <- merge(df, all_combinations,
by = c(idvar, timevar), all = TRUE)
dt_all <- data.table::merge.data.table(dt, all_combinations,
by = c(idvar, timevar), all = TRUE)

df_all[, paste0(diffvar, "_fd") := get(diffvar) - shift((get(diffvar))),
dt_all[, paste0(diffvar, "_fd") := get(diffvar) - data.table::shift((get(diffvar))),
by = idvar]

## Bring first differences back to the original dataset
vars_to_keep <- c(idvar, timevar, paste0(diffvar, "_fd"))

df <- merge(df, df_all[, .SD, .SDcols = vars_to_keep],
by = c(idvar, timevar), all.x = TRUE)
}

if (return_df) {
df <- as.data.frame(df)
dt <- data.table::merge.data.table(dt, dt_all[, .SD, .SDcols = vars_to_keep],
by = c(idvar, timevar), all.x = TRUE)
}

return(df)
return(dt)
}
18 changes: 4 additions & 14 deletions R/ComputeShifts.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#' @param shiftvar Character indicating column of variable that will be shifted.
#' @param shiftvalues Numeric vector specifying the leads/lags to be computed. For example, c(-1, 0, 1) will compute the lead, current, and lag values.
#' @param timevar_holes Logical indicating whether the panel contains gaps in the time variable. Defaults to FALSE.
#' @param return_df Logical indicating whether the function should return a data frame (TRUE) or data.table (FALSE). Defaults to TRUE.
#'
#' @return The passed dataset augmented with columns that reflect the desired shift values

Expand All @@ -22,14 +21,14 @@
#' )
#'
#'
#' @importFrom data.table setDT setorderv shift := CJ .SD
#' @importFrom data.table setorderv shift := CJ .SD
#' @keywords internal
#' @noRd

ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
timevar_holes = FALSE, return_df = TRUE) {
if (! is.data.frame(df)) {
stop("df should be a data frame.")
timevar_holes = FALSE) {
if (! data.table::is.data.table(df)) {
stop("df should be a data.table.")
}
for (var in c(idvar, timevar, shiftvar)) {
if ((! is.character(var))) {
Expand All @@ -48,11 +47,7 @@ ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
if (! is.logical(timevar_holes)) {
stop("timevar_holes should be logical.")
}
if (! is.logical(return_df)) {
stop("return_df should be logical.")
}

data.table::setDT(df)
data.table::setorderv(df, cols = c(idvar, timevar))

lags <- shiftvalues[shiftvalues > 0]
Expand Down Expand Up @@ -92,10 +87,5 @@ ComputeShifts <- function(df, idvar, timevar, shiftvar, shiftvalues,
df <- merge(df, df_all,
by = c(idvar, timevar, shiftvar), all.x = TRUE)
}

if (return_df) {
df <- as.data.frame(df)
}

return(df)
}
55 changes: 40 additions & 15 deletions R/EventStudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@
#' @param normalize Specifies the event-time coefficient to be normalized. Defaults to - pre - 1.
#' @param anticipation_effects_normalization If set to TRUE, runs the default process and switches coefficient to be normalized to 0
#' when there are anticipation effects. If set to FALSE, does not make the switch. Defaults to TRUE.
#' @param allow_duplicate_id If TRUE, the function estimates a regression where duplicated ID-time rows are weighted by their duplication count. If FALSE, the function raises an error if duplicate unit-time keys exist in the input data. Default is FALSE.
#' @param avoid_internal_copy If TRUE, the function avoids making an internal deep copy of the input data, and instead directly modifies the input data.table. Default is FALSE.
#'
#' @return A list that contains, under "output", the estimation output as an lm_robust object, and under "arguments", the arguments passed to the function.
#' @import dplyr
#' @import estimatr
#' @importFrom stats reformulate
#' @importFrom data.table setorderv as.data.table .SD
#' @importFrom data.table setorderv as.data.table is.data.table .SD copy
#' @export
#'
#' @examples
Expand Down Expand Up @@ -142,15 +144,20 @@

EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, controls = NULL,
proxy = NULL, proxyIV = NULL, FE = TRUE, TFE = TRUE, post, overidpost = 1, pre, overidpre = post + pre,
normalize = -1 * (pre + 1), cluster = TRUE, anticipation_effects_normalization = TRUE) {
normalize = -1 * (pre + 1), cluster = TRUE, anticipation_effects_normalization = TRUE,
allow_duplicate_id = FALSE, avoid_internal_copy = FALSE) {

# Check for errors in arguments
if (! estimator %in% c("OLS", "FHS")) {stop("estimator should be either 'OLS' or 'FHS'.")}
if (! is.data.frame(data)) {stop("data should be a data frame.")}
if (! is.character(outcomevar)) {stop("outcomevar should be a character.")}
if (! is.character(policyvar)) {stop("policyvar should be a character.")}
if (! is.character(idvar)) {stop("idvar should be a character.")}
if (! is.character(timevar)) {stop("timevar should be a character.")}
for (var in c(idvar, timevar, outcomevar, policyvar)) {
if ((! is.character(var))) {
stop(paste0(var, " should be a character."))
}
if (! var %in% colnames(data)) {
stop(paste0(var, " should be the name of a variable in the dataset."))
}
}
if (! (is.null(controls) | is.character(controls))) {stop("controls should be either NULL or a character.")}

if ((estimator == "OLS" & ! is.null(proxy))) {stop("proxy should only be specified when estimator = 'FHS'.")}
Expand All @@ -162,11 +169,13 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
stop("When estimator is 'FHS' and there are no leads in the model, proxyIV must be specified explicitly.")
}

if (! is.logical(FE)) {stop("FE should be either TRUE or FALSE.")}
if (! is.logical(TFE)) {stop("TFE should be either TRUE or FALSE.")}
if (! is.logical(cluster)) {stop("cluster should be either TRUE or FALSE.")}
for (var in c(FE, TFE, cluster, anticipation_effects_normalization, allow_duplicate_id, avoid_internal_copy)) {
if (! is.logical(var)) {
stop(paste0(var, " should be either TRUE or FALSE."))
}
}

if (FE & !cluster) {stop("cluster=TRUE is required when FE=TRUE.")}
if (! is.logical(anticipation_effects_normalization)) {stop("anticipation_effects_normalization should be either TRUE or FALSE.")}

if (! (is.numeric(post) & post >= 0 & post %% 1 == 0)) {stop("post should be a whole number.")}
if (! (is.numeric(overidpost) & overidpost >= 0 & overidpost %% 1 == 0)) {stop("overidpost should be a whole number.")}
Expand All @@ -177,15 +186,27 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
& normalize >= -(pre + overidpre + 1) & normalize <= post + overidpost)) {
stop("normalize should be an integer between -(pre + overidpre + 1) and (post + overidpost).")
}
if (avoid_internal_copy & ! data.table::is.data.table(data)) {
warning("`avoid_internal_copy` has no effect because dataset passed to `data` is not a `data.table`.")
}

# Check for errors in data
if (! is.numeric(data[[timevar]])) {stop("timevar column in dataset should be numeric.")}
if (! all(data[[timevar]] %% 1 == 0)) {
stop("timevar column in dataset should be a vector of integers.")
}

data_ids <- as.data.frame(data)[, c(idvar, timevar)]
if (data.table::is.data.table(data)) {
if (!avoid_internal_copy) {
data <- data.table::copy(data)
}
} else {
data <- data.table::as.data.table(data)
}
data.table::setorderv(data, c(idvar, timevar))
data_ids <- data[, .SD, .SDcols = c(idvar, timevar)]

# Check panel balance and unique keys
n_units <- length(base::unique(data[[idvar]]))
n_periods <- length(base::unique(data[[timevar]]))
n_unique_rows <- nrow(data[!base::duplicated(data_ids),])
Expand All @@ -195,11 +216,15 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
} else {
unbalanced <- FALSE
}

data.table::setorderv(data, c(idvar, timevar))
if (n_unique_rows != nrow(data)) {
if (allow_duplicate_id == TRUE) {
warning("idvar-timevar pairs do not uniquely identify all rows in the data.")
} else if (allow_duplicate_id == FALSE) {
stop("idvar-timevar pairs do not uniquely identify all rows in the data. Turn on allow_duplicate_id if you want to proceed with weighted duplicated rows.")
}
}

detect_holes <- function(dt, idvar, timevar) {
dt <- data.table::as.data.table(dt)
holes_per_id <- dt[, .SD[!is.na(base::get(timevar))], by = c(idvar)
][, list(holes = any(base::diff(base::get(timevar)) != 1)),
by = c(idvar)]
Expand Down Expand Up @@ -264,7 +289,7 @@ EventStudy <- function(estimator, data, outcomevar, policyvar, idvar, timevar, c
timevar_holes = timevar_holes)

lead_endpoint_var <- paste0(policyvar, "_lead", num_fd_leads)
data[lead_endpoint_var] <- 1 - data[lead_endpoint_var]
data[, (lead_endpoint_var) := 1 - get(lead_endpoint_var)]
}

if (pre != 0 & normalize == -1 & anticipation_effects_normalization) {
Expand Down
8 changes: 7 additions & 1 deletion man/EventStudy.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-AddCIs.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
test_that("correctly calculates conf_level at 0.95", {
df_test <- as.data.frame(data.table::fread("./input/df_test_AddCI.csv"))
df_test <- data.table::fread("./input/df_test_AddCI.csv")

policyvar <- "z"
eventstudy_coefficients <- c("z_fd_lag1", "z_fd_lead1")
Expand Down
6 changes: 2 additions & 4 deletions tests/testthat/test-AddMeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ test_that("means are computed correctly when a first differenced variable is nor
sample_estimation[[2]]$policyvar,
sample_estimation[[2]]$outcomevar)

mean_manual <- mean(df_estimation[df_estimation[,"z_fd_lead1"] != 0, ]$y_base,
na.rm = T)
mean_manual <- df_estimation[z_fd_lead1 != 0, mean(y_base, na.rm = TRUE)]

expect_equal(mean_function, mean_manual)

Expand All @@ -33,8 +32,7 @@ test_that("means are computed correctly when the furthest lead is normalized", {
sample_estimation[[2]]$policyvar,
sample_estimation[[2]]$outcomevar)

mean_manual <- mean(df_estimation[df_estimation[,"z_lead4"] == 0, ]$y_base,
na.rm = T)
mean_manual <- df_estimation[z_lead4 == 0, mean(y_base, na.rm = TRUE)]

expect_equal(mean_function, mean_manual)
})
Expand Down
18 changes: 10 additions & 8 deletions tests/testthat/test-ComputeFirstDifferences.R
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
library(data.table)

test_that("correctly recognizes wrong variable type for grouping variable", {
df_test <- read.csv("./input/df_test.csv")
dt_test <- data.table::fread("./input/df_test.csv")

expect_error(ComputeFirstDifferences(df_test, country, "periods", "values"))
expect_error(ComputeFirstDifferences(dt_test, country, "periods", "values"))
})

test_that("correctly recognizes wrong variable type for diffvar variable", {
df_test <- read.csv("./input/df_test.csv")
dt_test <- data.table::fread("./input/df_test.csv")

expect_error(ComputeFirstDifferences(df_test, "country", "periods", values))
expect_error(ComputeFirstDifferences(dt_test, "country", "periods", values))
})

test_that("a column with _fd suffix is added", {
df_test <- read.csv("./input/df_test.csv")
dt_test <- data.table::fread("./input/df_test.csv")

df_diff <- ComputeFirstDifferences(df_test, "country", "periods", "values")
df_diff <- ComputeFirstDifferences(dt_test, "country", "periods", "values")

expect_true("values_fd" %in% colnames(df_diff))
})

test_that("correctly computes differences with a balanced dataset", {
df <- data.frame(
df <- data.table(
id = rep(c("A", "B"), each = 4),
time = rep(1:4, times = 2),
policy = c(10, 12, 11, 13, 8, 9, 10, 11),
Expand All @@ -32,7 +34,7 @@ test_that("correctly computes differences with a balanced dataset", {
})

test_that("correctly computes differences with a dataset that has holes in time var", {
df <- data.frame(
df <- data.table(
id = c(rep("A", 4), rep("B", 2), rep("C", 3)),
time = c(1, 2, 4, 5, 2, 3, 2, 3, 4),
policy = c(10, 12, 13, 14, 8, 9, 10, 11, 12),
Expand Down
Loading
Loading