Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,7 @@ run_tpch() {

debug_run env BENCH_NAME=tpch \
BENCH_SIZE="${SCALE_FACTOR}" \
DATA_DIR="${DATA_DIR}" \
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
TPCH_FILE_TYPE="${FORMAT}" \
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
Expand All @@ -709,6 +710,7 @@ run_tpch_mem() {

debug_run env BENCH_NAME=tpch \
BENCH_SIZE="${SCALE_FACTOR}" \
DATA_DIR="${DATA_DIR}" \
TPCH_FILE_TYPE="mem" \
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/sql_benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ The SQL benchmarking tool uses the following environment variables:
| BENCH_QUERY | A query number to run. |
| BENCH_PERSIST_RESULTS | true/false to persist benchmark results. Results will be persisted in csv format so be cognizant of the size of the results. |
| BENCH_VALIDATE | true/false to validate benchmark results against persisted results or result_query's. If both `BENCH_PERSIST_RESULTS` and `BENCH_VALIDATE` are true, persist mode runs and validation is skipped. |
| DATA_DIR | Root directory for benchmark data loaded by SQL benchmark files. When unset, uses `data` (relative to the benchmarks/ directory). |
| SIMULATE_LATENCY | Simulate object store latency to mimic remote storage (e.g. S3). Adds random latency in the range 20-200ms to each object store operation. |
| MEM_POOL_TYPE | The memory pool type to use, should be one of "fair" or "greedy". |
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. | |
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. |

Example – Run the H2O window benchmarks on the 'small' sized CSV data files:

Expand Down
16 changes: 8 additions & 8 deletions benchmarks/sql_benchmarks/tpch/init/load_csv.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ CREATE EXTERNAL TABLE nation
n_regionkey INT,
n_comment VARCHAR(152),
PRIMARY KEY (n_nationkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';

CREATE EXTERNAL TABLE region
(
r_regionkey INT,
r_name CHAR(25),
r_comment VARCHAR(152),
PRIMARY KEY (r_regionkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';

CREATE EXTERNAL TABLE supplier
(
Expand All @@ -25,7 +25,7 @@ CREATE EXTERNAL TABLE supplier
s_acctbal DECIMAL(15, 2),
s_comment VARCHAR(101),
PRIMARY KEY (s_suppkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';

CREATE EXTERNAL TABLE customer
(
Expand All @@ -38,7 +38,7 @@ CREATE EXTERNAL TABLE customer
c_mktsegment CHAR(10),
c_comment VARCHAR(117),
PRIMARY KEY (c_custkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';

CREATE EXTERNAL TABLE part
(
Expand All @@ -52,7 +52,7 @@ CREATE EXTERNAL TABLE part
p_retailprice DECIMAL(15, 2),
p_comment VARCHAR(23),
PRIMARY KEY (p_partkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';

CREATE EXTERNAL TABLE partsupp
(
Expand All @@ -62,7 +62,7 @@ CREATE EXTERNAL TABLE partsupp
ps_supplycost DECIMAL(15, 2),
ps_comment VARCHAR(199),
PRIMARY KEY (ps_partkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';

CREATE EXTERNAL TABLE orders
(
Expand All @@ -76,7 +76,7 @@ CREATE EXTERNAL TABLE orders
o_shippriority INT,
o_comment VARCHAR(79),
PRIMARY KEY (o_orderkey)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';

CREATE EXTERNAL TABLE lineitem
(
Expand All @@ -96,4 +96,4 @@ CREATE EXTERNAL TABLE lineitem
l_shipinstruct CHAR(25),
l_shipmode CHAR(10),
l_comment VARCHAR(44)
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
16 changes: 8 additions & 8 deletions benchmarks/sql_benchmarks/tpch/init/load_mem.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';

CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';

CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';

CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';

CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';

CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';

CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';

CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';

CREATE TABLE nation as SELECT * FROM nation_raw;

Expand Down
16 changes: 8 additions & 8 deletions benchmarks/sql_benchmarks/tpch/init/load_parquet.sql
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';

CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';

CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';

CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';

CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';

CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';

CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';

CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
68 changes: 68 additions & 0 deletions benchmarks/src/sql_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2230,6 +2230,74 @@ NULL|(empty)
);
}

#[tokio::test]
async fn parser_applies_data_dir_replacement_in_load_query_file() {
let temp_dir = tempdir().expect("failed to create benchmark test directory");
let data_dir = temp_dir.path().join("non_default_data");
let csv_dir = data_dir.join("tpch_sf1/csv/generated");
fs::create_dir_all(&csv_dir).expect("failed to create generated data directory");
fs::write(csv_dir.join("generated.1.csv"), "value\n42\n")
.expect("failed to write generated csv file");

let load_path = write_test_file(
&temp_dir,
"load_generated_csv.sql",
"CREATE EXTERNAL TABLE generated(value INT) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/generated/generated.1.csv' OPTIONS ('format.has_header' 'true');\n",
);
let template_path = write_test_file(
&temp_dir,
"load_file_template.benchmark",
&format!(
"load {}\n\nrun\nSELECT value FROM generated;\n",
load_path.display()
),
);
let benchmark_path = write_test_file(
&temp_dir,
"load_file_driver.benchmark",
&format!(
"template {}\nDATA_DIR={}\n",
template_path.display(),
data_dir.display()
),
);

let ctx = SessionContext::new();
let path_string = benchmark_path.to_string_lossy().into_owned();
let mut benchmark = SqlBenchmark::new(&ctx, &path_string, "/tmp")
.await
.expect("benchmark should parse");

let load_queries = benchmark
.queries()
.get(&QueryDirective::Load)
.expect("load queries");
assert_eq!(load_queries.len(), 1);
assert!(
load_queries.iter().all(|query| !query.contains("${")),
"all placeholders should be replaced: {load_queries:?}"
);
let expected_location = format!(
"LOCATION '{}/tpch_sf1/csv/generated/generated.1.csv'",
data_dir.display()
);
assert!(
load_queries[0].contains(&expected_location),
"all load locations should use the non-default DATA_DIR: {load_queries:?}"
);

benchmark
.initialize(&ctx)
.await
.expect("benchmark should load generated csv file");
benchmark
.run(&ctx, true)
.await
.expect("benchmark should read generated csv file");

assert_eq!(formatted_last_results(&benchmark), vec![vec!["42"]]);
}

#[tokio::test]
async fn parser_rejects_inline_sql_when_query_file_is_provided() {
let temp_dir = tempdir().expect("failed to create benchmark test directory");
Expand Down
Loading