From f419d4c7abc0051cbb05f474d0546d70efd96265 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Fri, 17 Apr 2026 15:32:36 +0530 Subject: [PATCH 1/8] Revamp perf benchmarks: insertmanyvalues scenario, baseline comparison, CI artifact pipeline - Add insertmanyvalues scenario (100K rows, 1000/batch, GH #500 repro) - Add --baseline flag for 3-way comparison (main vs PR vs pyodbc) - Add --json flag to save results for pipeline artifact consumption - Publish benchmark results as CI artifact on main merge - Download baseline artifact on PR runs for automatic regression detection - Fix timing: time.time() -> time.perf_counter() - Move connection setup outside timing window (measure execute+fetch only) --- benchmarks/perf-benchmarking.py | 663 +++++++++++++---------- eng/pipelines/pr-validation-pipeline.yml | 27 +- 2 files changed, 410 insertions(+), 280 deletions(-) diff --git a/benchmarks/perf-benchmarking.py b/benchmarks/perf-benchmarking.py index a00a3f6fe..ba407c56c 100644 --- a/benchmarks/perf-benchmarking.py +++ b/benchmarks/perf-benchmarking.py @@ -1,24 +1,26 @@ """ -Performance Benchmarking Script for mssql-python vs pyodbc +Performance Benchmarking: mssql-python vs pyodbc -This script runs comprehensive performance tests comparing mssql-python with pyodbc -across multiple query types and scenarios. Each test is run multiple times to calculate -average execution times, minimum, maximum, and standard deviation. +Runs scenarios (fetch queries + insertmanyvalues), compares mssql-python against pyodbc, +and optionally against a baseline JSON from the main branch. Usage: - python benchmarks/perf-benchmarking.py + python benchmarks/perf-benchmarking.py # 2-col: PR vs pyodbc + python benchmarks/perf-benchmarking.py --baseline baseline.json # 3-col: main vs PR vs pyodbc + python benchmarks/perf-benchmarking.py --json results.json # save results to JSON -Requirements: - - pyodbc - - mssql_python - - Valid SQL Server connection +Environment: + DB_CONNECTION_STRING — required, e.g. Server=localhost;Database=...;Uid=sa;Pwd=...;TrustServerCertificate=yes """ +import argparse +import json import os import sys import time import statistics -from typing import List, Tuple +from datetime import datetime, timezone +from typing import List, Optional # Add parent directory to path to import local mssql_python sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) @@ -28,339 +30,444 @@ # Configuration CONN_STR = os.getenv("DB_CONNECTION_STRING") +CONN_STR_PYODBC = None -if not CONN_STR: - print( - "Error: The environment variable DB_CONNECTION_STRING is not set. Please set it to a valid SQL Server connection string and try again." - ) - sys.exit(1) - -# Ensure pyodbc connection string has ODBC driver specified -if CONN_STR and "Driver=" not in CONN_STR: - CONN_STR_PYODBC = f"Driver={{ODBC Driver 18 for SQL Server}};{CONN_STR}" -else: - CONN_STR_PYODBC = CONN_STR - -NUM_ITERATIONS = 10 # Number of times to run each test for averaging - -# SQL Queries -COMPLEX_JOIN_AGGREGATION = """ - SELECT - p.ProductID, - p.Name AS ProductName, - pc.Name AS Category, - psc.Name AS Subcategory, - COUNT(sod.SalesOrderDetailID) AS TotalOrders, - SUM(sod.OrderQty) AS TotalQuantity, - SUM(sod.LineTotal) AS TotalRevenue, - AVG(sod.UnitPrice) AS AvgPrice - FROM Sales.SalesOrderDetail sod - INNER JOIN Production.Product p ON sod.ProductID = p.ProductID - INNER JOIN Production.ProductSubcategory psc ON p.ProductSubcategoryID = psc.ProductSubcategoryID - INNER JOIN Production.ProductCategory pc ON psc.ProductCategoryID = pc.ProductCategoryID - GROUP BY p.ProductID, p.Name, pc.Name, psc.Name - HAVING SUM(sod.LineTotal) > 10000 - ORDER BY TotalRevenue DESC; -""" +NUM_ITERATIONS = 10 +INSERTMANY_ROWS = 100_000 +INSERTMANY_BATCH_SIZE = 1000 -LARGE_DATASET = """ - SELECT - soh.SalesOrderID, - soh.OrderDate, - soh.DueDate, - soh.ShipDate, - soh.Status, - soh.SubTotal, - soh.TaxAmt, - soh.Freight, - soh.TotalDue, - c.CustomerID, - p.FirstName, - p.LastName, - a.AddressLine1, - a.City, - sp.Name AS StateProvince, - cr.Name AS Country - FROM Sales.SalesOrderHeader soh - INNER JOIN Sales.Customer c ON soh.CustomerID = c.CustomerID - INNER JOIN Person.Person p ON c.PersonID = p.BusinessEntityID - INNER JOIN Person.BusinessEntityAddress bea ON p.BusinessEntityID = bea.BusinessEntityID - INNER JOIN Person.Address a ON bea.AddressID = a.AddressID - INNER JOIN Person.StateProvince sp ON a.StateProvinceID = sp.StateProvinceID - INNER JOIN Person.CountryRegion cr ON sp.CountryRegionCode = cr.CountryRegionCode - WHERE soh.OrderDate >= '2013-01-01'; -""" - -VERY_LARGE_DATASET = """ -SELECT - sod.SalesOrderID, - sod.SalesOrderDetailID, - sod.ProductID, - sod.OrderQty, - sod.UnitPrice, - sod.LineTotal, - p.Name AS ProductName, - p.ProductNumber, - p.Color, - p.ListPrice, - n1.number AS RowMultiplier1 -FROM Sales.SalesOrderDetail sod -CROSS JOIN (SELECT TOP 10 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS number - FROM Sales.SalesOrderDetail) n1 -INNER JOIN Production.Product p ON sod.ProductID = p.ProductID; -""" -SUBQUERY_WITH_CTE = """ - WITH SalesSummary AS ( - SELECT - soh.SalesPersonID, - YEAR(soh.OrderDate) AS OrderYear, - SUM(soh.TotalDue) AS YearlyTotal - FROM Sales.SalesOrderHeader soh - WHERE soh.SalesPersonID IS NOT NULL - GROUP BY soh.SalesPersonID, YEAR(soh.OrderDate) - ), - RankedSales AS ( - SELECT - SalesPersonID, - OrderYear, - YearlyTotal, - RANK() OVER (PARTITION BY OrderYear ORDER BY YearlyTotal DESC) AS SalesRank - FROM SalesSummary - ) - SELECT - rs.SalesPersonID, - p.FirstName, - p.LastName, - rs.OrderYear, - rs.YearlyTotal, - rs.SalesRank - FROM RankedSales rs - INNER JOIN Person.Person p ON rs.SalesPersonID = p.BusinessEntityID - WHERE rs.SalesRank <= 10 - ORDER BY rs.OrderYear DESC, rs.SalesRank; -""" +def _init_conn_strings(): + global CONN_STR, CONN_STR_PYODBC + if not CONN_STR: + print( + "Error: The environment variable DB_CONNECTION_STRING is not set. " + "Please set it to a valid SQL Server connection string and try again." + ) + sys.exit(1) + if "Driver=" not in CONN_STR: + CONN_STR_PYODBC = f"Driver={{ODBC Driver 18 for SQL Server}};{CONN_STR}" + else: + CONN_STR_PYODBC = CONN_STR class BenchmarkResult: - """Class to store and calculate benchmark statistics""" - def __init__(self, name: str): self.name = name self.times: List[float] = [] self.row_count: int = 0 def add_time(self, elapsed: float, rows: int = 0): - """Add a timing result""" self.times.append(elapsed) if rows > 0: self.row_count = rows @property - def avg_time(self) -> float: - """Calculate average time""" + def avg(self) -> float: return statistics.mean(self.times) if self.times else 0.0 @property - def min_time(self) -> float: - """Get minimum time""" + def min(self) -> float: return min(self.times) if self.times else 0.0 @property - def max_time(self) -> float: - """Get maximum time""" + def max(self) -> float: return max(self.times) if self.times else 0.0 @property - def std_dev(self) -> float: - """Calculate standard deviation""" + def stddev(self) -> float: return statistics.stdev(self.times) if len(self.times) > 1 else 0.0 - def __str__(self) -> str: - """Format results as string""" - return ( - f"{self.name}:\n" - f" Avg: {self.avg_time:.4f}s | Min: {self.min_time:.4f}s | " - f"Max: {self.max_time:.4f}s | StdDev: {self.std_dev:.4f}s | " - f"Rows: {self.row_count}" - ) + def to_dict(self) -> dict: + return { + "avg": round(self.avg, 6), + "min": round(self.min, 6), + "max": round(self.max, 6), + "stddev": round(self.stddev, 6), + "rows": self.row_count, + "iterations": len(self.times), + } -def run_benchmark_pyodbc(query: str, name: str, iterations: int) -> BenchmarkResult: - """Run a benchmark using pyodbc""" - result = BenchmarkResult(f"{name} (pyodbc)") +# --------------------------------------------------------------------------- +# Fetch scenario runners +# --------------------------------------------------------------------------- - for i in range(iterations): +def run_fetch_pyodbc(query: str, name: str, iterations: int) -> BenchmarkResult: + result = BenchmarkResult(name) + for _ in range(iterations): try: - start_time = time.time() conn = pyodbc.connect(CONN_STR_PYODBC) cursor = conn.cursor() + start = time.perf_counter() cursor.execute(query) rows = cursor.fetchall() - elapsed = time.time() - start_time - + elapsed = time.perf_counter() - start result.add_time(elapsed, len(rows)) - cursor.close() conn.close() except Exception as e: - print(f" Error in iteration {i+1}: {e}") - continue - + print(f" pyodbc error: {e}") return result -def run_benchmark_mssql_python(query: str, name: str, iterations: int) -> BenchmarkResult: - """Run a benchmark using mssql-python""" - result = BenchmarkResult(f"{name} (mssql-python)") - - for i in range(iterations): +def run_fetch_mssql(query: str, name: str, iterations: int) -> BenchmarkResult: + result = BenchmarkResult(name) + for _ in range(iterations): try: - start_time = time.time() conn = connect(CONN_STR) cursor = conn.cursor() + start = time.perf_counter() cursor.execute(query) rows = cursor.fetchall() - elapsed = time.time() - start_time - + elapsed = time.perf_counter() - start result.add_time(elapsed, len(rows)) - cursor.close() conn.close() except Exception as e: - print(f" Error in iteration {i+1}: {e}") - continue - + print(f" mssql-python error: {e}") return result -def calculate_speedup( - pyodbc_result: BenchmarkResult, mssql_python_result: BenchmarkResult -) -> float: - """Calculate speedup factor""" - if mssql_python_result.avg_time == 0: - return 0.0 - return pyodbc_result.avg_time / mssql_python_result.avg_time - - -def print_comparison(pyodbc_result: BenchmarkResult, mssql_python_result: BenchmarkResult): - """Print detailed comparison of results""" - speedup = calculate_speedup(pyodbc_result, mssql_python_result) - - print(f"\n{'='*80}") - print(f"BENCHMARK: {pyodbc_result.name.split(' (')[0]}") - print(f"{'='*80}") - print(f"\npyodbc:") - print(f" Avg: {pyodbc_result.avg_time:.4f}s") - print(f" Min: {pyodbc_result.min_time:.4f}s") - print(f" Max: {pyodbc_result.max_time:.4f}s") - print(f" StdDev: {pyodbc_result.std_dev:.4f}s") - print(f" Rows: {pyodbc_result.row_count}") - - print(f"\nmssql-python:") - print(f" Avg: {mssql_python_result.avg_time:.4f}s") - print(f" Min: {mssql_python_result.min_time:.4f}s") - print(f" Max: {mssql_python_result.max_time:.4f}s") - print(f" StdDev: {mssql_python_result.std_dev:.4f}s") - print(f" Rows: {mssql_python_result.row_count}") - - print(f"\nPerformance:") - if speedup > 1: - print(f" mssql-python is {speedup:.2f}x FASTER than pyodbc") - elif speedup < 1 and speedup > 0: - print(f" mssql-python is {1/speedup:.2f}x SLOWER than pyodbc") - else: - print(f" Unable to calculate speedup") +# --------------------------------------------------------------------------- +# Insertmanyvalues scenario +# --------------------------------------------------------------------------- - print(f" Time difference: {(pyodbc_result.avg_time - mssql_python_result.avg_time):.4f}s") +def _build_batch_sql(batch_size: int) -> str: + placeholders = ",".join(["(?,?)"] * batch_size) + return f"INSERT INTO #bench_insert VALUES {placeholders}" -def main(): - """Main benchmark runner""" - print("=" * 80) - print("PERFORMANCE BENCHMARKING: mssql-python vs pyodbc") - print("=" * 80) - print(f"\nConfiguration:") - print(f" Iterations per test: {NUM_ITERATIONS}") - print(f" Database: AdventureWorks2022") - print(f"\n") - - # Define benchmarks - benchmarks = [ - (COMPLEX_JOIN_AGGREGATION, "Complex Join Aggregation"), - (LARGE_DATASET, "Large Dataset Retrieval"), - (VERY_LARGE_DATASET, "Very Large Dataset (1.2M rows)"), - (SUBQUERY_WITH_CTE, "Subquery with CTE"), - ] - - # Store all results for summary - all_results: List[Tuple[BenchmarkResult, BenchmarkResult]] = [] - - # Run each benchmark - for query, name in benchmarks: - print(f"\nRunning: {name}") - print(f" Testing with pyodbc... ", end="", flush=True) - pyodbc_result = run_benchmark_pyodbc(query, name, NUM_ITERATIONS) - print(f"OK (avg: {pyodbc_result.avg_time:.4f}s)") - - print(f" Testing with mssql-python... ", end="", flush=True) - mssql_python_result = run_benchmark_mssql_python(query, name, NUM_ITERATIONS) - print(f"OK (avg: {mssql_python_result.avg_time:.4f}s)") - - all_results.append((pyodbc_result, mssql_python_result)) - - # Print detailed comparisons - print("\n\n" + "=" * 80) - print("DETAILED RESULTS") - print("=" * 80) - - for pyodbc_result, mssql_python_result in all_results: - print_comparison(pyodbc_result, mssql_python_result) - - # Print summary table - print("\n\n" + "=" * 80) - print("SUMMARY TABLE") - print("=" * 80) - print(f"\n{'Benchmark':<35} {'pyodbc (s)':<15} {'mssql-python (s)':<20} {'Speedup'}") - print("-" * 80) - - total_pyodbc = 0.0 - total_mssql_python = 0.0 - - for pyodbc_result, mssql_python_result in all_results: - name = pyodbc_result.name.split(" (")[0] - speedup = calculate_speedup(pyodbc_result, mssql_python_result) - - total_pyodbc += pyodbc_result.avg_time - total_mssql_python += mssql_python_result.avg_time +def _generate_rows(total: int) -> list: + return [(i, f"value_{i}") for i in range(total)] + + +def _run_insertmany(conn_factory, conn_str, name: str, iterations: int) -> BenchmarkResult: + result = BenchmarkResult(name) + batch_sql = _build_batch_sql(INSERTMANY_BATCH_SIZE) + all_rows = _generate_rows(INSERTMANY_ROWS) + + # Pre-build flat param lists per batch + batches = [] + for start in range(0, INSERTMANY_ROWS, INSERTMANY_BATCH_SIZE): + chunk = all_rows[start : start + INSERTMANY_BATCH_SIZE] + flat = [] + for row in chunk: + flat.extend(row) + batches.append(flat) + + for _ in range(iterations): + try: + conn = conn_factory(conn_str) + cursor = conn.cursor() + cursor.execute( + "IF OBJECT_ID('tempdb..#bench_insert') IS NOT NULL DROP TABLE #bench_insert; " + "CREATE TABLE #bench_insert (id INT, val VARCHAR(100))" + ) + + start = time.perf_counter() + for flat_params in batches: + cursor.execute(batch_sql, flat_params) + elapsed = time.perf_counter() - start + + result.add_time(elapsed, INSERTMANY_ROWS) + cursor.close() + conn.close() + except Exception as e: + print(f" {name} error: {e}") + return result - print( - f"{name:<35} {pyodbc_result.avg_time:<15.4f} {mssql_python_result.avg_time:<20.4f} {speedup:.2f}x" - ) - print("-" * 80) - print( - f"{'TOTAL':<35} {total_pyodbc:<15.4f} {total_mssql_python:<20.4f} " - f"{total_pyodbc/total_mssql_python if total_mssql_python > 0 else 0:.2f}x" +def run_insertmany_pyodbc(iterations: int) -> BenchmarkResult: + return _run_insertmany( + lambda cs: pyodbc.connect(cs), CONN_STR_PYODBC, + "Insertmanyvalues (100K rows)", iterations, ) - # Overall conclusion - overall_speedup = total_pyodbc / total_mssql_python if total_mssql_python > 0 else 0 - print(f"\n{'='*80}") - print("OVERALL CONCLUSION") - print("=" * 80) - if overall_speedup > 1: - print(f"\nmssql-python is {overall_speedup:.2f}x FASTER than pyodbc on average") - print( - f"Total time saved: {total_pyodbc - total_mssql_python:.4f}s ({((total_pyodbc - total_mssql_python)/total_pyodbc*100):.1f}%)" + +def run_insertmany_mssql(iterations: int) -> BenchmarkResult: + return _run_insertmany( + lambda cs: connect(cs), CONN_STR, + "Insertmanyvalues (100K rows)", iterations, + ) + + +# --------------------------------------------------------------------------- +# Output formatting +# --------------------------------------------------------------------------- + +def _ratio_str(a: float, b: float) -> str: + """Return 'Nx faster/slower' comparing a to b (lower is better).""" + if a == 0 or b == 0: + return "N/A" + if a <= b: + factor = b / a + return f"{factor:.1f}x faster" + else: + factor = a / b + return f"{factor:.1f}x slower" + + +def print_results( + results: List[tuple], + baseline: Optional[dict], +): + has_baseline = baseline is not None + + # Header + print("\n" + "=" * 100) + if has_baseline: + print("RESULTS: main (baseline) vs this PR vs pyodbc") + else: + print("RESULTS: this PR vs pyodbc") + print("=" * 100) + + if has_baseline: + hdr = ( + f"\n{'Scenario':<40} {'main':<10} {'this PR':<10} {'pyodbc':<10} " + f"{'vs main':<16} {'vs pyodbc':<16}" ) - elif overall_speedup < 1 and overall_speedup > 0: - print(f"\nmssql-python is {1/overall_speedup:.2f}x SLOWER than pyodbc on average") - print( - f"Total time difference: {total_mssql_python - total_pyodbc:.4f}s ({((total_mssql_python - total_pyodbc)/total_mssql_python*100):.1f}%)" + else: + hdr = f"\n{'Scenario':<40} {'this PR':<10} {'pyodbc':<10} {'vs pyodbc':<16}" + print(hdr) + print("-" * 100) + + regressions = [] + highlights = [] + + for name, mssql_result, pyodbc_result in results: + pr_avg = mssql_result.avg + py_avg = pyodbc_result.avg + + if has_baseline and name in baseline: + main_avg = baseline[name]["avg"] + vs_main = _ratio_str(pr_avg, main_avg) + vs_pyodbc = _ratio_str(pr_avg, py_avg) + print( + f"{name:<40} {main_avg:<10.4f} {pr_avg:<10.4f} {py_avg:<10.4f} " + f"{vs_main:<16} {vs_pyodbc:<16}" + ) + if main_avg > 0 and pr_avg > main_avg * 1.05: + regressions.append((name, main_avg, pr_avg)) + if main_avg > 0 and pr_avg < main_avg * 0.90: + highlights.append((name, main_avg, pr_avg)) + else: + vs_pyodbc = _ratio_str(pr_avg, py_avg) + if has_baseline: + print( + f"{name:<40} {'N/A':<10} {pr_avg:<10.4f} {py_avg:<10.4f} " + f"{'N/A':<16} {vs_pyodbc:<16}" + ) + else: + print(f"{name:<40} {pr_avg:<10.4f} {py_avg:<10.4f} {vs_pyodbc:<16}") + + print("-" * 100) + + if has_baseline: + print(f"\n{'='*100}") + if regressions: + print("REGRESSIONS (>5% slower than main)") + print("=" * 100) + for name, main_avg, pr_avg in regressions: + factor = pr_avg / main_avg + print(f" {name}: {main_avg:.4f}s -> {pr_avg:.4f}s ({factor:.1f}x slower)") + else: + print("REGRESSIONS (>5% slower than main): None detected") + + print(f"\n{'='*100}") + if highlights: + print("HIGHLIGHTS (>10% faster than main)") + print("=" * 100) + for name, main_avg, pr_avg in highlights: + factor = main_avg / pr_avg + print(f" {name}: {main_avg:.4f}s -> {pr_avg:.4f}s ({factor:.1f}x faster)") + else: + print("HIGHLIGHTS (>10% faster than main): None") + + print(f"\n{'='*100}\n") + + +# --------------------------------------------------------------------------- +# JSON I/O +# --------------------------------------------------------------------------- + +def save_json(results: List[tuple], path: str): + data = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "iterations": NUM_ITERATIONS, + "scenarios": {}, + } + for name, mssql_result, pyodbc_result in results: + data["scenarios"][name] = { + "mssql_python": mssql_result.to_dict(), + "pyodbc": pyodbc_result.to_dict(), + } + # For baseline consumption, also store flat avg per scenario at top level + data["baseline"] = {name: mssql_result.to_dict() for name, mssql_result, _ in results} + with open(path, "w") as f: + json.dump(data, f, indent=2) + print(f"Results saved to {path}") + + +def load_baseline(path: str) -> Optional[dict]: + if not path or not os.path.exists(path): + return None + try: + with open(path) as f: + data = json.load(f) + return data.get("baseline", {}) + except (json.JSONDecodeError, KeyError) as e: + print(f" Warning: could not parse baseline {path}: {e}") + return None + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +FETCH_SCENARIOS = [ + ( + "Complex Join Aggregation", + """ + SELECT p.ProductID, p.Name AS ProductName, pc.Name AS Category, + psc.Name AS Subcategory, COUNT(sod.SalesOrderDetailID) AS TotalOrders, + SUM(sod.OrderQty) AS TotalQuantity, SUM(sod.LineTotal) AS TotalRevenue, + AVG(sod.UnitPrice) AS AvgPrice + FROM Sales.SalesOrderDetail sod + INNER JOIN Production.Product p ON sod.ProductID = p.ProductID + INNER JOIN Production.ProductSubcategory psc ON p.ProductSubcategoryID = psc.ProductSubcategoryID + INNER JOIN Production.ProductCategory pc ON psc.ProductCategoryID = pc.ProductCategoryID + GROUP BY p.ProductID, p.Name, pc.Name, psc.Name + HAVING SUM(sod.LineTotal) > 10000 + ORDER BY TotalRevenue DESC + """, + ), + ( + "Large Dataset Retrieval", + """ + SELECT soh.SalesOrderID, soh.OrderDate, soh.DueDate, soh.ShipDate, soh.Status, + soh.SubTotal, soh.TaxAmt, soh.Freight, soh.TotalDue, c.CustomerID, + p.FirstName, p.LastName, a.AddressLine1, a.City, + sp.Name AS StateProvince, cr.Name AS Country + FROM Sales.SalesOrderHeader soh + INNER JOIN Sales.Customer c ON soh.CustomerID = c.CustomerID + INNER JOIN Person.Person p ON c.PersonID = p.BusinessEntityID + INNER JOIN Person.BusinessEntityAddress bea ON p.BusinessEntityID = bea.BusinessEntityID + INNER JOIN Person.Address a ON bea.AddressID = a.AddressID + INNER JOIN Person.StateProvince sp ON a.StateProvinceID = sp.StateProvinceID + INNER JOIN Person.CountryRegion cr ON sp.CountryRegionCode = cr.CountryRegionCode + WHERE soh.OrderDate >= '2013-01-01' + """, + ), + ( + "Very Large Dataset (1.2M rows)", + """ + SELECT sod.SalesOrderID, sod.SalesOrderDetailID, sod.ProductID, + sod.OrderQty, sod.UnitPrice, sod.LineTotal, + p.Name AS ProductName, p.ProductNumber, p.Color, p.ListPrice, + n1.number AS RowMultiplier1 + FROM Sales.SalesOrderDetail sod + CROSS JOIN (SELECT TOP 10 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS number + FROM Sales.SalesOrderDetail) n1 + INNER JOIN Production.Product p ON sod.ProductID = p.ProductID + """, + ), + ( + "Subquery with CTE", + """ + WITH SalesSummary AS ( + SELECT soh.SalesPersonID, YEAR(soh.OrderDate) AS OrderYear, + SUM(soh.TotalDue) AS YearlyTotal + FROM Sales.SalesOrderHeader soh + WHERE soh.SalesPersonID IS NOT NULL + GROUP BY soh.SalesPersonID, YEAR(soh.OrderDate) + ), + RankedSales AS ( + SELECT SalesPersonID, OrderYear, YearlyTotal, + RANK() OVER (PARTITION BY OrderYear ORDER BY YearlyTotal DESC) AS SalesRank + FROM SalesSummary ) + SELECT rs.SalesPersonID, p.FirstName, p.LastName, + rs.OrderYear, rs.YearlyTotal, rs.SalesRank + FROM RankedSales rs + INNER JOIN Person.Person p ON rs.SalesPersonID = p.BusinessEntityID + WHERE rs.SalesRank <= 10 + ORDER BY rs.OrderYear DESC, rs.SalesRank + """, + ), +] + + +def main(): + parser = argparse.ArgumentParser(description="mssql-python performance benchmarks") + parser.add_argument("--json", metavar="PATH", help="Save results to JSON file") + parser.add_argument("--baseline", metavar="PATH", help="Load baseline JSON from main branch") + args = parser.parse_args() + + _init_conn_strings() + + baseline = load_baseline(args.baseline) + + print("=" * 100) + if baseline: + print("PERFORMANCE BENCHMARKING: mssql-python PR vs main vs pyodbc") + else: + if args.baseline: + print("PERFORMANCE BENCHMARKING: mssql-python vs pyodbc") + print(" (baseline file not found — showing 2-column comparison)") + else: + print("PERFORMANCE BENCHMARKING: mssql-python vs pyodbc") + print("=" * 100) + print(f" Iterations: {NUM_ITERATIONS}") + if baseline: + print(f" Baseline: {args.baseline}") + print() + + all_results: List[tuple] = [] + + # Fetch scenarios (require AdventureWorks) + for name, query in FETCH_SCENARIOS: + print(f"Running: {name}") + print(f" pyodbc... ", end="", flush=True) + py_result = run_fetch_pyodbc(query, name, NUM_ITERATIONS) + if py_result.times: + print(f"OK ({py_result.avg:.4f}s)") + else: + print("FAILED") + + print(f" mssql-python... ", end="", flush=True) + ms_result = run_fetch_mssql(query, name, NUM_ITERATIONS) + if ms_result.times: + print(f"OK ({ms_result.avg:.4f}s)") + else: + print("FAILED") + + all_results.append((name, ms_result, py_result)) + + # Insertmanyvalues scenario (uses temp table, no AdventureWorks needed) + insert_name = "Insertmanyvalues (100K rows)" + print(f"\nRunning: {insert_name}") + print(f" pyodbc... ", end="", flush=True) + py_insert = run_insertmany_pyodbc(NUM_ITERATIONS) + if py_insert.times: + print(f"OK ({py_insert.avg:.4f}s)") + else: + print("FAILED") + + print(f" mssql-python... ", end="", flush=True) + ms_insert = run_insertmany_mssql(NUM_ITERATIONS) + if ms_insert.times: + print(f"OK ({ms_insert.avg:.4f}s)") + else: + print("FAILED") + + all_results.append((insert_name, ms_insert, py_insert)) + + # Output + print_results(all_results, baseline) - print(f"\n{'='*80}\n") + if args.json: + save_json(all_results, args.json) if __name__ == "__main__": diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index a76370840..6b3e9655a 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -383,13 +383,36 @@ jobs: pip install pyodbc Write-Host "`nRunning performance benchmarks..." - python benchmarks/perf-benchmarking.py + python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json displayName: 'Run performance benchmarks on SQL Server 2022/2025' condition: or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025')) continueOnError: true env: DB_CONNECTION_STRING: 'Server=localhost;Database=AdventureWorks2022;Uid=sa;Pwd=$(DB_PASSWORD);TrustServerCertificate=yes' + # Publish benchmark results as artifact (consumed as baseline by future PR runs) + - task: PublishPipelineArtifact@1 + inputs: + targetPath: benchmark_results.json + artifact: 'perf-baseline-$(sqlVersion)' + displayName: 'Publish benchmark baseline' + condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + + # Download baseline from latest main run (for PR comparison) + - task: DownloadPipelineArtifact@2 + inputs: + source: specific + project: $(System.TeamProjectId) + pipeline: $(System.DefinitionId) + runVersion: latestFromBranch + runBranch: refs/heads/main + artifact: 'perf-baseline-$(sqlVersion)' + path: $(Build.SourcesDirectory) + displayName: 'Download baseline from main' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + - task: CopyFiles@2 inputs: SourceFolder: 'mssql_python' @@ -808,7 +831,7 @@ jobs: echo 'Installing pyodbc for benchmarking...' pip install pyodbc echo 'Running performance benchmarks on $(distroName)' - python benchmarks/perf-benchmarking.py || echo 'Performance benchmark failed or database not available' + python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json || echo 'Performance benchmark failed or database not available' " else echo "Skipping performance benchmarks on $(distroName) (only runs on Ubuntu with local SQL Server)" From 422767b692efbb73fd8a8cf4cda19c1ed36a54e5 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 11:49:47 +0530 Subject: [PATCH 2/8] Add benchmark runs to macOS CI pipeline - Download and restore AdventureWorks2022 via docker cp + sqlcmd - Install pyodbc and run perf-benchmarking.py with --baseline/--json - Publish/download baseline artifacts (perf-baseline-macOS-SQL2022/SQL2025) - Same pattern as Windows, adapted for Colima Docker --- eng/pipelines/pr-validation-pipeline.yml | 62 ++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index 6b3e9655a..9b2c3e4ec 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -542,6 +542,68 @@ jobs: testResultsFiles: '**/test-results.xml' testRunTitle: 'Publish pytest results on macOS $(sqlVersion)' + # Download and restore AdventureWorks2022 database for benchmarking + - script: | + echo "Downloading AdventureWorks2022.bak..." + curl -sSL -o /tmp/AdventureWorks2022.bak \ + https://github.com/Microsoft/sql-server-samples/releases/download/adventureworks/AdventureWorks2022.bak + + echo "Copying backup into SQL Server container..." + docker cp /tmp/AdventureWorks2022.bak sqlserver:/tmp/AdventureWorks2022.bak + + echo "Restoring AdventureWorks2022 database..." + docker exec sqlserver /opt/mssql-tools18/bin/sqlcmd \ + -S localhost -U SA -P "$DB_PASSWORD" -C \ + -Q "RESTORE DATABASE AdventureWorks2022 FROM DISK = '/tmp/AdventureWorks2022.bak' WITH MOVE 'AdventureWorks2022' TO '/var/opt/mssql/data/AdventureWorks2022.mdf', MOVE 'AdventureWorks2022_log' TO '/var/opt/mssql/data/AdventureWorks2022_log.ldf', REPLACE" + + if [ $? -eq 0 ]; then + echo "AdventureWorks2022 database restored successfully" + else + echo "Failed to restore AdventureWorks2022 database" + fi + + rm -f /tmp/AdventureWorks2022.bak + docker exec sqlserver rm -f /tmp/AdventureWorks2022.bak || true + displayName: 'Download and restore AdventureWorks2022 database on macOS' + condition: or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025')) + continueOnError: true + env: + DB_PASSWORD: $(DB_PASSWORD) + + # Run performance benchmarks on macOS + - script: | + pip install pyodbc + echo "Running performance benchmarks..." + python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json + displayName: 'Run performance benchmarks on macOS $(sqlVersion)' + condition: or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025')) + continueOnError: true + env: + DB_CONNECTION_STRING: 'Server=localhost;Database=AdventureWorks2022;Uid=SA;Pwd=$(DB_PASSWORD);TrustServerCertificate=yes' + + # Publish benchmark results as artifact on main merges + - task: PublishPipelineArtifact@1 + inputs: + targetPath: benchmark_results.json + artifact: 'perf-baseline-macOS-$(sqlVersion)' + displayName: 'Publish macOS benchmark baseline' + condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + + # Download baseline from latest main run (for PR comparison) + - task: DownloadPipelineArtifact@2 + inputs: + source: specific + project: $(System.TeamProjectId) + pipeline: $(System.DefinitionId) + runVersion: latestFromBranch + runBranch: refs/heads/main + artifact: 'perf-baseline-macOS-$(sqlVersion)' + path: $(Build.SourcesDirectory) + displayName: 'Download macOS baseline from main' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + - job: PytestOnLinux displayName: 'Linux x86_64' pool: From c5098e17e67a3ee245d929dcd656e3960c6d4f53 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 11:52:02 +0530 Subject: [PATCH 3/8] Add PERF: as allowed PR title prefix --- .github/PULL_REQUEST_TEMPLATE.MD | 3 +++ .github/workflows/pr-format-check.yml | 2 +- CONTRIBUTING.md | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.MD b/.github/PULL_REQUEST_TEMPLATE.MD index 2625d3416..61fa0e65b 100644 --- a/.github/PULL_REQUEST_TEMPLATE.MD +++ b/.github/PULL_REQUEST_TEMPLATE.MD @@ -38,6 +38,9 @@ STYLE: (short-description) > For Refactor, without any feature changes REFACTOR: (short-description) +> For performance improvements +PERF: (short-description) + > For release related changes, without any feature changes RELEASE: # (short-description) diff --git a/.github/workflows/pr-format-check.yml b/.github/workflows/pr-format-check.yml index 55c3129d6..0d1772c3c 100644 --- a/.github/workflows/pr-format-check.yml +++ b/.github/workflows/pr-format-check.yml @@ -22,7 +22,7 @@ jobs: // Validate title prefix for all contributors const validTitlePrefixes = [ 'FEAT:', 'CHORE:', 'FIX:', 'DOC:', 'STYLE:', 'REFACTOR:', - 'RELEASE:' + 'PERF:', 'RELEASE:' ]; const hasValidPrefix = validTitlePrefixes.some(prefix => diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4288fcb5a..c23bcb56f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,7 +35,7 @@ If you are a Microsoft organization member (internal contributor): All pull requests must include: -- **Valid Title Prefix**: Your PR title must start with one of: `FEAT:`, `CHORE:`, `FIX:`, `DOC:`, `STYLE:`, `REFACTOR:`, or `RELEASE:` +- **Valid Title Prefix**: Your PR title must start with one of: `FEAT:`, `CHORE:`, `FIX:`, `DOC:`, `STYLE:`, `REFACTOR:`, `PERF:`, or `RELEASE:` - **Meaningful Summary**: Include a clear description of your changes under the "### Summary" section in the PR description (minimum 10 characters) - **Issue/Work Item Link** (only one required): - External contributors: Link to a GitHub issue From a77fd5d2d508328a461fd0ec506be75112d97882 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 13:29:57 +0530 Subject: [PATCH 4/8] Fix macOS benchmark: install ODBC driver for pyodbc, use tcp:127.0.0.1 for Colima --- eng/pipelines/pr-validation-pipeline.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index 9b2c3e4ec..d785616ed 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -572,6 +572,8 @@ jobs: # Run performance benchmarks on macOS - script: | + echo "Installing ODBC Driver 18 for pyodbc..." + HOMEBREW_ACCEPT_EULA=Y brew install msodbcsql18 || echo "ODBC Driver 18 install failed — pyodbc benchmarks will be skipped" pip install pyodbc echo "Running performance benchmarks..." python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json @@ -579,7 +581,7 @@ jobs: condition: or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025')) continueOnError: true env: - DB_CONNECTION_STRING: 'Server=localhost;Database=AdventureWorks2022;Uid=SA;Pwd=$(DB_PASSWORD);TrustServerCertificate=yes' + DB_CONNECTION_STRING: 'Server=tcp:127.0.0.1,1433;Database=AdventureWorks2022;Uid=SA;Pwd=$(DB_PASSWORD);TrustServerCertificate=yes' # Publish benchmark results as artifact on main merges - task: PublishPipelineArtifact@1 From a3eb8e331fa45427d911626640c2274ccdf5d6d1 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 15:10:38 +0530 Subject: [PATCH 5/8] Fix benchmark baseline: download before run, rename to expected path, handle empty baseline --- benchmarks/perf-benchmarking.py | 3 +- eng/pipelines/pr-validation-pipeline.yml | 79 +++++++++++++++--------- 2 files changed, 53 insertions(+), 29 deletions(-) diff --git a/benchmarks/perf-benchmarking.py b/benchmarks/perf-benchmarking.py index ba407c56c..66a5a2e58 100644 --- a/benchmarks/perf-benchmarking.py +++ b/benchmarks/perf-benchmarking.py @@ -315,7 +315,8 @@ def load_baseline(path: str) -> Optional[dict]: try: with open(path) as f: data = json.load(f) - return data.get("baseline", {}) + baseline = data.get("baseline") + return baseline if baseline else None except (json.JSONDecodeError, KeyError) as e: print(f" Warning: could not parse baseline {path}: {e}") return None diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index d785616ed..40b18a3ac 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -296,6 +296,32 @@ jobs: env: DB_PASSWORD: $(DB_PASSWORD) + # Download baseline from latest main run (for PR comparison) + - task: DownloadPipelineArtifact@2 + inputs: + source: specific + project: $(System.TeamProjectId) + pipeline: $(System.DefinitionId) + runVersion: latestFromBranch + runBranch: refs/heads/main + artifact: 'perf-baseline-$(sqlVersion)' + path: $(Build.SourcesDirectory) + displayName: 'Download baseline from main' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + + # Rename downloaded baseline so the script finds it + - powershell: | + if (Test-Path "benchmark_results.json") { + Move-Item "benchmark_results.json" "benchmark_baseline.json" -Force + Write-Host "Baseline file ready: benchmark_baseline.json" + } else { + Write-Host "No baseline file downloaded (first run or artifact missing)" + } + displayName: 'Prepare baseline file' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + # Run performance benchmarks on SQL Server 2022 - powershell: | Write-Host "Checking and installing ODBC Driver 18 for SQL Server..." @@ -399,20 +425,6 @@ jobs: condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) continueOnError: true - # Download baseline from latest main run (for PR comparison) - - task: DownloadPipelineArtifact@2 - inputs: - source: specific - project: $(System.TeamProjectId) - pipeline: $(System.DefinitionId) - runVersion: latestFromBranch - runBranch: refs/heads/main - artifact: 'perf-baseline-$(sqlVersion)' - path: $(Build.SourcesDirectory) - displayName: 'Download baseline from main' - condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) - continueOnError: true - - task: CopyFiles@2 inputs: SourceFolder: 'mssql_python' @@ -570,6 +582,31 @@ jobs: env: DB_PASSWORD: $(DB_PASSWORD) + # Download macOS baseline from latest main run (for PR comparison) + - task: DownloadPipelineArtifact@2 + inputs: + source: specific + project: $(System.TeamProjectId) + pipeline: $(System.DefinitionId) + runVersion: latestFromBranch + runBranch: refs/heads/main + artifact: 'perf-baseline-macOS-$(sqlVersion)' + path: $(Build.SourcesDirectory) + displayName: 'Download macOS baseline from main' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + + - script: | + if [ -f benchmark_results.json ]; then + mv benchmark_results.json benchmark_baseline.json + echo "Baseline file ready: benchmark_baseline.json" + else + echo "No baseline file downloaded (first run or artifact missing)" + fi + displayName: 'Prepare macOS baseline file' + condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) + continueOnError: true + # Run performance benchmarks on macOS - script: | echo "Installing ODBC Driver 18 for pyodbc..." @@ -592,20 +629,6 @@ jobs: condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) continueOnError: true - # Download baseline from latest main run (for PR comparison) - - task: DownloadPipelineArtifact@2 - inputs: - source: specific - project: $(System.TeamProjectId) - pipeline: $(System.DefinitionId) - runVersion: latestFromBranch - runBranch: refs/heads/main - artifact: 'perf-baseline-macOS-$(sqlVersion)' - path: $(Build.SourcesDirectory) - displayName: 'Download macOS baseline from main' - condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) - continueOnError: true - - job: PytestOnLinux displayName: 'Linux x86_64' pool: From 1670065b3af824c948255bb75857d0d9b752b982 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 15:29:12 +0530 Subject: [PATCH 6/8] Fix review comments: resource cleanup, baseline subfolder search, Linux conditional baseline --- benchmarks/perf-benchmarking.py | 27 ++++++++++++++++++------ eng/pipelines/pr-validation-pipeline.yml | 22 ++++++++++++------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/benchmarks/perf-benchmarking.py b/benchmarks/perf-benchmarking.py index 66a5a2e58..1d72b45b8 100644 --- a/benchmarks/perf-benchmarking.py +++ b/benchmarks/perf-benchmarking.py @@ -96,6 +96,7 @@ def to_dict(self) -> dict: def run_fetch_pyodbc(query: str, name: str, iterations: int) -> BenchmarkResult: result = BenchmarkResult(name) for _ in range(iterations): + conn = None try: conn = pyodbc.connect(CONN_STR_PYODBC) cursor = conn.cursor() @@ -104,16 +105,21 @@ def run_fetch_pyodbc(query: str, name: str, iterations: int) -> BenchmarkResult: rows = cursor.fetchall() elapsed = time.perf_counter() - start result.add_time(elapsed, len(rows)) - cursor.close() - conn.close() except Exception as e: print(f" pyodbc error: {e}") + finally: + if conn: + try: + conn.close() + except Exception: + pass return result def run_fetch_mssql(query: str, name: str, iterations: int) -> BenchmarkResult: result = BenchmarkResult(name) for _ in range(iterations): + conn = None try: conn = connect(CONN_STR) cursor = conn.cursor() @@ -122,10 +128,14 @@ def run_fetch_mssql(query: str, name: str, iterations: int) -> BenchmarkResult: rows = cursor.fetchall() elapsed = time.perf_counter() - start result.add_time(elapsed, len(rows)) - cursor.close() - conn.close() except Exception as e: print(f" mssql-python error: {e}") + finally: + if conn: + try: + conn.close() + except Exception: + pass return result @@ -157,6 +167,7 @@ def _run_insertmany(conn_factory, conn_str, name: str, iterations: int) -> Bench batches.append(flat) for _ in range(iterations): + conn = None try: conn = conn_factory(conn_str) cursor = conn.cursor() @@ -171,10 +182,14 @@ def _run_insertmany(conn_factory, conn_str, name: str, iterations: int) -> Bench elapsed = time.perf_counter() - start result.add_time(elapsed, INSERTMANY_ROWS) - cursor.close() - conn.close() except Exception as e: print(f" {name} error: {e}") + finally: + if conn: + try: + conn.close() + except Exception: + pass return result diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index 40b18a3ac..8ffcff9fc 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -310,11 +310,12 @@ jobs: condition: and(ne(variables['Build.SourceBranch'], 'refs/heads/main'), or(eq(variables['sqlVersion'], 'SQL2022'), eq(variables['sqlVersion'], 'SQL2025'))) continueOnError: true - # Rename downloaded baseline so the script finds it + # Rename downloaded baseline so the script finds it (artifact may be in a subfolder) - powershell: | - if (Test-Path "benchmark_results.json") { - Move-Item "benchmark_results.json" "benchmark_baseline.json" -Force - Write-Host "Baseline file ready: benchmark_baseline.json" + $found = Get-ChildItem -Path "$(Build.SourcesDirectory)" -Filter "benchmark_results.json" -Recurse -File -ErrorAction SilentlyContinue | Select-Object -First 1 + if ($null -ne $found) { + Copy-Item $found.FullName "benchmark_baseline.json" -Force + Write-Host "Baseline file ready: benchmark_baseline.json (from $($found.FullName))" } else { Write-Host "No baseline file downloaded (first run or artifact missing)" } @@ -597,9 +598,10 @@ jobs: continueOnError: true - script: | - if [ -f benchmark_results.json ]; then - mv benchmark_results.json benchmark_baseline.json - echo "Baseline file ready: benchmark_baseline.json" + found=$(find "$(Build.SourcesDirectory)" -name benchmark_results.json -type f 2>/dev/null | head -1) + if [ -n "$found" ]; then + cp "$found" benchmark_baseline.json + echo "Baseline file ready: benchmark_baseline.json (from $found)" else echo "No baseline file downloaded (first run or artifact missing)" fi @@ -918,7 +920,11 @@ jobs: echo 'Installing pyodbc for benchmarking...' pip install pyodbc echo 'Running performance benchmarks on $(distroName)' - python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json || echo 'Performance benchmark failed or database not available' + if [ -f benchmark_baseline.json ]; then + python benchmarks/perf-benchmarking.py --baseline benchmark_baseline.json --json benchmark_results.json || echo 'Performance benchmark failed or database not available' + else + python benchmarks/perf-benchmarking.py --json benchmark_results.json || echo 'Performance benchmark failed or database not available' + fi " else echo "Skipping performance benchmarks on $(distroName) (only runs on Ubuntu with local SQL Server)" From ebf8934e2d8b17ae122b623101d468f31c67a637 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Mon, 20 Apr 2026 16:07:46 +0530 Subject: [PATCH 7/8] Fix macOS benchmark: add Microsoft brew tap before installing msodbcsql18 --- eng/pipelines/pr-validation-pipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/eng/pipelines/pr-validation-pipeline.yml b/eng/pipelines/pr-validation-pipeline.yml index 8ffcff9fc..899b11824 100644 --- a/eng/pipelines/pr-validation-pipeline.yml +++ b/eng/pipelines/pr-validation-pipeline.yml @@ -612,6 +612,7 @@ jobs: # Run performance benchmarks on macOS - script: | echo "Installing ODBC Driver 18 for pyodbc..." + brew tap microsoft/mssql-release https://github.com/Microsoft/homebrew-mssql-release HOMEBREW_ACCEPT_EULA=Y brew install msodbcsql18 || echo "ODBC Driver 18 install failed — pyodbc benchmarks will be skipped" pip install pyodbc echo "Running performance benchmarks..." From 970ce4d42dc922bf9da1b20f5967c8dfa10e4577 Mon Sep 17 00:00:00 2001 From: Gaurav Sharma Date: Fri, 17 Apr 2026 16:12:58 +0530 Subject: [PATCH 8/8] Optimize execute() hot path for repeated parameterized inserts - Add _soft_reset_cursor: SQL_CLOSE + SQL_RESET_PARAMS instead of full HSTMT free/realloc on each execute() call - Add DDBCSQLResetStmt C++ wrapper exposing lightweight reset via pybind11 - Skip SQLPrepare when re-executing the same SQL (prepare caching) - Skip detect_and_convert_parameters on repeated same-SQL calls - Guard DDBCSQLGetAllDiagRecords behind SQL_SUCCESS_WITH_INFO check - Guard per-parameter debug logging behind logger.isEnabledFor(DEBUG) - Fix benchmark script to strip Driver= from mssql-python connection string --- benchmarks/perf-benchmarking.py | 4 ++ mssql_python/cursor.py | 83 +++++++++++++++++---------- mssql_python/pybind/ddbc_bindings.cpp | 20 +++++++ 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/benchmarks/perf-benchmarking.py b/benchmarks/perf-benchmarking.py index 1d72b45b8..f8ce3aa8b 100644 --- a/benchmarks/perf-benchmarking.py +++ b/benchmarks/perf-benchmarking.py @@ -49,6 +49,10 @@ def _init_conn_strings(): CONN_STR_PYODBC = f"Driver={{ODBC Driver 18 for SQL Server}};{CONN_STR}" else: CONN_STR_PYODBC = CONN_STR + # mssql-python manages its own driver — strip Driver= from its connection string + import re + CONN_STR = re.sub(r"Driver=[^;]*;?", "", CONN_STR, flags=re.IGNORECASE).strip(";") + class BenchmarkResult: diff --git a/mssql_python/cursor.py b/mssql_python/cursor.py index ba5065d56..1fac2401f 100644 --- a/mssql_python/cursor.py +++ b/mssql_python/cursor.py @@ -12,6 +12,7 @@ # pylint: disable=too-many-lines # Large file due to comprehensive DB-API 2.0 implementation import decimal +import logging import uuid import datetime import warnings @@ -140,6 +141,7 @@ def __init__(self, connection: "Connection", timeout: int = 0) -> None: False ] # Indicates if last_executed_stmt was prepared by ddbc shim. # Is a list instead of a bool coz bools in Python are immutable. + self._cached_param_types = None # Cached ParamInfo list for prepare-cache reuse # Initialize attributes that may be defined later to avoid pylint warnings # Note: _original_fetch* methods are not initialized here as they need to be @@ -747,6 +749,18 @@ def _reset_cursor(self) -> None: # Reinitialize the statement handle self._initialize_cursor() + self.is_stmt_prepared = [False] + self._cached_param_types = None + + def _soft_reset_cursor(self) -> None: + """Lightweight reset: close cursor and unbind params without freeing the HSTMT. + + Preserves the prepared statement plan on the server so repeated + executions of the same SQL skip SQLPrepare entirely. + """ + if self.hstmt: + ddbc_bindings.DDBCSQLResetStmt(self.hstmt) + self._clear_rownumber() def close(self) -> None: """ @@ -1349,8 +1363,10 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state self._check_closed() # Check if the cursor is closed if reset_cursor: - logger.debug("execute: Resetting cursor state") - self._reset_cursor() + if self.hstmt: + self._soft_reset_cursor() + else: + self._reset_cursor() # Clear any previous messages self.messages = [] @@ -1385,8 +1401,6 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state # Check if single parameter is a nested container that should be unwrapped # e.g., execute("SELECT ?", (value,)) vs execute("SELECT ?, ?", ((1, 2),)) if isinstance(parameters, tuple) and len(parameters) == 1: - # Could be either (value,) for single param or ((tuple),) for nested - # Check if it's a nested container if isinstance(parameters[0], (tuple, list, dict)): actual_params = parameters[0] else: @@ -1394,11 +1408,17 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state else: actual_params = parameters - # Convert parameters based on detected style - operation, converted_params = detect_and_convert_parameters(operation, actual_params) - - # Convert back to list format expected by the binding code - parameters = list(converted_params) + # Skip detect_and_convert_parameters when re-executing the same SQL — + # the parameter style (qmark vs pyformat) won't change between calls. + if operation == self.last_executed_stmt and isinstance(actual_params, (tuple, list)): + parameters = ( + list(actual_params) if not isinstance(actual_params, list) else actual_params + ) + else: + operation, converted_params = detect_and_convert_parameters( + operation, actual_params + ) + parameters = list(converted_params) else: parameters = [] @@ -1426,27 +1446,28 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state paraminfo = self._create_parameter_types_list(param, param_info, parameters, i) parameters_type.append(paraminfo) - # TODO: Use a more sophisticated string compare that handles redundant spaces etc. - # Also consider storing last query's hash instead of full query string. This will help - # in low-memory conditions - # (Ex: huge number of parallel queries with huge query string sizes) - if operation != self.last_executed_stmt: - # Executing a new statement. Reset is_stmt_prepared to false + # Prepare caching: skip SQLPrepare when re-executing the same SQL + # with parameters. The HSTMT is reused via _soft_reset_cursor, so the + # server-side plan from the previous SQLPrepare is still valid. + same_sql = parameters and operation == self.last_executed_stmt and self.is_stmt_prepared[0] + if not same_sql: self.is_stmt_prepared = [False] + effective_use_prepare = use_prepare and not same_sql - for i, param in enumerate(parameters): - logger.debug( - """Parameter number: %s, Parameter: %s, - Param Python Type: %s, ParamInfo: %s, %s, %s, %s, %s""", - i + 1, - param, - str(type(param)), - parameters_type[i].paramSQLType, - parameters_type[i].paramCType, - parameters_type[i].columnSize, - parameters_type[i].decimalDigits, - parameters_type[i].inputOutputType, - ) + if logger.isEnabledFor(logging.DEBUG): + for i, param in enumerate(parameters): + logger.debug( + """Parameter number: %s, Parameter: %s, + Param Python Type: %s, ParamInfo: %s, %s, %s, %s, %s""", + i + 1, + param, + str(type(param)), + parameters_type[i].paramSQLType, + parameters_type[i].paramCType, + parameters_type[i].columnSize, + parameters_type[i].decimalDigits, + parameters_type[i].inputOutputType, + ) ret = ddbc_bindings.DDBCSQLExecute( self.hstmt, @@ -1454,7 +1475,7 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state parameters, parameters_type, self.is_stmt_prepared, - use_prepare, + effective_use_prepare, encoding_settings, ) # Check return code @@ -1467,8 +1488,8 @@ def execute( # pylint: disable=too-many-locals,too-many-branches,too-many-state self._reset_cursor() raise - # Capture any diagnostic messages (SQL_SUCCESS_WITH_INFO, etc.) - if self.hstmt: + # Capture diagnostic messages only when the driver signalled info. + if ret == ddbc_sql_const.SQL_SUCCESS_WITH_INFO.value and self.hstmt: self.messages.extend(ddbc_bindings.DDBCSQLGetAllDiagRecords(self.hstmt)) self.last_executed_stmt = operation diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp index c7537cbe8..d124e1662 100644 --- a/mssql_python/pybind/ddbc_bindings.cpp +++ b/mssql_python/pybind/ddbc_bindings.cpp @@ -1362,6 +1362,25 @@ void SqlHandle::free() { } } +SQLRETURN SQLResetStmt_wrap(SqlHandlePtr statementHandle) { + if (!SQLFreeStmt_ptr) { + DriverLoader::getInstance().loadDriver(); + } + SQLHANDLE hStmt = statementHandle->get(); + if (!hStmt) { + return SQL_INVALID_HANDLE; + } + + SQLRETURN rc = SQLFreeStmt_ptr(hStmt, SQL_CLOSE); + if (SQL_SUCCEEDED(rc)) { + rc = SQLFreeStmt_ptr(hStmt, SQL_RESET_PARAMS); + } + if (SQL_SUCCEEDED(rc) && SQLSetStmtAttr_ptr) { + SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_PARAMSET_SIZE, (SQLPOINTER)1, 0); + } + return rc; +} + SQLRETURN SQLGetTypeInfo_Wrapper(SqlHandlePtr StatementHandle, SQLSMALLINT DataType) { if (!SQLGetTypeInfo_ptr) { ThrowStdException("SQLGetTypeInfo function not loaded"); @@ -5783,6 +5802,7 @@ PYBIND11_MODULE(ddbc_bindings, m) { py::arg("wcharEncoding") = "utf-16le"); m.def("DDBCSQLFetchArrowBatch", &FetchArrowBatch_wrap, "Fetch an arrow batch of given length from the result set"); m.def("DDBCSQLFreeHandle", &SQLFreeHandle_wrap, "Free a handle"); + m.def("DDBCSQLResetStmt", &SQLResetStmt_wrap, "Close cursor and unbind params without freeing HSTMT"); m.def("DDBCSQLCheckError", &SQLCheckError_Wrap, "Check for driver errors"); m.def("DDBCSQLGetAllDiagRecords", &SQLGetAllDiagRecords, "Get all diagnostic records for a handle", py::arg("handle"));