Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions clojure/src/pgloader/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,28 @@
(map :name)
(type-names-in-schema
pg-conn {:schema schema :names names})))))]
;; After apply-identifier-case, check whether any table has two columns
;; whose names become identical after PostgreSQL's 63-char truncation.
;; Accumulate every instance and report them all before aborting.
(let [collisions (ddl/check-identifier-collisions cat)]
(when (seq collisions)
(doseq [c collisions]
(log/error (str (:schema c) "." (:table c)
": column name collision — "
(str/join ", " (map pr-str (:columns c)))
" all truncate to " (pr-str (:effective-name c)))))
(throw (ex-info
(str (count collisions)
" column name collision(s) found in source catalog.\n\n"
"PostgreSQL limits identifier names to "
ddl/pg-max-identifier-length
" bytes (NAMEDATALEN-1). The tables listed above contain "
"multiple columns whose names become identical after "
"truncation. This would cause CREATE TABLE to fail or COPY "
"to load data into the wrong column.\n\n"
"Please rename the affected columns in the source database "
"before migrating.")
{:collisions collisions}))))
;; Truncate tables if requested — each table its own transaction
(when (get with-options :truncate)
(log/debug "Truncating tables")
Expand Down
23 changes: 22 additions & 1 deletion clojure/src/pgloader/ddl/common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,9 @@
on-upd on-del ";\n")))
fkeys))

(def ^:private pg-max-identifier-length 63)
(def pg-max-identifier-length
"PostgreSQL's NAMEDATALEN-1: the maximum number of bytes in an identifier."
63)

(defn- snake-case-transform
"Convert an identifier to snake_case:
Expand Down Expand Up @@ -403,6 +405,25 @@
fks))))))
catalog))))

(defn check-identifier-collisions
"Return a seq of collision maps for every (table, truncated-name) pair in
CATALOG where two or more columns map to the same PostgreSQL identifier
after 63-character truncation. Each map has keys :schema :table
:effective-name :columns. Returns an empty seq when no collisions exist."
[catalog]
(for [t catalog
:let [names (mapv :column-name (:columns t))
eff-names (mapv #(subs % 0 (min pg-max-identifier-length (count %))) names)
freqs (frequencies eff-names)
coll-effs (into #{} (keep (fn [[k v]] (when (> v 1) k)) freqs))]
:when (seq coll-effs)
eff coll-effs]
{:schema (:schema t)
:table (:table-name t)
:effective-name eff
:columns (filterv #(= eff (subs % 0 (min pg-max-identifier-length (count %))))
names)}))

(defn apply-alter-schema
"Apply ALTER SCHEMA ... RENAME TO ... rules from the load command.
Each rule: {:source-name s :target-name t}
Expand Down
57 changes: 57 additions & 0 deletions clojure/test/pgloader/ddl_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,63 @@
{:snake-case-ids true})]
(is (= "xml_parser" (:table-name t))))))

;; ── Identifier collision detection (#353) ────────────────────────────────────

(def ^:private col-prefix-63
"col_very_long_name_that_exceeds_postgresql_identifier_limit_aaa")

(deftest test-check-identifier-collisions
(testing "no collisions: empty catalog"
(is (empty? (ddl/check-identifier-collisions []))))

(testing "no collisions: short column names"
(let [cat [(make-table "t" ["id" "name" "value"] [])]]
(is (empty? (ddl/check-identifier-collisions cat)))))

(testing "no collisions: names exactly 63 chars (no truncation)"
(let [cat [(make-table "t" [col-prefix-63 "other_col"] [])]]
(is (empty? (ddl/check-identifier-collisions cat)))))

(testing "no collisions: 64-char names that differ within first 63 chars"
(let [col-a (str "aaaa_" col-prefix-63) ;; first 63 chars differ from col-b
col-b (str "bbbb_" col-prefix-63)
cat [(make-table "t" [col-a col-b] [])]]
(is (empty? (ddl/check-identifier-collisions cat)))))

(testing "collision: two columns sharing first 63 chars"
(let [col-a (str col-prefix-63 "x")
col-b (str col-prefix-63 "y")
cat [(make-table "t" [col-a col-b "short_col"] [])]]
(let [cs (ddl/check-identifier-collisions cat)]
(is (= 1 (count cs)))
(is (= col-prefix-63 (:effective-name (first cs))))
(is (= #{col-a col-b} (set (:columns (first cs)))))
(is (= "t" (:table (first cs)))))))

(testing "collision: three columns sharing first 63 chars"
(let [col-a (str col-prefix-63 "x")
col-b (str col-prefix-63 "y")
col-c (str col-prefix-63 "z")
cat [(make-table "t" [col-a col-b col-c] [])]]
(let [cs (ddl/check-identifier-collisions cat)]
(is (= 1 (count cs)))
(is (= #{col-a col-b col-c} (set (:columns (first cs))))))))

(testing "collision: distinct collisions in two different tables — both reported"
;; All four names share the same 63-char prefix so both tables get a
;; collision entry — total must be 2, one per table.
(let [a1 (str col-prefix-63 "x") a2 (str col-prefix-63 "y")
b1 (str col-prefix-63 "p") b2 (str col-prefix-63 "q")
cat [(make-table "table_a" [a1 a2] [])
(make-table "table_b" [b1 b2] [])]]
(is (= 2 (count (ddl/check-identifier-collisions cat))))))

(testing "no cross-table collision: same long name in two different tables is fine"
(let [col-a (str col-prefix-63 "x")
cat [(make-table "t1" [col-a "short"] [])
(make-table "t2" [col-a "other"] [])]]
(is (empty? (ddl/check-identifier-collisions cat))))))

;; ── FK filter (#1216) ─────────────────────────────────────────────────────────

(deftest test-create-fkeys-sql-filters-missing-tables
Expand Down
25 changes: 25 additions & 0 deletions src/load/migrate-database.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,31 @@
;; cast the catalog into something PostgreSQL can work on
(cast catalog)

;; After casting, apply-identifier-case has been applied to every column
;; name. Check now whether any table has two columns whose names become
;; identical after PostgreSQL's 63-character identifier truncation. All
;; collisions are accumulated and reported together so users can fix
;; everything in one pass before re-running the migration.
(let ((collisions (check-catalog-identifier-collisions catalog)))
(when collisions
(dolist (c collisions)
(log-message :error
"~a.~a: column name collision — ~{~s~^, ~} all truncate to ~s"
(getf c :schema)
(getf c :table)
(getf c :columns)
(getf c :effective-name)))
(error "~d column name collision~:p found in source catalog.

PostgreSQL limits identifier names to ~d bytes (NAMEDATALEN-1). The tables
listed above contain multiple columns whose names become identical after
truncation. This would cause CREATE TABLE to fail or COPY to load data into
the wrong column.

Please rename the affected columns in the source database before migrating."
(length collisions)
+pg-max-identifier-length+)))

;; support code for index filters (where clauses)
(process-index-definitions catalog :sql-dialect (class-name (class-of copy)))

Expand Down
6 changes: 5 additions & 1 deletion src/package.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,11 @@
#:citus-format-sql-select
#:citus-backfill-table-p

#:format-table-name))
#:format-table-name

#:+pg-max-identifier-length+
#:pg-effective-name
#:check-catalog-identifier-collisions))

(defpackage #:pgloader.state
(:use #:cl #:pgloader.params #:pgloader.catalog)
Expand Down
39 changes: 39 additions & 0 deletions src/utils/catalog.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,45 @@
(defmethod field-name ((column column) &key)
(column-name column))

;;;
;;; PostgreSQL limits identifier names to NAMEDATALEN-1 = 63 bytes. Two
;;; source columns in the same table whose names map to the same truncated
;;; identifier cause CREATE TABLE to fail with a duplicate-column error, or
;;; COPY to silently load data into the wrong column. Detect these
;;; collisions after the catalog has been cast (apply-identifier-case
;;; already applied) so the names here are exactly what pgloader will send
;;; to PostgreSQL.
;;;
(defconstant +pg-max-identifier-length+ 63)

(defun pg-effective-name (identifier)
"Return the name PostgreSQL stores in pg_attribute for IDENTIFIER.
Strips outer double-quotes (from :quote identifier-case mode) and
truncates to +pg-max-identifier-length+ characters."
(let ((inner (ensure-unquoted identifier)))
(subseq inner 0 (min +pg-max-identifier-length+ (length inner)))))

(defun check-catalog-identifier-collisions (catalog)
"Return a plist for every (table, truncated-name) pair in CATALOG where
two or more columns map to the same PostgreSQL identifier after 63-char
truncation. Each entry has keys :schema :table :effective-name :columns.
Returns NIL when no collisions are found."
(loop :for schema :in (catalog-schema-list catalog)
:append
(loop :for table :in (schema-table-list schema)
:append
(let ((name-map (make-hash-table :test #'equal)))
(dolist (col (table-column-list table))
(push (column-name col)
(gethash (pg-effective-name (column-name col)) name-map)))
(loop :for eff :being :the :hash-keys :of name-map
:using (hash-value cols)
:when (> (length cols) 1)
:collect (list :schema (schema-name schema)
:table (table-name table)
:effective-name eff
:columns (nreverse cols)))))))

;;;
;;; There's no simple equivalent to array_agg() in MS SQL, so the index and
;;; fkey queries return a row per index|fkey column rather than per
Expand Down
10 changes: 10 additions & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ REGRESS= allcols.load \
overflow.load \
partial.load \
serial.load \
sqlite-collision.load \
sqlite-on-error-stop.load \
udc.load \
xzero.load
Expand Down Expand Up @@ -85,6 +86,15 @@ errors.out: errors.load
# sqlite LOAD DATABASE is incompatible with --regress (parse-target-pg-db-uri
# returns nil for LOAD DATABASE commands). Run without --regress; any hang
# would be caught by the CI job timeout.
sqlite/collision.db:
python3 sqlite/create-collision.py

# pgloader must detect the 63-char identifier collision and exit non-zero.
# LOAD DATABASE is incompatible with --regress; run without it.
regress/out/sqlite-collision.out: sqlite-collision.load sqlite/collision.db
-$(PGLOADER) $<
touch $@

sqlite/type-mismatch.db:
python3 sqlite/create-type-mismatch.py

Expand Down
31 changes: 31 additions & 0 deletions test/mysql-collision.load
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Manual test for GitHub issue #353 (column name identifier collision).
*
* Requires a running MySQL server with the mysql_collision database set up:
*
* CREATE DATABASE mysql_collision;
* USE mysql_collision;
* CREATE TABLE products (
* id INT PRIMARY KEY,
* name VARCHAR(100) NOT NULL,
* col_very_long_name_that_exceeds_postgresql_identifier_limit_aaax INT,
* col_very_long_name_that_exceeds_postgresql_identifier_limit_aaay INT
* );
*
* Expected outcome: pgloader exits non-zero, logs the collision, touches no
* PostgreSQL tables.
*
* Run manually (not in REGRESS — requires a MySQL server):
* pgloader test/mysql-collision.load
*/

load database
from mysql://root@localhost/mysql_collision
into postgresql:///pgloader

with include drop, create tables, create indexes, reset sequences

set search_path to 'mysql_collision_test'

before load do
$$ create schema if not exists mysql_collision_test; $$;
31 changes: 31 additions & 0 deletions test/sqlite-collision.load
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Regression test for GitHub issue #353 (column name identifier collision).
*
* The source SQLite database has a table "products" with two columns whose
* names share the same first 63 characters:
*
* col_very_long_name_that_exceeds_postgresql_identifier_limit_aaax
* col_very_long_name_that_exceeds_postgresql_identifier_limit_aaay
*
* Both truncate to the same PostgreSQL identifier (NAMEDATALEN-1 = 63).
*
* Expected outcome:
* - pgloader exits with a non-zero exit code.
* - pgloader does NOT hang.
* - pgloader logs an error naming the colliding columns.
* - pgloader does NOT attempt to create any table or copy any data.
*
* To regenerate the database:
* python3 test/sqlite/create-collision.py
*/

load database
from 'sqlite/collision.db'
into postgresql:///pgloader

with include drop, create tables, create indexes, reset sequences

set search_path to 'collision_test'

before load do
$$ create schema if not exists collision_test; $$;
Binary file added test/sqlite/collision.db
Binary file not shown.
43 changes: 43 additions & 0 deletions test/sqlite/create-collision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""
Regenerate test/sqlite/collision.db for the identifier collision test
(GitHub issue #353).

Two columns in the same table share the same first 63 characters so they
would both truncate to the same PostgreSQL identifier name. pgloader must
detect this early and exit with a clear error rather than letting PostgreSQL
silently lose data.

Usage:
python3 test/sqlite/create-collision.py
"""
import os
import sqlite3

PREFIX = "col_very_long_name_that_exceeds_postgresql_identifier_limit_aaa"
assert len(PREFIX) == 63, f"prefix should be 63 chars, got {len(PREFIX)}"
COL_A = PREFIX + "x" # 64 chars, truncates to PREFIX
COL_B = PREFIX + "y" # 64 chars, truncates to PREFIX — collision!
assert len(COL_A) == 64
assert COL_A[:63] == COL_B[:63]

out = os.path.join(os.path.dirname(__file__), "collision.db")
if os.path.exists(out):
os.remove(out)

conn = sqlite3.connect(out)
conn.execute(
f"""CREATE TABLE products (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
{COL_A} INTEGER,
{COL_B} INTEGER
)"""
)
conn.execute("INSERT INTO products VALUES (1, 'apple', 10, 20)")
conn.commit()
conn.close()
print(f"Created {out}")
print(f" colliding columns: {COL_A!r}")
print(f" {COL_B!r}")
print(f" both truncate to: {PREFIX!r}")
Loading