diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py index a059fa4e87..6f3b012bd8 100644 --- a/sqlmesh/core/model/definition.py +++ b/sqlmesh/core/model/definition.py @@ -1588,7 +1588,7 @@ def is_breaking_change(self, previous: Model) -> t.Optional[bool]: for edit in edits: if not isinstance(edit, Insert): - return None + return _additive_projection_change(previous_query, this_query, self.dialect) expr = edit.expression if isinstance(expr, exp.UDTF): @@ -1602,7 +1602,7 @@ def is_breaking_change(self, previous: Model) -> t.Optional[bool]: expr = parent if not _is_projection(expr) and expr.parent not in inserted_expressions: - return None + return _additive_projection_change(previous_query, this_query, self.dialect) return False @@ -2907,6 +2907,98 @@ def _is_projection(expr: exp.Expr) -> bool: return isinstance(parent, exp.Select) and expr.arg_key == "expressions" +def _has_ordinal_references(query: exp.Select) -> bool: + order = query.args.get("order") + if order and any( + isinstance(ob.this, exp.Literal) and ob.this.is_number for ob in order.expressions + ): + return True + group = query.args.get("group") + return bool( + group and any(isinstance(gb, exp.Literal) and gb.is_number for gb in group.expressions) + ) + + +def _additive_projection_change( + previous_query: exp.Query, + this_query: exp.Query, + dialect: DialectType, +) -> t.Optional[bool]: + """Fallback for when SQLGlot's tree diff can't express an additive projection change. + + SQLGlot's diff matches nodes by structural similarity, so interchangeable leaves (e.g. two + identical ``CAST(... AS T)`` target types) can be cross-matched. Inserting a same-type cast + above an existing one therefore yields spurious ``Move`` / ``Update`` edits even though a + column was simply added to the SELECT list. In that case the edit-based check above is + inconclusive, so we verify additivity directly against the output projections. + + Returns ``False`` (non-breaking) only when the change is provably additive: + * both queries are simple ``SELECT`` statements, + * everything other than the projection list is structurally identical, + * no added projection is a (potentially cardinality-changing) ``UDTF``, + * every previous projection is preserved, in order, within the new projection list, and + * no mid-list insert shifts ordinal ``ORDER BY`` / ``GROUP BY`` references. + + Otherwise returns ``None`` (undetermined), preserving the conservative default. + """ + # UNIONs or other query expressions, are left to the caller's conservative diff result. + if not isinstance(previous_query, exp.Select) or not isinstance(this_query, exp.Select): + return None + + previous_projections = previous_query.expressions + this_projections = this_query.expressions + # If the new query has not gained any projections, this cannot be an additive projection-only + # change, so there is nothing for this fallback to prove. + if len(this_projections) <= len(previous_projections): + return None + + # Adding a UDTF projection (e.g. EXPLODE / UNNEST) can change row cardinality, so such a + # change is not safely non-breaking even when it appears as an extra SELECT item. + for projection in this_projections: + bare = projection.this if isinstance(projection, exp.Alias) else projection + if isinstance(bare, exp.UDTF): + return None + + # Everything other than the projection list must be structurally identical. Replacing each + # SELECT list with the same dummy literal lets the expression equality check focus on the + # FROM / WHERE / GROUP BY / ORDER BY / etc. parts of the query. + previous_skeleton = previous_query.copy() + this_skeleton = this_query.copy() + previous_skeleton.set("expressions", [exp.Literal.number(1)]) + this_skeleton.set("expressions", [exp.Literal.number(1)]) + if previous_skeleton != this_skeleton: + return None + + # Every previous projection must appear, in order, within the new projection list. Comparing + # dialect-normalized SQL makes semantically equivalent projection nodes match even when the + # parser built distinct object identities. + this_projection_sql = [p.sql(dialect=dialect, comments=False) for p in this_projections] + search_start = 0 + matched_at: list[int] = [] + for projection in previous_projections: + target_sql = projection.sql(dialect=dialect, comments=False) + # Continue after the previous match so added columns can appear before, between, or after + # the original projections, but existing projections cannot be reordered or rewritten. + for index in range(search_start, len(this_projection_sql)): + if this_projection_sql[index] == target_sql: + matched_at.append(index) + search_start = index + 1 + break + else: + return None + + # Mid-list inserts shift ordinal references in ORDER BY / GROUP BY clauses. + if _has_ordinal_references(this_query): + matched_set = set(matched_at) + last_matched = matched_at[-1] + if any(i < last_matched for i in range(len(this_projections)) if i not in matched_set): + return None + + # At this point the query shape is unchanged and all prior outputs are preserved, so the only + # remaining difference is one or more additional, non-UDTF projections. + return False + + def _single_expr_or_tuple(values: t.Sequence[exp.Expr]) -> exp.Expr | exp.Tuple: return values[0] if len(values) == 1 else exp.Tuple(expressions=values) diff --git a/tests/core/test_snapshot.py b/tests/core/test_snapshot.py index 1acc6cc265..2edf8b7383 100644 --- a/tests/core/test_snapshot.py +++ b/tests/core/test_snapshot.py @@ -1633,6 +1633,183 @@ def test_categorize_change_sql(make_snapshot): ) +def test_categorize_change_sql_redundant_cast(make_snapshot): + # Adding a column with a redundant CAST above an existing same-type cast makes SQLGlot's tree + # diff emit spurious Move/Update edits (interchangeable DataType leaves get cross-matched), + # even though the change is purely an added projection. These must remain NON_BREAKING. + config = CategorizerConfig(sql=AutoCategorizationMode.SEMI) + + old_snapshot = make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t")) + ) + + # A same-type cast column has been inserted mid-list, above the existing one. + assert ( + categorize_change( + new=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t")) + ), + old=old_snapshot, + config=config, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + + # Multiple same-type cast columns have been inserted mid-list. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT, y::INT FROM t") + ) + ), + old=old_snapshot, + config=config, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + + # The type of an existing projection has changed (in addition to a new column): undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::INT, x::TEXT, s::TEXT FROM t")) + ), + old=old_snapshot, + config=config, + ) + is None + ) + + # Existing projections have been reordered: undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT s::TEXT, a::DATE FROM t")) + ), + old=old_snapshot, + config=config, + ) + is None + ) + + # An existing projection has been removed: undetermined. + assert ( + categorize_change( + new=make_snapshot(SqlModel(name="a", query=parse_one("SELECT s::TEXT FROM t"))), + old=old_snapshot, + config=config, + ) + is None + ) + + # A WHERE clause changed alongside the added cast column: undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t WHERE a = 2"), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t WHERE a = 1")) + ), + config=config, + ) + is None + ) + + # Mid-list insert with ORDER BY ordinal shifts the referenced projection: undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t ORDER BY 2"), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t ORDER BY 2")) + ), + config=config, + ) + is None + ) + + # Append at end with ORDER BY ordinal leaves existing ordinal bindings unchanged. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one("SELECT a::DATE, s::TEXT, x::TEXT FROM t ORDER BY 2"), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t ORDER BY 2")) + ), + config=config, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + + # Mid-list insert with GROUP BY ordinal shifts the referenced projection: undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t GROUP BY 2"), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t GROUP BY 2")) + ), + config=config, + ) + is None + ) + + # Aliased UDTF projection via fallback path: undetermined. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one( + "SELECT a::DATE AS a, x::TEXT AS x, EXPLODE(y) AS y, s::TEXT AS s FROM t" + ), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE AS a, s::TEXT AS s FROM t")) + ), + config=config, + ) + is None + ) + + # UDTF inside aliased scalar subquery remains non-breaking. + assert ( + categorize_change( + new=make_snapshot( + SqlModel( + name="a", + query=parse_one( + "SELECT a::DATE AS a, (SELECT x FROM unnest(b) x) AS sub, s::TEXT AS s FROM t" + ), + ) + ), + old=make_snapshot( + SqlModel(name="a", query=parse_one("SELECT a::DATE AS a, s::TEXT AS s FROM t")) + ), + config=config, + ) + == SnapshotChangeCategory.NON_BREAKING + ) + + def test_categorize_change_seed(make_snapshot, tmp_path): config = CategorizerConfig(seed=AutoCategorizationMode.SEMI) model_name = "test_db.test_seed_model"