@@ -91,19 +91,31 @@ class BracketedTableDialect(TSQL):
9191
9292
9393class DialectParser :
94- """Detect the appropriate sqlglot dialect and parse SQL into an AST."""
94+ """Detect the appropriate sqlglot dialect and parse SQL into an AST.
95+
96+ SQL varies across database engines — back-ticks (MySQL), square
97+ brackets (TSQL), ``#temp`` tables (MSSQL), ``LATERAL VIEW`` (Hive),
98+ etc. A single sqlglot dialect cannot handle all of them, so this
99+ class first inspects the raw SQL for dialect markers, then tries
100+ candidate dialects in order and picks the first result that passes
101+ quality checks.
102+ """
95103
96104 def parse (self , clean_sql : str ) -> tuple [exp .Expression , DialectType ]:
97- """Parse *clean_sql*, returning ``(ast, dialect)``.
98-
99- Detects candidate dialects via heuristics, tries each in order,
100- and returns the first non-degraded result.
101-
102- :param clean_sql: Preprocessed SQL string (comments stripped, etc.).
103- :type clean_sql: str
104- :returns: 2-tuple of ``(ast_node, winning_dialect)``.
105- :rtype: tuple
106- :raises ValueError: If all dialect attempts fail.
105+ """Parse *clean_sql* into a sqlglot AST, returning ``(ast, dialect)``.
106+
107+ Entry point for the two-phase process: first
108+ :meth:`_detect_dialects` builds a priority-ordered list of
109+ candidate dialects from syntactic markers in the SQL, then
110+ :meth:`_try_dialects` attempts each one and returns the first
111+ non-degraded result.
112+
113+ :param clean_sql: Preprocessed SQL string produced by
114+ :class:`~sql_metadata.sql_cleaner.SqlCleaner` (comments
115+ stripped, outer parentheses removed, CTE names normalised).
116+ :returns: 2-tuple of ``(ast_root_node, winning_dialect)``.
117+ :raises InvalidQueryDefinition: If every candidate dialect
118+ fails to produce a usable AST.
107119 """
108120 dialects = self ._detect_dialects (clean_sql )
109121 return self ._try_dialects (clean_sql , dialects )
@@ -112,20 +124,30 @@ def parse(self, clean_sql: str) -> tuple[exp.Expression, DialectType]:
112124
113125 @staticmethod
114126 def _detect_dialects (sql : str ) -> list [Any ]:
115- """Choose an ordered list of sqlglot dialects to try for *sql*.
116-
117- Heuristics:
118-
119- * ``#WORD`` → :class:`HashVarDialect` (MSSQL temp tables).
120- * Back-ticks → ``"mysql"``.
121- * Square brackets or ``TOP`` → :class:`BracketedTableDialect`.
122- * ``UNIQUE`` → try default, MySQL, Oracle.
123- * ``LATERAL VIEW`` → ``"spark"`` (Hive).
127+ """Build a priority-ordered list of sqlglot dialects for *sql*.
128+
129+ Scans the SQL string for syntactic markers that reveal which
130+ database engine produced it and returns the most likely dialect
131+ first. Every list includes at least one fallback so that the
132+ subsequent :meth:`_try_dialects` loop always has alternatives.
133+
134+ Heuristics (checked in order, first match wins):
135+
136+ * ``#WORD`` patterns → :class:`HashVarDialect` (MSSQL ``#temp``
137+ tables or ``#VAR#`` template placeholders).
138+ * Back-tick quoting → ``"mysql"`` (MySQL-style identifiers).
139+ * ``LATERAL VIEW`` → ``"spark"`` (Hive/Spark explode syntax).
140+ * Square brackets or ``TOP`` keyword →
141+ :class:`BracketedTableDialect` (TSQL bracket-quoted names).
142+ * ``UNIQUE`` keyword → default, ``"mysql"``, ``"oracle"``
143+ (ambiguous across engines).
144+ * ``APPEND FROM`` → :class:`RedshiftAppendDialect` (Redshift
145+ ``ALTER TABLE … APPEND FROM`` not natively supported).
146+ * No markers → default dialect with ``"mysql"`` fallback.
124147
125148 :param sql: Cleaned SQL string.
126- :type sql: str
127- :returns: Ordered list of dialects to attempt.
128- :rtype: list
149+ :returns: Ordered list of dialect identifiers or classes to
150+ attempt.
129151 """
130152 upper = sql .upper ()
131153 if _has_hash_variables (sql ):
@@ -147,20 +169,25 @@ def _detect_dialects(sql: str) -> list[Any]:
147169 def _try_dialects (
148170 self , clean_sql : str , dialects : list [Any ]
149171 ) -> tuple [exp .Expression , DialectType ]:
150- """Try parsing *clean_sql* with each dialect, returning the best.
151-
152- :returns: 2-tuple of ``(ast_node, winning_dialect)``.
153- :raises ValueError: If all dialect attempts fail.
172+ """Try each candidate dialect in order and return the first good result.
173+
174+ Iterates over *dialects*, calling :meth:`_parse_with_dialect` for
175+ each. A result is accepted immediately if it is the last dialect
176+ in the list (best-effort) or if :meth:`_is_degraded` reports no
177+ quality issues. Degraded results from non-last dialects are
178+ skipped so the next candidate gets a chance.
179+
180+ :param clean_sql: Preprocessed SQL string.
181+ :param dialects: Priority-ordered list from :meth:`_detect_dialects`.
182+ :returns: 2-tuple of ``(ast_root_node, winning_dialect)``.
183+ :raises InvalidQueryDefinition: If the last dialect raises a
184+ parse error, or if no dialect produces a usable AST.
154185 """
155- last_result = None
156- winning_dialect = None
157186 for dialect in dialects :
158187 try :
159188 result = self ._parse_with_dialect (clean_sql , dialect )
160189 if result is None :
161190 continue
162- last_result = result
163- winning_dialect = dialect
164191 is_last = dialect == dialects [- 1 ]
165192 if not is_last and self ._is_degraded (result , clean_sql ):
166193 continue
@@ -172,16 +199,32 @@ def _try_dialects(
172199 )
173200 continue
174201
175- # TODO: revisit if sqlglot starts returning None from parse for last dialect
176- if last_result is not None : # pragma: no cover
177- return last_result , winning_dialect
178202 raise InvalidQueryDefinition (
179203 "Query could not be parsed — no dialect could handle this SQL"
180204 )
181205
182206 @staticmethod
183207 def _parse_with_dialect (clean_sql : str , dialect : Any ) -> exp .Expression | None :
184- """Parse *clean_sql* with a single dialect, suppressing warnings."""
208+ """Parse *clean_sql* with a single sqlglot dialect.
209+
210+ Uses ``ErrorLevel.WARN`` so that sqlglot returns a best-effort
211+ AST instead of raising on the first syntax problem — the caller
212+ decides whether the result is good enough via
213+ :meth:`_is_degraded`.
214+
215+ The sqlglot logger is temporarily raised to ``CRITICAL`` during
216+ the parse call because ``WARN`` mode emits noisy warnings for
217+ every token it cannot handle. Since :meth:`_try_dialects`
218+ intentionally tries multiple dialects expecting some to produce
219+ degraded results, those warnings are expected and would mislead
220+ end-users if left visible.
221+
222+ :param clean_sql: Preprocessed SQL string.
223+ :param dialect: A sqlglot dialect identifier, class, or ``None``
224+ for the default dialect.
225+ :returns: The root AST node, or ``None`` if sqlglot could not
226+ produce any result.
227+ """
185228 logger = logging .getLogger ("sqlglot" )
186229 old_level = logger .level
187230 logger .setLevel (logging .CRITICAL )
@@ -196,35 +239,58 @@ def _parse_with_dialect(clean_sql: str, dialect: Any) -> exp.Expression | None:
196239
197240 if not results or results [0 ] is None :
198241 return None
199- result = results [0 ]
200- assert result is not None # guaranteed by check above
201- # TODO: revisit if sqlglot returns top-level Subquery
202- if isinstance (result , exp .Subquery ) and not result .alias : # pragma: no cover
203- inner = result .this
204- if isinstance (inner , exp .Expression ):
205- return inner
206- return result # type: ignore[return-value]
242+ return results [0 ] # type: ignore[return-value]
207243
208244 # -- quality checks -----------------------------------------------------
209245
210246 def _is_degraded (self , result : exp .Expression , clean_sql : str ) -> bool :
211- """Return ``True`` when a better dialect should be tried."""
247+ """Return ``True`` when the parse result is low quality.
248+
249+ A degraded result means the dialect parsed the SQL without
250+ raising, but the AST is suspicious — either the whole statement
251+ collapsed into an opaque ``exp.Command`` (when it should not
252+ have) or :meth:`_has_parse_issues` found placeholder-like table
253+ or column names. When ``True``, :meth:`_try_dialects` skips
254+ this dialect and moves on to the next candidate.
255+
256+ :param result: Root AST node from :meth:`_parse_with_dialect`.
257+ :param clean_sql: Original cleaned SQL (needed to check whether
258+ ``exp.Command`` is expected).
259+ :returns: ``True`` if the result should be discarded in favour
260+ of the next dialect.
261+ """
212262 if isinstance (result , exp .Command ) and not self ._is_expected_command (clean_sql ):
213263 return True
214264 return self ._has_parse_issues (result )
215265
216266 @staticmethod
217267 def _is_expected_command (sql : str ) -> bool :
218- """Check whether *sql* legitimately parses as ``exp.Command``."""
268+ """Return ``True`` when *sql* legitimately parses as ``exp.Command``.
269+
270+ Some dialect-specific DDL (e.g. Hive ``CREATE FUNCTION … USING
271+ JAR … WITH SERDEPROPERTIES``) is not supported by any sqlglot
272+ dialect and always degrades to ``exp.Command``. This method
273+ whitelists those known cases so :meth:`_is_degraded` does not
274+ reject them.
275+
276+ :param sql: Cleaned SQL string.
277+ :returns: ``True`` if ``exp.Command`` is the expected result.
278+ """
219279 upper = sql .strip ().upper ()
220280 return upper .startswith ("CREATE FUNCTION" )
221281
222282 @staticmethod
223283 def _has_parse_issues (ast : exp .Expression ) -> bool :
224- """Detect signs of a degraded or incorrect parse.
284+ """Walk the AST looking for signs of a degraded or incorrect parse.
285+
286+ When sqlglot misinterprets a query it often places SQL keywords
287+ (``UNIQUE``, ``DISTINCT``, etc.) into column or table name
288+ positions, or produces table nodes with empty names. This
289+ method scans all :class:`~sqlglot.exp.Table` and
290+ :class:`~sqlglot.exp.Column` nodes for those telltale patterns.
225291
226- Checks for table nodes with empty/keyword-like names and column
227- nodes whose name is a SQL keyword without a table qualifier .
292+ :param ast: Root AST node to inspect.
293+ :returns: ``True`` if suspicious nodes were found .
228294 """
229295 for table in ast .find_all (exp .Table ):
230296 if table .name in _BAD_TABLE_NAMES :
0 commit comments