Fix Google Search Console indexing issues for spdlearn.org

bruAristimunha · bruAristimunha · commit c7ee6b7435fd · 2026-02-26T11:10:57.000+01:00
Two root causes were preventing proper indexing: 1. Sitemap URLs had a broken `en/0.1/` prefix (all 179 pages returned 404). sphinx_sitemap defaults to `{lang}{version}{link}` which doesn't match our single-version deployment at root. Fixed by setting `sitemap_url_scheme = "{link}"`. 2. Homepage canonical tag included `index.html` (`https://spdlearn.org/index.html`), causing Google to override it with `https://spdlearn.org/`. Added a Sphinx `html-page-context` hook to strip the trailing `index.html` from canonical URLs.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -303,6 +303,10 @@
 # Point 6: SEO - Add canonical URLs
 html_baseurl = "https://spdlearn.org/"
 
+# Fix sitemap URLs - current deployment is single-version at root,
+# so don't prefix with language/version directories
+sitemap_url_scheme = "{link}"
+
 # Point 7: Copy root-level site files (robots.txt, BingSiteAuth.xml, etc.)
 html_extra_path = ["_extra"]
 
@@ -549,3 +553,14 @@ def linkcode_resolve(domain, info):
         pass
 
     return f"https://github.com/{github_user}/{github_repo}/blob/{github_version}/{relpath}{linespec}"
+
+
+def _fix_index_canonical_url(app, pagename, templatename, context, doctree):
+    """Strip index.html from canonical URLs so Google indexes clean directory URLs."""
+    pageurl = context.get("pageurl", "")
+    if pageurl and pageurl.endswith("/index.html"):
+        context["pageurl"] = pageurl[: -len("index.html")]
+
+
+def setup(app):
+    app.connect("html-page-context", _fix_index_canonical_url)