fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md

2026-01-12 04:09:14 +00:00 · 2025-12-23 23:19:40 -08:00
parent e8c8218c81
commit 6a49583da5
3 changed files with 119 additions and 28 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

+## [1.24.5] - 2025-12-23
+
+### Fixed
+
+- AI crawlers (ChatGPT, Perplexity) can now fetch raw markdown from `/raw/*.md` URLs
+  - Added explicit `/raw/*` redirect passthrough in `netlify.toml` before SPA fallback
+  - Expanded `excludedPath` array to cover all static file patterns
+  - Refactored `botMeta.ts` edge function:
+    - Added hard bypass at top of handler for static file paths
+    - Separated social preview bots from AI crawlers
+    - AI crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) now bypass OG interception
+    - Only social preview bots (Facebook, Twitter, LinkedIn, etc.) receive OG metadata HTML
+
+### Technical
+
+- `netlify.toml`: Added `force = true` to `/raw/*` redirect, expanded `excludedPath` array
+- `botMeta.ts`: Complete refactor with `SOCIAL_PREVIEW_BOTS` and `AI_CRAWLERS` lists, hard path bypass
+
 ## [1.24.4] - 2025-12-23

 ### Added
--- a/netlify.toml
+++ b/netlify.toml
@@ -5,6 +5,20 @@
 [build.environment]
  NODE_VERSION = "20"

+# Raw markdown passthrough - explicit rule prevents SPA fallback from intercepting
+[[redirects]]
+  from = "/raw/*"
+  to = "/raw/:splat"
+  status = 200
+  force = true
+
+# Static files passthrough
+[[redirects]]
+  from = "/assets/*"
+  to = "/assets/:splat"
+  status = 200
+  force = true
+
 # SPA fallback for client-side routing (must be last)
 [[redirects]]
  from = "/*"
@@ -44,11 +58,23 @@
  path = "/api/geo"
  function = "geo"

-# Open Graph bot detection (excludes raw markdown directory)
+# Open Graph bot detection for social preview cards only
+# Excludes raw markdown, static assets, and AI-consumable files
 [[edge_functions]]
  path = "/*"
  function = "botMeta"
-  excludedPath = "/raw/*"
+  excludedPath = [
+    "/raw/*",
+    "/assets/*",
+    "/api/*",
+    "/.netlify/*",
+    "/favicon.ico",
+    "/favicon.svg",
+    "/robots.txt",
+    "/sitemap.xml",
+    "/llms.txt",
+    "/openapi.yaml"
+  ]

 # Security and SEO headers
 [[headers]]
@@ -60,12 +86,13 @@
    Referrer-Policy = "strict-origin-when-cross-origin"
    Link = "</llms.txt>; rel=\"author\""

-# Raw markdown files with proper content-type
+# Raw markdown files - AI friendly headers
 [[headers]]
-  for = "/raw/*.md"
+  for = "/raw/*"
  [headers.values]
    Content-Type = "text/markdown; charset=utf-8"
    Access-Control-Allow-Origin = "*"
+    Cache-Control = "public, max-age=3600"
    X-Robots-Tag = "noindex"

 [context.production.environment]
@@ -76,4 +103,3 @@

 [context.branch-deploy.environment]
  NODE_ENV = "development"
-
--- a/netlify/edge-functions/botMeta.ts
+++ b/netlify/edge-functions/botMeta.ts
@@ -1,7 +1,8 @@
 import type { Context } from "@netlify/edge-functions";

-// List of known social media and search engine bots
-const BOTS = [
+// Social preview bots that need OG metadata HTML
+// These bots cannot render JavaScript and need pre-rendered OG tags
+const SOCIAL_PREVIEW_BOTS = [
  "facebookexternalhit",
  "twitterbot",
  "linkedinbot",
@@ -12,24 +13,45 @@ const BOTS = [
  "pinterest",
  "opengraph",
  "opengraphbot",
-  "bot ",
-  "crawler",
  "embedly",
  "vkshare",
  "quora link preview",
  "redditbot",
  "rogerbot",
  "showyoubot",
-  "google",
-  "bingbot",
-  "baiduspider",
-  "duckduckbot",
 ];

-function isBot(userAgent: string | null): boolean {
+// AI crawlers that should get raw content, not OG previews
+const AI_CRAWLERS = [
+  "gptbot",
+  "chatgpt",
+  "chatgpt-user",
+  "oai-searchbot",
+  "claude-web",
+  "claudebot",
+  "anthropic",
+  "anthropic-ai",
+  "ccbot",
+  "perplexitybot",
+  "perplexity",
+  "cohere-ai",
+  "bytespider",
+  "googleother",
+  "google-extended",
+];
+
+// Check if user agent is a social preview bot
+function isSocialPreviewBot(userAgent: string | null): boolean {
  if (!userAgent) return false;
  const ua = userAgent.toLowerCase();
-  return BOTS.some((bot) => ua.includes(bot));
+  return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
+}
+
+// Check if user agent is an AI crawler
+function isAICrawler(userAgent: string | null): boolean {
+  if (!userAgent) return false;
+  const ua = userAgent.toLowerCase();
+  return AI_CRAWLERS.some((bot) => ua.includes(bot));
 }

 export default async function handler(
@@ -37,28 +59,53 @@ export default async function handler(
  context: Context,
 ): Promise<Response> {
  const url = new URL(request.url);
-  const userAgent = request.headers.get("user-agent");

-  // Only intercept post pages for bots
-  const pathParts = url.pathname.split("/").filter(Boolean);
-
-  // Skip if it's the home page, static assets, API routes, or raw markdown files
+  // HARD BYPASS: Never intercept these paths regardless of user agent
+  // This is the first check to guarantee static files are served directly
  if (
-    pathParts.length === 0 ||
-    pathParts[0].includes(".") ||
-    pathParts[0] === "api" ||
-    pathParts[0] === "_next" ||
-    pathParts[0] === "raw"
+    url.pathname.startsWith("/raw/") ||
+    url.pathname.startsWith("/assets/") ||
+    url.pathname.startsWith("/api/") ||
+    url.pathname.startsWith("/.netlify/") ||
+    url.pathname.endsWith(".md") ||
+    url.pathname.endsWith(".xml") ||
+    url.pathname.endsWith(".txt") ||
+    url.pathname.endsWith(".yaml") ||
+    url.pathname.endsWith(".json") ||
+    url.pathname.endsWith(".svg") ||
+    url.pathname.endsWith(".ico") ||
+    url.pathname.endsWith(".png") ||
+    url.pathname.endsWith(".jpg") ||
+    url.pathname.endsWith(".jpeg") ||
+    url.pathname.endsWith(".gif") ||
+    url.pathname.endsWith(".webp") ||
+    url.pathname.endsWith(".css") ||
+    url.pathname.endsWith(".js")
  ) {
    return context.next();
  }

-  // If not a bot, continue to the SPA
-  if (!isBot(userAgent)) {
+  const userAgent = request.headers.get("user-agent");
+
+  // Let AI crawlers through to normal content - they need raw data, not OG previews
+  if (isAICrawler(userAgent)) {
    return context.next();
  }

-  // For bots, fetch the Open Graph metadata from Convex
+  // Only intercept post pages for bots
+  const pathParts = url.pathname.split("/").filter(Boolean);
+
+  // Skip home page and any path with a file extension
+  if (pathParts.length === 0 || pathParts[0].includes(".")) {
+    return context.next();
+  }
+
+  // Only serve OG metadata to social preview bots, not search engines or AI
+  if (!isSocialPreviewBot(userAgent)) {
+    return context.next();
+  }
+
+  // For social preview bots, fetch the Open Graph metadata from Convex
  const slug = pathParts[0];
  const convexUrl =
    Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");