From 6a49583da58d500ceb00e20f497376ca8fc08815 Mon Sep 17 00:00:00 2001
From: Wayne Sutton <waynesutton@users.noreply.github.com>
Date: Tue, 23 Dec 2025 23:19:40 -0800
Subject: [PATCH] fix(edge): let AI crawlers bypass botMeta OG interception for
 /raw/*.md

---
 changelog.md                      | 18 ++++++
 netlify.toml                      | 36 ++++++++++--
 netlify/edge-functions/botMeta.ts | 93 +++++++++++++++++++++++--------
 3 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/changelog.md b/changelog.md
index 2e41225..81f8da1 100644
--- a/changelog.md
+++ b/changelog.md
@@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [1.24.5] - 2025-12-23
+
+### Fixed
+
+- AI crawlers (ChatGPT, Perplexity) can now fetch raw markdown from `/raw/*.md` URLs
+  - Added explicit `/raw/*` redirect passthrough in `netlify.toml` before SPA fallback
+  - Expanded `excludedPath` array to cover all static file patterns
+  - Refactored `botMeta.ts` edge function:
+    - Added hard bypass at top of handler for static file paths
+    - Separated social preview bots from AI crawlers
+    - AI crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) now bypass OG interception
+    - Only social preview bots (Facebook, Twitter, LinkedIn, etc.) receive OG metadata HTML
+
+### Technical
+
+- `netlify.toml`: Added `force = true` to `/raw/*` redirect, expanded `excludedPath` array
+- `botMeta.ts`: Complete refactor with `SOCIAL_PREVIEW_BOTS` and `AI_CRAWLERS` lists, hard path bypass
+
 ## [1.24.4] - 2025-12-23
 
 ### Added
diff --git a/netlify.toml b/netlify.toml
index 86719c0..ae777be 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -5,6 +5,20 @@
 [build.environment]
   NODE_VERSION = "20"
 
+# Raw markdown passthrough - explicit rule prevents SPA fallback from intercepting
+[[redirects]]
+  from = "/raw/*"
+  to = "/raw/:splat"
+  status = 200
+  force = true
+
+# Static files passthrough
+[[redirects]]
+  from = "/assets/*"
+  to = "/assets/:splat"
+  status = 200
+  force = true
+
 # SPA fallback for client-side routing (must be last)
 [[redirects]]
   from = "/*"
@@ -44,11 +58,23 @@
   path = "/api/geo"
   function = "geo"
 
-# Open Graph bot detection (excludes raw markdown directory)
+# Open Graph bot detection for social preview cards only
+# Excludes raw markdown, static assets, and AI-consumable files
 [[edge_functions]]
   path = "/*"
   function = "botMeta"
-  excludedPath = "/raw/*"
+  excludedPath = [
+    "/raw/*",
+    "/assets/*",
+    "/api/*",
+    "/.netlify/*",
+    "/favicon.ico",
+    "/favicon.svg",
+    "/robots.txt",
+    "/sitemap.xml",
+    "/llms.txt",
+    "/openapi.yaml"
+  ]
 
 # Security and SEO headers
 [[headers]]
@@ -60,12 +86,13 @@
     Referrer-Policy = "strict-origin-when-cross-origin"
     Link = "</llms.txt>; rel=\"author\""
 
-# Raw markdown files with proper content-type
+# Raw markdown files - AI friendly headers
 [[headers]]
-  for = "/raw/*.md"
+  for = "/raw/*"
   [headers.values]
     Content-Type = "text/markdown; charset=utf-8"
     Access-Control-Allow-Origin = "*"
+    Cache-Control = "public, max-age=3600"
     X-Robots-Tag = "noindex"
 
 [context.production.environment]
@@ -76,4 +103,3 @@
 
 [context.branch-deploy.environment]
   NODE_ENV = "development"
-
diff --git a/netlify/edge-functions/botMeta.ts b/netlify/edge-functions/botMeta.ts
index fbf2140..0b1b3c8 100644
--- a/netlify/edge-functions/botMeta.ts
+++ b/netlify/edge-functions/botMeta.ts
@@ -1,7 +1,8 @@
 import type { Context } from "@netlify/edge-functions";
 
-// List of known social media and search engine bots
-const BOTS = [
+// Social preview bots that need OG metadata HTML
+// These bots cannot render JavaScript and need pre-rendered OG tags
+const SOCIAL_PREVIEW_BOTS = [
   "facebookexternalhit",
   "twitterbot",
   "linkedinbot",
@@ -12,24 +13,45 @@ const BOTS = [
   "pinterest",
   "opengraph",
   "opengraphbot",
-  "bot ",
-  "crawler",
   "embedly",
   "vkshare",
   "quora link preview",
   "redditbot",
   "rogerbot",
   "showyoubot",
-  "google",
-  "bingbot",
-  "baiduspider",
-  "duckduckbot",
 ];
 
-function isBot(userAgent: string | null): boolean {
+// AI crawlers that should get raw content, not OG previews
+const AI_CRAWLERS = [
+  "gptbot",
+  "chatgpt",
+  "chatgpt-user",
+  "oai-searchbot",
+  "claude-web",
+  "claudebot",
+  "anthropic",
+  "anthropic-ai",
+  "ccbot",
+  "perplexitybot",
+  "perplexity",
+  "cohere-ai",
+  "bytespider",
+  "googleother",
+  "google-extended",
+];
+
+// Check if user agent is a social preview bot
+function isSocialPreviewBot(userAgent: string | null): boolean {
   if (!userAgent) return false;
   const ua = userAgent.toLowerCase();
-  return BOTS.some((bot) => ua.includes(bot));
+  return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
+}
+
+// Check if user agent is an AI crawler
+function isAICrawler(userAgent: string | null): boolean {
+  if (!userAgent) return false;
+  const ua = userAgent.toLowerCase();
+  return AI_CRAWLERS.some((bot) => ua.includes(bot));
 }
 
 export default async function handler(
@@ -37,28 +59,53 @@ export default async function handler(
   context: Context,
 ): Promise<Response> {
   const url = new URL(request.url);
-  const userAgent = request.headers.get("user-agent");
 
-  // Only intercept post pages for bots
-  const pathParts = url.pathname.split("/").filter(Boolean);
-
-  // Skip if it's the home page, static assets, API routes, or raw markdown files
+  // HARD BYPASS: Never intercept these paths regardless of user agent
+  // This is the first check to guarantee static files are served directly
   if (
-    pathParts.length === 0 ||
-    pathParts[0].includes(".") ||
-    pathParts[0] === "api" ||
-    pathParts[0] === "_next" ||
-    pathParts[0] === "raw"
+    url.pathname.startsWith("/raw/") ||
+    url.pathname.startsWith("/assets/") ||
+    url.pathname.startsWith("/api/") ||
+    url.pathname.startsWith("/.netlify/") ||
+    url.pathname.endsWith(".md") ||
+    url.pathname.endsWith(".xml") ||
+    url.pathname.endsWith(".txt") ||
+    url.pathname.endsWith(".yaml") ||
+    url.pathname.endsWith(".json") ||
+    url.pathname.endsWith(".svg") ||
+    url.pathname.endsWith(".ico") ||
+    url.pathname.endsWith(".png") ||
+    url.pathname.endsWith(".jpg") ||
+    url.pathname.endsWith(".jpeg") ||
+    url.pathname.endsWith(".gif") ||
+    url.pathname.endsWith(".webp") ||
+    url.pathname.endsWith(".css") ||
+    url.pathname.endsWith(".js")
   ) {
     return context.next();
   }
 
-  // If not a bot, continue to the SPA
-  if (!isBot(userAgent)) {
+  const userAgent = request.headers.get("user-agent");
+
+  // Let AI crawlers through to normal content - they need raw data, not OG previews
+  if (isAICrawler(userAgent)) {
     return context.next();
   }
 
-  // For bots, fetch the Open Graph metadata from Convex
+  // Only intercept post pages for bots
+  const pathParts = url.pathname.split("/").filter(Boolean);
+
+  // Skip home page and any path with a file extension
+  if (pathParts.length === 0 || pathParts[0].includes(".")) {
+    return context.next();
+  }
+
+  // Only serve OG metadata to social preview bots, not search engines or AI
+  if (!isSocialPreviewBot(userAgent)) {
+    return context.next();
+  }
+
+  // For social preview bots, fetch the Open Graph metadata from Convex
   const slug = pathParts[0];
   const convexUrl =
     Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");