fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md

2026-01-12 04:09:14 +00:00 · 2025-12-23 23:19:40 -08:00
parent e8c8218c81
commit 6a49583da5
3 changed files with 119 additions and 28 deletions
--- a/netlify/edge-functions/botMeta.ts
+++ b/netlify/edge-functions/botMeta.ts
@@ -1,7 +1,8 @@
 import type { Context } from "@netlify/edge-functions";

-// List of known social media and search engine bots
-const BOTS = [
+// Social preview bots that need OG metadata HTML
+// These bots cannot render JavaScript and need pre-rendered OG tags
+const SOCIAL_PREVIEW_BOTS = [
  "facebookexternalhit",
  "twitterbot",
  "linkedinbot",
@@ -12,24 +13,45 @@ const BOTS = [
  "pinterest",
  "opengraph",
  "opengraphbot",
-  "bot ",
-  "crawler",
  "embedly",
  "vkshare",
  "quora link preview",
  "redditbot",
  "rogerbot",
  "showyoubot",
-  "google",
-  "bingbot",
-  "baiduspider",
-  "duckduckbot",
 ];

-function isBot(userAgent: string | null): boolean {
+// AI crawlers that should get raw content, not OG previews
+const AI_CRAWLERS = [
+  "gptbot",
+  "chatgpt",
+  "chatgpt-user",
+  "oai-searchbot",
+  "claude-web",
+  "claudebot",
+  "anthropic",
+  "anthropic-ai",
+  "ccbot",
+  "perplexitybot",
+  "perplexity",
+  "cohere-ai",
+  "bytespider",
+  "googleother",
+  "google-extended",
+];
+
+// Check if user agent is a social preview bot
+function isSocialPreviewBot(userAgent: string | null): boolean {
  if (!userAgent) return false;
  const ua = userAgent.toLowerCase();
-  return BOTS.some((bot) => ua.includes(bot));
+  return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
+}
+
+// Check if user agent is an AI crawler
+function isAICrawler(userAgent: string | null): boolean {
+  if (!userAgent) return false;
+  const ua = userAgent.toLowerCase();
+  return AI_CRAWLERS.some((bot) => ua.includes(bot));
 }

 export default async function handler(
@@ -37,28 +59,53 @@ export default async function handler(
  context: Context,
 ): Promise<Response> {
  const url = new URL(request.url);
-  const userAgent = request.headers.get("user-agent");

-  // Only intercept post pages for bots
-  const pathParts = url.pathname.split("/").filter(Boolean);
-
-  // Skip if it's the home page, static assets, API routes, or raw markdown files
+  // HARD BYPASS: Never intercept these paths regardless of user agent
+  // This is the first check to guarantee static files are served directly
  if (
-    pathParts.length === 0 ||
-    pathParts[0].includes(".") ||
-    pathParts[0] === "api" ||
-    pathParts[0] === "_next" ||
-    pathParts[0] === "raw"
+    url.pathname.startsWith("/raw/") ||
+    url.pathname.startsWith("/assets/") ||
+    url.pathname.startsWith("/api/") ||
+    url.pathname.startsWith("/.netlify/") ||
+    url.pathname.endsWith(".md") ||
+    url.pathname.endsWith(".xml") ||
+    url.pathname.endsWith(".txt") ||
+    url.pathname.endsWith(".yaml") ||
+    url.pathname.endsWith(".json") ||
+    url.pathname.endsWith(".svg") ||
+    url.pathname.endsWith(".ico") ||
+    url.pathname.endsWith(".png") ||
+    url.pathname.endsWith(".jpg") ||
+    url.pathname.endsWith(".jpeg") ||
+    url.pathname.endsWith(".gif") ||
+    url.pathname.endsWith(".webp") ||
+    url.pathname.endsWith(".css") ||
+    url.pathname.endsWith(".js")
  ) {
    return context.next();
  }

-  // If not a bot, continue to the SPA
-  if (!isBot(userAgent)) {
+  const userAgent = request.headers.get("user-agent");
+
+  // Let AI crawlers through to normal content - they need raw data, not OG previews
+  if (isAICrawler(userAgent)) {
    return context.next();
  }

-  // For bots, fetch the Open Graph metadata from Convex
+  // Only intercept post pages for bots
+  const pathParts = url.pathname.split("/").filter(Boolean);
+
+  // Skip home page and any path with a file extension
+  if (pathParts.length === 0 || pathParts[0].includes(".")) {
+    return context.next();
+  }
+
+  // Only serve OG metadata to social preview bots, not search engines or AI
+  if (!isSocialPreviewBot(userAgent)) {
+    return context.next();
+  }
+
+  // For social preview bots, fetch the Open Graph metadata from Convex
  const slug = pathParts[0];
  const convexUrl =
    Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");