From 6a49583da58d500ceb00e20f497376ca8fc08815 Mon Sep 17 00:00:00 2001 From: Wayne Sutton Date: Tue, 23 Dec 2025 23:19:40 -0800 Subject: [PATCH] fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md --- changelog.md | 18 ++++++ netlify.toml | 36 ++++++++++-- netlify/edge-functions/botMeta.ts | 93 +++++++++++++++++++++++-------- 3 files changed, 119 insertions(+), 28 deletions(-) diff --git a/changelog.md b/changelog.md index 2e41225..81f8da1 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [1.24.5] - 2025-12-23 + +### Fixed + +- AI crawlers (ChatGPT, Perplexity) can now fetch raw markdown from `/raw/*.md` URLs + - Added explicit `/raw/*` redirect passthrough in `netlify.toml` before SPA fallback + - Expanded `excludedPath` array to cover all static file patterns + - Refactored `botMeta.ts` edge function: + - Added hard bypass at top of handler for static file paths + - Separated social preview bots from AI crawlers + - AI crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) now bypass OG interception + - Only social preview bots (Facebook, Twitter, LinkedIn, etc.) receive OG metadata HTML + +### Technical + +- `netlify.toml`: Added `force = true` to `/raw/*` redirect, expanded `excludedPath` array +- `botMeta.ts`: Complete refactor with `SOCIAL_PREVIEW_BOTS` and `AI_CRAWLERS` lists, hard path bypass + ## [1.24.4] - 2025-12-23 ### Added diff --git a/netlify.toml b/netlify.toml index 86719c0..ae777be 100644 --- a/netlify.toml +++ b/netlify.toml @@ -5,6 +5,20 @@ [build.environment] NODE_VERSION = "20" +# Raw markdown passthrough - explicit rule prevents SPA fallback from intercepting +[[redirects]] + from = "/raw/*" + to = "/raw/:splat" + status = 200 + force = true + +# Static files passthrough +[[redirects]] + from = "/assets/*" + to = "/assets/:splat" + status = 200 + force = true + # SPA fallback for client-side routing (must be last) [[redirects]] from = "/*" @@ -44,11 +58,23 @@ path = "/api/geo" function = "geo" -# Open Graph bot detection (excludes raw markdown directory) +# Open Graph bot detection for social preview cards only +# Excludes raw markdown, static assets, and AI-consumable files [[edge_functions]] path = "/*" function = "botMeta" - excludedPath = "/raw/*" + excludedPath = [ + "/raw/*", + "/assets/*", + "/api/*", + "/.netlify/*", + "/favicon.ico", + "/favicon.svg", + "/robots.txt", + "/sitemap.xml", + "/llms.txt", + "/openapi.yaml" + ] # Security and SEO headers [[headers]] @@ -60,12 +86,13 @@ Referrer-Policy = "strict-origin-when-cross-origin" Link = "; rel=\"author\"" -# Raw markdown files with proper content-type +# Raw markdown files - AI friendly headers [[headers]] - for = "/raw/*.md" + for = "/raw/*" [headers.values] Content-Type = "text/markdown; charset=utf-8" Access-Control-Allow-Origin = "*" + Cache-Control = "public, max-age=3600" X-Robots-Tag = "noindex" [context.production.environment] @@ -76,4 +103,3 @@ [context.branch-deploy.environment] NODE_ENV = "development" - diff --git a/netlify/edge-functions/botMeta.ts b/netlify/edge-functions/botMeta.ts index fbf2140..0b1b3c8 100644 --- a/netlify/edge-functions/botMeta.ts +++ b/netlify/edge-functions/botMeta.ts @@ -1,7 +1,8 @@ import type { Context } from "@netlify/edge-functions"; -// List of known social media and search engine bots -const BOTS = [ +// Social preview bots that need OG metadata HTML +// These bots cannot render JavaScript and need pre-rendered OG tags +const SOCIAL_PREVIEW_BOTS = [ "facebookexternalhit", "twitterbot", "linkedinbot", @@ -12,24 +13,45 @@ const BOTS = [ "pinterest", "opengraph", "opengraphbot", - "bot ", - "crawler", "embedly", "vkshare", "quora link preview", "redditbot", "rogerbot", "showyoubot", - "google", - "bingbot", - "baiduspider", - "duckduckbot", ]; -function isBot(userAgent: string | null): boolean { +// AI crawlers that should get raw content, not OG previews +const AI_CRAWLERS = [ + "gptbot", + "chatgpt", + "chatgpt-user", + "oai-searchbot", + "claude-web", + "claudebot", + "anthropic", + "anthropic-ai", + "ccbot", + "perplexitybot", + "perplexity", + "cohere-ai", + "bytespider", + "googleother", + "google-extended", +]; + +// Check if user agent is a social preview bot +function isSocialPreviewBot(userAgent: string | null): boolean { if (!userAgent) return false; const ua = userAgent.toLowerCase(); - return BOTS.some((bot) => ua.includes(bot)); + return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot)); +} + +// Check if user agent is an AI crawler +function isAICrawler(userAgent: string | null): boolean { + if (!userAgent) return false; + const ua = userAgent.toLowerCase(); + return AI_CRAWLERS.some((bot) => ua.includes(bot)); } export default async function handler( @@ -37,28 +59,53 @@ export default async function handler( context: Context, ): Promise { const url = new URL(request.url); - const userAgent = request.headers.get("user-agent"); - // Only intercept post pages for bots - const pathParts = url.pathname.split("/").filter(Boolean); - - // Skip if it's the home page, static assets, API routes, or raw markdown files + // HARD BYPASS: Never intercept these paths regardless of user agent + // This is the first check to guarantee static files are served directly if ( - pathParts.length === 0 || - pathParts[0].includes(".") || - pathParts[0] === "api" || - pathParts[0] === "_next" || - pathParts[0] === "raw" + url.pathname.startsWith("/raw/") || + url.pathname.startsWith("/assets/") || + url.pathname.startsWith("/api/") || + url.pathname.startsWith("/.netlify/") || + url.pathname.endsWith(".md") || + url.pathname.endsWith(".xml") || + url.pathname.endsWith(".txt") || + url.pathname.endsWith(".yaml") || + url.pathname.endsWith(".json") || + url.pathname.endsWith(".svg") || + url.pathname.endsWith(".ico") || + url.pathname.endsWith(".png") || + url.pathname.endsWith(".jpg") || + url.pathname.endsWith(".jpeg") || + url.pathname.endsWith(".gif") || + url.pathname.endsWith(".webp") || + url.pathname.endsWith(".css") || + url.pathname.endsWith(".js") ) { return context.next(); } - // If not a bot, continue to the SPA - if (!isBot(userAgent)) { + const userAgent = request.headers.get("user-agent"); + + // Let AI crawlers through to normal content - they need raw data, not OG previews + if (isAICrawler(userAgent)) { return context.next(); } - // For bots, fetch the Open Graph metadata from Convex + // Only intercept post pages for bots + const pathParts = url.pathname.split("/").filter(Boolean); + + // Skip home page and any path with a file extension + if (pathParts.length === 0 || pathParts[0].includes(".")) { + return context.next(); + } + + // Only serve OG metadata to social preview bots, not search engines or AI + if (!isSocialPreviewBot(userAgent)) { + return context.next(); + } + + // For social preview bots, fetch the Open Graph metadata from Convex const slug = pathParts[0]; const convexUrl = Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");