fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md

This commit is contained in:
Wayne Sutton
2025-12-23 23:19:40 -08:00
parent e8c8218c81
commit 6a49583da5
3 changed files with 119 additions and 28 deletions

View File

@@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [1.24.5] - 2025-12-23
### Fixed
- AI crawlers (ChatGPT, Perplexity) can now fetch raw markdown from `/raw/*.md` URLs
- Added explicit `/raw/*` redirect passthrough in `netlify.toml` before SPA fallback
- Expanded `excludedPath` array to cover all static file patterns
- Refactored `botMeta.ts` edge function:
- Added hard bypass at top of handler for static file paths
- Separated social preview bots from AI crawlers
- AI crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) now bypass OG interception
- Only social preview bots (Facebook, Twitter, LinkedIn, etc.) receive OG metadata HTML
### Technical
- `netlify.toml`: Added `force = true` to `/raw/*` redirect, expanded `excludedPath` array
- `botMeta.ts`: Complete refactor with `SOCIAL_PREVIEW_BOTS` and `AI_CRAWLERS` lists, hard path bypass
## [1.24.4] - 2025-12-23
### Added

View File

@@ -5,6 +5,20 @@
[build.environment]
NODE_VERSION = "20"
# Raw markdown passthrough - explicit rule prevents SPA fallback from intercepting
[[redirects]]
from = "/raw/*"
to = "/raw/:splat"
status = 200
force = true
# Static files passthrough
[[redirects]]
from = "/assets/*"
to = "/assets/:splat"
status = 200
force = true
# SPA fallback for client-side routing (must be last)
[[redirects]]
from = "/*"
@@ -44,11 +58,23 @@
path = "/api/geo"
function = "geo"
# Open Graph bot detection (excludes raw markdown directory)
# Open Graph bot detection for social preview cards only
# Excludes raw markdown, static assets, and AI-consumable files
[[edge_functions]]
path = "/*"
function = "botMeta"
excludedPath = "/raw/*"
excludedPath = [
"/raw/*",
"/assets/*",
"/api/*",
"/.netlify/*",
"/favicon.ico",
"/favicon.svg",
"/robots.txt",
"/sitemap.xml",
"/llms.txt",
"/openapi.yaml"
]
# Security and SEO headers
[[headers]]
@@ -60,12 +86,13 @@
Referrer-Policy = "strict-origin-when-cross-origin"
Link = "</llms.txt>; rel=\"author\""
# Raw markdown files with proper content-type
# Raw markdown files - AI friendly headers
[[headers]]
for = "/raw/*.md"
for = "/raw/*"
[headers.values]
Content-Type = "text/markdown; charset=utf-8"
Access-Control-Allow-Origin = "*"
Cache-Control = "public, max-age=3600"
X-Robots-Tag = "noindex"
[context.production.environment]
@@ -76,4 +103,3 @@
[context.branch-deploy.environment]
NODE_ENV = "development"

View File

@@ -1,7 +1,8 @@
import type { Context } from "@netlify/edge-functions";
// List of known social media and search engine bots
const BOTS = [
// Social preview bots that need OG metadata HTML
// These bots cannot render JavaScript and need pre-rendered OG tags
const SOCIAL_PREVIEW_BOTS = [
"facebookexternalhit",
"twitterbot",
"linkedinbot",
@@ -12,24 +13,45 @@ const BOTS = [
"pinterest",
"opengraph",
"opengraphbot",
"bot ",
"crawler",
"embedly",
"vkshare",
"quora link preview",
"redditbot",
"rogerbot",
"showyoubot",
"google",
"bingbot",
"baiduspider",
"duckduckbot",
];
function isBot(userAgent: string | null): boolean {
// AI crawlers that should get raw content, not OG previews
const AI_CRAWLERS = [
"gptbot",
"chatgpt",
"chatgpt-user",
"oai-searchbot",
"claude-web",
"claudebot",
"anthropic",
"anthropic-ai",
"ccbot",
"perplexitybot",
"perplexity",
"cohere-ai",
"bytespider",
"googleother",
"google-extended",
];
// Check if user agent is a social preview bot
function isSocialPreviewBot(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return BOTS.some((bot) => ua.includes(bot));
return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
}
// Check if user agent is an AI crawler
function isAICrawler(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return AI_CRAWLERS.some((bot) => ua.includes(bot));
}
export default async function handler(
@@ -37,28 +59,53 @@ export default async function handler(
context: Context,
): Promise<Response> {
const url = new URL(request.url);
const userAgent = request.headers.get("user-agent");
// Only intercept post pages for bots
const pathParts = url.pathname.split("/").filter(Boolean);
// Skip if it's the home page, static assets, API routes, or raw markdown files
// HARD BYPASS: Never intercept these paths regardless of user agent
// This is the first check to guarantee static files are served directly
if (
pathParts.length === 0 ||
pathParts[0].includes(".") ||
pathParts[0] === "api" ||
pathParts[0] === "_next" ||
pathParts[0] === "raw"
url.pathname.startsWith("/raw/") ||
url.pathname.startsWith("/assets/") ||
url.pathname.startsWith("/api/") ||
url.pathname.startsWith("/.netlify/") ||
url.pathname.endsWith(".md") ||
url.pathname.endsWith(".xml") ||
url.pathname.endsWith(".txt") ||
url.pathname.endsWith(".yaml") ||
url.pathname.endsWith(".json") ||
url.pathname.endsWith(".svg") ||
url.pathname.endsWith(".ico") ||
url.pathname.endsWith(".png") ||
url.pathname.endsWith(".jpg") ||
url.pathname.endsWith(".jpeg") ||
url.pathname.endsWith(".gif") ||
url.pathname.endsWith(".webp") ||
url.pathname.endsWith(".css") ||
url.pathname.endsWith(".js")
) {
return context.next();
}
// If not a bot, continue to the SPA
if (!isBot(userAgent)) {
const userAgent = request.headers.get("user-agent");
// Let AI crawlers through to normal content - they need raw data, not OG previews
if (isAICrawler(userAgent)) {
return context.next();
}
// For bots, fetch the Open Graph metadata from Convex
// Only intercept post pages for bots
const pathParts = url.pathname.split("/").filter(Boolean);
// Skip home page and any path with a file extension
if (pathParts.length === 0 || pathParts[0].includes(".")) {
return context.next();
}
// Only serve OG metadata to social preview bots, not search engines or AI
if (!isSocialPreviewBot(userAgent)) {
return context.next();
}
// For social preview bots, fetch the Open Graph metadata from Convex
const slug = pathParts[0];
const convexUrl =
Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");