fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md

This commit is contained in:
Wayne Sutton
2025-12-23 23:19:40 -08:00
parent e8c8218c81
commit 6a49583da5
3 changed files with 119 additions and 28 deletions

View File

@@ -1,7 +1,8 @@
import type { Context } from "@netlify/edge-functions";
// List of known social media and search engine bots
const BOTS = [
// Social preview bots that need OG metadata HTML
// These bots cannot render JavaScript and need pre-rendered OG tags
const SOCIAL_PREVIEW_BOTS = [
"facebookexternalhit",
"twitterbot",
"linkedinbot",
@@ -12,24 +13,45 @@ const BOTS = [
"pinterest",
"opengraph",
"opengraphbot",
"bot ",
"crawler",
"embedly",
"vkshare",
"quora link preview",
"redditbot",
"rogerbot",
"showyoubot",
"google",
"bingbot",
"baiduspider",
"duckduckbot",
];
function isBot(userAgent: string | null): boolean {
// AI crawlers that should get raw content, not OG previews
const AI_CRAWLERS = [
"gptbot",
"chatgpt",
"chatgpt-user",
"oai-searchbot",
"claude-web",
"claudebot",
"anthropic",
"anthropic-ai",
"ccbot",
"perplexitybot",
"perplexity",
"cohere-ai",
"bytespider",
"googleother",
"google-extended",
];
// Check if user agent is a social preview bot
function isSocialPreviewBot(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return BOTS.some((bot) => ua.includes(bot));
return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
}
// Check if user agent is an AI crawler
function isAICrawler(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return AI_CRAWLERS.some((bot) => ua.includes(bot));
}
export default async function handler(
@@ -37,28 +59,53 @@ export default async function handler(
context: Context,
): Promise<Response> {
const url = new URL(request.url);
const userAgent = request.headers.get("user-agent");
// Only intercept post pages for bots
const pathParts = url.pathname.split("/").filter(Boolean);
// Skip if it's the home page, static assets, API routes, or raw markdown files
// HARD BYPASS: Never intercept these paths regardless of user agent
// This is the first check to guarantee static files are served directly
if (
pathParts.length === 0 ||
pathParts[0].includes(".") ||
pathParts[0] === "api" ||
pathParts[0] === "_next" ||
pathParts[0] === "raw"
url.pathname.startsWith("/raw/") ||
url.pathname.startsWith("/assets/") ||
url.pathname.startsWith("/api/") ||
url.pathname.startsWith("/.netlify/") ||
url.pathname.endsWith(".md") ||
url.pathname.endsWith(".xml") ||
url.pathname.endsWith(".txt") ||
url.pathname.endsWith(".yaml") ||
url.pathname.endsWith(".json") ||
url.pathname.endsWith(".svg") ||
url.pathname.endsWith(".ico") ||
url.pathname.endsWith(".png") ||
url.pathname.endsWith(".jpg") ||
url.pathname.endsWith(".jpeg") ||
url.pathname.endsWith(".gif") ||
url.pathname.endsWith(".webp") ||
url.pathname.endsWith(".css") ||
url.pathname.endsWith(".js")
) {
return context.next();
}
// If not a bot, continue to the SPA
if (!isBot(userAgent)) {
const userAgent = request.headers.get("user-agent");
// Let AI crawlers through to normal content - they need raw data, not OG previews
if (isAICrawler(userAgent)) {
return context.next();
}
// For bots, fetch the Open Graph metadata from Convex
// Only intercept post pages for bots
const pathParts = url.pathname.split("/").filter(Boolean);
// Skip home page and any path with a file extension
if (pathParts.length === 0 || pathParts[0].includes(".")) {
return context.next();
}
// Only serve OG metadata to social preview bots, not search engines or AI
if (!isSocialPreviewBot(userAgent)) {
return context.next();
}
// For social preview bots, fetch the Open Graph metadata from Convex
const slug = pathParts[0];
const convexUrl =
Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");