mirror of
https://github.com/waynesutton/markdown-site.git
synced 2026-01-12 04:09:14 +00:00
fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
import type { Context } from "@netlify/edge-functions";
|
||||
|
||||
// List of known social media and search engine bots
|
||||
const BOTS = [
|
||||
// Social preview bots that need OG metadata HTML
|
||||
// These bots cannot render JavaScript and need pre-rendered OG tags
|
||||
const SOCIAL_PREVIEW_BOTS = [
|
||||
"facebookexternalhit",
|
||||
"twitterbot",
|
||||
"linkedinbot",
|
||||
@@ -12,24 +13,45 @@ const BOTS = [
|
||||
"pinterest",
|
||||
"opengraph",
|
||||
"opengraphbot",
|
||||
"bot ",
|
||||
"crawler",
|
||||
"embedly",
|
||||
"vkshare",
|
||||
"quora link preview",
|
||||
"redditbot",
|
||||
"rogerbot",
|
||||
"showyoubot",
|
||||
"google",
|
||||
"bingbot",
|
||||
"baiduspider",
|
||||
"duckduckbot",
|
||||
];
|
||||
|
||||
function isBot(userAgent: string | null): boolean {
|
||||
// AI crawlers that should get raw content, not OG previews
|
||||
const AI_CRAWLERS = [
|
||||
"gptbot",
|
||||
"chatgpt",
|
||||
"chatgpt-user",
|
||||
"oai-searchbot",
|
||||
"claude-web",
|
||||
"claudebot",
|
||||
"anthropic",
|
||||
"anthropic-ai",
|
||||
"ccbot",
|
||||
"perplexitybot",
|
||||
"perplexity",
|
||||
"cohere-ai",
|
||||
"bytespider",
|
||||
"googleother",
|
||||
"google-extended",
|
||||
];
|
||||
|
||||
// Check if user agent is a social preview bot
|
||||
function isSocialPreviewBot(userAgent: string | null): boolean {
|
||||
if (!userAgent) return false;
|
||||
const ua = userAgent.toLowerCase();
|
||||
return BOTS.some((bot) => ua.includes(bot));
|
||||
return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
|
||||
}
|
||||
|
||||
// Check if user agent is an AI crawler
|
||||
function isAICrawler(userAgent: string | null): boolean {
|
||||
if (!userAgent) return false;
|
||||
const ua = userAgent.toLowerCase();
|
||||
return AI_CRAWLERS.some((bot) => ua.includes(bot));
|
||||
}
|
||||
|
||||
export default async function handler(
|
||||
@@ -37,28 +59,53 @@ export default async function handler(
|
||||
context: Context,
|
||||
): Promise<Response> {
|
||||
const url = new URL(request.url);
|
||||
const userAgent = request.headers.get("user-agent");
|
||||
|
||||
// Only intercept post pages for bots
|
||||
const pathParts = url.pathname.split("/").filter(Boolean);
|
||||
|
||||
// Skip if it's the home page, static assets, API routes, or raw markdown files
|
||||
// HARD BYPASS: Never intercept these paths regardless of user agent
|
||||
// This is the first check to guarantee static files are served directly
|
||||
if (
|
||||
pathParts.length === 0 ||
|
||||
pathParts[0].includes(".") ||
|
||||
pathParts[0] === "api" ||
|
||||
pathParts[0] === "_next" ||
|
||||
pathParts[0] === "raw"
|
||||
url.pathname.startsWith("/raw/") ||
|
||||
url.pathname.startsWith("/assets/") ||
|
||||
url.pathname.startsWith("/api/") ||
|
||||
url.pathname.startsWith("/.netlify/") ||
|
||||
url.pathname.endsWith(".md") ||
|
||||
url.pathname.endsWith(".xml") ||
|
||||
url.pathname.endsWith(".txt") ||
|
||||
url.pathname.endsWith(".yaml") ||
|
||||
url.pathname.endsWith(".json") ||
|
||||
url.pathname.endsWith(".svg") ||
|
||||
url.pathname.endsWith(".ico") ||
|
||||
url.pathname.endsWith(".png") ||
|
||||
url.pathname.endsWith(".jpg") ||
|
||||
url.pathname.endsWith(".jpeg") ||
|
||||
url.pathname.endsWith(".gif") ||
|
||||
url.pathname.endsWith(".webp") ||
|
||||
url.pathname.endsWith(".css") ||
|
||||
url.pathname.endsWith(".js")
|
||||
) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// If not a bot, continue to the SPA
|
||||
if (!isBot(userAgent)) {
|
||||
const userAgent = request.headers.get("user-agent");
|
||||
|
||||
// Let AI crawlers through to normal content - they need raw data, not OG previews
|
||||
if (isAICrawler(userAgent)) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// For bots, fetch the Open Graph metadata from Convex
|
||||
// Only intercept post pages for bots
|
||||
const pathParts = url.pathname.split("/").filter(Boolean);
|
||||
|
||||
// Skip home page and any path with a file extension
|
||||
if (pathParts.length === 0 || pathParts[0].includes(".")) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// Only serve OG metadata to social preview bots, not search engines or AI
|
||||
if (!isSocialPreviewBot(userAgent)) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// For social preview bots, fetch the Open Graph metadata from Convex
|
||||
const slug = pathParts[0];
|
||||
const convexUrl =
|
||||
Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");
|
||||
|
||||
Reference in New Issue
Block a user