Files
wiki/netlify/edge-functions/botMeta.ts

180 lines
5.2 KiB
TypeScript

import type { Context } from "@netlify/edge-functions";
// =============================================================================
// BOT DETECTION CONFIGURATION
// =============================================================================
// Customize these arrays to control which bots receive pre-rendered HTML
// with correct SEO meta tags (canonical URLs, Open Graph, etc.)
//
// - SOCIAL_PREVIEW_BOTS: Bots that generate link previews (Twitter, Slack, etc.)
// - SEARCH_ENGINE_BOTS: Search engine crawlers for SEO (Google, Bing, etc.)
// - AI_CRAWLERS: AI agents that should get the raw SPA (can render JavaScript)
//
// How it works:
// - Social + Search bots -> Pre-rendered HTML with correct canonical/meta tags
// - AI crawlers -> Normal SPA (they can render JavaScript and want raw content)
// - Regular browsers -> Normal SPA (React updates meta tags client-side)
// =============================================================================
// Social preview bots that need OG metadata HTML
// These bots cannot render JavaScript and need pre-rendered OG tags
const SOCIAL_PREVIEW_BOTS = [
"facebookexternalhit",
"twitterbot",
"linkedinbot",
"slackbot",
"discordbot",
"telegrambot",
"whatsapp",
"pinterest",
"opengraph",
"opengraphbot",
"embedly",
"vkshare",
"quora link preview",
"redditbot",
"rogerbot",
"showyoubot",
];
// Search engine crawlers that need correct canonical URLs in raw HTML
// These bots may not render JavaScript or check raw HTML first
const SEARCH_ENGINE_BOTS = [
"googlebot",
"bingbot",
"yandexbot",
"duckduckbot",
"baiduspider",
"sogou",
"yahoo! slurp",
"applebot",
];
// AI crawlers that should get raw content, not OG previews
const AI_CRAWLERS = [
"gptbot",
"chatgpt",
"chatgpt-user",
"oai-searchbot",
"claude-web",
"claudebot",
"anthropic",
"anthropic-ai",
"ccbot",
"perplexitybot",
"perplexity",
"cohere-ai",
"bytespider",
"googleother",
"google-extended",
];
// Check if user agent is a social preview bot
function isSocialPreviewBot(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
}
// Check if user agent is an AI crawler
function isAICrawler(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return AI_CRAWLERS.some((bot) => ua.includes(bot));
}
// Check if user agent is a search engine bot
function isSearchEngineBot(userAgent: string | null): boolean {
if (!userAgent) return false;
const ua = userAgent.toLowerCase();
return SEARCH_ENGINE_BOTS.some((bot) => ua.includes(bot));
}
export default async function handler(
request: Request,
context: Context,
): Promise<Response> {
const url = new URL(request.url);
// HARD BYPASS: Never intercept these paths regardless of user agent
// This is the first check to guarantee static files are served directly
if (
url.pathname.startsWith("/raw/") ||
url.pathname.startsWith("/assets/") ||
url.pathname.startsWith("/api/") ||
url.pathname.startsWith("/.netlify/") ||
url.pathname.endsWith(".md") ||
url.pathname.endsWith(".xml") ||
url.pathname.endsWith(".txt") ||
url.pathname.endsWith(".yaml") ||
url.pathname.endsWith(".json") ||
url.pathname.endsWith(".svg") ||
url.pathname.endsWith(".ico") ||
url.pathname.endsWith(".png") ||
url.pathname.endsWith(".jpg") ||
url.pathname.endsWith(".jpeg") ||
url.pathname.endsWith(".gif") ||
url.pathname.endsWith(".webp") ||
url.pathname.endsWith(".css") ||
url.pathname.endsWith(".js")
) {
return context.next();
}
const userAgent = request.headers.get("user-agent");
// Let AI crawlers through to normal content - they need raw data, not OG previews
if (isAICrawler(userAgent)) {
return context.next();
}
// Only intercept post pages for bots
const pathParts = url.pathname.split("/").filter(Boolean);
// Skip home page and any path with a file extension
if (pathParts.length === 0 || pathParts[0].includes(".")) {
return context.next();
}
// Serve pre-rendered HTML with correct canonical URLs to social preview and search engine bots
if (!isSocialPreviewBot(userAgent) && !isSearchEngineBot(userAgent)) {
return context.next();
}
// For social preview bots, fetch the Open Graph metadata from Convex
const slug = pathParts[0];
const convexUrl =
Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");
if (!convexUrl) {
return context.next();
}
try {
// Construct the Convex site URL for the HTTP endpoint
const convexSiteUrl = convexUrl.replace(".cloud", ".site");
const metaUrl = `${convexSiteUrl}/meta/post?slug=${encodeURIComponent(slug)}`;
const response = await fetch(metaUrl, {
headers: {
Accept: "text/html",
},
});
if (response.ok) {
const html = await response.text();
return new Response(html, {
headers: {
"Content-Type": "text/html; charset=utf-8",
"Cache-Control": "public, max-age=60, s-maxage=300",
},
});
}
// If meta endpoint fails, fall back to SPA
return context.next();
} catch {
return context.next();
}
}