wiki/netlify/edge-functions/botMeta.ts

import type { Context } from "@netlify/edge-functions";

// =============================================================================
// BOT DETECTION CONFIGURATION
// =============================================================================
// Customize these arrays to control which bots receive pre-rendered HTML
// with correct SEO meta tags (canonical URLs, Open Graph, etc.)
//
// - SOCIAL_PREVIEW_BOTS: Bots that generate link previews (Twitter, Slack, etc.)
// - SEARCH_ENGINE_BOTS: Search engine crawlers for SEO (Google, Bing, etc.)
// - AI_CRAWLERS: AI agents that should get the raw SPA (can render JavaScript)
//
// How it works:
// - Social + Search bots -> Pre-rendered HTML with correct canonical/meta tags
// - AI crawlers -> Normal SPA (they can render JavaScript and want raw content)
// - Regular browsers -> Normal SPA (React updates meta tags client-side)
// =============================================================================

// Social preview bots that need OG metadata HTML
// These bots cannot render JavaScript and need pre-rendered OG tags
const SOCIAL_PREVIEW_BOTS = [
  "facebookexternalhit",
  "twitterbot",
  "linkedinbot",
  "slackbot",
  "discordbot",
  "telegrambot",
  "whatsapp",
  "pinterest",
  "opengraph",
  "opengraphbot",
  "embedly",
  "vkshare",
  "quora link preview",
  "redditbot",
  "rogerbot",
  "showyoubot",
];

// Search engine crawlers that need correct canonical URLs in raw HTML
// These bots may not render JavaScript or check raw HTML first
const SEARCH_ENGINE_BOTS = [
  "googlebot",
  "bingbot",
  "yandexbot",
  "duckduckbot",
  "baiduspider",
  "sogou",
  "yahoo! slurp",
  "applebot",
];

// AI crawlers that should get raw content, not OG previews
const AI_CRAWLERS = [
  "gptbot",
  "chatgpt",
  "chatgpt-user",
  "oai-searchbot",
  "claude-web",
  "claudebot",
  "anthropic",
  "anthropic-ai",
  "ccbot",
  "perplexitybot",
  "perplexity",
  "cohere-ai",
  "bytespider",
  "googleother",
  "google-extended",
];

// Check if user agent is a social preview bot
function isSocialPreviewBot(userAgent: string | null): boolean {
  if (!userAgent) return false;
  const ua = userAgent.toLowerCase();
  return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
}

// Check if user agent is an AI crawler
function isAICrawler(userAgent: string | null): boolean {
  if (!userAgent) return false;
  const ua = userAgent.toLowerCase();
  return AI_CRAWLERS.some((bot) => ua.includes(bot));
}

// Check if user agent is a search engine bot
function isSearchEngineBot(userAgent: string | null): boolean {
  if (!userAgent) return false;
  const ua = userAgent.toLowerCase();
  return SEARCH_ENGINE_BOTS.some((bot) => ua.includes(bot));
}

export default async function handler(
  request: Request,
  context: Context,
): Promise<Response> {
  const url = new URL(request.url);

  // HARD BYPASS: Never intercept these paths regardless of user agent
  // This is the first check to guarantee static files are served directly
  if (
    url.pathname.startsWith("/raw/") ||
    url.pathname.startsWith("/assets/") ||
    url.pathname.startsWith("/api/") ||
    url.pathname.startsWith("/.netlify/") ||
    url.pathname.endsWith(".md") ||
    url.pathname.endsWith(".xml") ||
    url.pathname.endsWith(".txt") ||
    url.pathname.endsWith(".yaml") ||
    url.pathname.endsWith(".json") ||
    url.pathname.endsWith(".svg") ||
    url.pathname.endsWith(".ico") ||
    url.pathname.endsWith(".png") ||
    url.pathname.endsWith(".jpg") ||
    url.pathname.endsWith(".jpeg") ||
    url.pathname.endsWith(".gif") ||
    url.pathname.endsWith(".webp") ||
    url.pathname.endsWith(".css") ||
    url.pathname.endsWith(".js")
  ) {
    return context.next();
  }

  const userAgent = request.headers.get("user-agent");

  // Let AI crawlers through to normal content - they need raw data, not OG previews
  if (isAICrawler(userAgent)) {
    return context.next();
  }

  // Only intercept post pages for bots
  const pathParts = url.pathname.split("/").filter(Boolean);

  // Skip home page and any path with a file extension
  if (pathParts.length === 0 || pathParts[0].includes(".")) {
    return context.next();
  }

  // Serve pre-rendered HTML with correct canonical URLs to social preview and search engine bots
  if (!isSocialPreviewBot(userAgent) && !isSearchEngineBot(userAgent)) {
    return context.next();
  }

  // For social preview bots, fetch the Open Graph metadata from Convex
  const slug = pathParts[0];
  const convexUrl =
    Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");

  if (!convexUrl) {
    return context.next();
  }

  try {
    // Construct the Convex site URL for the HTTP endpoint
    const convexSiteUrl = convexUrl.replace(".cloud", ".site");
    const metaUrl = `${convexSiteUrl}/meta/post?slug=${encodeURIComponent(slug)}`;

    const response = await fetch(metaUrl, {
      headers: {
        Accept: "text/html",
      },
    });

    if (response.ok) {
      const html = await response.text();
      return new Response(html, {
        headers: {
          "Content-Type": "text/html; charset=utf-8",
          "Cache-Control": "public, max-age=60, s-maxage=300",
        },
      });
    }

    // If meta endpoint fails, fall back to SPA
    return context.next();
  } catch {
    return context.next();
  }
}