mirror of
https://github.com/waynesutton/markdown-site.git
synced 2026-01-12 04:09:14 +00:00
fix(edge): let AI crawlers bypass botMeta OG interception for /raw/*.md
This commit is contained in:
18
changelog.md
18
changelog.md
@@ -4,6 +4,24 @@ All notable changes to this project will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [1.24.5] - 2025-12-23
|
||||
|
||||
### Fixed
|
||||
|
||||
- AI crawlers (ChatGPT, Perplexity) can now fetch raw markdown from `/raw/*.md` URLs
|
||||
- Added explicit `/raw/*` redirect passthrough in `netlify.toml` before SPA fallback
|
||||
- Expanded `excludedPath` array to cover all static file patterns
|
||||
- Refactored `botMeta.ts` edge function:
|
||||
- Added hard bypass at top of handler for static file paths
|
||||
- Separated social preview bots from AI crawlers
|
||||
- AI crawlers (GPTBot, ClaudeBot, PerplexityBot, etc.) now bypass OG interception
|
||||
- Only social preview bots (Facebook, Twitter, LinkedIn, etc.) receive OG metadata HTML
|
||||
|
||||
### Technical
|
||||
|
||||
- `netlify.toml`: Added `force = true` to `/raw/*` redirect, expanded `excludedPath` array
|
||||
- `botMeta.ts`: Complete refactor with `SOCIAL_PREVIEW_BOTS` and `AI_CRAWLERS` lists, hard path bypass
|
||||
|
||||
## [1.24.4] - 2025-12-23
|
||||
|
||||
### Added
|
||||
|
||||
36
netlify.toml
36
netlify.toml
@@ -5,6 +5,20 @@
|
||||
[build.environment]
|
||||
NODE_VERSION = "20"
|
||||
|
||||
# Raw markdown passthrough - explicit rule prevents SPA fallback from intercepting
|
||||
[[redirects]]
|
||||
from = "/raw/*"
|
||||
to = "/raw/:splat"
|
||||
status = 200
|
||||
force = true
|
||||
|
||||
# Static files passthrough
|
||||
[[redirects]]
|
||||
from = "/assets/*"
|
||||
to = "/assets/:splat"
|
||||
status = 200
|
||||
force = true
|
||||
|
||||
# SPA fallback for client-side routing (must be last)
|
||||
[[redirects]]
|
||||
from = "/*"
|
||||
@@ -44,11 +58,23 @@
|
||||
path = "/api/geo"
|
||||
function = "geo"
|
||||
|
||||
# Open Graph bot detection (excludes raw markdown directory)
|
||||
# Open Graph bot detection for social preview cards only
|
||||
# Excludes raw markdown, static assets, and AI-consumable files
|
||||
[[edge_functions]]
|
||||
path = "/*"
|
||||
function = "botMeta"
|
||||
excludedPath = "/raw/*"
|
||||
excludedPath = [
|
||||
"/raw/*",
|
||||
"/assets/*",
|
||||
"/api/*",
|
||||
"/.netlify/*",
|
||||
"/favicon.ico",
|
||||
"/favicon.svg",
|
||||
"/robots.txt",
|
||||
"/sitemap.xml",
|
||||
"/llms.txt",
|
||||
"/openapi.yaml"
|
||||
]
|
||||
|
||||
# Security and SEO headers
|
||||
[[headers]]
|
||||
@@ -60,12 +86,13 @@
|
||||
Referrer-Policy = "strict-origin-when-cross-origin"
|
||||
Link = "</llms.txt>; rel=\"author\""
|
||||
|
||||
# Raw markdown files with proper content-type
|
||||
# Raw markdown files - AI friendly headers
|
||||
[[headers]]
|
||||
for = "/raw/*.md"
|
||||
for = "/raw/*"
|
||||
[headers.values]
|
||||
Content-Type = "text/markdown; charset=utf-8"
|
||||
Access-Control-Allow-Origin = "*"
|
||||
Cache-Control = "public, max-age=3600"
|
||||
X-Robots-Tag = "noindex"
|
||||
|
||||
[context.production.environment]
|
||||
@@ -76,4 +103,3 @@
|
||||
|
||||
[context.branch-deploy.environment]
|
||||
NODE_ENV = "development"
|
||||
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import type { Context } from "@netlify/edge-functions";
|
||||
|
||||
// List of known social media and search engine bots
|
||||
const BOTS = [
|
||||
// Social preview bots that need OG metadata HTML
|
||||
// These bots cannot render JavaScript and need pre-rendered OG tags
|
||||
const SOCIAL_PREVIEW_BOTS = [
|
||||
"facebookexternalhit",
|
||||
"twitterbot",
|
||||
"linkedinbot",
|
||||
@@ -12,24 +13,45 @@ const BOTS = [
|
||||
"pinterest",
|
||||
"opengraph",
|
||||
"opengraphbot",
|
||||
"bot ",
|
||||
"crawler",
|
||||
"embedly",
|
||||
"vkshare",
|
||||
"quora link preview",
|
||||
"redditbot",
|
||||
"rogerbot",
|
||||
"showyoubot",
|
||||
"google",
|
||||
"bingbot",
|
||||
"baiduspider",
|
||||
"duckduckbot",
|
||||
];
|
||||
|
||||
function isBot(userAgent: string | null): boolean {
|
||||
// AI crawlers that should get raw content, not OG previews
|
||||
const AI_CRAWLERS = [
|
||||
"gptbot",
|
||||
"chatgpt",
|
||||
"chatgpt-user",
|
||||
"oai-searchbot",
|
||||
"claude-web",
|
||||
"claudebot",
|
||||
"anthropic",
|
||||
"anthropic-ai",
|
||||
"ccbot",
|
||||
"perplexitybot",
|
||||
"perplexity",
|
||||
"cohere-ai",
|
||||
"bytespider",
|
||||
"googleother",
|
||||
"google-extended",
|
||||
];
|
||||
|
||||
// Check if user agent is a social preview bot
|
||||
function isSocialPreviewBot(userAgent: string | null): boolean {
|
||||
if (!userAgent) return false;
|
||||
const ua = userAgent.toLowerCase();
|
||||
return BOTS.some((bot) => ua.includes(bot));
|
||||
return SOCIAL_PREVIEW_BOTS.some((bot) => ua.includes(bot));
|
||||
}
|
||||
|
||||
// Check if user agent is an AI crawler
|
||||
function isAICrawler(userAgent: string | null): boolean {
|
||||
if (!userAgent) return false;
|
||||
const ua = userAgent.toLowerCase();
|
||||
return AI_CRAWLERS.some((bot) => ua.includes(bot));
|
||||
}
|
||||
|
||||
export default async function handler(
|
||||
@@ -37,28 +59,53 @@ export default async function handler(
|
||||
context: Context,
|
||||
): Promise<Response> {
|
||||
const url = new URL(request.url);
|
||||
const userAgent = request.headers.get("user-agent");
|
||||
|
||||
// Only intercept post pages for bots
|
||||
const pathParts = url.pathname.split("/").filter(Boolean);
|
||||
|
||||
// Skip if it's the home page, static assets, API routes, or raw markdown files
|
||||
// HARD BYPASS: Never intercept these paths regardless of user agent
|
||||
// This is the first check to guarantee static files are served directly
|
||||
if (
|
||||
pathParts.length === 0 ||
|
||||
pathParts[0].includes(".") ||
|
||||
pathParts[0] === "api" ||
|
||||
pathParts[0] === "_next" ||
|
||||
pathParts[0] === "raw"
|
||||
url.pathname.startsWith("/raw/") ||
|
||||
url.pathname.startsWith("/assets/") ||
|
||||
url.pathname.startsWith("/api/") ||
|
||||
url.pathname.startsWith("/.netlify/") ||
|
||||
url.pathname.endsWith(".md") ||
|
||||
url.pathname.endsWith(".xml") ||
|
||||
url.pathname.endsWith(".txt") ||
|
||||
url.pathname.endsWith(".yaml") ||
|
||||
url.pathname.endsWith(".json") ||
|
||||
url.pathname.endsWith(".svg") ||
|
||||
url.pathname.endsWith(".ico") ||
|
||||
url.pathname.endsWith(".png") ||
|
||||
url.pathname.endsWith(".jpg") ||
|
||||
url.pathname.endsWith(".jpeg") ||
|
||||
url.pathname.endsWith(".gif") ||
|
||||
url.pathname.endsWith(".webp") ||
|
||||
url.pathname.endsWith(".css") ||
|
||||
url.pathname.endsWith(".js")
|
||||
) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// If not a bot, continue to the SPA
|
||||
if (!isBot(userAgent)) {
|
||||
const userAgent = request.headers.get("user-agent");
|
||||
|
||||
// Let AI crawlers through to normal content - they need raw data, not OG previews
|
||||
if (isAICrawler(userAgent)) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// For bots, fetch the Open Graph metadata from Convex
|
||||
// Only intercept post pages for bots
|
||||
const pathParts = url.pathname.split("/").filter(Boolean);
|
||||
|
||||
// Skip home page and any path with a file extension
|
||||
if (pathParts.length === 0 || pathParts[0].includes(".")) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// Only serve OG metadata to social preview bots, not search engines or AI
|
||||
if (!isSocialPreviewBot(userAgent)) {
|
||||
return context.next();
|
||||
}
|
||||
|
||||
// For social preview bots, fetch the Open Graph metadata from Convex
|
||||
const slug = pathParts[0];
|
||||
const convexUrl =
|
||||
Deno.env.get("VITE_CONVEX_URL") || Deno.env.get("CONVEX_URL");
|
||||
|
||||
Reference in New Issue
Block a user