Files
wiki/scripts/import-url.ts

153 lines
4.4 KiB
TypeScript
Raw Normal View History

import fs from "fs";
import path from "path";
import FirecrawlApp from "@mendable/firecrawl-js";
import dotenv from "dotenv";
// Load environment variables
dotenv.config({ path: ".env.local" });
const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY;
if (!FIRECRAWL_API_KEY) {
console.error("Error: FIRECRAWL_API_KEY not found in .env.local");
console.log("\nTo set up Firecrawl:");
console.log("1. Get an API key from https://firecrawl.dev");
console.log("2. Add FIRECRAWL_API_KEY=fc-xxx to your .env.local file");
process.exit(1);
}
const firecrawl = new FirecrawlApp({ apiKey: FIRECRAWL_API_KEY });
// Generate a URL-safe slug from a title
function generateSlug(title: string): string {
return title
.toLowerCase()
.replace(/[^a-z0-9\s-]/g, "") // Remove special characters
.replace(/\s+/g, "-") // Replace spaces with hyphens
.replace(/-+/g, "-") // Remove consecutive hyphens
.replace(/^-|-$/g, "") // Remove leading/trailing hyphens
.substring(0, 60); // Limit length
}
// Clean up markdown content
function cleanMarkdown(content: string): string {
return content
.replace(/^\s+|\s+$/g, "") // Trim whitespace
.replace(/\n{3,}/g, "\n\n"); // Remove excessive newlines
}
async function importFromUrl(url: string) {
console.log(`\nScraping: ${url}`);
console.log("This may take a moment...\n");
try {
const result = await firecrawl.scrapeUrl(url, {
formats: ["markdown"],
});
if (!result.success) {
console.error("Failed to scrape URL");
console.error("Error:", result.error || "Unknown error");
process.exit(1);
}
const title = result.metadata?.title || "Imported Post";
const description = result.metadata?.description || "";
const content = cleanMarkdown(result.markdown || "");
if (!content) {
console.error("No content found at URL");
process.exit(1);
}
// Generate slug from title
const baseSlug = generateSlug(title);
const slug = baseSlug || `imported-${Date.now()}`;
// Get today's date
const today = new Date().toISOString().split("T")[0];
// Create markdown file with frontmatter
const markdown = `---
title: "${title.replace(/"/g, '\\"')}"
description: "${description.replace(/"/g, '\\"')}"
date: "${today}"
slug: "${slug}"
published: false
tags: ["imported"]
---
${content}
---
*Originally published at [${new URL(url).hostname}](${url})*
`;
// Ensure content/blog directory exists
const blogDir = path.join(process.cwd(), "content", "blog");
if (!fs.existsSync(blogDir)) {
fs.mkdirSync(blogDir, { recursive: true });
}
// Write the file
const filePath = path.join(blogDir, `${slug}.md`);
// Check if file already exists
if (fs.existsSync(filePath)) {
console.warn(`Warning: File already exists at ${filePath}`);
console.warn("Adding timestamp to filename to avoid overwrite.");
const newSlug = `${slug}-${Date.now()}`;
const newFilePath = path.join(blogDir, `${newSlug}.md`);
fs.writeFileSync(
newFilePath,
markdown.replace(`slug: "${slug}"`, `slug: "${newSlug}"`),
);
console.log(`\nCreated: ${newFilePath}`);
console.log(`Slug: ${newSlug}`);
} else {
fs.writeFileSync(filePath, markdown);
console.log(`\nCreated: ${filePath}`);
console.log(`Slug: ${slug}`);
}
console.log(`Title: ${title}`);
console.log(`Status: Draft (published: false)`);
console.log("\nNext steps:");
console.log("1. Review and edit the imported content");
console.log("2. Set published: true when ready");
console.log("3. Run: npm run sync");
} catch (error) {
console.error("Error importing URL:", error);
process.exit(1);
}
}
// Parse command line arguments
const url = process.argv[2];
if (!url) {
console.log("Firecrawl Content Importer");
console.log("==========================\n");
console.log("Usage: npm run import <url>\n");
console.log("Example:");
console.log(" npm run import https://example.com/article\n");
console.log("This will:");
console.log(" 1. Scrape the URL and convert to markdown");
console.log(" 2. Create a draft post in content/blog/");
console.log(" 3. You can then review, edit, and sync\n");
process.exit(0);
}
// Validate URL
try {
new URL(url);
} catch {
console.error("Error: Invalid URL provided");
console.log("Please provide a valid URL starting with http:// or https://");
process.exit(1);
}
importFromUrl(url);