import fs from 'fs'; import axios from 'axios'; import * as cheerio from 'cheerio'; import path from 'path'; import { setTimeout } from 'timers/promises'; import { fileURLToPath } from 'url'; import JSZip from 'jszip'; import { v4 as uuidv4 } from 'uuid'; import { JSDOM } from 'jsdom'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); /** * 格式化章节内容 * @param {string} content - 原始文本内容 * @returns {string} - 格式化后的HTML内容 */ async function formatChapterContent(content) { // 将章节标题替换为

content = content.replace(/Chapter \d+/g, match => `

${match}

`); // 将正文段落用

包裹 content = content.replace(/\n\n/g, '

'); content = `

${content}

`; return content; } /** * 处理文本文件,为章节标题添加h2标签,为段落添加p标签 * @param {string} filePath - 文本文件路径 * @param {string} outputPath - 输出文件路径 * @returns {Promise} */ async function formatTextFile(filePath, outputPath) { try { // 读取文本文件 const content = await fs.promises.readFile(filePath, 'utf-8'); console.log(`已读取文件: ${filePath}`); // 分割成章节 const chapters = content.split(/Chapter \d+/).filter(Boolean); console.log(`检测到 ${chapters.length} 个章节内容块`); let formattedContent = ''; let chapterIndex = 1; // 处理每个章节 for (const chapter of chapters) { // 添加章节标题 formattedContent += `

Chapter ${chapterIndex}

\n`; // 处理章节内容,将段落用

标签包裹 const paragraphs = chapter.trim().split(/\n\s*\n/); for (const paragraph of paragraphs) { if (paragraph.trim()) { formattedContent += `

${paragraph.trim()}

\n`; } } chapterIndex++; } // 写入输出文件 await fs.promises.writeFile(outputPath, formattedContent); console.log(`格式化完成! 结果已保存到: ${outputPath}`); console.log(`共处理了 ${chapters.length} 个章节`); } catch (error) { console.error('处理文本文件时出错:', error); } } async function mergeChapterFiles(sourceDir, outputFile) { try { const SEPARATOR = '\n******************\n'; const { readdir, readFile, writeFile } = fs.promises; // 获取所有HTML文件并按章节顺序排序 const files = (await readdir(sourceDir)) .filter(file => file.endsWith('.html')) .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0])); if (files.length === 0) { console.error('未找到任何章节文件'); return; } let mergedContent = ''; // 处理每个章节文件 for (const file of files) { const filePath = path.join(sourceDir, file); const html = await readFile(filePath, 'utf-8'); // 提取章节标题 const titleMatch = file.match(/_([^\.]+)\.html$/); const title = titleMatch ? titleMatch[1] : file; // 提取正文内容 const contentMatch = html.match(/
]*>([\s\S]*?)<\/div>/); if (!contentMatch) continue; let content = contentMatch[1] .replace(/]*>/g, '') .replace(/<\/sent>/g, '') .replace(/
/g, '\n') .replace(/<[^>]+>/g, '') .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '') .replace(/\n{3,}/g, '\n\n'); // 添加到合并内容 mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`; console.log(`已处理: ${file}`); } // 格式化合并后的内容 const formattedContent = mergedContent; // 写入合并文件 await writeFile(outputFile, formattedContent); console.log(`\n合并完成! 结果已保存到: ${outputFile}`); console.log(`共合并了 ${files.length} 个章节`); } catch (error) { console.error('合并章节时出错:', error); } } /** * 爬取单个章节内容 * @param {string} baseUrl - 网站基础URL * @param {string} chapterUrl - 章节URL * @param {object} headers - 请求头 * @param {boolean} debug - 是否开启调试模式 * @returns {Promise} - 章节内容或null */ async function fetchChapterContent(baseUrl, chapterUrl, headers, debug = false) { try { // 如果URL不是以http开头,则添加baseUrl const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`; if (debug) { console.log(`爬取章节内容: ${fullUrl}`); } const response = await axios.get(fullUrl, { headers }); if (response.status !== 200) { console.error(`获取章节内容失败,状态码: ${response.status}`); return null; } const $ = cheerio.load(response.data); // 尝试多种选择器来获取章节内容 const contentSelectors = [ '.chapter-content', '.article-content', '.content', '#content', '.text-content', '.chapter-text', '.novel-content' ]; let content = null; for (const selector of contentSelectors) { const element = $(selector); if (element.length > 0) { content = element.html(); if (debug) { console.log(`使用选择器 ${selector} 成功获取章节内容`); } break; } } // 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素 if (!content) { let maxTextLength = 0; let maxTextElement = null; $('div, article, section, p').each((_, element) => { const text = $(element).text().trim(); if (text.length > maxTextLength && text.length > 500) { maxTextLength = text.length; maxTextElement = element; } }); if (maxTextElement) { content = $(maxTextElement).html(); if (debug) { console.log(`使用最长文本元素获取章节内容,长度: ${maxTextLength}`); } } } return content; } catch (error) { console.error(`爬取章节内容出错: ${error.message}`); return null; } } /** * 爬取小说章节目录 * @param {string} title - 小说标题 * @param {boolean} debug - 是否开启调试模式 * @param {string} outputDir - 输出目录 * @param {boolean} fetchContent - 是否爬取章节内容 * @returns {Promise} - 章节目录数组或错误对象 */ async function runScript(title, debug = false, outputDir = 'src/web_crawler', fetchContent = false){ const url = `https://novelhi.com/s/index/`+title; console.log(`正在爬取网址: ${url}`); try { // 设置请求头,模拟浏览器行为 const headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0' }; const response = await axios.get(url, { headers }); console.log(`请求状态码: ${response.status}`); // 如果开启调试模式,保存HTML内容 if (debug) { const debugFilePath = path.join(outputDir, 'debug_page.html'); fs.writeFileSync(debugFilePath, response.data); console.log(`已保存HTML内容到 ${debugFilePath} 文件`); } const $ = cheerio.load(response.data); // 提取章节目录 const chapters = []; // 如果开启调试模式,打印页面结构信息 if (debug) { console.log('页面标题:', $('title').text()); console.log('页面主要容器数量:'); console.log('- div 元素数量:', $('div').length); console.log('- ul 元素数量:', $('ul').length); console.log('- li 元素数量:', $('li').length); console.log('- a 元素数量:', $('a').length); } // 尝试多种选择器 const selectors = [ '.chapter-list li', '.book-catalog-list a', '.catalog-list li', '.chapter-item', '.chapter a', 'ul.chapters li', '.book-chapters a', '.novel-chapters a', 'span:contains("Chapter")' ]; for (const selector of selectors) { console.log(`尝试选择器: ${selector}`); const elements = $(selector); console.log(`找到 ${elements.length} 个元素`); if (elements.length > 0) { elements.each((index, element) => { let chapterTitle, chapterUrl; if (selector === 'span:contains("Chapter")') { chapterTitle = $(element).text().trim(); // 对于这个网站,我们可能需要构造章节URL chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`; } else if (selector.includes('a')) { chapterTitle = $(element).text().trim(); chapterUrl = $(element).attr('href'); } else { chapterTitle = $(element).find('a').text().trim(); chapterUrl = $(element).find('a').attr('href'); } if (chapterTitle) { chapters.push({ title: chapterTitle, url: chapterUrl || '#', index: index + 1 }); } }); if (chapters.length > 0) { console.log(`使用选择器 ${selector} 成功找到章节`); break; } } } // 如果上面的选择器都没有找到章节,尝试更通用的方法 if (chapters.length === 0) { console.log('尝试查找所有可能的章节链接...'); // 查找所有包含"chapter"或"第"字样的链接或文本 $('*').each((index, element) => { const text = $(element).text().trim(); if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) { // 检查是否是单独的章节标题(不包含其他章节) if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) { chapters.push({ title: text, url: '#', // 如果没有URL,使用占位符 index: index + 1 }); } } }); } console.log(`共找到 ${chapters.length} 个章节`); // 将结果保存到文件中 const outputFilePath = path.join(outputDir, `${title}_chapters.json`); fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2)); console.log(`已将章节目录保存到 ${outputFilePath} 文件`); // 打印前10个章节和后10个章节 if (debug || chapters.length <= 20) { console.log("章节目录:"); console.log(JSON.stringify(chapters, null, 2)); } else { console.log("前10个章节:"); console.log(JSON.stringify(chapters.slice(0, 10), null, 2)); console.log("..."); console.log("后10个章节:"); console.log(JSON.stringify(chapters.slice(-10), null, 2)); } // 如果需要爬取章节内容 if (fetchContent && chapters.length > 0) { console.log(`开始爬取章节内容...`); // 创建章节内容目录 const contentDir = path.join(outputDir, `${title}_contents`); if (!fs.existsSync(contentDir)) { fs.mkdirSync(contentDir, { recursive: true }); } // 创建一个包含所有章节内容的对象 const contentsObj = {}; // 设置基础URL const baseUrl = 'https://novelhi.com'; // 爬取所有章节 const chaptersToFetch = chapters; console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`); // 爬取章节内容 for (let i = 0; i < chaptersToFetch.length; i++) { const chapter = chaptersToFetch[i]; // 每爬取10个章节,暂停一下,避免请求过于频繁 if (i > 0 && i % 10 === 0) { console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`); await setTimeout(2000); } // 爬取章节内容 const content = await fetchChapterContent(baseUrl, chapter.url, headers, debug); if (content) { // 更新章节对象,添加内容 chapters[i].content = content; contentsObj[chapter.title] = content; // 将章节内容保存到单独的文件 const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`; const chapterFilePath = path.join(contentDir, chapterFileName); // 创建一个完整的HTML文件 const htmlContent = ` ${chapter.title}

${chapter.title}

${content} `; fs.writeFileSync(chapterFilePath, htmlContent); if (debug) { console.log(`已保存章节 ${chapter.title} 到 ${chapterFilePath}`); } else if (i % 10 === 0 || i === chapters.length - 1) { console.log(`已保存 ${i + 1}/${chapters.length} 章节`); } } else { console.error(`获取章节 ${chapter.title} 内容失败`); } } // 保存所有章节内容到一个文件 const allContentsPath = path.join(outputDir, `${title}_all_contents.json`); fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2)); console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`); // 更新章节目录文件,包含内容 const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`); fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2)); console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`); } return chapters; } catch (err) { console.error("爬取过程中出错:", err.message); return { errcode: 101, errStr: err.message }; } finally { console.log("完成"); } } /** * 生成 EPUB 电子书 * @param {string} contentFilePath - HTML 格式的正文文件路径 * @param {string} coverImagePath - 封面图片路径 * @param {string} outputPath - 输出 EPUB 文件路径 * @param {string} bookTitle - 电子书标题 * @param {string} author - 作者名称 * @returns {Promise} */ async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) { try { // 读取 HTML 内容 const htmlContent = fs.readFileSync(contentFilePath, 'utf-8'); const dom = new JSDOM(htmlContent); const document = dom.window.document; // 提取章节 (h2 标签) const chapterElements = document.querySelectorAll('h2'); const chapters = []; // 处理每个章节 chapterElements.forEach((chapterElement, index) => { const title = chapterElement.textContent.trim(); let content = ''; // 收集当前章节的所有段落,直到下一个 h2 或文档结束 let currentElement = chapterElement.nextElementSibling; while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') { if (currentElement.tagName.toLowerCase() === 'p') { // 每个 p 标签作为独立段落,用

标签包裹并添加样式类 content += `

${currentElement.innerHTML}

\n`; } currentElement = currentElement.nextElementSibling; } chapters.push({ title, content }); }); // 读取封面图片 const coverImage = fs.readFileSync(coverImagePath); // 创建 EPUB 容器 const zip = new JSZip(); // 添加 mimetype 文件(必须是第一个文件,且不压缩) zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' }); // 创建 META-INF 目录 const metaInf = zip.folder('META-INF'); metaInf.file('container.xml', ` `); // 创建 OEBPS 目录 const oebps = zip.folder('OEBPS'); // 添加封面图片 oebps.file('Images/cover.jpg', coverImage); // 生成封面页 XHTML const coverXhtml = ` 封面
封面
`; const textFolder = oebps.folder('Text'); textFolder.file('cover.xhtml', coverXhtml); // 生成章节 HTML 文件 const spineItems = [ { idref: 'cover', linear: 'no' } ]; const manifestItems = [ { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' }, { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' }, { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' } ]; chapters.forEach((chapter, index) => { const { title, content } = chapter; const chapterId = `chapter_${index}`; const chapterFileName = `${chapterId}.xhtml`; const chapterHtml = ` ${title}

${title}

${content}
`; textFolder.file(chapterFileName, chapterHtml); manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' }); spineItems.push({ idref: chapterId }); }); // 生成目录文件 const tocHtml = ` 目录

目录

    ${chapters.map((chapter, index) => { return `
  1. ${chapter.title}
  2. `; }).join('\n')}
`; textFolder.file('toc.xhtml', tocHtml); manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' }); // 添加 CSS 文件 const cssFolder = oebps.folder('Styles'); const csspath=path.join(__dirname, 'epub_styles.css'); console.log("🚀 ~ generateEpub ~ csspath:", csspath) const cssContent = fs.readFileSync(csspath, 'utf-8'); cssFolder.file('stylesheet.css', cssContent); manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' }); // 生成唯一标识符 const bookUUID = uuidv4(); // 生成 toc.ncx 文件 const tocNcx = ` ${bookTitle} 封面 目录 ${chapters.map((chapter, index) => { return ` ${chapter.title} `; }).join('\n')} `; oebps.file('toc.ncx', tocNcx); // 生成 content.opf 文件 const contentOpf = ` urn:uuid:${bookUUID} ${bookTitle} ${author} zh-CN ${new Date().toISOString().split('T')[0]} ${manifestItems.map(item => ``).join('\n')} ${spineItems.map(item => ``).join('\n')} `; oebps.file('content.opf', contentOpf); // 生成 EPUB 文件 const epubContent = await zip.generateAsync({ type: 'nodebuffer', compression: 'DEFLATE', mimeType: 'application/epub+zip' }); fs.writeFileSync(outputPath, epubContent); console.log(`EPUB 电子书已生成: ${outputPath}`); } catch (error) { console.error('生成 EPUB 电子书时出错:', error); } } /** * 命令行入口 */ //const title = "Zhui-Xu"; const title="Throne-of-Magical-Arcana"; //const title = 'Release-that-Witch'; //const title = 'Strange-Life-of-a-Cat'; //const title = "Hidden-Assassin"; //const author = "Angry Banana"; const author = "Cuttlefish That Loves Diving"; const coverName = "cover.jpg"; // 从命令行参数获取小说标题 const args = process.argv.slice(2); // 显示帮助信息 if (args.includes("--help") || args.includes("-h")) { showHelp(); process.exit(0); } // 获取参数 const debug = args.includes("--debug"); const outputDir = args.find(arg => arg.startsWith("--output="))?.split("=")[1] || title; const fetchContent = 1; console.log(`开始爬取小说: ${title}`); console.log(`调试模式: ${debug ? "开启" : "关闭"}`); console.log(`输出目录: ${outputDir}`); console.log(`爬取章节内容: ${fetchContent ? "是" : "否"}`); // 确保输出目录存在 if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } if (1==1){ await runScript(title, debug, outputDir, fetchContent) .then(result => { if (result && result.errcode) { console.error(`爬取失败: ${result.errStr}`); process.exit(1); } else { console.log("爬取任务完成!"); } }) .catch(err => { console.error("程序执行出错:", err); process.exit(1); }); } let sourceDir = path.join(__dirname, title+'/'+title+'_contents'); let outputFile = path.join(__dirname, title+'/'+title+'.txt'); let outputFile2 = path.join(__dirname, title+'/'+title+'.html'); let coverFile=path.join(__dirname, title+'/'+coverName); let epubFile=path.join(__dirname, title+'/'+title+'.epub'); if (1==1){ await mergeChapterFiles(sourceDir, outputFile) .then(() => console.log('合并操作完成')) .catch(err => console.error('合并操作失败:', err)); await formatTextFile(outputFile, outputFile2); } await generateEpub(outputFile2, coverFile, epubFile, title, author); // 导出函数,以便其他模块使用 export default runScript;