месяцев назад: 2 · a0aefa52be
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,5 @@ src/web_crawler/Zhui-Xu/
 
				 src/web_crawler/Hidden-Assassin/
			
 
				 src/web_crawler/Release-that-Witch/
			
 
				 src/web_crawler/Strange-Life-of-a-Cat/
			
 
				-src/web_crawler/Throne-of-Magical-Arcana/
			
 
				+src/web_crawler/Throne-of-Magical-Arcana/
			
 
				+src/web_crawler/The-Legendary-Mechanic/
			
--- a/src/web_crawler/crawle_english.js
+++ b/src/web_crawler/crawle_english.js
@@ -0,0 +1,917 @@
 
				+import fs from 'fs';
			
 
				+import axios from 'axios';
			
 
				+import * as cheerio from 'cheerio';
			
 
				+import path from 'path';
			
 
				+import { setTimeout } from 'timers/promises';
			
 
				+import { fileURLToPath } from 'url';
			
 
				+import JSZip from 'jszip';
			
 
				+import { v4 as uuidv4 } from 'uuid';
			
 
				+import { JSDOM } from 'jsdom';
			
 
				+
			
 
				+const __filename = fileURLToPath(import.meta.url);
			
 
				+const __dirname = path.dirname(__filename);
			
 
				+
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * 爬取单个章节内容
			
 
				+ * @param {string} baseUrl - 网站基础URL
			
 
				+ * @param {string} chapterUrl - 章节URL
			
 
				+ * @param {object} headers - 请求头
			
 
				+ * @returns {Promise<{content: string|null, error: string|null}>} - 章节内容或错误信息
			
 
				+ */
			
 
				+async function fetchChapterContent(baseUrl, chapterUrl, headers) {
			
 
				+    try {
			
 
				+        // 如果URL不是以http开头，则添加baseUrl
			
 
				+        const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
			
 
				+
			
 
				+        const response = await axios.get(fullUrl, { headers });
			
 
				+
			
 
				+        if (response.status !== 200) {
			
 
				+            const errorMsg = `获取章节内容失败，状态码: ${response.status}`;
			
 
				+            console.error(errorMsg);
			
 
				+            return { content: null, error: errorMsg };
			
 
				+        }
			
 
				+
			
 
				+        const $ = cheerio.load(response.data);
			
 
				+
			
 
				+        // 尝试多种选择器来获取章节内容
			
 
				+        const contentSelectors = [
			
 
				+            '.chapter-content',
			
 
				+            '.article-content',
			
 
				+            '.content',
			
 
				+            '#content',
			
 
				+            '.text-content',
			
 
				+            '.chapter-text',
			
 
				+            '.novel-content'
			
 
				+        ];
			
 
				+
			
 
				+        let content = null;
			
 
				+
			
 
				+        for (const selector of contentSelectors) {
			
 
				+            const element = $(selector);
			
 
				+            if (element.length > 0) {
			
 
				+                content = element.html();
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // 如果上面的选择器都没找到内容，尝试查找包含大量文本的元素
			
 
				+        if (!content) {
			
 
				+            let maxTextLength = 0;
			
 
				+            let maxTextElement = null;
			
 
				+
			
 
				+            $('div, article, section, p').each((_, element) => {
			
 
				+                const text = $(element).text().trim();
			
 
				+                if (text.length > maxTextLength && text.length > 500) {
			
 
				+                    maxTextLength = text.length;
			
 
				+                    maxTextElement = element;
			
 
				+                }
			
 
				+            });
			
 
				+
			
 
				+            if (maxTextElement) {
			
 
				+                content = $(maxTextElement).html();
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (!content) {
			
 
				+            return { content: null, error: "未能找到章节内容" };
			
 
				+        }
			
 
				+
			
 
				+        return { content, error: null };
			
 
				+    } catch (error) {
			
 
				+        const errorMsg = `爬取章节内容出错: ${error.message}`;
			
 
				+        console.error(errorMsg);
			
 
				+        return { content: null, error: errorMsg };
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 爬取小说章节目录
			
 
				+ * @param {string} title - 小说标题
			
 
				+ * @param {string} outputDir - 输出目录
			
 
				+ * @param {boolean} fetchContent - 是否爬取章节内容
			
 
				+ * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
			
 
				+ */
			
 
				+async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = false) {
			
 
				+    const url = `https://novelhi.com/s/index/` + title;
			
 
				+    console.log(`正在爬取网址: ${url}`);
			
 
				+
			
 
				+    try {
			
 
				+        // 设置请求头，模拟浏览器行为
			
 
				+        const headers = {
			
 
				+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
			
 
				+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
			
 
				+            'Accept-Language': 'en-US,en;q=0.5',
			
 
				+            'Connection': 'keep-alive',
			
 
				+            'Upgrade-Insecure-Requests': '1',
			
 
				+            'Cache-Control': 'max-age=0'
			
 
				+        };
			
 
				+
			
 
				+        const response = await axios.get(url, { headers });
			
 
				+        console.log(`请求状态码: ${response.status}`);
			
 
				+
			
 
				+        const $ = cheerio.load(response.data);
			
 
				+
			
 
				+        // 提取章节目录
			
 
				+        const chapters = [];
			
 
				+
			
 
				+        // 尝试多种选择器
			
 
				+        const selectors = [
			
 
				+            '.chapter-list li',
			
 
				+            '.book-catalog-list a',
			
 
				+            '.catalog-list li',
			
 
				+            '.chapter-item',
			
 
				+            '.chapter a',
			
 
				+            'ul.chapters li',
			
 
				+            '.book-chapters a',
			
 
				+            '.novel-chapters a',
			
 
				+            'span:contains("Chapter")'
			
 
				+        ];
			
 
				+
			
 
				+        for (const selector of selectors) {
			
 
				+            console.log(`尝试选择器: ${selector}`);
			
 
				+            const elements = $(selector);
			
 
				+            console.log(`找到 ${elements.length} 个元素`);
			
 
				+
			
 
				+            if (elements.length > 0) {
			
 
				+                elements.each((index, element) => {
			
 
				+                    let chapterTitle, chapterUrl;
			
 
				+
			
 
				+                    if (selector === 'span:contains("Chapter")') {
			
 
				+                        chapterTitle = $(element).text().trim();
			
 
				+                        // 对于这个网站，我们可能需要构造章节URL
			
 
				+                        chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
			
 
				+                    } else if (selector.includes('a')) {
			
 
				+                        chapterTitle = $(element).text().trim();
			
 
				+                        chapterUrl = $(element).attr('href');
			
 
				+                    } else {
			
 
				+                        chapterTitle = $(element).find('a').text().trim();
			
 
				+                        chapterUrl = $(element).find('a').attr('href');
			
 
				+                    }
			
 
				+
			
 
				+                    if (chapterTitle) {
			
 
				+                        chapters.push({
			
 
				+                            title: chapterTitle,
			
 
				+                            url: chapterUrl || '#',
			
 
				+                            index: index + 1
			
 
				+                        });
			
 
				+                    }
			
 
				+                });
			
 
				+
			
 
				+                if (chapters.length > 0) {
			
 
				+                    console.log(`使用选择器 ${selector} 成功找到章节`);
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // 如果上面的选择器都没有找到章节，尝试更通用的方法
			
 
				+        if (chapters.length === 0) {
			
 
				+            console.log('尝试查找所有可能的章节链接...');
			
 
				+
			
 
				+            // 查找所有包含"chapter"或"第"字样的链接或文本
			
 
				+            $('*').each((index, element) => {
			
 
				+                const text = $(element).text().trim();
			
 
				+
			
 
				+                if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
			
 
				+                    // 检查是否是单独的章节标题（不包含其他章节）
			
 
				+                    if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
			
 
				+                        chapters.push({
			
 
				+                            title: text,
			
 
				+                            url: '#',  // 如果没有URL，使用占位符
			
 
				+                            index: index + 1
			
 
				+                        });
			
 
				+                    }
			
 
				+                }
			
 
				+            });
			
 
				+        }
			
 
				+
			
 
				+        console.log(`共找到 ${chapters.length} 个章节`);
			
 
				+
			
 
				+        // 将结果保存到文件中
			
 
				+        const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
			
 
				+        fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
			
 
				+        console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
			
 
				+
			
 
				+        // 打印前10个章节和后10个章节
			
 
				+        if (chapters.length <= 20) {
			
 
				+            console.log("章节目录:");
			
 
				+            console.log(JSON.stringify(chapters, null, 2));
			
 
				+        } else {
			
 
				+            console.log("前10个章节:");
			
 
				+            console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
			
 
				+
			
 
				+            console.log("...");
			
 
				+
			
 
				+            console.log("后10个章节:");
			
 
				+            console.log(JSON.stringify(chapters.slice(-10), null, 2));
			
 
				+        }
			
 
				+
			
 
				+        // 如果需要爬取章节内容
			
 
				+        if (fetchContent && chapters.length > 0) {
			
 
				+            console.log(`开始爬取章节内容...`);
			
 
				+
			
 
				+            // 创建章节内容目录
			
 
				+            const contentDir = path.join(outputDir, `${title}_contents`);
			
 
				+            if (!fs.existsSync(contentDir)) {
			
 
				+                fs.mkdirSync(contentDir, { recursive: true });
			
 
				+            }
			
 
				+
			
 
				+            // 创建一个包含所有章节内容的对象
			
 
				+            const contentsObj = {};
			
 
				+
			
 
				+            // 设置基础URL
			
 
				+            const baseUrl = 'https://novelhi.com';
			
 
				+
			
 
				+            // 爬取所有章节
			
 
				+            const chaptersToFetch = chapters;
			
 
				+            console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
			
 
				+
			
 
				+            // 记录失败的章节
			
 
				+            const failedChapters = [];
			
 
				+            
			
 
				+            // 爬取章节内容
			
 
				+            for (let i = 0; i < chaptersToFetch.length; i++) {
			
 
				+                const chapter = chaptersToFetch[i];
			
 
				+
			
 
				+                // 每爬取10个章节，暂停一下，避免请求过于频繁
			
 
				+                if (i > 0 && i % 10 === 0) {
			
 
				+                    console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节，暂停 2 秒...`);
			
 
				+                    await setTimeout(2000);
			
 
				+                }
			
 
				+
			
 
				+                // 爬取章节内容
			
 
				+                const result = await fetchChapterContent(baseUrl, chapter.url, headers);
			
 
				+
			
 
				+                if (result.content) {
			
 
				+                    // 更新章节对象，添加内容
			
 
				+                    chapters[i].content = result.content;
			
 
				+                    contentsObj[chapter.title] = result.content;
			
 
				+
			
 
				+                    // 将章节内容保存到单独的文件
			
 
				+                    const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
			
 
				+                    const chapterFilePath = path.join(contentDir, chapterFileName);
			
 
				+
			
 
				+                    // 创建一个完整的HTML文件
			
 
				+                    const htmlContent = `<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <title>${chapter.title}</title>
			
 
				+    <style>
			
 
				+        body {
			
 
				+            font-family: Arial, sans-serif;
			
 
				+            line-height: 1.6;
			
 
				+            margin: 0 auto;
			
 
				+            max-width: 800px;
			
 
				+            padding: 20px;
			
 
				+        }
			
 
				+        h1 {
			
 
				+            text-align: center;
			
 
				+            margin-bottom: 30px;
			
 
				+        }
			
 
				+        p {
			
 
				+            text-indent: 2em;
			
 
				+            margin-bottom: 1em;
			
 
				+        }
			
 
				+    </style>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <h1>${chapter.title}</h1>
			
 
				+    ${result.content}
			
 
				+</body>
			
 
				+</html>`;
			
 
				+
			
 
				+                    fs.writeFileSync(chapterFilePath, htmlContent);
			
 
				+
			
 
				+                    if (i % 10 === 0 || i === chapters.length - 1) {
			
 
				+                        console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
			
 
				+                    }
			
 
				+                } else {
			
 
				+                    console.error(`获取章节 ${chapter.title} 内容失败: ${result.error}`);
			
 
				+                    // 记录失败的章节
			
 
				+                    failedChapters.push({
			
 
				+                        index: i,
			
 
				+                        chapter: chapter,
			
 
				+                        error: result.error
			
 
				+                    });
			
 
				+                }
			
 
				+            }
			
 
				+            
			
 
				+            // 如果有失败的章节，尝试重新爬取
			
 
				+            if (failedChapters.length > 0) {
			
 
				+                console.log(`首次爬取完成，有 ${failedChapters.length} 个章节失败，开始重试...`);
			
 
				+                
			
 
				+                // 保存失败章节记录
			
 
				+                const failedChaptersPath = path.join(outputDir, `${title}_failed_chapters.json`);
			
 
				+                fs.writeFileSync(failedChaptersPath, JSON.stringify(failedChapters, null, 2));
			
 
				+                console.log(`已将失败章节信息保存到 ${failedChaptersPath}`);
			
 
				+                
			
 
				+                // 重试失败的章节，最多重试3次
			
 
				+                for (let retry = 0; retry < 3 && failedChapters.length > 0; retry++) {
			
 
				+                    console.log(`第 ${retry + 1} 次重试，剩余 ${failedChapters.length} 个失败章节`);
			
 
				+                    
			
 
				+                    // 等待一段时间再重试
			
 
				+                    await setTimeout(5000);
			
 
				+                    
			
 
				+                    // 创建一个新的失败章节数组，用于记录本次重试后仍然失败的章节
			
 
				+                    const stillFailedChapters = [];
			
 
				+                    
			
 
				+                    for (const failedItem of failedChapters) {
			
 
				+                        const { index, chapter } = failedItem;
			
 
				+                        console.log(`重试章节: ${chapter.title}`);
			
 
				+                        
			
 
				+                        // 重新爬取章节内容
			
 
				+                        const result = await fetchChapterContent(baseUrl, chapter.url, headers);
			
 
				+                        
			
 
				+                        if (result.content) {
			
 
				+                            // 更新章节对象，添加内容
			
 
				+                            chapters[index].content = result.content;
			
 
				+                            contentsObj[chapter.title] = result.content;
			
 
				+                            
			
 
				+                            // 将章节内容保存到单独的文件
			
 
				+                            const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
			
 
				+                            const chapterFilePath = path.join(contentDir, chapterFileName);
			
 
				+                            
			
 
				+                            // 创建一个完整的HTML文件
			
 
				+                            const htmlContent = `<!DOCTYPE html>
			
 
				+<html>
			
 
				+<head>
			
 
				+    <meta charset="UTF-8">
			
 
				+    <title>${chapter.title}</title>
			
 
				+    <style>
			
 
				+        body {
			
 
				+            font-family: Arial, sans-serif;
			
 
				+            line-height: 1.6;
			
 
				+            margin: 0 auto;
			
 
				+            max-width: 800px;
			
 
				+            padding: 20px;
			
 
				+        }
			
 
				+        h1 {
			
 
				+            text-align: center;
			
 
				+            margin-bottom: 30px;
			
 
				+        }
			
 
				+        p {
			
 
				+            text-indent: 2em;
			
 
				+            margin-bottom: 1em;
			
 
				+        }
			
 
				+    </style>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <h1>${chapter.title}</h1>
			
 
				+    ${result.content}
			
 
				+</body>
			
 
				+</html>`;
			
 
				+                            
			
 
				+                            fs.writeFileSync(chapterFilePath, htmlContent);
			
 
				+                            console.log(`重试成功: ${chapter.title}`);
			
 
				+                        } else {
			
 
				+                            console.error(`重试失败: ${chapter.title}, 错误: ${result.error}`);
			
 
				+                            stillFailedChapters.push({
			
 
				+                                index,
			
 
				+                                chapter,
			
 
				+                                error: result.error
			
 
				+                            });
			
 
				+                        }
			
 
				+                        
			
 
				+                        // 每次重试后暂停一下
			
 
				+                        await setTimeout(2000);
			
 
				+                    }
			
 
				+                    
			
 
				+                    // 更新失败章节列表
			
 
				+                    failedChapters.length = 0;
			
 
				+                    failedChapters.push(...stillFailedChapters);
			
 
				+                    
			
 
				+                    // 更新失败章节记录文件
			
 
				+                    fs.writeFileSync(failedChaptersPath, JSON.stringify(failedChapters, null, 2));
			
 
				+                    console.log(`第 ${retry + 1} 次重试后，还有 ${failedChapters.length} 个章节失败`);
			
 
				+                }
			
 
				+                
			
 
				+                // 最终检查是否所有章节都成功爬取
			
 
				+                if (failedChapters.length > 0) {
			
 
				+                    console.warn(`警告: 经过多次重试后，仍有 ${failedChapters.length} 个章节未能成功爬取`);
			
 
				+                    // 将最终失败的章节信息保存到文件
			
 
				+                    const finalFailedPath = path.join(outputDir, `${title}_final_failed_chapters.json`);
			
 
				+                    fs.writeFileSync(finalFailedPath, JSON.stringify(failedChapters, null, 2));
			
 
				+                    console.log(`已将最终失败章节信息保存到 ${finalFailedPath}`);
			
 
				+                } else {
			
 
				+                    console.log(`所有章节都已成功爬取!`);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            // 保存所有章节内容到一个文件
			
 
				+            const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
			
 
				+            fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
			
 
				+            console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
			
 
				+
			
 
				+            // 更新章节目录文件，包含内容
			
 
				+            const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
			
 
				+            fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
			
 
				+            console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
			
 
				+            
			
 
				+            // 检查是否有最终失败的章节
			
 
				+            const finalFailedPath = path.join(outputDir, `${title}_final_failed_chapters.json`);
			
 
				+            if (fs.existsSync(finalFailedPath)) {
			
 
				+                try {
			
 
				+                    const failedChapters = JSON.parse(fs.readFileSync(finalFailedPath, 'utf-8'));
			
 
				+                    if (failedChapters && failedChapters.length > 0) {
			
 
				+                        return {
			
 
				+                            chapters,
			
 
				+                            failedChapters,
			
 
				+                            success: false,
			
 
				+                            message: `有 ${failedChapters.length} 个章节未能成功爬取`
			
 
				+                        };
			
 
				+                    }
			
 
				+                } catch (err) {
			
 
				+                    console.error("读取失败章节文件出错:", err);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        return {
			
 
				+            chapters,
			
 
				+            failedChapters: [],
			
 
				+            success: true,
			
 
				+            message: "所有章节爬取成功"
			
 
				+        };
			
 
				+    } catch (err) {
			
 
				+        console.error("爬取过程中出错:", err.message);
			
 
				+        return { errcode: 101, errStr: err.message };
			
 
				+    } finally {
			
 
				+        console.log("完成");
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+async function mergeChapterFiles(sourceDir, outputFile) {
			
 
				+    try {
			
 
				+        const SEPARATOR = '\n******************\n';
			
 
				+        const { readdir, readFile, writeFile } = fs.promises;
			
 
				+
			
 
				+        // 获取所有HTML文件并按章节顺序排序
			
 
				+        const files = (await readdir(sourceDir))
			
 
				+            .filter(file => file.endsWith('.html'))
			
 
				+            .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
			
 
				+
			
 
				+        if (files.length === 0) {
			
 
				+            console.error('未找到任何章节文件');
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        let mergedContent = '';
			
 
				+
			
 
				+        // 处理每个章节文件
			
 
				+        for (const file of files) {
			
 
				+            const filePath = path.join(sourceDir, file);
			
 
				+            const html = await readFile(filePath, 'utf-8');
			
 
				+
			
 
				+            // 提取章节标题
			
 
				+            const titleMatch = file.match(/_([^\.]+)\.html$/);
			
 
				+            const title = titleMatch ? titleMatch[1] : file;
			
 
				+
			
 
				+            // 提取正文内容
			
 
				+            const contentMatch = html.match(/<div id="showReading"[^>]*>([\s\S]*?)<\/div>/);
			
 
				+            if (!contentMatch) continue;
			
 
				+
			
 
				+            let content = contentMatch[1]
			
 
				+                .replace(/<sent[^>]*>/g, '')
			
 
				+                .replace(/<\/sent>/g, '')
			
 
				+                .replace(/<br>/g, '\n')
			
 
				+                .replace(/<[^>]+>/g, '')
			
 
				+                .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
			
 
				+                .replace(/\n{3,}/g, '\n\n');
			
 
				+
			
 
				+            // 添加到合并内容
			
 
				+            mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
			
 
				+            console.log(`已处理: ${file}`);
			
 
				+        }
			
 
				+
			
 
				+        // 格式化合并后的内容
			
 
				+        const formattedContent = mergedContent;
			
 
				+
			
 
				+        // 写入合并文件
			
 
				+        await writeFile(outputFile, formattedContent);
			
 
				+        console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
			
 
				+        console.log(`共合并了 ${files.length} 个章节`);
			
 
				+
			
 
				+    } catch (error) {
			
 
				+        console.error('合并章节时出错:', error);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 格式化章节内容
			
 
				+ * @param {string} content - 原始文本内容
			
 
				+ * @returns {string} - 格式化后的HTML内容
			
 
				+ */
			
 
				+async function formatChapterContent(content) {
			
 
				+    // 将章节标题替换为<h2>
			
 
				+    content = content.replace(/Chapter \d+/g, match => `<h2>${match}</h2>`);
			
 
				+
			
 
				+    // 将正文段落用<p>包裹
			
 
				+    content = content.replace(/\n\n/g, '</p><p>');
			
 
				+    content = `<p>${content}</p>`;
			
 
				+
			
 
				+    return content;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 处理文本文件，为章节标题添加h2标签，为段落添加p标签
			
 
				+ * @param {string} filePath - 文本文件路径
			
 
				+ * @param {string} outputPath - 输出文件路径
			
 
				+ * @returns {Promise<void>}
			
 
				+ */
			
 
				+async function formatTextFile(filePath, outputPath) {
			
 
				+    try {
			
 
				+        // 读取文本文件
			
 
				+        const content = await fs.promises.readFile(filePath, 'utf-8');
			
 
				+        console.log(`已读取文件: ${filePath}`);
			
 
				+
			
 
				+        // 分割成章节
			
 
				+        const chapters = content.split(/Chapter \d+/).filter(Boolean);
			
 
				+        console.log(`检测到 ${chapters.length} 个章节内容块`);
			
 
				+
			
 
				+        let formattedContent = '';
			
 
				+        let chapterIndex = 1;
			
 
				+
			
 
				+        // 处理每个章节
			
 
				+        for (const chapter of chapters) {
			
 
				+            // 添加章节标题
			
 
				+            formattedContent += `<h2>Chapter ${chapterIndex}</h2>\n`;
			
 
				+
			
 
				+            // 处理章节内容，将段落用<p>标签包裹
			
 
				+            const paragraphs = chapter.trim().split(/\n\s*\n/);
			
 
				+            for (const paragraph of paragraphs) {
			
 
				+                if (paragraph.trim()) {
			
 
				+                    formattedContent += `<p>${paragraph.trim()}</p>\n`;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            chapterIndex++;
			
 
				+        }
			
 
				+
			
 
				+        // 写入输出文件
			
 
				+        await fs.promises.writeFile(outputPath, formattedContent);
			
 
				+        console.log(`格式化完成! 结果已保存到: ${outputPath}`);
			
 
				+        console.log(`共处理了 ${chapters.length} 个章节`);
			
 
				+
			
 
				+    } catch (error) {
			
 
				+        console.error('处理文本文件时出错:', error);
			
 
				+    }
			
 
				+}
			
 
				+/**
			
 
				+ * 生成 EPUB 电子书
			
 
				+ * @param {string} contentFilePath - HTML 格式的正文文件路径
			
 
				+ * @param {string} coverImagePath - 封面图片路径
			
 
				+ * @param {string} outputPath - 输出 EPUB 文件路径
			
 
				+ * @param {string} bookTitle - 电子书标题
			
 
				+ * @param {string} author - 作者名称
			
 
				+ * @returns {Promise<void>}
			
 
				+ */
			
 
				+async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
			
 
				+    try {
			
 
				+        // 读取 HTML 内容
			
 
				+        const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
			
 
				+        const dom = new JSDOM(htmlContent);
			
 
				+        const document = dom.window.document;
			
 
				+
			
 
				+        // 提取章节 (h2 标签)
			
 
				+        const chapterElements = document.querySelectorAll('h2');
			
 
				+        const chapters = [];
			
 
				+
			
 
				+        // 处理每个章节
			
 
				+        chapterElements.forEach((chapterElement, index) => {
			
 
				+            const title = chapterElement.textContent.trim();
			
 
				+            let content = '';
			
 
				+
			
 
				+            // 收集当前章节的所有段落，直到下一个 h2 或文档结束
			
 
				+            let currentElement = chapterElement.nextElementSibling;
			
 
				+            while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
			
 
				+                if (currentElement.tagName.toLowerCase() === 'p') {
			
 
				+                    // 每个 p 标签作为独立段落，用 <p> 标签包裹并添加样式类
			
 
				+                    content += `<p>${currentElement.innerHTML}</p>\n`;
			
 
				+                }
			
 
				+                currentElement = currentElement.nextElementSibling;
			
 
				+            }
			
 
				+
			
 
				+            chapters.push({ title, content });
			
 
				+        });
			
 
				+
			
 
				+        // 读取封面图片
			
 
				+        const coverImage = fs.readFileSync(coverImagePath);
			
 
				+
			
 
				+        // 创建 EPUB 容器
			
 
				+        const zip = new JSZip();
			
 
				+
			
 
				+        // 添加 mimetype 文件（必须是第一个文件，且不压缩）
			
 
				+        zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
			
 
				+
			
 
				+        // 创建 META-INF 目录
			
 
				+        const metaInf = zip.folder('META-INF');
			
 
				+        metaInf.file('container.xml', `<?xml version="1.0"?>
			
 
				+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
			
 
				+    <rootfiles>
			
 
				+        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
			
 
				+    </rootfiles>
			
 
				+</container>`);
			
 
				+
			
 
				+        // 创建 OEBPS 目录
			
 
				+        const oebps = zip.folder('OEBPS');
			
 
				+
			
 
				+        // 添加封面图片
			
 
				+        oebps.file('Images/cover.jpg', coverImage);
			
 
				+
			
 
				+        // 生成封面页 XHTML
			
 
				+        const coverXhtml = `<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
			
 
				+<html xmlns="http://www.w3.org/1999/xhtml">
			
 
				+<head>
			
 
				+    <title>封面</title>
			
 
				+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
			
 
				+    <meta name="calibre:cover" content="true"/>
			
 
				+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
			
 
				+    <style type="text/css">
			
 
				+        body {
			
 
				+            margin: 0;
			
 
				+            padding: 0;
			
 
				+            text-align: center;
			
 
				+        }
			
 
				+        img {
			
 
				+            max-width: 100%;
			
 
				+            height: auto;
			
 
				+            margin: 0;
			
 
				+            padding: 0;
			
 
				+        }
			
 
				+    </style>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <div class="cover-container">
			
 
				+        <img src="../Images/cover.jpg" alt="封面"/>
			
 
				+    </div>
			
 
				+</body>
			
 
				+</html>`;
			
 
				+
			
 
				+        const textFolder = oebps.folder('Text');
			
 
				+        textFolder.file('cover.xhtml', coverXhtml);
			
 
				+
			
 
				+        // 生成章节 HTML 文件
			
 
				+        const spineItems = [
			
 
				+            { idref: 'cover', linear: 'no' }
			
 
				+        ];
			
 
				+        const manifestItems = [
			
 
				+            { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' },
			
 
				+            { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
			
 
				+            { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
			
 
				+        ];
			
 
				+
			
 
				+        chapters.forEach((chapter, index) => {
			
 
				+            const { title, content } = chapter;
			
 
				+            const chapterId = `chapter_${index}`;
			
 
				+            const chapterFileName = `${chapterId}.xhtml`;
			
 
				+
			
 
				+            const chapterHtml = `<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
			
 
				+<html xmlns="http://www.w3.org/1999/xhtml">
			
 
				+<head>
			
 
				+    <title>${title}</title>
			
 
				+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
			
 
				+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <h2 class="chapter-title">${title}</h2>
			
 
				+    <div class="chapter-content">
			
 
				+        ${content}
			
 
				+    </div>
			
 
				+</body>
			
 
				+</html>`;
			
 
				+
			
 
				+            textFolder.file(chapterFileName, chapterHtml);
			
 
				+            manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
			
 
				+            spineItems.push({ idref: chapterId });
			
 
				+        });
			
 
				+
			
 
				+        // 生成目录文件
			
 
				+        const tocHtml = `<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
			
 
				+<html xmlns="http://www.w3.org/1999/xhtml">
			
 
				+<head>
			
 
				+    <title>目录</title>
			
 
				+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
			
 
				+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
			
 
				+</head>
			
 
				+<body>
			
 
				+    <h2 class="toc-title">目录</h2>
			
 
				+    <ol class="toc-list">
			
 
				+        ${chapters.map((chapter, index) => {
			
 
				+            return `<li class="toc-item"><a href="chapter_${index}.xhtml">${chapter.title}</a></li>`;
			
 
				+        }).join('\n')}
			
 
				+    </ol>
			
 
				+</body>
			
 
				+</html>`;
			
 
				+
			
 
				+        textFolder.file('toc.xhtml', tocHtml);
			
 
				+        manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
			
 
				+
			
 
				+        // 添加 CSS 文件
			
 
				+        const cssFolder = oebps.folder('Styles');
			
 
				+        const csspath = path.join(__dirname, 'epub_styles.css');
			
 
				+        console.log("🚀 ~ generateEpub ~ csspath:", csspath)
			
 
				+
			
 
				+        const cssContent = fs.readFileSync(csspath, 'utf-8');
			
 
				+        cssFolder.file('stylesheet.css', cssContent);
			
 
				+        manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
			
 
				+
			
 
				+        // 生成唯一标识符
			
 
				+        const bookUUID = uuidv4();
			
 
				+
			
 
				+        // 生成 toc.ncx 文件
			
 
				+        const tocNcx = `<?xml version='1.0' encoding='utf-8'?>
			
 
				+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="zh-CN">
			
 
				+  <head>
			
 
				+    <meta content="${bookUUID}" name="dtb:uid"/>
			
 
				+    <meta content="2" name="dtb:depth"/>
			
 
				+    <meta content="0" name="dtb:totalPageCount"/>
			
 
				+    <meta content="0" name="dtb:maxPageNumber"/>
			
 
				+  </head>
			
 
				+  <docTitle>
			
 
				+    <text>${bookTitle}</text>
			
 
				+  </docTitle>
			
 
				+  <navMap>
			
 
				+    <navPoint id="navpoint-0" playOrder="0">
			
 
				+      <navLabel>
			
 
				+        <text>封面</text>
			
 
				+      </navLabel>
			
 
				+      <content src="Text/cover.xhtml"/>
			
 
				+    </navPoint>
			
 
				+    <navPoint id="navpoint-1" playOrder="1">
			
 
				+      <navLabel>
			
 
				+        <text>目录</text>
			
 
				+      </navLabel>
			
 
				+      <content src="Text/toc.xhtml"/>
			
 
				+    </navPoint>
			
 
				+    ${chapters.map((chapter, index) => {
			
 
				+            return `<navPoint id="navpoint-${index + 2}" playOrder="${index + 2}">
			
 
				+      <navLabel>
			
 
				+        <text>${chapter.title}</text>
			
 
				+      </navLabel>
			
 
				+      <content src="Text/chapter_${index}.xhtml"/>
			
 
				+    </navPoint>`;
			
 
				+        }).join('\n')}
			
 
				+  </navMap>
			
 
				+</ncx>`;
			
 
				+
			
 
				+        oebps.file('toc.ncx', tocNcx);
			
 
				+
			
 
				+        // 生成 content.opf 文件
			
 
				+        const contentOpf = `<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="uuid_id">
			
 
				+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
			
 
				+        <dc:identifier id="uuid_id">urn:uuid:${bookUUID}</dc:identifier>
			
 
				+        <dc:title>${bookTitle}</dc:title>
			
 
				+        <dc:creator>${author}</dc:creator>
			
 
				+        <dc:language>zh-CN</dc:language>
			
 
				+        <dc:date>${new Date().toISOString().split('T')[0]}</dc:date>
			
 
				+        <meta name="cover" content="cover-image"/>
			
 
				+    </metadata>
			
 
				+    <manifest>
			
 
				+        ${manifestItems.map(item => `<item id="${item.id}" href="${item.href}" media-type="${item.mediaType}"/>`).join('\n')}
			
 
				+    </manifest>
			
 
				+    <spine toc="ncx">
			
 
				+        <itemref idref="cover"/>
			
 
				+        <itemref idref="toc"/>
			
 
				+        ${spineItems.map(item => `<itemref idref="${item.idref}"/>`).join('\n')}
			
 
				+    </spine>
			
 
				+    <guide>
			
 
				+        <reference type="cover" title="封面" href="Text/cover.xhtml"/>
			
 
				+    </guide>
			
 
				+</package>`;
			
 
				+
			
 
				+        oebps.file('content.opf', contentOpf);
			
 
				+
			
 
				+        // 生成 EPUB 文件
			
 
				+        const epubContent = await zip.generateAsync({
			
 
				+            type: 'nodebuffer',
			
 
				+            compression: 'DEFLATE',
			
 
				+            mimeType: 'application/epub+zip'
			
 
				+        });
			
 
				+        fs.writeFileSync(outputPath, epubContent);
			
 
				+
			
 
				+        console.log(`EPUB 电子书已生成: ${outputPath}`);
			
 
				+    } catch (error) {
			
 
				+        console.error('生成 EPUB 电子书时出错:', error);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+const title = "The-Legendary-Mechanic";
			
 
				+const author = "Qi Peijia";
			
 
				+const coverName = "cover.jpg";
			
 
				+
			
 
				+let sourceDir = path.join(__dirname, title + '/' + title + '_contents');
			
 
				+let outputFile = path.join(__dirname, title + '/' + title + '.txt');
			
 
				+let outputFile2 = path.join(__dirname, title + '/' + title + '.html');
			
 
				+let coverFile = path.join(__dirname, title + '/' + coverName);
			
 
				+let epubFile = path.join(__dirname, title + '/' + title + '.epub');
			
 
				+
			
 
				+console.log(`开始爬取小说: ${title}`);
			
 
				+console.log(`输出目录: ${title}`);
			
 
				+
			
 
				+// 确保输出目录存在
			
 
				+if (!fs.existsSync(title)) {
			
 
				+    fs.mkdirSync(title, { recursive: true });
			
 
				+}
			
 
				+
			
 
				+// 爬取小说内容
			
 
				+let crawlResult;
			
 
				+try {
			
 
				+    crawlResult = await crawleWeb(title, title, true);
			
 
				+    
			
 
				+    if (crawlResult && crawlResult.errcode) {
			
 
				+        console.error(`爬取失败: ${crawlResult.errStr}`);
			
 
				+        process.exit(1);
			
 
				+    } else if (crawlResult && !crawlResult.success) {
			
 
				+        console.error(`爬取完成但存在问题: ${crawlResult.message}`);
			
 
				+        console.error(`有 ${crawlResult.failedChapters.length} 个章节未能成功爬取`);
			
 
				+        process.exit(1);
			
 
				+    } else {
			
 
				+        console.log("爬取任务完成！所有章节爬取成功");
			
 
				+    }
			
 
				+} catch (err) {
			
 
				+    console.error("程序执行出错:", err);
			
 
				+    process.exit(1);
			
 
				+}
			
 
				+
			
 
				+// 检查是否有最终失败的章节
			
 
				+const finalFailedPath = path.join(title, `${title}_final_failed_chapters.json`);
			
 
				+if (fs.existsSync(finalFailedPath)) {
			
 
				+    try {
			
 
				+        const failedChapters = JSON.parse(fs.readFileSync(finalFailedPath, 'utf-8'));
			
 
				+        if (failedChapters && failedChapters.length > 0) {
			
 
				+            console.error(`警告: 有 ${failedChapters.length} 个章节未能成功爬取，不进行电子书生成`);
			
 
				+            console.error('失败的章节:');
			
 
				+            failedChapters.forEach(item => {
			
 
				+                console.error(`- 章节 ${item.chapter.title} (索引: ${item.index}): ${item.error}`);
			
 
				+            });
			
 
				+            process.exit(1);
			
 
				+        }
			
 
				+    } catch (err) {
			
 
				+        console.error("读取失败章节文件出错:", err);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// 检查内容目录是否存在
			
 
				+if (!fs.existsSync(sourceDir)) {
			
 
				+    console.error(`错误: 内容目录 ${sourceDir} 不存在，无法生成电子书`);
			
 
				+    process.exit(1);
			
 
				+}
			
 
				+
			
 
				+// 检查内容目录中的文件数量
			
 
				+const contentFiles = fs.readdirSync(sourceDir).filter(file => file.endsWith('.html'));
			
 
				+if (contentFiles.length === 0) {
			
 
				+    console.error(`错误: 内容目录 ${sourceDir} 中没有HTML文件，无法生成电子书`);
			
 
				+    process.exit(1);
			
 
				+}
			
 
				+
			
 
				+console.log(`开始生成电子书，共有 ${contentFiles.length} 个章节文件`);
			
 
				+
			
 
				+// 合并章节文件
			
 
				+try {
			
 
				+    await mergeChapterFiles(sourceDir, outputFile);
			
 
				+    console.log('合并操作完成');
			
 
				+} catch (err) {
			
 
				+    console.error('合并操作失败:', err);
			
 
				+    process.exit(1);
			
 
				+}
			
 
				+
			
 
				+// 格式化文本文件
			
 
				+try {
			
 
				+    await formatTextFile(outputFile, outputFile2);
			
 
				+} catch (err) {
			
 
				+    console.error('格式化文本文件失败:', err);
			
 
				+    process.exit(1);
			
 
				+}
			
 
				+
			
 
				+// 检查封面文件是否存在
			
 
				+if (!fs.existsSync(coverFile)) {
			
 
				+    console.warn(`警告: 封面文件 ${coverFile} 不存在，将使用默认封面`);
			
 
				+    // 创建一个简单的默认封面
			
 
				+    const defaultCoverPath = path.join(__dirname, 'default_cover.jpg');
			
 
				+    if (fs.existsSync(defaultCoverPath)) {
			
 
				+        coverFile = defaultCoverPath;
			
 
				+    } else {
			
 
				+        console.error('错误: 默认封面文件也不存在，无法生成电子书');
			
 
				+        process.exit(1);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// 生成EPUB电子书
			
 
				+try {
			
 
				+    await generateEpub(outputFile2, coverFile, epubFile, title, author);
			
 
				+    console.log(`电子书生成成功: ${epubFile}`);
			
 
				+} catch (err) {
			
 
				+    console.error('生成EPUB电子书失败:', err);
			
 
				+    process.exit(1);
			
 
				+}