6 months ago · 6cc84ce9cb
--- a/src/web_crawler/crawle_english.js
+++ b/src/web_crawler/crawle_english.js
@@ -14,6 +14,53 @@ const __dirname = path.dirname(__filename);
 
				 const WEB_URL=`https://novelhi.com`;
			
 
				 const WEB_URL_INDEX=`/s/index/`;
			
 
				 
			
 
				+const title = "Throne-of-Magical-Arcana";
			
 
				+const author = "Cuttlefish That Loves Diving";
			
 
				+const coverName = "cover.jpg";
			
 
				+
			
 
				+/**
			
 
				+ * 检查章节文件是否已存在
			
 
				+ * @param {string} contentDir - 章节内容目录
			
 
				+ * @param {object} chapter - 章节对象
			
 
				+ * @returns {boolean} - 章节文件是否存在
			
 
				+ */
			
 
				+function isChapterFileExists(contentDir, chapter) {
			
 
				+    try {
			
 
				+        const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
			
 
				+        const chapterFilePath = path.join(contentDir, chapterFileName);
			
 
				+        return fs.existsSync(chapterFilePath);
			
 
				+    } catch (error) {
			
 
				+        console.error(`检查章节文件是否存在时出错: ${error.message}`);
			
 
				+        return false;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * 从已存在的章节文件中加载内容
			
 
				+ * @param {string} contentDir - 章节内容目录
			
 
				+ * @param {object} chapter - 章节对象
			
 
				+ * @returns {string|null} - 章节内容或null
			
 
				+ */
			
 
				+function loadExistingChapterContent(contentDir, chapter) {
			
 
				+    try {
			
 
				+        const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
			
 
				+        const chapterFilePath = path.join(contentDir, chapterFileName);
			
 
				+        
			
 
				+        if (fs.existsSync(chapterFilePath)) {
			
 
				+            const htmlContent = fs.readFileSync(chapterFilePath, 'utf-8');
			
 
				+            // 提取正文内容
			
 
				+            const contentMatch = htmlContent.match(/<body>\s*<h1[^>]*>.*?<\/h1>\s*([\s\S]*?)\s*<\/body>/);
			
 
				+            if (contentMatch) {
			
 
				+                return contentMatch[1];
			
 
				+            }
			
 
				+        }
			
 
				+        return null;
			
 
				+    } catch (error) {
			
 
				+        console.error(`加载已存在章节内容时出错: ${error.message}`);
			
 
				+        return null;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * 爬取单个章节内容
			
 
				  * @param {string} baseUrl - 网站基础URL
			
@@ -241,6 +288,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
 
				                     console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节，暂停 2 秒...`);
			
 
				                     await setTimeout(2000);
			
 
				                 }
			
 
				+                
			
 
				+                // 检查章节文件是否已存在
			
 
				+                if (isChapterFileExists(contentDir, chapter)) {
			
 
				+                    console.log(`章节 ${chapter.title} 已存在，跳过爬取`);
			
 
				+                    // 加载已存在的章节内容
			
 
				+                    const existingContent = loadExistingChapterContent(contentDir, chapter);
			
 
				+                    if (existingContent) {
			
 
				+                        chapters[i].content = existingContent;
			
 
				+                        contentsObj[chapter.title] = existingContent;
			
 
				+                        console.log(`已加载章节 ${chapter.title} 的现有内容`);
			
 
				+                    }
			
 
				+                    continue;
			
 
				+                }
			
 
				 
			
 
				                 // 爬取章节内容
			
 
				                 const result = await fetchChapterContent(baseUrl, chapter.url, headers);
			
@@ -323,6 +383,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
 
				                         const { index, chapter } = failedItem;
			
 
				                         console.log(`重试章节: ${chapter.title}`);
			
 
				                         
			
 
				+                        // 检查章节文件是否已存在
			
 
				+                        if (isChapterFileExists(contentDir, chapter)) {
			
 
				+                            console.log(`章节 ${chapter.title} 已存在，跳过重试爬取`);
			
 
				+                            // 加载已存在的章节内容
			
 
				+                            const existingContent = loadExistingChapterContent(contentDir, chapter);
			
 
				+                            if (existingContent) {
			
 
				+                                chapters[index].content = existingContent;
			
 
				+                                contentsObj[chapter.title] = existingContent;
			
 
				+                                console.log(`已加载章节 ${chapter.title} 的现有内容`);
			
 
				+                                continue;
			
 
				+                            }
			
 
				+                        }
			
 
				+                        
			
 
				                         // 重新爬取章节内容
			
 
				                         const result = await fetchChapterContent(baseUrl, chapter.url, headers);
			
 
				                         
			
@@ -807,10 +880,6 @@ async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTit
 
				 
			
 
				 
			
 
				 
			
 
				-const title = "The-Legendary-Mechanic";
			
 
				-const author = "Qi Peijia";
			
 
				-const coverName = "cover.jpg";
			
 
				-
			
 
				 let sourceDir = path.join(__dirname, title + '/' + title + '_contents');
			
 
				 let outputFile = path.join(__dirname, title + '/' + title + '.txt');
			
 
				 let outputFile2 = path.join(__dirname, title + '/' + title + '.html');