chengjie 2 months ago
parent
commit
6cc84ce9cb
1 changed files with 73 additions and 4 deletions
  1. 73 4
      src/web_crawler/crawle_english.js

+ 73 - 4
src/web_crawler/crawle_english.js

@@ -14,6 +14,53 @@ const __dirname = path.dirname(__filename);
14 14
 const WEB_URL=`https://novelhi.com`;
15 15
 const WEB_URL_INDEX=`/s/index/`;
16 16
 
17
+const title = "Throne-of-Magical-Arcana";
18
+const author = "Cuttlefish That Loves Diving";
19
+const coverName = "cover.jpg";
20
+
21
+/**
22
+ * 检查章节文件是否已存在
23
+ * @param {string} contentDir - 章节内容目录
24
+ * @param {object} chapter - 章节对象
25
+ * @returns {boolean} - 章节文件是否存在
26
+ */
27
+function isChapterFileExists(contentDir, chapter) {
28
+    try {
29
+        const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
30
+        const chapterFilePath = path.join(contentDir, chapterFileName);
31
+        return fs.existsSync(chapterFilePath);
32
+    } catch (error) {
33
+        console.error(`检查章节文件是否存在时出错: ${error.message}`);
34
+        return false;
35
+    }
36
+}
37
+
38
+/**
39
+ * 从已存在的章节文件中加载内容
40
+ * @param {string} contentDir - 章节内容目录
41
+ * @param {object} chapter - 章节对象
42
+ * @returns {string|null} - 章节内容或null
43
+ */
44
+function loadExistingChapterContent(contentDir, chapter) {
45
+    try {
46
+        const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
47
+        const chapterFilePath = path.join(contentDir, chapterFileName);
48
+        
49
+        if (fs.existsSync(chapterFilePath)) {
50
+            const htmlContent = fs.readFileSync(chapterFilePath, 'utf-8');
51
+            // 提取正文内容
52
+            const contentMatch = htmlContent.match(/<body>\s*<h1[^>]*>.*?<\/h1>\s*([\s\S]*?)\s*<\/body>/);
53
+            if (contentMatch) {
54
+                return contentMatch[1];
55
+            }
56
+        }
57
+        return null;
58
+    } catch (error) {
59
+        console.error(`加载已存在章节内容时出错: ${error.message}`);
60
+        return null;
61
+    }
62
+}
63
+
17 64
 /**
18 65
  * 爬取单个章节内容
19 66
  * @param {string} baseUrl - 网站基础URL
@@ -241,6 +288,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
241 288
                     console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
242 289
                     await setTimeout(2000);
243 290
                 }
291
+                
292
+                // 检查章节文件是否已存在
293
+                if (isChapterFileExists(contentDir, chapter)) {
294
+                    console.log(`章节 ${chapter.title} 已存在,跳过爬取`);
295
+                    // 加载已存在的章节内容
296
+                    const existingContent = loadExistingChapterContent(contentDir, chapter);
297
+                    if (existingContent) {
298
+                        chapters[i].content = existingContent;
299
+                        contentsObj[chapter.title] = existingContent;
300
+                        console.log(`已加载章节 ${chapter.title} 的现有内容`);
301
+                    }
302
+                    continue;
303
+                }
244 304
 
245 305
                 // 爬取章节内容
246 306
                 const result = await fetchChapterContent(baseUrl, chapter.url, headers);
@@ -323,6 +383,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
323 383
                         const { index, chapter } = failedItem;
324 384
                         console.log(`重试章节: ${chapter.title}`);
325 385
                         
386
+                        // 检查章节文件是否已存在
387
+                        if (isChapterFileExists(contentDir, chapter)) {
388
+                            console.log(`章节 ${chapter.title} 已存在,跳过重试爬取`);
389
+                            // 加载已存在的章节内容
390
+                            const existingContent = loadExistingChapterContent(contentDir, chapter);
391
+                            if (existingContent) {
392
+                                chapters[index].content = existingContent;
393
+                                contentsObj[chapter.title] = existingContent;
394
+                                console.log(`已加载章节 ${chapter.title} 的现有内容`);
395
+                                continue;
396
+                            }
397
+                        }
398
+                        
326 399
                         // 重新爬取章节内容
327 400
                         const result = await fetchChapterContent(baseUrl, chapter.url, headers);
328 401
                         
@@ -807,10 +880,6 @@ async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTit
807 880
 
808 881
 
809 882
 
810
-const title = "The-Legendary-Mechanic";
811
-const author = "Qi Peijia";
812
-const coverName = "cover.jpg";
813
-
814 883
 let sourceDir = path.join(__dirname, title + '/' + title + '_contents');
815 884
 let outputFile = path.join(__dirname, title + '/' + title + '.txt');
816 885
 let outputFile2 = path.join(__dirname, title + '/' + title + '.html');