|
|
@@ -14,6 +14,53 @@ const __dirname = path.dirname(__filename);
|
|
14
|
14
|
const WEB_URL=`https://novelhi.com`;
|
|
15
|
15
|
const WEB_URL_INDEX=`/s/index/`;
|
|
16
|
16
|
|
|
|
17
|
+const title = "Throne-of-Magical-Arcana";
|
|
|
18
|
+const author = "Cuttlefish That Loves Diving";
|
|
|
19
|
+const coverName = "cover.jpg";
|
|
|
20
|
+
|
|
|
21
|
+/**
|
|
|
22
|
+ * 检查章节文件是否已存在
|
|
|
23
|
+ * @param {string} contentDir - 章节内容目录
|
|
|
24
|
+ * @param {object} chapter - 章节对象
|
|
|
25
|
+ * @returns {boolean} - 章节文件是否存在
|
|
|
26
|
+ */
|
|
|
27
|
+function isChapterFileExists(contentDir, chapter) {
|
|
|
28
|
+ try {
|
|
|
29
|
+ const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
|
|
|
30
|
+ const chapterFilePath = path.join(contentDir, chapterFileName);
|
|
|
31
|
+ return fs.existsSync(chapterFilePath);
|
|
|
32
|
+ } catch (error) {
|
|
|
33
|
+ console.error(`检查章节文件是否存在时出错: ${error.message}`);
|
|
|
34
|
+ return false;
|
|
|
35
|
+ }
|
|
|
36
|
+}
|
|
|
37
|
+
|
|
|
38
|
+/**
|
|
|
39
|
+ * 从已存在的章节文件中加载内容
|
|
|
40
|
+ * @param {string} contentDir - 章节内容目录
|
|
|
41
|
+ * @param {object} chapter - 章节对象
|
|
|
42
|
+ * @returns {string|null} - 章节内容或null
|
|
|
43
|
+ */
|
|
|
44
|
+function loadExistingChapterContent(contentDir, chapter) {
|
|
|
45
|
+ try {
|
|
|
46
|
+ const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
|
|
|
47
|
+ const chapterFilePath = path.join(contentDir, chapterFileName);
|
|
|
48
|
+
|
|
|
49
|
+ if (fs.existsSync(chapterFilePath)) {
|
|
|
50
|
+ const htmlContent = fs.readFileSync(chapterFilePath, 'utf-8');
|
|
|
51
|
+ // 提取正文内容
|
|
|
52
|
+ const contentMatch = htmlContent.match(/<body>\s*<h1[^>]*>.*?<\/h1>\s*([\s\S]*?)\s*<\/body>/);
|
|
|
53
|
+ if (contentMatch) {
|
|
|
54
|
+ return contentMatch[1];
|
|
|
55
|
+ }
|
|
|
56
|
+ }
|
|
|
57
|
+ return null;
|
|
|
58
|
+ } catch (error) {
|
|
|
59
|
+ console.error(`加载已存在章节内容时出错: ${error.message}`);
|
|
|
60
|
+ return null;
|
|
|
61
|
+ }
|
|
|
62
|
+}
|
|
|
63
|
+
|
|
17
|
64
|
/**
|
|
18
|
65
|
* 爬取单个章节内容
|
|
19
|
66
|
* @param {string} baseUrl - 网站基础URL
|
|
|
@@ -241,6 +288,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
|
|
241
|
288
|
console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
|
|
242
|
289
|
await setTimeout(2000);
|
|
243
|
290
|
}
|
|
|
291
|
+
|
|
|
292
|
+ // 检查章节文件是否已存在
|
|
|
293
|
+ if (isChapterFileExists(contentDir, chapter)) {
|
|
|
294
|
+ console.log(`章节 ${chapter.title} 已存在,跳过爬取`);
|
|
|
295
|
+ // 加载已存在的章节内容
|
|
|
296
|
+ const existingContent = loadExistingChapterContent(contentDir, chapter);
|
|
|
297
|
+ if (existingContent) {
|
|
|
298
|
+ chapters[i].content = existingContent;
|
|
|
299
|
+ contentsObj[chapter.title] = existingContent;
|
|
|
300
|
+ console.log(`已加载章节 ${chapter.title} 的现有内容`);
|
|
|
301
|
+ }
|
|
|
302
|
+ continue;
|
|
|
303
|
+ }
|
|
244
|
304
|
|
|
245
|
305
|
// 爬取章节内容
|
|
246
|
306
|
const result = await fetchChapterContent(baseUrl, chapter.url, headers);
|
|
|
@@ -323,6 +383,19 @@ async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = fa
|
|
323
|
383
|
const { index, chapter } = failedItem;
|
|
324
|
384
|
console.log(`重试章节: ${chapter.title}`);
|
|
325
|
385
|
|
|
|
386
|
+ // 检查章节文件是否已存在
|
|
|
387
|
+ if (isChapterFileExists(contentDir, chapter)) {
|
|
|
388
|
+ console.log(`章节 ${chapter.title} 已存在,跳过重试爬取`);
|
|
|
389
|
+ // 加载已存在的章节内容
|
|
|
390
|
+ const existingContent = loadExistingChapterContent(contentDir, chapter);
|
|
|
391
|
+ if (existingContent) {
|
|
|
392
|
+ chapters[index].content = existingContent;
|
|
|
393
|
+ contentsObj[chapter.title] = existingContent;
|
|
|
394
|
+ console.log(`已加载章节 ${chapter.title} 的现有内容`);
|
|
|
395
|
+ continue;
|
|
|
396
|
+ }
|
|
|
397
|
+ }
|
|
|
398
|
+
|
|
326
|
399
|
// 重新爬取章节内容
|
|
327
|
400
|
const result = await fetchChapterContent(baseUrl, chapter.url, headers);
|
|
328
|
401
|
|
|
|
@@ -807,10 +880,6 @@ async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTit
|
|
807
|
880
|
|
|
808
|
881
|
|
|
809
|
882
|
|
|
810
|
|
-const title = "The-Legendary-Mechanic";
|
|
811
|
|
-const author = "Qi Peijia";
|
|
812
|
|
-const coverName = "cover.jpg";
|
|
813
|
|
-
|
|
814
|
883
|
let sourceDir = path.join(__dirname, title + '/' + title + '_contents');
|
|
815
|
884
|
let outputFile = path.join(__dirname, title + '/' + title + '.txt');
|
|
816
|
885
|
let outputFile2 = path.join(__dirname, title + '/' + title + '.html');
|