|
|
@@ -12,10 +12,53 @@ import iconv from 'iconv-lite';
|
|
12
|
12
|
const __filename = fileURLToPath(import.meta.url);
|
|
13
|
13
|
const __dirname = path.dirname(__filename);
|
|
14
|
14
|
|
|
15
|
|
-const WEB_URL="https://www.piaotia.com/html/3/3759/";
|
|
16
|
|
-const title = "奥术神座";
|
|
17
|
|
-const author = "爱潜水的乌贼";
|
|
18
|
|
-const coverName = "cover.jpeg";
|
|
|
15
|
+const WEB_URL="https://www.piaotia.com/html/5/5150/";
|
|
|
16
|
+const title = "回到过去变成猫";
|
|
|
17
|
+const author = "陈词懒调";
|
|
|
18
|
+const coverName = "cover.png";
|
|
|
19
|
+
|
|
|
20
|
+/**
|
|
|
21
|
+ * 检查章节文件是否已存在
|
|
|
22
|
+ * @param {string} contentDir - 章节内容目录
|
|
|
23
|
+ * @param {object} chapter - 章节对象
|
|
|
24
|
+ * @returns {boolean} - 章节文件是否存在
|
|
|
25
|
+ */
|
|
|
26
|
+function isChapterFileExists(contentDir, chapter) {
|
|
|
27
|
+ try {
|
|
|
28
|
+ const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
|
|
|
29
|
+ const chapterFilePath = path.join(contentDir, chapterFileName);
|
|
|
30
|
+ return fs.existsSync(chapterFilePath);
|
|
|
31
|
+ } catch (error) {
|
|
|
32
|
+ console.error(`检查章节文件是否存在时出错: ${error.message}`);
|
|
|
33
|
+ return false;
|
|
|
34
|
+ }
|
|
|
35
|
+}
|
|
|
36
|
+
|
|
|
37
|
+/**
|
|
|
38
|
+ * 从已存在的章节文件中加载内容
|
|
|
39
|
+ * @param {string} contentDir - 章节内容目录
|
|
|
40
|
+ * @param {object} chapter - 章节对象
|
|
|
41
|
+ * @returns {string|null} - 章节内容或null
|
|
|
42
|
+ */
|
|
|
43
|
+function loadExistingChapterContent(contentDir, chapter) {
|
|
|
44
|
+ try {
|
|
|
45
|
+ const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
|
|
|
46
|
+ const chapterFilePath = path.join(contentDir, chapterFileName);
|
|
|
47
|
+
|
|
|
48
|
+ if (fs.existsSync(chapterFilePath)) {
|
|
|
49
|
+ const htmlContent = fs.readFileSync(chapterFilePath, 'utf-8');
|
|
|
50
|
+ // 提取正文内容
|
|
|
51
|
+ const contentMatch = htmlContent.match(/<body>\s*<h1[^>]*>.*?<\/h1>\s*([\s\S]*?)\s*<\/body>/);
|
|
|
52
|
+ if (contentMatch) {
|
|
|
53
|
+ return contentMatch[1];
|
|
|
54
|
+ }
|
|
|
55
|
+ }
|
|
|
56
|
+ return null;
|
|
|
57
|
+ } catch (error) {
|
|
|
58
|
+ console.error(`加载已存在章节内容时出错: ${error.message}`);
|
|
|
59
|
+ return null;
|
|
|
60
|
+ }
|
|
|
61
|
+}
|
|
19
|
62
|
|
|
20
|
63
|
/**
|
|
21
|
64
|
* 爬取单个章节内容
|
|
|
@@ -393,6 +436,19 @@ async function crawleWeb(outputDir = 'src/web_crawler', fetchContent = false, ma
|
|
393
|
436
|
await setTimeout(3000);
|
|
394
|
437
|
}
|
|
395
|
438
|
|
|
|
439
|
+ // 检查章节文件是否已存在
|
|
|
440
|
+ if (isChapterFileExists(contentDir, chapter)) {
|
|
|
441
|
+ console.log(`章节 ${chapter.title} 已存在,跳过爬取`);
|
|
|
442
|
+ // 加载已存在的章节内容
|
|
|
443
|
+ const existingContent = loadExistingChapterContent(contentDir, chapter);
|
|
|
444
|
+ if (existingContent) {
|
|
|
445
|
+ chapters[i].content = existingContent;
|
|
|
446
|
+ contentsObj[chapter.title] = existingContent;
|
|
|
447
|
+ console.log(`已加载章节 ${chapter.title} 的现有内容`);
|
|
|
448
|
+ }
|
|
|
449
|
+ continue;
|
|
|
450
|
+ }
|
|
|
451
|
+
|
|
396
|
452
|
// 检查章节URL是否有效
|
|
397
|
453
|
if (!chapter.url || !chapter.url.startsWith('http')) {
|
|
398
|
454
|
console.error(`章节 ${chapter.title} 的URL无效: ${chapter.url}`);
|
|
|
@@ -500,6 +556,19 @@ async function crawleWeb(outputDir = 'src/web_crawler', fetchContent = false, ma
|
|
500
|
556
|
const { index, chapter } = failedItem;
|
|
501
|
557
|
console.log(`重试章节: ${chapter.title}`);
|
|
502
|
558
|
|
|
|
559
|
+ // 检查章节文件是否已存在
|
|
|
560
|
+ if (isChapterFileExists(contentDir, chapter)) {
|
|
|
561
|
+ console.log(`章节 ${chapter.title} 已存在,跳过重试`);
|
|
|
562
|
+ // 加载已存在的章节内容
|
|
|
563
|
+ const existingContent = loadExistingChapterContent(contentDir, chapter);
|
|
|
564
|
+ if (existingContent) {
|
|
|
565
|
+ chapters[index].content = existingContent;
|
|
|
566
|
+ contentsObj[chapter.title] = existingContent;
|
|
|
567
|
+ console.log(`已加载章节 ${chapter.title} 的现有内容`);
|
|
|
568
|
+ }
|
|
|
569
|
+ continue;
|
|
|
570
|
+ }
|
|
|
571
|
+
|
|
503
|
572
|
// 检查章节URL是否有效
|
|
504
|
573
|
if (!chapter.url || !chapter.url.startsWith('http')) {
|
|
505
|
574
|
console.error(`重试章节 ${chapter.title} 的URL无效: ${chapter.url}`);
|