chengjie месяцев назад: 2
Родитель
Сommit
a0aefa52be
2 измененных файлов с 919 добавлено и 1 удалено
  1. 2 1
      .gitignore
  2. 917 0
      src/web_crawler/crawle_english.js

+ 2 - 1
.gitignore

@@ -12,4 +12,5 @@ src/web_crawler/Zhui-Xu/
12 12
 src/web_crawler/Hidden-Assassin/
13 13
 src/web_crawler/Release-that-Witch/
14 14
 src/web_crawler/Strange-Life-of-a-Cat/
15
-src/web_crawler/Throne-of-Magical-Arcana/
15
+src/web_crawler/Throne-of-Magical-Arcana/
16
+src/web_crawler/The-Legendary-Mechanic/

+ 917 - 0
src/web_crawler/crawle_english.js

@@ -0,0 +1,917 @@
1
+import fs from 'fs';
2
+import axios from 'axios';
3
+import * as cheerio from 'cheerio';
4
+import path from 'path';
5
+import { setTimeout } from 'timers/promises';
6
+import { fileURLToPath } from 'url';
7
+import JSZip from 'jszip';
8
+import { v4 as uuidv4 } from 'uuid';
9
+import { JSDOM } from 'jsdom';
10
+
11
+const __filename = fileURLToPath(import.meta.url);
12
+const __dirname = path.dirname(__filename);
13
+
14
+
15
+
16
+/**
17
+ * 爬取单个章节内容
18
+ * @param {string} baseUrl - 网站基础URL
19
+ * @param {string} chapterUrl - 章节URL
20
+ * @param {object} headers - 请求头
21
+ * @returns {Promise<{content: string|null, error: string|null}>} - 章节内容或错误信息
22
+ */
23
+async function fetchChapterContent(baseUrl, chapterUrl, headers) {
24
+    try {
25
+        // 如果URL不是以http开头,则添加baseUrl
26
+        const fullUrl = chapterUrl.startsWith('http') ? chapterUrl : `${baseUrl}${chapterUrl}`;
27
+
28
+        const response = await axios.get(fullUrl, { headers });
29
+
30
+        if (response.status !== 200) {
31
+            const errorMsg = `获取章节内容失败,状态码: ${response.status}`;
32
+            console.error(errorMsg);
33
+            return { content: null, error: errorMsg };
34
+        }
35
+
36
+        const $ = cheerio.load(response.data);
37
+
38
+        // 尝试多种选择器来获取章节内容
39
+        const contentSelectors = [
40
+            '.chapter-content',
41
+            '.article-content',
42
+            '.content',
43
+            '#content',
44
+            '.text-content',
45
+            '.chapter-text',
46
+            '.novel-content'
47
+        ];
48
+
49
+        let content = null;
50
+
51
+        for (const selector of contentSelectors) {
52
+            const element = $(selector);
53
+            if (element.length > 0) {
54
+                content = element.html();
55
+                break;
56
+            }
57
+        }
58
+
59
+        // 如果上面的选择器都没找到内容,尝试查找包含大量文本的元素
60
+        if (!content) {
61
+            let maxTextLength = 0;
62
+            let maxTextElement = null;
63
+
64
+            $('div, article, section, p').each((_, element) => {
65
+                const text = $(element).text().trim();
66
+                if (text.length > maxTextLength && text.length > 500) {
67
+                    maxTextLength = text.length;
68
+                    maxTextElement = element;
69
+                }
70
+            });
71
+
72
+            if (maxTextElement) {
73
+                content = $(maxTextElement).html();
74
+            }
75
+        }
76
+
77
+        if (!content) {
78
+            return { content: null, error: "未能找到章节内容" };
79
+        }
80
+
81
+        return { content, error: null };
82
+    } catch (error) {
83
+        const errorMsg = `爬取章节内容出错: ${error.message}`;
84
+        console.error(errorMsg);
85
+        return { content: null, error: errorMsg };
86
+    }
87
+}
88
+
89
+/**
90
+ * 爬取小说章节目录
91
+ * @param {string} title - 小说标题
92
+ * @param {string} outputDir - 输出目录
93
+ * @param {boolean} fetchContent - 是否爬取章节内容
94
+ * @returns {Promise<Array|Object>} - 章节目录数组或错误对象
95
+ */
96
+async function crawleWeb(title, outputDir = 'src/web_crawler', fetchContent = false) {
97
+    const url = `https://novelhi.com/s/index/` + title;
98
+    console.log(`正在爬取网址: ${url}`);
99
+
100
+    try {
101
+        // 设置请求头,模拟浏览器行为
102
+        const headers = {
103
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
104
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
105
+            'Accept-Language': 'en-US,en;q=0.5',
106
+            'Connection': 'keep-alive',
107
+            'Upgrade-Insecure-Requests': '1',
108
+            'Cache-Control': 'max-age=0'
109
+        };
110
+
111
+        const response = await axios.get(url, { headers });
112
+        console.log(`请求状态码: ${response.status}`);
113
+
114
+        const $ = cheerio.load(response.data);
115
+
116
+        // 提取章节目录
117
+        const chapters = [];
118
+
119
+        // 尝试多种选择器
120
+        const selectors = [
121
+            '.chapter-list li',
122
+            '.book-catalog-list a',
123
+            '.catalog-list li',
124
+            '.chapter-item',
125
+            '.chapter a',
126
+            'ul.chapters li',
127
+            '.book-chapters a',
128
+            '.novel-chapters a',
129
+            'span:contains("Chapter")'
130
+        ];
131
+
132
+        for (const selector of selectors) {
133
+            console.log(`尝试选择器: ${selector}`);
134
+            const elements = $(selector);
135
+            console.log(`找到 ${elements.length} 个元素`);
136
+
137
+            if (elements.length > 0) {
138
+                elements.each((index, element) => {
139
+                    let chapterTitle, chapterUrl;
140
+
141
+                    if (selector === 'span:contains("Chapter")') {
142
+                        chapterTitle = $(element).text().trim();
143
+                        // 对于这个网站,我们可能需要构造章节URL
144
+                        chapterUrl = `/s/${title}/${chapterTitle.replace('Chapter ', '')}`;
145
+                    } else if (selector.includes('a')) {
146
+                        chapterTitle = $(element).text().trim();
147
+                        chapterUrl = $(element).attr('href');
148
+                    } else {
149
+                        chapterTitle = $(element).find('a').text().trim();
150
+                        chapterUrl = $(element).find('a').attr('href');
151
+                    }
152
+
153
+                    if (chapterTitle) {
154
+                        chapters.push({
155
+                            title: chapterTitle,
156
+                            url: chapterUrl || '#',
157
+                            index: index + 1
158
+                        });
159
+                    }
160
+                });
161
+
162
+                if (chapters.length > 0) {
163
+                    console.log(`使用选择器 ${selector} 成功找到章节`);
164
+                    break;
165
+                }
166
+            }
167
+        }
168
+
169
+        // 如果上面的选择器都没有找到章节,尝试更通用的方法
170
+        if (chapters.length === 0) {
171
+            console.log('尝试查找所有可能的章节链接...');
172
+
173
+            // 查找所有包含"chapter"或"第"字样的链接或文本
174
+            $('*').each((index, element) => {
175
+                const text = $(element).text().trim();
176
+
177
+                if (text && (text.includes('Chapter') || text.includes('第') || text.includes('章'))) {
178
+                    // 检查是否是单独的章节标题(不包含其他章节)
179
+                    if (text.match(/^Chapter \d+$/) || text.match(/^第[一二三四五六七八九十百千万]+章/) || text.match(/^\d+\.\s+.+$/)) {
180
+                        chapters.push({
181
+                            title: text,
182
+                            url: '#',  // 如果没有URL,使用占位符
183
+                            index: index + 1
184
+                        });
185
+                    }
186
+                }
187
+            });
188
+        }
189
+
190
+        console.log(`共找到 ${chapters.length} 个章节`);
191
+
192
+        // 将结果保存到文件中
193
+        const outputFilePath = path.join(outputDir, `${title}_chapters.json`);
194
+        fs.writeFileSync(outputFilePath, JSON.stringify(chapters, null, 2));
195
+        console.log(`已将章节目录保存到 ${outputFilePath} 文件`);
196
+
197
+        // 打印前10个章节和后10个章节
198
+        if (chapters.length <= 20) {
199
+            console.log("章节目录:");
200
+            console.log(JSON.stringify(chapters, null, 2));
201
+        } else {
202
+            console.log("前10个章节:");
203
+            console.log(JSON.stringify(chapters.slice(0, 10), null, 2));
204
+
205
+            console.log("...");
206
+
207
+            console.log("后10个章节:");
208
+            console.log(JSON.stringify(chapters.slice(-10), null, 2));
209
+        }
210
+
211
+        // 如果需要爬取章节内容
212
+        if (fetchContent && chapters.length > 0) {
213
+            console.log(`开始爬取章节内容...`);
214
+
215
+            // 创建章节内容目录
216
+            const contentDir = path.join(outputDir, `${title}_contents`);
217
+            if (!fs.existsSync(contentDir)) {
218
+                fs.mkdirSync(contentDir, { recursive: true });
219
+            }
220
+
221
+            // 创建一个包含所有章节内容的对象
222
+            const contentsObj = {};
223
+
224
+            // 设置基础URL
225
+            const baseUrl = 'https://novelhi.com';
226
+
227
+            // 爬取所有章节
228
+            const chaptersToFetch = chapters;
229
+            console.log(`将爬取 ${chaptersToFetch.length}/${chapters.length} 个章节的内容`);
230
+
231
+            // 记录失败的章节
232
+            const failedChapters = [];
233
+            
234
+            // 爬取章节内容
235
+            for (let i = 0; i < chaptersToFetch.length; i++) {
236
+                const chapter = chaptersToFetch[i];
237
+
238
+                // 每爬取10个章节,暂停一下,避免请求过于频繁
239
+                if (i > 0 && i % 10 === 0) {
240
+                    console.log(`已爬取 ${i}/${chaptersToFetch.length} 章节,暂停 2 秒...`);
241
+                    await setTimeout(2000);
242
+                }
243
+
244
+                // 爬取章节内容
245
+                const result = await fetchChapterContent(baseUrl, chapter.url, headers);
246
+
247
+                if (result.content) {
248
+                    // 更新章节对象,添加内容
249
+                    chapters[i].content = result.content;
250
+                    contentsObj[chapter.title] = result.content;
251
+
252
+                    // 将章节内容保存到单独的文件
253
+                    const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
254
+                    const chapterFilePath = path.join(contentDir, chapterFileName);
255
+
256
+                    // 创建一个完整的HTML文件
257
+                    const htmlContent = `<!DOCTYPE html>
258
+<html>
259
+<head>
260
+    <meta charset="UTF-8">
261
+    <title>${chapter.title}</title>
262
+    <style>
263
+        body {
264
+            font-family: Arial, sans-serif;
265
+            line-height: 1.6;
266
+            margin: 0 auto;
267
+            max-width: 800px;
268
+            padding: 20px;
269
+        }
270
+        h1 {
271
+            text-align: center;
272
+            margin-bottom: 30px;
273
+        }
274
+        p {
275
+            text-indent: 2em;
276
+            margin-bottom: 1em;
277
+        }
278
+    </style>
279
+</head>
280
+<body>
281
+    <h1>${chapter.title}</h1>
282
+    ${result.content}
283
+</body>
284
+</html>`;
285
+
286
+                    fs.writeFileSync(chapterFilePath, htmlContent);
287
+
288
+                    if (i % 10 === 0 || i === chapters.length - 1) {
289
+                        console.log(`已保存 ${i + 1}/${chapters.length} 章节`);
290
+                    }
291
+                } else {
292
+                    console.error(`获取章节 ${chapter.title} 内容失败: ${result.error}`);
293
+                    // 记录失败的章节
294
+                    failedChapters.push({
295
+                        index: i,
296
+                        chapter: chapter,
297
+                        error: result.error
298
+                    });
299
+                }
300
+            }
301
+            
302
+            // 如果有失败的章节,尝试重新爬取
303
+            if (failedChapters.length > 0) {
304
+                console.log(`首次爬取完成,有 ${failedChapters.length} 个章节失败,开始重试...`);
305
+                
306
+                // 保存失败章节记录
307
+                const failedChaptersPath = path.join(outputDir, `${title}_failed_chapters.json`);
308
+                fs.writeFileSync(failedChaptersPath, JSON.stringify(failedChapters, null, 2));
309
+                console.log(`已将失败章节信息保存到 ${failedChaptersPath}`);
310
+                
311
+                // 重试失败的章节,最多重试3次
312
+                for (let retry = 0; retry < 3 && failedChapters.length > 0; retry++) {
313
+                    console.log(`第 ${retry + 1} 次重试,剩余 ${failedChapters.length} 个失败章节`);
314
+                    
315
+                    // 等待一段时间再重试
316
+                    await setTimeout(5000);
317
+                    
318
+                    // 创建一个新的失败章节数组,用于记录本次重试后仍然失败的章节
319
+                    const stillFailedChapters = [];
320
+                    
321
+                    for (const failedItem of failedChapters) {
322
+                        const { index, chapter } = failedItem;
323
+                        console.log(`重试章节: ${chapter.title}`);
324
+                        
325
+                        // 重新爬取章节内容
326
+                        const result = await fetchChapterContent(baseUrl, chapter.url, headers);
327
+                        
328
+                        if (result.content) {
329
+                            // 更新章节对象,添加内容
330
+                            chapters[index].content = result.content;
331
+                            contentsObj[chapter.title] = result.content;
332
+                            
333
+                            // 将章节内容保存到单独的文件
334
+                            const chapterFileName = `${String(chapter.index).padStart(4, '0')}_${chapter.title.replace(/[\\/:*?"<>|]/g, '_')}.html`;
335
+                            const chapterFilePath = path.join(contentDir, chapterFileName);
336
+                            
337
+                            // 创建一个完整的HTML文件
338
+                            const htmlContent = `<!DOCTYPE html>
339
+<html>
340
+<head>
341
+    <meta charset="UTF-8">
342
+    <title>${chapter.title}</title>
343
+    <style>
344
+        body {
345
+            font-family: Arial, sans-serif;
346
+            line-height: 1.6;
347
+            margin: 0 auto;
348
+            max-width: 800px;
349
+            padding: 20px;
350
+        }
351
+        h1 {
352
+            text-align: center;
353
+            margin-bottom: 30px;
354
+        }
355
+        p {
356
+            text-indent: 2em;
357
+            margin-bottom: 1em;
358
+        }
359
+    </style>
360
+</head>
361
+<body>
362
+    <h1>${chapter.title}</h1>
363
+    ${result.content}
364
+</body>
365
+</html>`;
366
+                            
367
+                            fs.writeFileSync(chapterFilePath, htmlContent);
368
+                            console.log(`重试成功: ${chapter.title}`);
369
+                        } else {
370
+                            console.error(`重试失败: ${chapter.title}, 错误: ${result.error}`);
371
+                            stillFailedChapters.push({
372
+                                index,
373
+                                chapter,
374
+                                error: result.error
375
+                            });
376
+                        }
377
+                        
378
+                        // 每次重试后暂停一下
379
+                        await setTimeout(2000);
380
+                    }
381
+                    
382
+                    // 更新失败章节列表
383
+                    failedChapters.length = 0;
384
+                    failedChapters.push(...stillFailedChapters);
385
+                    
386
+                    // 更新失败章节记录文件
387
+                    fs.writeFileSync(failedChaptersPath, JSON.stringify(failedChapters, null, 2));
388
+                    console.log(`第 ${retry + 1} 次重试后,还有 ${failedChapters.length} 个章节失败`);
389
+                }
390
+                
391
+                // 最终检查是否所有章节都成功爬取
392
+                if (failedChapters.length > 0) {
393
+                    console.warn(`警告: 经过多次重试后,仍有 ${failedChapters.length} 个章节未能成功爬取`);
394
+                    // 将最终失败的章节信息保存到文件
395
+                    const finalFailedPath = path.join(outputDir, `${title}_final_failed_chapters.json`);
396
+                    fs.writeFileSync(finalFailedPath, JSON.stringify(failedChapters, null, 2));
397
+                    console.log(`已将最终失败章节信息保存到 ${finalFailedPath}`);
398
+                } else {
399
+                    console.log(`所有章节都已成功爬取!`);
400
+                }
401
+            }
402
+
403
+            // 保存所有章节内容到一个文件
404
+            const allContentsPath = path.join(outputDir, `${title}_all_contents.json`);
405
+            fs.writeFileSync(allContentsPath, JSON.stringify(contentsObj, null, 2));
406
+            console.log(`已将所有章节内容保存到 ${allContentsPath} 文件`);
407
+
408
+            // 更新章节目录文件,包含内容
409
+            const chaptersWithContentPath = path.join(outputDir, `${title}_chapters_with_content.json`);
410
+            fs.writeFileSync(chaptersWithContentPath, JSON.stringify(chapters, null, 2));
411
+            console.log(`已将包含内容的章节目录保存到 ${chaptersWithContentPath} 文件`);
412
+            
413
+            // 检查是否有最终失败的章节
414
+            const finalFailedPath = path.join(outputDir, `${title}_final_failed_chapters.json`);
415
+            if (fs.existsSync(finalFailedPath)) {
416
+                try {
417
+                    const failedChapters = JSON.parse(fs.readFileSync(finalFailedPath, 'utf-8'));
418
+                    if (failedChapters && failedChapters.length > 0) {
419
+                        return {
420
+                            chapters,
421
+                            failedChapters,
422
+                            success: false,
423
+                            message: `有 ${failedChapters.length} 个章节未能成功爬取`
424
+                        };
425
+                    }
426
+                } catch (err) {
427
+                    console.error("读取失败章节文件出错:", err);
428
+                }
429
+            }
430
+        }
431
+
432
+        return {
433
+            chapters,
434
+            failedChapters: [],
435
+            success: true,
436
+            message: "所有章节爬取成功"
437
+        };
438
+    } catch (err) {
439
+        console.error("爬取过程中出错:", err.message);
440
+        return { errcode: 101, errStr: err.message };
441
+    } finally {
442
+        console.log("完成");
443
+    }
444
+}
445
+
446
+
447
+async function mergeChapterFiles(sourceDir, outputFile) {
448
+    try {
449
+        const SEPARATOR = '\n******************\n';
450
+        const { readdir, readFile, writeFile } = fs.promises;
451
+
452
+        // 获取所有HTML文件并按章节顺序排序
453
+        const files = (await readdir(sourceDir))
454
+            .filter(file => file.endsWith('.html'))
455
+            .sort((a, b) => parseInt(a.split('_')[0]) - parseInt(b.split('_')[0]));
456
+
457
+        if (files.length === 0) {
458
+            console.error('未找到任何章节文件');
459
+            return;
460
+        }
461
+
462
+        let mergedContent = '';
463
+
464
+        // 处理每个章节文件
465
+        for (const file of files) {
466
+            const filePath = path.join(sourceDir, file);
467
+            const html = await readFile(filePath, 'utf-8');
468
+
469
+            // 提取章节标题
470
+            const titleMatch = file.match(/_([^\.]+)\.html$/);
471
+            const title = titleMatch ? titleMatch[1] : file;
472
+
473
+            // 提取正文内容
474
+            const contentMatch = html.match(/<div id="showReading"[^>]*>([\s\S]*?)<\/div>/);
475
+            if (!contentMatch) continue;
476
+
477
+            let content = contentMatch[1]
478
+                .replace(/<sent[^>]*>/g, '')
479
+                .replace(/<\/sent>/g, '')
480
+                .replace(/<br>/g, '\n')
481
+                .replace(/<[^>]+>/g, '')
482
+                .replace(/\(adsbygoogle\s*=\s*window\.adsbygoogle\s*\|\|\s*\[\]\).push\(\{\}\);/g, '')
483
+                .replace(/\n{3,}/g, '\n\n');
484
+
485
+            // 添加到合并内容
486
+            mergedContent += `${title}\n\n${content.trim()}${SEPARATOR}`;
487
+            console.log(`已处理: ${file}`);
488
+        }
489
+
490
+        // 格式化合并后的内容
491
+        const formattedContent = mergedContent;
492
+
493
+        // 写入合并文件
494
+        await writeFile(outputFile, formattedContent);
495
+        console.log(`\n合并完成! 结果已保存到: ${outputFile}`);
496
+        console.log(`共合并了 ${files.length} 个章节`);
497
+
498
+    } catch (error) {
499
+        console.error('合并章节时出错:', error);
500
+    }
501
+}
502
+
503
+/**
504
+ * 格式化章节内容
505
+ * @param {string} content - 原始文本内容
506
+ * @returns {string} - 格式化后的HTML内容
507
+ */
508
+async function formatChapterContent(content) {
509
+    // 将章节标题替换为<h2>
510
+    content = content.replace(/Chapter \d+/g, match => `<h2>${match}</h2>`);
511
+
512
+    // 将正文段落用<p>包裹
513
+    content = content.replace(/\n\n/g, '</p><p>');
514
+    content = `<p>${content}</p>`;
515
+
516
+    return content;
517
+}
518
+
519
+/**
520
+ * 处理文本文件,为章节标题添加h2标签,为段落添加p标签
521
+ * @param {string} filePath - 文本文件路径
522
+ * @param {string} outputPath - 输出文件路径
523
+ * @returns {Promise<void>}
524
+ */
525
+async function formatTextFile(filePath, outputPath) {
526
+    try {
527
+        // 读取文本文件
528
+        const content = await fs.promises.readFile(filePath, 'utf-8');
529
+        console.log(`已读取文件: ${filePath}`);
530
+
531
+        // 分割成章节
532
+        const chapters = content.split(/Chapter \d+/).filter(Boolean);
533
+        console.log(`检测到 ${chapters.length} 个章节内容块`);
534
+
535
+        let formattedContent = '';
536
+        let chapterIndex = 1;
537
+
538
+        // 处理每个章节
539
+        for (const chapter of chapters) {
540
+            // 添加章节标题
541
+            formattedContent += `<h2>Chapter ${chapterIndex}</h2>\n`;
542
+
543
+            // 处理章节内容,将段落用<p>标签包裹
544
+            const paragraphs = chapter.trim().split(/\n\s*\n/);
545
+            for (const paragraph of paragraphs) {
546
+                if (paragraph.trim()) {
547
+                    formattedContent += `<p>${paragraph.trim()}</p>\n`;
548
+                }
549
+            }
550
+
551
+            chapterIndex++;
552
+        }
553
+
554
+        // 写入输出文件
555
+        await fs.promises.writeFile(outputPath, formattedContent);
556
+        console.log(`格式化完成! 结果已保存到: ${outputPath}`);
557
+        console.log(`共处理了 ${chapters.length} 个章节`);
558
+
559
+    } catch (error) {
560
+        console.error('处理文本文件时出错:', error);
561
+    }
562
+}
563
+/**
564
+ * 生成 EPUB 电子书
565
+ * @param {string} contentFilePath - HTML 格式的正文文件路径
566
+ * @param {string} coverImagePath - 封面图片路径
567
+ * @param {string} outputPath - 输出 EPUB 文件路径
568
+ * @param {string} bookTitle - 电子书标题
569
+ * @param {string} author - 作者名称
570
+ * @returns {Promise<void>}
571
+ */
572
+async function generateEpub(contentFilePath, coverImagePath, outputPath, bookTitle, author) {
573
+    try {
574
+        // 读取 HTML 内容
575
+        const htmlContent = fs.readFileSync(contentFilePath, 'utf-8');
576
+        const dom = new JSDOM(htmlContent);
577
+        const document = dom.window.document;
578
+
579
+        // 提取章节 (h2 标签)
580
+        const chapterElements = document.querySelectorAll('h2');
581
+        const chapters = [];
582
+
583
+        // 处理每个章节
584
+        chapterElements.forEach((chapterElement, index) => {
585
+            const title = chapterElement.textContent.trim();
586
+            let content = '';
587
+
588
+            // 收集当前章节的所有段落,直到下一个 h2 或文档结束
589
+            let currentElement = chapterElement.nextElementSibling;
590
+            while (currentElement && currentElement.tagName.toLowerCase() !== 'h2') {
591
+                if (currentElement.tagName.toLowerCase() === 'p') {
592
+                    // 每个 p 标签作为独立段落,用 <p> 标签包裹并添加样式类
593
+                    content += `<p>${currentElement.innerHTML}</p>\n`;
594
+                }
595
+                currentElement = currentElement.nextElementSibling;
596
+            }
597
+
598
+            chapters.push({ title, content });
599
+        });
600
+
601
+        // 读取封面图片
602
+        const coverImage = fs.readFileSync(coverImagePath);
603
+
604
+        // 创建 EPUB 容器
605
+        const zip = new JSZip();
606
+
607
+        // 添加 mimetype 文件(必须是第一个文件,且不压缩)
608
+        zip.file('mimetype', 'application/epub+zip', { compression: 'STORE' });
609
+
610
+        // 创建 META-INF 目录
611
+        const metaInf = zip.folder('META-INF');
612
+        metaInf.file('container.xml', `<?xml version="1.0"?>
613
+<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
614
+    <rootfiles>
615
+        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
616
+    </rootfiles>
617
+</container>`);
618
+
619
+        // 创建 OEBPS 目录
620
+        const oebps = zip.folder('OEBPS');
621
+
622
+        // 添加封面图片
623
+        oebps.file('Images/cover.jpg', coverImage);
624
+
625
+        // 生成封面页 XHTML
626
+        const coverXhtml = `<?xml version="1.0" encoding="UTF-8"?>
627
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
628
+<html xmlns="http://www.w3.org/1999/xhtml">
629
+<head>
630
+    <title>封面</title>
631
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
632
+    <meta name="calibre:cover" content="true"/>
633
+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
634
+    <style type="text/css">
635
+        body {
636
+            margin: 0;
637
+            padding: 0;
638
+            text-align: center;
639
+        }
640
+        img {
641
+            max-width: 100%;
642
+            height: auto;
643
+            margin: 0;
644
+            padding: 0;
645
+        }
646
+    </style>
647
+</head>
648
+<body>
649
+    <div class="cover-container">
650
+        <img src="../Images/cover.jpg" alt="封面"/>
651
+    </div>
652
+</body>
653
+</html>`;
654
+
655
+        const textFolder = oebps.folder('Text');
656
+        textFolder.file('cover.xhtml', coverXhtml);
657
+
658
+        // 生成章节 HTML 文件
659
+        const spineItems = [
660
+            { idref: 'cover', linear: 'no' }
661
+        ];
662
+        const manifestItems = [
663
+            { id: 'cover', href: 'Text/cover.xhtml', mediaType: 'application/xhtml+xml', properties: 'cover-image' },
664
+            { id: 'cover-image', href: 'Images/cover.jpg', mediaType: 'image/jpeg' },
665
+            { id: 'ncx', href: 'toc.ncx', mediaType: 'application/x-dtbncx+xml' }
666
+        ];
667
+
668
+        chapters.forEach((chapter, index) => {
669
+            const { title, content } = chapter;
670
+            const chapterId = `chapter_${index}`;
671
+            const chapterFileName = `${chapterId}.xhtml`;
672
+
673
+            const chapterHtml = `<?xml version="1.0" encoding="UTF-8"?>
674
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
675
+<html xmlns="http://www.w3.org/1999/xhtml">
676
+<head>
677
+    <title>${title}</title>
678
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
679
+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
680
+</head>
681
+<body>
682
+    <h2 class="chapter-title">${title}</h2>
683
+    <div class="chapter-content">
684
+        ${content}
685
+    </div>
686
+</body>
687
+</html>`;
688
+
689
+            textFolder.file(chapterFileName, chapterHtml);
690
+            manifestItems.push({ id: chapterId, href: `Text/${chapterFileName}`, mediaType: 'application/xhtml+xml' });
691
+            spineItems.push({ idref: chapterId });
692
+        });
693
+
694
+        // 生成目录文件
695
+        const tocHtml = `<?xml version="1.0" encoding="UTF-8"?>
696
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
697
+<html xmlns="http://www.w3.org/1999/xhtml">
698
+<head>
699
+    <title>目录</title>
700
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
701
+    <link rel="stylesheet" type="text/css" href="../Styles/stylesheet.css"/>
702
+</head>
703
+<body>
704
+    <h2 class="toc-title">目录</h2>
705
+    <ol class="toc-list">
706
+        ${chapters.map((chapter, index) => {
707
+            return `<li class="toc-item"><a href="chapter_${index}.xhtml">${chapter.title}</a></li>`;
708
+        }).join('\n')}
709
+    </ol>
710
+</body>
711
+</html>`;
712
+
713
+        textFolder.file('toc.xhtml', tocHtml);
714
+        manifestItems.push({ id: 'toc', href: 'Text/toc.xhtml', mediaType: 'application/xhtml+xml' });
715
+
716
+        // 添加 CSS 文件
717
+        const cssFolder = oebps.folder('Styles');
718
+        const csspath = path.join(__dirname, 'epub_styles.css');
719
+        console.log("🚀 ~ generateEpub ~ csspath:", csspath)
720
+
721
+        const cssContent = fs.readFileSync(csspath, 'utf-8');
722
+        cssFolder.file('stylesheet.css', cssContent);
723
+        manifestItems.push({ id: 'stylesheet', href: 'Styles/stylesheet.css', mediaType: 'text/css' });
724
+
725
+        // 生成唯一标识符
726
+        const bookUUID = uuidv4();
727
+
728
+        // 生成 toc.ncx 文件
729
+        const tocNcx = `<?xml version='1.0' encoding='utf-8'?>
730
+<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="zh-CN">
731
+  <head>
732
+    <meta content="${bookUUID}" name="dtb:uid"/>
733
+    <meta content="2" name="dtb:depth"/>
734
+    <meta content="0" name="dtb:totalPageCount"/>
735
+    <meta content="0" name="dtb:maxPageNumber"/>
736
+  </head>
737
+  <docTitle>
738
+    <text>${bookTitle}</text>
739
+  </docTitle>
740
+  <navMap>
741
+    <navPoint id="navpoint-0" playOrder="0">
742
+      <navLabel>
743
+        <text>封面</text>
744
+      </navLabel>
745
+      <content src="Text/cover.xhtml"/>
746
+    </navPoint>
747
+    <navPoint id="navpoint-1" playOrder="1">
748
+      <navLabel>
749
+        <text>目录</text>
750
+      </navLabel>
751
+      <content src="Text/toc.xhtml"/>
752
+    </navPoint>
753
+    ${chapters.map((chapter, index) => {
754
+            return `<navPoint id="navpoint-${index + 2}" playOrder="${index + 2}">
755
+      <navLabel>
756
+        <text>${chapter.title}</text>
757
+      </navLabel>
758
+      <content src="Text/chapter_${index}.xhtml"/>
759
+    </navPoint>`;
760
+        }).join('\n')}
761
+  </navMap>
762
+</ncx>`;
763
+
764
+        oebps.file('toc.ncx', tocNcx);
765
+
766
+        // 生成 content.opf 文件
767
+        const contentOpf = `<?xml version="1.0" encoding="UTF-8"?>
768
+<package xmlns="http://www.idpf.org/2007/opf" version="2.0" unique-identifier="uuid_id">
769
+    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
770
+        <dc:identifier id="uuid_id">urn:uuid:${bookUUID}</dc:identifier>
771
+        <dc:title>${bookTitle}</dc:title>
772
+        <dc:creator>${author}</dc:creator>
773
+        <dc:language>zh-CN</dc:language>
774
+        <dc:date>${new Date().toISOString().split('T')[0]}</dc:date>
775
+        <meta name="cover" content="cover-image"/>
776
+    </metadata>
777
+    <manifest>
778
+        ${manifestItems.map(item => `<item id="${item.id}" href="${item.href}" media-type="${item.mediaType}"/>`).join('\n')}
779
+    </manifest>
780
+    <spine toc="ncx">
781
+        <itemref idref="cover"/>
782
+        <itemref idref="toc"/>
783
+        ${spineItems.map(item => `<itemref idref="${item.idref}"/>`).join('\n')}
784
+    </spine>
785
+    <guide>
786
+        <reference type="cover" title="封面" href="Text/cover.xhtml"/>
787
+    </guide>
788
+</package>`;
789
+
790
+        oebps.file('content.opf', contentOpf);
791
+
792
+        // 生成 EPUB 文件
793
+        const epubContent = await zip.generateAsync({
794
+            type: 'nodebuffer',
795
+            compression: 'DEFLATE',
796
+            mimeType: 'application/epub+zip'
797
+        });
798
+        fs.writeFileSync(outputPath, epubContent);
799
+
800
+        console.log(`EPUB 电子书已生成: ${outputPath}`);
801
+    } catch (error) {
802
+        console.error('生成 EPUB 电子书时出错:', error);
803
+    }
804
+}
805
+
806
+
807
+
808
+
809
+const title = "The-Legendary-Mechanic";
810
+const author = "Qi Peijia";
811
+const coverName = "cover.jpg";
812
+
813
+let sourceDir = path.join(__dirname, title + '/' + title + '_contents');
814
+let outputFile = path.join(__dirname, title + '/' + title + '.txt');
815
+let outputFile2 = path.join(__dirname, title + '/' + title + '.html');
816
+let coverFile = path.join(__dirname, title + '/' + coverName);
817
+let epubFile = path.join(__dirname, title + '/' + title + '.epub');
818
+
819
+console.log(`开始爬取小说: ${title}`);
820
+console.log(`输出目录: ${title}`);
821
+
822
+// 确保输出目录存在
823
+if (!fs.existsSync(title)) {
824
+    fs.mkdirSync(title, { recursive: true });
825
+}
826
+
827
+// 爬取小说内容
828
+let crawlResult;
829
+try {
830
+    crawlResult = await crawleWeb(title, title, true);
831
+    
832
+    if (crawlResult && crawlResult.errcode) {
833
+        console.error(`爬取失败: ${crawlResult.errStr}`);
834
+        process.exit(1);
835
+    } else if (crawlResult && !crawlResult.success) {
836
+        console.error(`爬取完成但存在问题: ${crawlResult.message}`);
837
+        console.error(`有 ${crawlResult.failedChapters.length} 个章节未能成功爬取`);
838
+        process.exit(1);
839
+    } else {
840
+        console.log("爬取任务完成!所有章节爬取成功");
841
+    }
842
+} catch (err) {
843
+    console.error("程序执行出错:", err);
844
+    process.exit(1);
845
+}
846
+
847
+// 检查是否有最终失败的章节
848
+const finalFailedPath = path.join(title, `${title}_final_failed_chapters.json`);
849
+if (fs.existsSync(finalFailedPath)) {
850
+    try {
851
+        const failedChapters = JSON.parse(fs.readFileSync(finalFailedPath, 'utf-8'));
852
+        if (failedChapters && failedChapters.length > 0) {
853
+            console.error(`警告: 有 ${failedChapters.length} 个章节未能成功爬取,不进行电子书生成`);
854
+            console.error('失败的章节:');
855
+            failedChapters.forEach(item => {
856
+                console.error(`- 章节 ${item.chapter.title} (索引: ${item.index}): ${item.error}`);
857
+            });
858
+            process.exit(1);
859
+        }
860
+    } catch (err) {
861
+        console.error("读取失败章节文件出错:", err);
862
+    }
863
+}
864
+
865
+// 检查内容目录是否存在
866
+if (!fs.existsSync(sourceDir)) {
867
+    console.error(`错误: 内容目录 ${sourceDir} 不存在,无法生成电子书`);
868
+    process.exit(1);
869
+}
870
+
871
+// 检查内容目录中的文件数量
872
+const contentFiles = fs.readdirSync(sourceDir).filter(file => file.endsWith('.html'));
873
+if (contentFiles.length === 0) {
874
+    console.error(`错误: 内容目录 ${sourceDir} 中没有HTML文件,无法生成电子书`);
875
+    process.exit(1);
876
+}
877
+
878
+console.log(`开始生成电子书,共有 ${contentFiles.length} 个章节文件`);
879
+
880
+// 合并章节文件
881
+try {
882
+    await mergeChapterFiles(sourceDir, outputFile);
883
+    console.log('合并操作完成');
884
+} catch (err) {
885
+    console.error('合并操作失败:', err);
886
+    process.exit(1);
887
+}
888
+
889
+// 格式化文本文件
890
+try {
891
+    await formatTextFile(outputFile, outputFile2);
892
+} catch (err) {
893
+    console.error('格式化文本文件失败:', err);
894
+    process.exit(1);
895
+}
896
+
897
+// 检查封面文件是否存在
898
+if (!fs.existsSync(coverFile)) {
899
+    console.warn(`警告: 封面文件 ${coverFile} 不存在,将使用默认封面`);
900
+    // 创建一个简单的默认封面
901
+    const defaultCoverPath = path.join(__dirname, 'default_cover.jpg');
902
+    if (fs.existsSync(defaultCoverPath)) {
903
+        coverFile = defaultCoverPath;
904
+    } else {
905
+        console.error('错误: 默认封面文件也不存在,无法生成电子书');
906
+        process.exit(1);
907
+    }
908
+}
909
+
910
+// 生成EPUB电子书
911
+try {
912
+    await generateEpub(outputFile2, coverFile, epubFile, title, author);
913
+    console.log(`电子书生成成功: ${epubFile}`);
914
+} catch (err) {
915
+    console.error('生成EPUB电子书失败:', err);
916
+    process.exit(1);
917
+}